diff --git a/research/mlperf_object_detection/Mask_RCNN/README.md b/research/mlperf_object_detection/Mask_RCNN/README.md new file mode 100644 index 0000000000000000000000000000000000000000..16bdc70c4bf9152560215bdd8a435940d20051df --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/README.md @@ -0,0 +1 @@ +Mask RCNN Implimentation adopted from models/research/object_detection/ diff --git a/research/mlperf_object_detection/Mask_RCNN/configs/e2e_mask_rcnn_R-50-C4_atrous.config b/research/mlperf_object_detection/Mask_RCNN/configs/e2e_mask_rcnn_R-50-C4_atrous.config new file mode 100644 index 0000000000000000000000000000000000000000..6085c7838582260fad17858ab4f65bd08e6ea14f --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/configs/e2e_mask_rcnn_R-50-C4_atrous.config @@ -0,0 +1,170 @@ +# Mask R-CNN with Resnet-50 (v1), Atrous version +# Configured for MSCOCO Dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + faster_rcnn { + num_classes: 81 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 800 + max_dimension: 1365 + } + } + number_of_stages: 3 + feature_extractor { + type: 'faster_rcnn_resnet50' + first_stage_features_stride: 8 + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.125, 0.25, 0.5, 1.0, 2.0] # base size=256**2 => anchor sizes=32 64 128 256 512 + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 8 + width_stride: 8 + } + } + first_stage_atrous_rate: 2 + first_stage_box_predictor_conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + first_stage_nms_score_threshold: 0.0 + first_stage_nms_iou_threshold: 0.7 + first_stage_max_proposals: 512 + first_stage_localization_loss_weight: 2.0 + first_stage_objectness_loss_weight: 1.0 + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_batch_size: 512 + second_stage_box_predictor { + mask_rcnn_box_predictor { + use_dropout: false + dropout_keep_probability: 1.0 + predict_instance_masks: true + mask_height: 14 + mask_width: 14 + mask_prediction_conv_depth: 0 + mask_prediction_num_conv_layers: 3 #from mask rcnn heads + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + variance_scaling_initializer { + factor: 1.0 + uniform: true + mode: FAN_AVG + } + } + } + conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.0 + iou_threshold: 0.6 + max_detections_per_class: 2000 + max_total_detections: 2000 + } + score_converter: SOFTMAX + } + second_stage_localization_loss_weight: 2.0 + second_stage_classification_loss_weight: 1.0 + second_stage_mask_prediction_loss_weight: 4.0 + } +} + +train_config: { + batch_size: 4 + optimizer { + momentum_optimizer: { + learning_rate: { + manual_step_learning_rate { + initial_learning_rate: 0.01 + schedule { + step: 120000 + learning_rate: .001 + } + schedule { + step: 160000 + learning_rate: .0001 + } + } + } + momentum_optimizer_value: 0.9 + } + use_moving_average: false + } + gradient_clipping_by_norm: 10.0 + #fine_tune_checkpoint: "/home/mehdisharif/data/coco/resnet_v1_50.ckpt" + #from_detection_checkpoint: True + # Note: The below line limits the training process to 200K steps, which we + # empirically found to be sufficient enough to train the pets dataset. This + # effectively bypasses the learning rate schedule (the learning rate will + # never decay). Remove the below line to train indefinitely. + num_steps: 20000000 + data_augmentation_options { + random_horizontal_flip { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "/home/mehdisharif/data/coco/output2017/coco_train.record" + } + label_map_path: "/home/mehdisharif/data/coco/output2017/mscoco_label_map.pbtxt" + load_instance_masks: true + mask_type: PNG_MASKS +} + +eval_config: { + metrics_set: ['coco_detection_metrics', 'coco_mask_metrics'] + num_examples: 50 + # Note: The below line limits the evaluation process to 10 evaluations. + # Remove the below line to evaluate indefinitely. + max_evals: 1 + num_visualizations: 50 + eval_interval_secs: 120 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "/home/mehdisharif/data/coco/output2017/coco_val.record" + } + label_map_path: "/home/mehdisharif/data/coco/output2017/mscoco_label_map.pbtxt" + load_instance_masks: true + mask_type: PNG_MASKS + shuffle: false + num_readers: 1 +} diff --git a/research/mlperf_object_detection/Mask_RCNN/configs/mask_rcnn_resnet50_atrous_coco.config b/research/mlperf_object_detection/Mask_RCNN/configs/mask_rcnn_resnet50_atrous_coco.config new file mode 100644 index 0000000000000000000000000000000000000000..17c8bd5e7c1400938f14eb068bb6ff60f445f109 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/configs/mask_rcnn_resnet50_atrous_coco.config @@ -0,0 +1,169 @@ +# Mask R-CNN with Resnet-50 (v1), Atrous version +# Configured for MSCOCO Dataset. +# Users should configure the fine_tune_checkpoint field in the train config as +# well as the label_map_path and input_path fields in the train_input_reader and +# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that +# should be configured. + +model { + faster_rcnn { + num_classes: 90 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 800 + max_dimension: 1365 + } + } + number_of_stages: 3 + feature_extractor { + type: 'faster_rcnn_resnet50' + first_stage_features_stride: 8 + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 8 + width_stride: 8 + } + } + first_stage_atrous_rate: 2 + first_stage_box_predictor_conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + first_stage_nms_score_threshold: 0.0 + first_stage_nms_iou_threshold: 0.7 + first_stage_max_proposals: 300 + first_stage_localization_loss_weight: 2.0 + first_stage_objectness_loss_weight: 1.0 + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_box_predictor { + mask_rcnn_box_predictor { + use_dropout: false + dropout_keep_probability: 1.0 + predict_instance_masks: true + mask_height: 33 + mask_width: 33 + mask_prediction_conv_depth: 0 + mask_prediction_num_conv_layers: 4 + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + variance_scaling_initializer { + factor: 1.0 + uniform: true + mode: FAN_AVG + } + } + } + conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.0 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + score_converter: SOFTMAX + } + second_stage_localization_loss_weight: 2.0 + second_stage_classification_loss_weight: 1.0 + second_stage_mask_prediction_loss_weight: 4.0 + } +} + +train_config: { + batch_size: 2 + optimizer { + momentum_optimizer: { + learning_rate: { + manual_step_learning_rate { + initial_learning_rate: 0.0003 + schedule { + step: 900000 + learning_rate: .00003 + } + schedule { + step: 1200000 + learning_rate: .000003 + } + } + } + momentum_optimizer_value: 0.9 + } + use_moving_average: false + } + gradient_clipping_by_norm: 10.0 + #fine_tune_checkpoint: "" + from_detection_checkpoint: false + # Note: The below line limits the training process to 200K steps, which we + # empirically found to be sufficient enough to train the pets dataset. This + # effectively bypasses the learning rate schedule (the learning rate will + # never decay). Remove the below line to train indefinitely. + #num_steps: 200000 + data_augmentation_options { + random_horizontal_flip { + } + } +} + +train_input_reader: { + tf_record_input_reader { + input_path: "PATH_TO_BE_CONFIGURED/coco_train.record" + } + label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt" + load_instance_masks: true + mask_type: PNG_MASKS +} + +eval_config: { + metrics_set: ['coco_detection_metrics', 'coco_mask_metrics'] + num_examples: 50 + # Note: The below line limits the evaluation process to 10 evaluations. + # Remove the below line to evaluate indefinitely. + max_evals: 1 + num_visualizations: 50 + eval_interval_secs: 120 +} + +eval_input_reader: { + tf_record_input_reader { + input_path: "PATH_TO_BE_CONFIGURED/coco_val.record" + } + label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt" + load_instance_masks: true + mask_type: PNG_MASKS + shuffle: true + num_readers: 1 +} diff --git a/research/mlperf_object_detection/Mask_RCNN/mask_rcnn_run_loop.py b/research/mlperf_object_detection/Mask_RCNN/mask_rcnn_run_loop.py new file mode 100644 index 0000000000000000000000000000000000000000..70c9185c94d25b39c2f778ed6c6a769d8aa3108c --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/mask_rcnn_run_loop.py @@ -0,0 +1,243 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Training and evaluation for Mask_RCNN. + + This module repeatedly runs 1 training epoch and then evaluation + ##add explanation for all the options!!!!!!! +""" + +import functools +import json +import os + +from object_detection import evaluator +from object_detection import trainer +from object_detection.builders import dataset_builder +from object_detection.builders import graph_rewriter_builder +from object_detection.builders import model_builder +from object_detection.utils import config_util +from object_detection.utils import dataset_util +from object_detection.utils import label_map_util + +import tensorflow as tf + +tf.logging.set_verbosity(tf.logging.INFO) + +flags = tf.app.flags +flags.DEFINE_string('master', '', 'Name of the TensorFlow master to use.') +flags.DEFINE_integer('task', 0, 'task id') +flags.DEFINE_integer('num_clones', 1, 'Number of clones to deploy per worker.') +flags.DEFINE_boolean('clone_on_cpu', False, + 'Force clones to be deployed on CPU. Note that even if ' + 'set to False (allowing ops to run on gpu), some ops may ' + 'still be run on the CPU if they have no GPU kernel.') +flags.DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer ' + 'replicas.') +flags.DEFINE_integer('parameter_server_tasks', 0, + 'Number of parameter server tasks. If None, does not use ' + 'a parameter server.') +flags.DEFINE_string('train_dir', '', + 'Directory to save the checkpoints and training summaries.') + +flags.DEFINE_string('pipeline_config_path', '', + 'Path to a pipeline_pb2.TrainEvalPipelineConfig config ' + 'file. If provided, other configs are ignored') + +flags.DEFINE_boolean('eval_training_data', False, + 'If training data should be evaluated for this job.') + +flags.DEFINE_string('eval_dir', '', + 'Directory to write eval summaries to.') + +flags.DEFINE_boolean('run_once', False, 'Option to only run a single pass of ' + 'evaluation. Overrides the `max_evals`' + ' parameter in the provided config.') +flags.DEFINE_float('box_min_ap', -1, 'Option to run until the box average' + 'precision reaches this number') +flags.DEFINE_float('mask_min_ap', -1, 'Option to run until the mask average' + 'precision reaches this number') +flags.DEFINE_integer('epochs_between_evals', 1, 'Number of training epochs to ' + 'run before running eval.') +FLAGS = flags.FLAGS + + +def stopping_criteria_met(eval_metrics, mask_min_ap, box_min_ap): + """Returns true if both of the min precision criteria are met in the given + evaluation metrics. + + Args: + eval_metrics: dict of metrics names as keys and their corresponding values, + containing "DetectionMasks_Precision/mAP", and + "DetectionBoxes_Precision/mAP" fields. + mask_min_ap: minimum desired mask average precision, will be ignored if -1 + box_min_ap: minimum desired box average precision, will be ignored if -1 + + Returns: + True if non -1 criteria are met, false o.w. + """ + assert mask_min_ap == -1 or 0 < mask_min_ap < 1 + assert box_min_ap == -1 or 0 < box_min_ap < 1 + try: + mask_mAP_reached = eval_metrics['DetectionMasks_Precision/mAP'] + box_mAP_reached = eval_metrics['DetectionBoxes_Precision/mAP'] + except KeyError as err: + raise Exception('eval_metrics dict does not contain the mAP field') from err + + return (mask_min_ap == -1 or mask_mAP_reached > mask_min_ap) & \ + (box_min_ap == -1 or box_mAP_reached > box_min_ap) & \ + (mask_min_ap != -1 or box_min_ap != -1) + + +def main(_): + assert FLAGS.train_dir, '`train_dir` is missing.' + assert FLAGS.pipeline_config_path, '`pipeline_config_path` is missing' + assert FLAGS.eval_dir, '`eval_dir` is missing.' + + configs = config_util.get_configs_from_pipeline_file( + FLAGS.pipeline_config_path) + if FLAGS.task == 0: + tf.gfile.MakeDirs(FLAGS.train_dir) + tf.gfile.Copy(FLAGS.pipeline_config_path, + os.path.join(FLAGS.train_dir, 'pipeline.config'), + overwrite=True) + + tf.gfile.MakeDirs(FLAGS.eval_dir) + tf.gfile.Copy(FLAGS.pipeline_config_path, + os.path.join(FLAGS.eval_dir, 'pipeline.config'), + overwrite=True) + + model_config = configs['model'] + + train_config = configs['train_config'] + train_input_config = configs['train_input_config'] + + eval_config = configs['eval_config'] + if FLAGS.eval_training_data: + eval_input_config = configs['train_input_config'] + else: + eval_input_config = configs['eval_input_config'] + + # setting to run evaluation after EPOCHS_BETWEEN_EVALS epochs of training. + # total number of training is set to total_num_epochs provided in the config + if train_config.num_steps: + total_num_epochs = train_config.num_steps + train_config.num_steps = FLAGS.epochs_between_evals + total_training_cycle = total_num_epochs // train_config.num_steps + else: + # TODO(mehdi): make it run indef + total_num_epochs = 20000000 + train_config.num_steps = FLAGS.epochs_between_evals + total_training_cycle = total_num_epochs // train_config.num_steps + + train_model_fn = functools.partial(model_builder.build, + model_config=model_config, + is_training=True) + eval_model_fn = functools.partial(model_builder.build, + model_config=model_config, + is_training=False) + + def get_next(config): + return dataset_util.make_initializable_iterator( + dataset_builder.build(config)).get_next() + + # functions to create a tensor input dictionary for both training & evaluation + train_input_dict_fn = functools.partial(get_next, train_input_config) + eval_input_dict_fn = functools.partial(get_next, eval_input_config) + + # If not explicitly specified in the constructor and the TF_CONFIG + # environment variable is present, load cluster_spec from TF_CONFIG. + env = json.loads(os.environ.get('TF_CONFIG', '{}')) + cluster_data = env.get('cluster', None) + cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None + task_data = env.get('task', {'type': 'master', 'index': 0}) + task_info = type('TaskSpec', (object,), task_data) + + # Parameters for a single worker. + parameter_server_tasks = 0 + worker_replicas = 1 + worker_job_name = 'lonely_worker' + task = 0 + is_chief = True + master = '' + + if cluster_data and 'worker' in cluster_data: + # Number of total worker replicas include "worker"s and the "master". + worker_replicas = len(cluster_data['worker']) + 1 + if cluster_data and 'ps' in cluster_data: + parameter_server_tasks = len(cluster_data['ps']) + + if worker_replicas > 1 and parameter_server_tasks < 1: + raise ValueError('At least 1 ps task is needed for distributed training.') + + if worker_replicas >= 1 and parameter_server_tasks > 0: + # Set up distributed training. + server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', + job_name=task_info.type, + task_index=task_info.index) + if task_info.type == 'ps': + server.join() + return + + worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) + task = task_info.index + is_chief = (task_info.type == 'master') + master = server.target + + label_map = label_map_util.load_labelmap(eval_input_config.label_map_path) + max_num_classes = max([item.id for item in label_map.item]) + categories = label_map_util.convert_label_map_to_categories(label_map, + max_num_classes) + + if FLAGS.run_once: + eval_config.max_evals = 1 + + train_graph_rewriter_fn = eval_graph_rewriter_fn = None + if 'graph_rewriter_config' in configs: + train_graph_rewriter_fn = graph_rewriter_builder.build( + configs['graph_rewriter_config'], is_training=True) + eval_graph_rewriter_fn = graph_rewriter_builder.build( + configs['eval_rewriter_config'], is_training=False) + + def train(): + return trainer.train(create_tensor_dict_fn=train_input_dict_fn, + create_model_fn=train_model_fn, + train_config=train_config, master=master, task=task, + num_clones=FLAGS.num_clones, + worker_replicas=worker_replicas, + clone_on_cpu=FLAGS.clone_on_cpu, + ps_tasks=parameter_server_tasks, + worker_job_name=worker_job_name, + is_chief=is_chief, train_dir=FLAGS.train_dir, + graph_hook_fn=train_graph_rewriter_fn) + + def evaluate(): + return evaluator.evaluate(eval_input_dict_fn, eval_model_fn, eval_config, + categories, FLAGS.train_dir, FLAGS.eval_dir, + graph_hook_fn=eval_graph_rewriter_fn) + + for cycle_index in range(total_training_cycle): + tf.logging.info('Starting a training cycle: %d/%d', + cycle_index, total_training_cycle) + train() + tf.logging.info('Starting to evaluate.') + eval_metrics = evaluate() + if stopping_criteria_met(eval_metrics, FLAGS.mask_min_ap, FLAGS.box_min_ap): + tf.logging.info('Stopping criteria met. Training stopped') + break + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/CONTRIBUTING.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..e3d87e3ce90fb4dd22b00a2c5368bf17c3610661 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/CONTRIBUTING.md @@ -0,0 +1,13 @@ +# Contributing to the Tensorflow Object Detection API + +Patches to Tensorflow Object Detection API are welcome! + +We require contributors to fill out either the individual or corporate +Contributor License Agreement (CLA). + + * If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an [individual CLA](http://code.google.com/legal/individual-cla-v1.0.html). + * If you work for a company that wants to allow you to contribute your work, then you'll need to sign a [corporate CLA](http://code.google.com/legal/corporate-cla-v1.0.html). + +Please follow the +[Tensorflow contributing guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md) +when submitting pull requests. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/README.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/README.md new file mode 100644 index 0000000000000000000000000000000000000000..52bf3565ede8269b90bd148f86c2bb73b4fc112f --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/README.md @@ -0,0 +1,190 @@ + +# Tensorflow Object Detection API +Creating accurate machine learning models capable of localizing and identifying +multiple objects in a single image remains a core challenge in computer vision. +The TensorFlow Object Detection API is an open source framework built on top of +TensorFlow that makes it easy to construct, train and deploy object detection +models. At Google we’ve certainly found this codebase to be useful for our +computer vision needs, and we hope that you will as well. +

+ +

+Contributions to the codebase are welcome and we would love to hear back from +you if you find this API useful. Finally if you use the Tensorflow Object +Detection API for a research publication, please consider citing: + +``` +"Speed/accuracy trade-offs for modern convolutional object detectors." +Huang J, Rathod V, Sun C, Zhu M, Korattikara A, Fathi A, Fischer I, Wojna Z, +Song Y, Guadarrama S, Murphy K, CVPR 2017 +``` +\[[link](https://arxiv.org/abs/1611.10012)\]\[[bibtex]( +https://scholar.googleusercontent.com/scholar.bib?q=info:l291WsrB-hQJ:scholar.google.com/&output=citation&scisig=AAGBfm0AAAAAWUIIlnPZ_L9jxvPwcC49kDlELtaeIyU-&scisf=4&ct=citation&cd=-1&hl=en&scfhb=1)\] + +

+ +

+ +## Maintainers + +* Jonathan Huang, github: [jch1](https://github.com/jch1) +* Vivek Rathod, github: [tombstone](https://github.com/tombstone) +* Ronny Votel, github: [ronnyvotel](https://github.com/ronnyvotel) +* Derek Chow, github: [derekjchow](https://github.com/derekjchow) +* Chen Sun, github: [jesu9](https://github.com/jesu9) +* Menglong Zhu, github: [dreamdragon](https://github.com/dreamdragon) +* Alireza Fathi, github: [afathi3](https://github.com/afathi3) +* Zhichao Lu, github: [pkulzc](https://github.com/pkulzc) + + +## Table of contents + +Quick Start: + + * + Quick Start: Jupyter notebook for off-the-shelf inference
+ * Quick Start: Training a pet detector
+ +Setup: + + * Installation
+ * + Configuring an object detection pipeline
+ * Preparing inputs
+ +Running: + + * Running locally
+ * Running on the cloud
+ +Extras: + + * Tensorflow detection model zoo
+ * + Exporting a trained model for inference
+ * + Defining your own model architecture
+ * + Bringing in your own dataset
+ * + Supported object detection evaluation protocols
+ * + Inference and evaluation on the Open Images dataset
+ * + Run an instance segmentation model
+ +## Getting Help + +To get help with issues you may encounter using the Tensorflow Object Detection +API, create a new question on [StackOverflow](https://stackoverflow.com/) with +the tags "tensorflow" and "object-detection". + +Please report bugs (actually broken code, not usage questions) to the +tensorflow/models GitHub +[issue tracker](https://github.com/tensorflow/models/issues), prefixing the +issue name with "object_detection". + +Please check [FAQ](g3doc/faq.md) for frequently asked questions before +reporting an issue. + + +## Release information + +### April 30, 2018 + +We have released a Faster R-CNN detector with ResNet-101 feature extractor trained on [AVA](https://research.google.com/ava/) v2.1. +Compared with other commonly used object detectors, it changes the action classification loss function to per-class Sigmoid loss to handle boxes with multiple labels. +The model is trained on the training split of AVA v2.1 for 1.5M iterations, it achieves mean AP of 11.25% over 60 classes on the validation split of AVA v2.1. +For more details please refer to this [paper](https://arxiv.org/abs/1705.08421). + +Thanks to contributors: Chen Sun, David Ross + +### April 2, 2018 + +Supercharge your mobile phones with the next generation mobile object detector! +We are adding support for MobileNet V2 with SSDLite presented in +[MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381). +This model is 35% faster than Mobilenet V1 SSD on a Google Pixel phone CPU (200ms vs. 270ms) at the same accuracy. +Along with the model definition, we are also releasing a model checkpoint trained on the COCO dataset. + +Thanks to contributors: Menglong Zhu, Mark Sandler, Zhichao Lu, Vivek Rathod, Jonathan Huang + +### February 9, 2018 + +We now support instance segmentation!! In this API update we support a number of instance segmentation models similar to those discussed in the [Mask R-CNN paper](https://arxiv.org/abs/1703.06870). For further details refer to +[our slides](http://presentations.cocodataset.org/Places17-GMRI.pdf) from the 2017 Coco + Places Workshop. +Refer to the section on [Running an Instance Segmentation Model](g3doc/instance_segmentation.md) for instructions on how to configure a model +that predicts masks in addition to object bounding boxes. + +Thanks to contributors: Alireza Fathi, Zhichao Lu, Vivek Rathod, Ronny Votel, Jonathan Huang + +### November 17, 2017 + +As a part of the Open Images V3 release we have released: + +* An implementation of the Open Images evaluation metric and the [protocol](g3doc/evaluation_protocols.md#open-images). +* Additional tools to separate inference of detection and evaluation (see [this tutorial](g3doc/oid_inference_and_evaluation.md)). +* A new detection model trained on the Open Images V2 data release (see [Open Images model](g3doc/detection_model_zoo.md#open-images-models)). + +See more information on the [Open Images website](https://github.com/openimages/dataset)! + +Thanks to contributors: Stefan Popov, Alina Kuznetsova + +### November 6, 2017 + +We have re-released faster versions of our (pre-trained) models in the +model zoo. In addition to what +was available before, we are also adding Faster R-CNN models trained on COCO +with Inception V2 and Resnet-50 feature extractors, as well as a Faster R-CNN +with Resnet-101 model trained on the KITTI dataset. + +Thanks to contributors: Jonathan Huang, Vivek Rathod, Derek Chow, +Tal Remez, Chen Sun. + +### October 31, 2017 + +We have released a new state-of-the-art model for object detection using +the Faster-RCNN with the +[NASNet-A image featurization](https://arxiv.org/abs/1707.07012). This +model achieves mAP of 43.1% on the test-dev validation dataset for COCO, +improving on the best available model in the zoo by 6% in terms +of absolute mAP. + +Thanks to contributors: Barret Zoph, Vijay Vasudevan, Jonathon Shlens, Quoc Le + +### August 11, 2017 + +We have released an update to the [Android Detect +demo](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android) +which will now run models trained using the Tensorflow Object +Detection API on an Android device. By default, it currently runs a +frozen SSD w/Mobilenet detector trained on COCO, but we encourage +you to try out other detection models! + +Thanks to contributors: Jonathan Huang, Andrew Harp + + +### June 15, 2017 + +In addition to our base Tensorflow detection model definitions, this +release includes: + +* A selection of trainable detection models, including: + * Single Shot Multibox Detector (SSD) with MobileNet, + * SSD with Inception V2, + * Region-Based Fully Convolutional Networks (R-FCN) with Resnet 101, + * Faster RCNN with Resnet 101, + * Faster RCNN with Inception Resnet v2 +* Frozen weights (trained on the COCO dataset) for each of the above models to + be used for out-of-the-box inference purposes. +* A [Jupyter notebook](object_detection_tutorial.ipynb) for performing + out-of-the-box inference with one of our released models +* Convenient [local training](g3doc/running_locally.md) scripts as well as + distributed training and evaluation pipelines via + [Google Cloud](g3doc/running_on_cloud.md). + + +Thanks to contributors: Jonathan Huang, Vivek Rathod, Derek Chow, +Chen Sun, Menglong Zhu, Matthew Tang, Anoop Korattikara, Alireza Fathi, Ian Fischer, Zbigniew Wojna, Yang Song, Sergio Guadarrama, Jasper Uijlings, +Viacheslav Kovalevskyi, Kevin Murphy + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/grid_anchor_generator.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/grid_anchor_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..ba43f0135481e433402b77e17a5db39a90ace8be --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/grid_anchor_generator.py @@ -0,0 +1,205 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Generates grid anchors on the fly as used in Faster RCNN. + +Generates grid anchors on the fly as described in: +"Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" +Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. +""" + +import tensorflow as tf + +from object_detection.core import anchor_generator +from object_detection.core import box_list +from object_detection.utils import ops + + +class GridAnchorGenerator(anchor_generator.AnchorGenerator): + """Generates a grid of anchors at given scales and aspect ratios.""" + + def __init__(self, + scales=(0.5, 1.0, 2.0), + aspect_ratios=(0.5, 1.0, 2.0), + base_anchor_size=None, + anchor_stride=None, + anchor_offset=None): + """Constructs a GridAnchorGenerator. + + Args: + scales: a list of (float) scales, default=(0.5, 1.0, 2.0) + aspect_ratios: a list of (float) aspect ratios, default=(0.5, 1.0, 2.0) + base_anchor_size: base anchor size as height, width ( + (length-2 float32 list or tensor, default=[256, 256]) + anchor_stride: difference in centers between base anchors for adjacent + grid positions (length-2 float32 list or tensor, + default=[16, 16]) + anchor_offset: center of the anchor with scale and aspect ratio 1 for the + upper left element of the grid, this should be zero for + feature networks with only VALID padding and even receptive + field size, but may need additional calculation if other + padding is used (length-2 float32 list or tensor, + default=[0, 0]) + """ + # Handle argument defaults + if base_anchor_size is None: + base_anchor_size = [256, 256] + base_anchor_size = tf.to_float(tf.convert_to_tensor(base_anchor_size)) + if anchor_stride is None: + anchor_stride = [16, 16] + anchor_stride = tf.to_float(tf.convert_to_tensor(anchor_stride)) + if anchor_offset is None: + anchor_offset = [0, 0] + anchor_offset = tf.to_float(tf.convert_to_tensor(anchor_offset)) + + self._scales = scales + self._aspect_ratios = aspect_ratios + self._base_anchor_size = base_anchor_size + self._anchor_stride = anchor_stride + self._anchor_offset = anchor_offset + + def name_scope(self): + return 'GridAnchorGenerator' + + def num_anchors_per_location(self): + """Returns the number of anchors per spatial location. + + Returns: + a list of integers, one for each expected feature map to be passed to + the `generate` function. + """ + return [len(self._scales) * len(self._aspect_ratios)] + + def _generate(self, feature_map_shape_list): + """Generates a collection of bounding boxes to be used as anchors. + + Args: + feature_map_shape_list: list of pairs of convnet layer resolutions in the + format [(height_0, width_0)]. For example, setting + feature_map_shape_list=[(8, 8)] asks for anchors that correspond + to an 8x8 layer. For this anchor generator, only lists of length 1 are + allowed. + + Returns: + boxes_list: a list of BoxLists each holding anchor boxes corresponding to + the input feature map shapes. + + Raises: + ValueError: if feature_map_shape_list, box_specs_list do not have the same + length. + ValueError: if feature_map_shape_list does not consist of pairs of + integers + """ + if not (isinstance(feature_map_shape_list, list) + and len(feature_map_shape_list) == 1): + raise ValueError('feature_map_shape_list must be a list of length 1.') + if not all([isinstance(list_item, tuple) and len(list_item) == 2 + for list_item in feature_map_shape_list]): + raise ValueError('feature_map_shape_list must be a list of pairs.') + grid_height, grid_width = feature_map_shape_list[0] + scales_grid, aspect_ratios_grid = ops.meshgrid(self._scales, + self._aspect_ratios) + scales_grid = tf.reshape(scales_grid, [-1]) + aspect_ratios_grid = tf.reshape(aspect_ratios_grid, [-1]) + anchors = tile_anchors(grid_height, + grid_width, + scales_grid, + aspect_ratios_grid, + self._base_anchor_size, + self._anchor_stride, + self._anchor_offset) + + num_anchors = anchors.num_boxes_static() + if num_anchors is None: + num_anchors = anchors.num_boxes() + anchor_indices = tf.zeros([num_anchors]) + anchors.add_field('feature_map_index', anchor_indices) + return [anchors] + + +def tile_anchors(grid_height, + grid_width, + scales, + aspect_ratios, + base_anchor_size, + anchor_stride, + anchor_offset): + """Create a tiled set of anchors strided along a grid in image space. + + This op creates a set of anchor boxes by placing a "basis" collection of + boxes with user-specified scales and aspect ratios centered at evenly + distributed points along a grid. The basis collection is specified via the + scale and aspect_ratios arguments. For example, setting scales=[.1, .2, .2] + and aspect ratios = [2,2,1/2] means that we create three boxes: one with scale + .1, aspect ratio 2, one with scale .2, aspect ratio 2, and one with scale .2 + and aspect ratio 1/2. Each box is multiplied by "base_anchor_size" before + placing it over its respective center. + + Grid points are specified via grid_height, grid_width parameters as well as + the anchor_stride and anchor_offset parameters. + + Args: + grid_height: size of the grid in the y direction (int or int scalar tensor) + grid_width: size of the grid in the x direction (int or int scalar tensor) + scales: a 1-d (float) tensor representing the scale of each box in the + basis set. + aspect_ratios: a 1-d (float) tensor representing the aspect ratio of each + box in the basis set. The length of the scales and aspect_ratios tensors + must be equal. + base_anchor_size: base anchor size as [height, width] + (float tensor of shape [2]) + anchor_stride: difference in centers between base anchors for adjacent grid + positions (float tensor of shape [2]) + anchor_offset: center of the anchor with scale and aspect ratio 1 for the + upper left element of the grid, this should be zero for + feature networks with only VALID padding and even receptive + field size, but may need some additional calculation if other + padding is used (float tensor of shape [2]) + Returns: + a BoxList holding a collection of N anchor boxes + """ + ratio_sqrts = tf.sqrt(aspect_ratios) + heights = scales / ratio_sqrts * base_anchor_size[0] + widths = scales * ratio_sqrts * base_anchor_size[1] + + # Get a grid of box centers + y_centers = tf.to_float(tf.range(grid_height)) + y_centers = y_centers * anchor_stride[0] + anchor_offset[0] + x_centers = tf.to_float(tf.range(grid_width)) + x_centers = x_centers * anchor_stride[1] + anchor_offset[1] + x_centers, y_centers = ops.meshgrid(x_centers, y_centers) + + widths_grid, x_centers_grid = ops.meshgrid(widths, x_centers) + heights_grid, y_centers_grid = ops.meshgrid(heights, y_centers) + bbox_centers = tf.stack([y_centers_grid, x_centers_grid], axis=3) + bbox_sizes = tf.stack([heights_grid, widths_grid], axis=3) + bbox_centers = tf.reshape(bbox_centers, [-1, 2]) + bbox_sizes = tf.reshape(bbox_sizes, [-1, 2]) + bbox_corners = _center_size_bbox_to_corners_bbox(bbox_centers, bbox_sizes) + return box_list.BoxList(bbox_corners) + + +def _center_size_bbox_to_corners_bbox(centers, sizes): + """Converts bbox center-size representation to corners representation. + + Args: + centers: a tensor with shape [N, 2] representing bounding box centers + sizes: a tensor with shape [N, 2] representing bounding boxes + + Returns: + corners: tensor with shape [N, 4] representing bounding boxes in corners + representation + """ + return tf.concat([centers - .5 * sizes, centers + .5 * sizes], 1) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/grid_anchor_generator_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/grid_anchor_generator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..8de74aa7ede1c5d26bb72cff3d04e1a1a544f4f3 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/grid_anchor_generator_test.py @@ -0,0 +1,104 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.grid_anchor_generator.""" +import numpy as np +import tensorflow as tf + +from object_detection.anchor_generators import grid_anchor_generator +from object_detection.utils import test_case + + +class GridAnchorGeneratorTest(test_case.TestCase): + + def test_construct_single_anchor(self): + """Builds a 1x1 anchor grid to test the size of the output boxes.""" + def graph_fn(): + scales = [0.5, 1.0, 2.0] + aspect_ratios = [0.25, 1.0, 4.0] + anchor_offset = [7, -3] + anchor_generator = grid_anchor_generator.GridAnchorGenerator( + scales, aspect_ratios, anchor_offset=anchor_offset) + anchors_list = anchor_generator.generate(feature_map_shape_list=[(1, 1)]) + anchor_corners = anchors_list[0].get() + return (anchor_corners,) + exp_anchor_corners = [[-121, -35, 135, 29], [-249, -67, 263, 61], + [-505, -131, 519, 125], [-57, -67, 71, 61], + [-121, -131, 135, 125], [-249, -259, 263, 253], + [-25, -131, 39, 125], [-57, -259, 71, 253], + [-121, -515, 135, 509]] + anchor_corners_out = self.execute(graph_fn, []) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_anchor_grid(self): + def graph_fn(): + base_anchor_size = [10, 10] + anchor_stride = [19, 19] + anchor_offset = [0, 0] + scales = [0.5, 1.0, 2.0] + aspect_ratios = [1.0] + + anchor_generator = grid_anchor_generator.GridAnchorGenerator( + scales, + aspect_ratios, + base_anchor_size=base_anchor_size, + anchor_stride=anchor_stride, + anchor_offset=anchor_offset) + + anchors_list = anchor_generator.generate(feature_map_shape_list=[(2, 2)]) + anchor_corners = anchors_list[0].get() + return (anchor_corners,) + exp_anchor_corners = [[-2.5, -2.5, 2.5, 2.5], [-5., -5., 5., 5.], + [-10., -10., 10., 10.], [-2.5, 16.5, 2.5, 21.5], + [-5., 14., 5, 24], [-10., 9., 10, 29], + [16.5, -2.5, 21.5, 2.5], [14., -5., 24, 5], + [9., -10., 29, 10], [16.5, 16.5, 21.5, 21.5], + [14., 14., 24, 24], [9., 9., 29, 29]] + anchor_corners_out = self.execute(graph_fn, []) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_anchor_grid_with_dynamic_feature_map_shapes(self): + def graph_fn(feature_map_height, feature_map_width): + base_anchor_size = [10, 10] + anchor_stride = [19, 19] + anchor_offset = [0, 0] + scales = [0.5, 1.0, 2.0] + aspect_ratios = [1.0] + anchor_generator = grid_anchor_generator.GridAnchorGenerator( + scales, + aspect_ratios, + base_anchor_size=base_anchor_size, + anchor_stride=anchor_stride, + anchor_offset=anchor_offset) + + anchors_list = anchor_generator.generate( + feature_map_shape_list=[(feature_map_height, feature_map_width)]) + anchor_corners = anchors_list[0].get() + return (anchor_corners,) + + exp_anchor_corners = [[-2.5, -2.5, 2.5, 2.5], [-5., -5., 5., 5.], + [-10., -10., 10., 10.], [-2.5, 16.5, 2.5, 21.5], + [-5., 14., 5, 24], [-10., 9., 10, 29], + [16.5, -2.5, 21.5, 2.5], [14., -5., 24, 5], + [9., -10., 29, 10], [16.5, 16.5, 21.5, 21.5], + [14., 14., 24, 24], [9., 9., 29, 29]] + anchor_corners_out = self.execute_cpu(graph_fn, + [np.array(2, dtype=np.int32), + np.array(2, dtype=np.int32)]) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiple_grid_anchor_generator.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiple_grid_anchor_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..bd785c171f686f1c524b78efbc7d03dbae4f7940 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiple_grid_anchor_generator.py @@ -0,0 +1,336 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Generates grid anchors on the fly corresponding to multiple CNN layers. + +Generates grid anchors on the fly corresponding to multiple CNN layers as +described in: +"SSD: Single Shot MultiBox Detector" +Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, +Cheng-Yang Fu, Alexander C. Berg +(see Section 2.2: Choosing scales and aspect ratios for default boxes) +""" + +import numpy as np + +import tensorflow as tf + +from object_detection.anchor_generators import grid_anchor_generator +from object_detection.core import anchor_generator +from object_detection.core import box_list_ops + + +class MultipleGridAnchorGenerator(anchor_generator.AnchorGenerator): + """Generate a grid of anchors for multiple CNN layers.""" + + def __init__(self, + box_specs_list, + base_anchor_size=None, + anchor_strides=None, + anchor_offsets=None, + clip_window=None): + """Constructs a MultipleGridAnchorGenerator. + + To construct anchors, at multiple grid resolutions, one must provide a + list of feature_map_shape_list (e.g., [(8, 8), (4, 4)]), and for each grid + size, a corresponding list of (scale, aspect ratio) box specifications. + + For example: + box_specs_list = [[(.1, 1.0), (.1, 2.0)], # for 8x8 grid + [(.2, 1.0), (.3, 1.0), (.2, 2.0)]] # for 4x4 grid + + To support the fully convolutional setting, we pass grid sizes in at + generation time, while scale and aspect ratios are fixed at construction + time. + + Args: + box_specs_list: list of list of (scale, aspect ratio) pairs with the + outside list having the same number of entries as feature_map_shape_list + (which is passed in at generation time). + base_anchor_size: base anchor size as [height, width] + (length-2 float tensor, default=[1.0, 1.0]). + The height and width values are normalized to the + minimum dimension of the input height and width, so that + when the base anchor height equals the base anchor + width, the resulting anchor is square even if the input + image is not square. + anchor_strides: list of pairs of strides in pixels (in y and x directions + respectively). For example, setting anchor_strides=[(25, 25), (50, 50)] + means that we want the anchors corresponding to the first layer to be + strided by 25 pixels and those in the second layer to be strided by 50 + pixels in both y and x directions. If anchor_strides=None, they are set + to be the reciprocal of the corresponding feature map shapes. + anchor_offsets: list of pairs of offsets in pixels (in y and x directions + respectively). The offset specifies where we want the center of the + (0, 0)-th anchor to lie for each layer. For example, setting + anchor_offsets=[(10, 10), (20, 20)]) means that we want the + (0, 0)-th anchor of the first layer to lie at (10, 10) in pixel space + and likewise that we want the (0, 0)-th anchor of the second layer to + lie at (25, 25) in pixel space. If anchor_offsets=None, then they are + set to be half of the corresponding anchor stride. + clip_window: a tensor of shape [4] specifying a window to which all + anchors should be clipped. If clip_window is None, then no clipping + is performed. + + Raises: + ValueError: if box_specs_list is not a list of list of pairs + ValueError: if clip_window is not either None or a tensor of shape [4] + """ + if isinstance(box_specs_list, list) and all( + [isinstance(list_item, list) for list_item in box_specs_list]): + self._box_specs = box_specs_list + else: + raise ValueError('box_specs_list is expected to be a ' + 'list of lists of pairs') + if base_anchor_size is None: + base_anchor_size = tf.constant([256, 256], dtype=tf.float32) + self._base_anchor_size = base_anchor_size + self._anchor_strides = anchor_strides + self._anchor_offsets = anchor_offsets + if clip_window is not None and clip_window.get_shape().as_list() != [4]: + raise ValueError('clip_window must either be None or a shape [4] tensor') + self._clip_window = clip_window + self._scales = [] + self._aspect_ratios = [] + for box_spec in self._box_specs: + if not all([isinstance(entry, tuple) and len(entry) == 2 + for entry in box_spec]): + raise ValueError('box_specs_list is expected to be a ' + 'list of lists of pairs') + scales, aspect_ratios = zip(*box_spec) + self._scales.append(scales) + self._aspect_ratios.append(aspect_ratios) + + for arg, arg_name in zip([self._anchor_strides, self._anchor_offsets], + ['anchor_strides', 'anchor_offsets']): + if arg and not (isinstance(arg, list) and + len(arg) == len(self._box_specs)): + raise ValueError('%s must be a list with the same length ' + 'as self._box_specs' % arg_name) + if arg and not all([ + isinstance(list_item, tuple) and len(list_item) == 2 + for list_item in arg + ]): + raise ValueError('%s must be a list of pairs.' % arg_name) + + def name_scope(self): + return 'MultipleGridAnchorGenerator' + + def num_anchors_per_location(self): + """Returns the number of anchors per spatial location. + + Returns: + a list of integers, one for each expected feature map to be passed to + the Generate function. + """ + return [len(box_specs) for box_specs in self._box_specs] + + def _generate(self, feature_map_shape_list, im_height=1, im_width=1): + """Generates a collection of bounding boxes to be used as anchors. + + The number of anchors generated for a single grid with shape MxM where we + place k boxes over each grid center is k*M^2 and thus the total number of + anchors is the sum over all grids. In our box_specs_list example + (see the constructor docstring), we would place two boxes over each grid + point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and + thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the + output anchors follows the order of how the grid sizes and box_specs are + specified (with box_spec index varying the fastest, followed by width + index, then height index, then grid index). + + Args: + feature_map_shape_list: list of pairs of convnet layer resolutions in the + format [(height_0, width_0), (height_1, width_1), ...]. For example, + setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that + correspond to an 8x8 layer followed by a 7x7 layer. + im_height: the height of the image to generate the grid for. If both + im_height and im_width are 1, the generated anchors default to + normalized coordinates, otherwise absolute coordinates are used for the + grid. + im_width: the width of the image to generate the grid for. If both + im_height and im_width are 1, the generated anchors default to + normalized coordinates, otherwise absolute coordinates are used for the + grid. + + Returns: + boxes_list: a list of BoxLists each holding anchor boxes corresponding to + the input feature map shapes. + + Raises: + ValueError: if feature_map_shape_list, box_specs_list do not have the same + length. + ValueError: if feature_map_shape_list does not consist of pairs of + integers + """ + if not (isinstance(feature_map_shape_list, list) + and len(feature_map_shape_list) == len(self._box_specs)): + raise ValueError('feature_map_shape_list must be a list with the same ' + 'length as self._box_specs') + if not all([isinstance(list_item, tuple) and len(list_item) == 2 + for list_item in feature_map_shape_list]): + raise ValueError('feature_map_shape_list must be a list of pairs.') + + im_height = tf.to_float(im_height) + im_width = tf.to_float(im_width) + + if not self._anchor_strides: + anchor_strides = [(1.0 / tf.to_float(pair[0]), 1.0 / tf.to_float(pair[1])) + for pair in feature_map_shape_list] + else: + anchor_strides = [(tf.to_float(stride[0]) / im_height, + tf.to_float(stride[1]) / im_width) + for stride in self._anchor_strides] + if not self._anchor_offsets: + anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1]) + for stride in anchor_strides] + else: + anchor_offsets = [(tf.to_float(offset[0]) / im_height, + tf.to_float(offset[1]) / im_width) + for offset in self._anchor_offsets] + + for arg, arg_name in zip([anchor_strides, anchor_offsets], + ['anchor_strides', 'anchor_offsets']): + if not (isinstance(arg, list) and len(arg) == len(self._box_specs)): + raise ValueError('%s must be a list with the same length ' + 'as self._box_specs' % arg_name) + if not all([isinstance(list_item, tuple) and len(list_item) == 2 + for list_item in arg]): + raise ValueError('%s must be a list of pairs.' % arg_name) + + anchor_grid_list = [] + min_im_shape = tf.minimum(im_height, im_width) + scale_height = min_im_shape / im_height + scale_width = min_im_shape / im_width + base_anchor_size = [ + scale_height * self._base_anchor_size[0], + scale_width * self._base_anchor_size[1] + ] + for feature_map_index, (grid_size, scales, aspect_ratios, stride, + offset) in enumerate( + zip(feature_map_shape_list, self._scales, + self._aspect_ratios, anchor_strides, + anchor_offsets)): + tiled_anchors = grid_anchor_generator.tile_anchors( + grid_height=grid_size[0], + grid_width=grid_size[1], + scales=scales, + aspect_ratios=aspect_ratios, + base_anchor_size=base_anchor_size, + anchor_stride=stride, + anchor_offset=offset) + if self._clip_window is not None: + tiled_anchors = box_list_ops.clip_to_window( + tiled_anchors, self._clip_window, filter_nonoverlapping=False) + num_anchors_in_layer = tiled_anchors.num_boxes_static() + if num_anchors_in_layer is None: + num_anchors_in_layer = tiled_anchors.num_boxes() + anchor_indices = feature_map_index * tf.ones([num_anchors_in_layer]) + tiled_anchors.add_field('feature_map_index', anchor_indices) + anchor_grid_list.append(tiled_anchors) + + return anchor_grid_list + + +def create_ssd_anchors(num_layers=6, + min_scale=0.2, + max_scale=0.95, + scales=None, + aspect_ratios=(1.0, 2.0, 3.0, 1.0 / 2, 1.0 / 3), + interpolated_scale_aspect_ratio=1.0, + base_anchor_size=None, + anchor_strides=None, + anchor_offsets=None, + reduce_boxes_in_lowest_layer=True): + """Creates MultipleGridAnchorGenerator for SSD anchors. + + This function instantiates a MultipleGridAnchorGenerator that reproduces + ``default box`` construction proposed by Liu et al in the SSD paper. + See Section 2.2 for details. Grid sizes are assumed to be passed in + at generation time from finest resolution to coarsest resolution --- this is + used to (linearly) interpolate scales of anchor boxes corresponding to the + intermediate grid sizes. + + Anchors that are returned by calling the `generate` method on the returned + MultipleGridAnchorGenerator object are always in normalized coordinates + and clipped to the unit square: (i.e. all coordinates lie in [0, 1]x[0, 1]). + + Args: + num_layers: integer number of grid layers to create anchors for (actual + grid sizes passed in at generation time) + min_scale: scale of anchors corresponding to finest resolution (float) + max_scale: scale of anchors corresponding to coarsest resolution (float) + scales: As list of anchor scales to use. When not None and not empty, + min_scale and max_scale are not used. + aspect_ratios: list or tuple of (float) aspect ratios to place on each + grid point. + interpolated_scale_aspect_ratio: An additional anchor is added with this + aspect ratio and a scale interpolated between the scale for a layer + and the scale for the next layer (1.0 for the last layer). + This anchor is not included if this value is 0. + base_anchor_size: base anchor size as [height, width]. + The height and width values are normalized to the minimum dimension of the + input height and width, so that when the base anchor height equals the + base anchor width, the resulting anchor is square even if the input image + is not square. + anchor_strides: list of pairs of strides in pixels (in y and x directions + respectively). For example, setting anchor_strides=[(25, 25), (50, 50)] + means that we want the anchors corresponding to the first layer to be + strided by 25 pixels and those in the second layer to be strided by 50 + pixels in both y and x directions. If anchor_strides=None, they are set to + be the reciprocal of the corresponding feature map shapes. + anchor_offsets: list of pairs of offsets in pixels (in y and x directions + respectively). The offset specifies where we want the center of the + (0, 0)-th anchor to lie for each layer. For example, setting + anchor_offsets=[(10, 10), (20, 20)]) means that we want the + (0, 0)-th anchor of the first layer to lie at (10, 10) in pixel space + and likewise that we want the (0, 0)-th anchor of the second layer to lie + at (25, 25) in pixel space. If anchor_offsets=None, then they are set to + be half of the corresponding anchor stride. + reduce_boxes_in_lowest_layer: a boolean to indicate whether the fixed 3 + boxes per location is used in the lowest layer. + + Returns: + a MultipleGridAnchorGenerator + """ + if base_anchor_size is None: + base_anchor_size = [1.0, 1.0] + base_anchor_size = tf.constant(base_anchor_size, dtype=tf.float32) + box_specs_list = [] + if scales is None or not scales: + scales = [min_scale + (max_scale - min_scale) * i / (num_layers - 1) + for i in range(num_layers)] + [1.0] + else: + # Add 1.0 to the end, which will only be used in scale_next below and used + # for computing an interpolated scale for the largest scale in the list. + scales += [1.0] + + for layer, scale, scale_next in zip( + range(num_layers), scales[:-1], scales[1:]): + layer_box_specs = [] + if layer == 0 and reduce_boxes_in_lowest_layer: + layer_box_specs = [(0.1, 1.0), (scale, 2.0), (scale, 0.5)] + else: + for aspect_ratio in aspect_ratios: + layer_box_specs.append((scale, aspect_ratio)) + # Add one more anchor, with a scale between the current scale, and the + # scale for the next layer, with a specified aspect ratio (1.0 by + # default). + if interpolated_scale_aspect_ratio > 0.0: + layer_box_specs.append((np.sqrt(scale*scale_next), + interpolated_scale_aspect_ratio)) + box_specs_list.append(layer_box_specs) + + return MultipleGridAnchorGenerator(box_specs_list, base_anchor_size, + anchor_strides, anchor_offsets) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiple_grid_anchor_generator_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiple_grid_anchor_generator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..070d81d36e79368c9fd46c7f3e03df7a93baee76 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiple_grid_anchor_generator_test.py @@ -0,0 +1,289 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for anchor_generators.multiple_grid_anchor_generator_test.py.""" + +import numpy as np + +import tensorflow as tf + +from object_detection.anchor_generators import multiple_grid_anchor_generator as ag +from object_detection.utils import test_case + + +class MultipleGridAnchorGeneratorTest(test_case.TestCase): + + def test_construct_single_anchor_grid(self): + """Builds a 1x1 anchor grid to test the size of the output boxes.""" + def graph_fn(): + + box_specs_list = [[(.5, .25), (1.0, .25), (2.0, .25), + (.5, 1.0), (1.0, 1.0), (2.0, 1.0), + (.5, 4.0), (1.0, 4.0), (2.0, 4.0)]] + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([256, 256], dtype=tf.float32), + anchor_strides=[(16, 16)], + anchor_offsets=[(7, -3)]) + anchors_list = anchor_generator.generate(feature_map_shape_list=[(1, 1)]) + return anchors_list[0].get() + exp_anchor_corners = [[-121, -35, 135, 29], [-249, -67, 263, 61], + [-505, -131, 519, 125], [-57, -67, 71, 61], + [-121, -131, 135, 125], [-249, -259, 263, 253], + [-25, -131, 39, 125], [-57, -259, 71, 253], + [-121, -515, 135, 509]] + + anchor_corners_out = self.execute(graph_fn, []) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_anchor_grid(self): + def graph_fn(): + box_specs_list = [[(0.5, 1.0), (1.0, 1.0), (2.0, 1.0)]] + + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([10, 10], dtype=tf.float32), + anchor_strides=[(19, 19)], + anchor_offsets=[(0, 0)]) + anchors_list = anchor_generator.generate(feature_map_shape_list=[(2, 2)]) + return anchors_list[0].get() + exp_anchor_corners = [[-2.5, -2.5, 2.5, 2.5], [-5., -5., 5., 5.], + [-10., -10., 10., 10.], [-2.5, 16.5, 2.5, 21.5], + [-5., 14., 5, 24], [-10., 9., 10, 29], + [16.5, -2.5, 21.5, 2.5], [14., -5., 24, 5], + [9., -10., 29, 10], [16.5, 16.5, 21.5, 21.5], + [14., 14., 24, 24], [9., 9., 29, 29]] + + anchor_corners_out = self.execute(graph_fn, []) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_anchor_grid_non_square(self): + + def graph_fn(): + box_specs_list = [[(1.0, 1.0)]] + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, base_anchor_size=tf.constant([1, 1], + dtype=tf.float32)) + anchors_list = anchor_generator.generate(feature_map_shape_list=[( + tf.constant(1, dtype=tf.int32), tf.constant(2, dtype=tf.int32))]) + return anchors_list[0].get() + + exp_anchor_corners = [[0., -0.25, 1., 0.75], [0., 0.25, 1., 1.25]] + anchor_corners_out = self.execute(graph_fn, []) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_dynamic_size_anchor_grid(self): + + def graph_fn(height, width): + box_specs_list = [[(1.0, 1.0)]] + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, base_anchor_size=tf.constant([1, 1], + dtype=tf.float32)) + anchors_list = anchor_generator.generate(feature_map_shape_list=[(height, + width)]) + return anchors_list[0].get() + + exp_anchor_corners = [[0., -0.25, 1., 0.75], [0., 0.25, 1., 1.25]] + + anchor_corners_out = self.execute_cpu(graph_fn, + [np.array(1, dtype=np.int32), + np.array(2, dtype=np.int32)]) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_anchor_grid_normalized(self): + def graph_fn(): + box_specs_list = [[(1.0, 1.0)]] + + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, base_anchor_size=tf.constant([1, 1], + dtype=tf.float32)) + anchors_list = anchor_generator.generate( + feature_map_shape_list=[(tf.constant(1, dtype=tf.int32), tf.constant( + 2, dtype=tf.int32))], + im_height=320, + im_width=640) + return anchors_list[0].get() + + exp_anchor_corners = [[0., 0., 1., 0.5], [0., 0.5, 1., 1.]] + anchor_corners_out = self.execute(graph_fn, []) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_multiple_grids(self): + + def graph_fn(): + box_specs_list = [[(1.0, 1.0), (2.0, 1.0), (1.0, 0.5)], + [(1.0, 1.0), (1.0, 0.5)]] + + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([1.0, 1.0], dtype=tf.float32), + anchor_strides=[(.25, .25), (.5, .5)], + anchor_offsets=[(.125, .125), (.25, .25)]) + anchors_list = anchor_generator.generate(feature_map_shape_list=[(4, 4), ( + 2, 2)]) + return [anchors.get() for anchors in anchors_list] + # height and width of box with .5 aspect ratio + h = np.sqrt(2) + w = 1.0/np.sqrt(2) + exp_small_grid_corners = [[-.25, -.25, .75, .75], + [.25-.5*h, .25-.5*w, .25+.5*h, .25+.5*w], + [-.25, .25, .75, 1.25], + [.25-.5*h, .75-.5*w, .25+.5*h, .75+.5*w], + [.25, -.25, 1.25, .75], + [.75-.5*h, .25-.5*w, .75+.5*h, .25+.5*w], + [.25, .25, 1.25, 1.25], + [.75-.5*h, .75-.5*w, .75+.5*h, .75+.5*w]] + # only test first entry of larger set of anchors + exp_big_grid_corners = [[.125-.5, .125-.5, .125+.5, .125+.5], + [.125-1.0, .125-1.0, .125+1.0, .125+1.0], + [.125-.5*h, .125-.5*w, .125+.5*h, .125+.5*w],] + + anchor_corners_out = np.concatenate(self.execute(graph_fn, []), axis=0) + self.assertEquals(anchor_corners_out.shape, (56, 4)) + big_grid_corners = anchor_corners_out[0:3, :] + small_grid_corners = anchor_corners_out[48:, :] + self.assertAllClose(small_grid_corners, exp_small_grid_corners) + self.assertAllClose(big_grid_corners, exp_big_grid_corners) + + def test_construct_multiple_grids_with_clipping(self): + + def graph_fn(): + box_specs_list = [[(1.0, 1.0), (2.0, 1.0), (1.0, 0.5)], + [(1.0, 1.0), (1.0, 0.5)]] + + clip_window = tf.constant([0, 0, 1, 1], dtype=tf.float32) + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([1.0, 1.0], dtype=tf.float32), + clip_window=clip_window) + anchors_list = anchor_generator.generate(feature_map_shape_list=[(4, 4), ( + 2, 2)]) + return [anchors.get() for anchors in anchors_list] + # height and width of box with .5 aspect ratio + h = np.sqrt(2) + w = 1.0/np.sqrt(2) + exp_small_grid_corners = [[0, 0, .75, .75], + [0, 0, .25+.5*h, .25+.5*w], + [0, .25, .75, 1], + [0, .75-.5*w, .25+.5*h, 1], + [.25, 0, 1, .75], + [.75-.5*h, 0, 1, .25+.5*w], + [.25, .25, 1, 1], + [.75-.5*h, .75-.5*w, 1, 1]] + + anchor_corners_out = np.concatenate(self.execute(graph_fn, []), axis=0) + small_grid_corners = anchor_corners_out[48:, :] + self.assertAllClose(small_grid_corners, exp_small_grid_corners) + + def test_invalid_box_specs(self): + # not all box specs are pairs + box_specs_list = [[(1.0, 1.0), (2.0, 1.0), (1.0, 0.5)], + [(1.0, 1.0), (1.0, 0.5, .3)]] + with self.assertRaises(ValueError): + ag.MultipleGridAnchorGenerator(box_specs_list) + + # box_specs_list is not a list of lists + box_specs_list = [(1.0, 1.0), (2.0, 1.0), (1.0, 0.5)] + with self.assertRaises(ValueError): + ag.MultipleGridAnchorGenerator(box_specs_list) + + def test_invalid_generate_arguments(self): + box_specs_list = [[(1.0, 1.0), (2.0, 1.0), (1.0, 0.5)], + [(1.0, 1.0), (1.0, 0.5)]] + + # incompatible lengths with box_specs_list + with self.assertRaises(ValueError): + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([1.0, 1.0], dtype=tf.float32), + anchor_strides=[(.25, .25)], + anchor_offsets=[(.125, .125), (.25, .25)]) + anchor_generator.generate(feature_map_shape_list=[(4, 4), (2, 2)]) + with self.assertRaises(ValueError): + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([1.0, 1.0], dtype=tf.float32), + anchor_strides=[(.25, .25), (.5, .5)], + anchor_offsets=[(.125, .125), (.25, .25)]) + anchor_generator.generate(feature_map_shape_list=[(4, 4), (2, 2), (1, 1)]) + with self.assertRaises(ValueError): + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([1.0, 1.0], dtype=tf.float32), + anchor_strides=[(.5, .5)], + anchor_offsets=[(.25, .25)]) + anchor_generator.generate(feature_map_shape_list=[(4, 4), (2, 2)]) + + # not pairs + with self.assertRaises(ValueError): + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([1.0, 1.0], dtype=tf.float32), + anchor_strides=[(.25, .25), (.5, .5)], + anchor_offsets=[(.125, .125), (.25, .25)]) + anchor_generator.generate(feature_map_shape_list=[(4, 4, 4), (2, 2)]) + with self.assertRaises(ValueError): + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([1.0, 1.0], dtype=tf.float32), + anchor_strides=[(.25, .25, .1), (.5, .5)], + anchor_offsets=[(.125, .125), (.25, .25)]) + anchor_generator.generate(feature_map_shape_list=[(4, 4), (2, 2)]) + with self.assertRaises(ValueError): + anchor_generator = ag.MultipleGridAnchorGenerator( + box_specs_list, + base_anchor_size=tf.constant([1.0, 1.0], dtype=tf.float32), + anchor_strides=[(.25, .25), (.5, .5)], + anchor_offsets=[(.125, .125), (.25, .25)]) + anchor_generator.generate(feature_map_shape_list=[(4), (2, 2)]) + + +class CreateSSDAnchorsTest(test_case.TestCase): + + def test_create_ssd_anchors_returns_correct_shape(self): + + def graph_fn1(): + anchor_generator = ag.create_ssd_anchors( + num_layers=6, + min_scale=0.2, + max_scale=0.95, + aspect_ratios=(1.0, 2.0, 3.0, 1.0 / 2, 1.0 / 3), + reduce_boxes_in_lowest_layer=True) + + feature_map_shape_list = [(38, 38), (19, 19), (10, 10), + (5, 5), (3, 3), (1, 1)] + anchors_list = anchor_generator.generate( + feature_map_shape_list=feature_map_shape_list) + return [anchors.get() for anchors in anchors_list] + anchor_corners_out = np.concatenate(self.execute(graph_fn1, []), axis=0) + self.assertEquals(anchor_corners_out.shape, (7308, 4)) + + def graph_fn2(): + anchor_generator = ag.create_ssd_anchors( + num_layers=6, min_scale=0.2, max_scale=0.95, + aspect_ratios=(1.0, 2.0, 3.0, 1.0/2, 1.0/3), + reduce_boxes_in_lowest_layer=False) + + feature_map_shape_list = [(38, 38), (19, 19), (10, 10), + (5, 5), (3, 3), (1, 1)] + anchors_list = anchor_generator.generate( + feature_map_shape_list=feature_map_shape_list) + return [anchors.get() for anchors in anchors_list] + anchor_corners_out = np.concatenate(self.execute(graph_fn2, []), axis=0) + self.assertEquals(anchor_corners_out.shape, (11640, 4)) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiscale_grid_anchor_generator.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiscale_grid_anchor_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..a8d227c77155eb45eb737c86c416d2e3d1fdda83 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiscale_grid_anchor_generator.py @@ -0,0 +1,138 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Generates grid anchors on the fly corresponding to multiple CNN layers. + +Generates grid anchors on the fly corresponding to multiple CNN layers as +described in: +"Focal Loss for Dense Object Detection" (https://arxiv.org/abs/1708.02002) +T.-Y. Lin, P. Goyal, R. Girshick, K. He, P. Dollar +""" + +from object_detection.anchor_generators import grid_anchor_generator +from object_detection.core import anchor_generator +from object_detection.core import box_list_ops + + +class MultiscaleGridAnchorGenerator(anchor_generator.AnchorGenerator): + """Generate a grid of anchors for multiple CNN layers of different scale.""" + + def __init__(self, min_level, max_level, anchor_scale, aspect_ratios, + scales_per_octave, normalize_coordinates=True): + """Constructs a MultiscaleGridAnchorGenerator. + + To construct anchors, at multiple scale resolutions, one must provide a + the minimum level and maximum levels on a scale pyramid. To define the size + of anchor, the anchor scale is provided to decide the size relatively to the + stride of the corresponding feature map. The generator allows one pixel + location on feature map maps to multiple anchors, that have different aspect + ratios and intermediate scales. + + Args: + min_level: minimum level in feature pyramid. + max_level: maximum level in feature pyramid. + anchor_scale: anchor scale and feature stride define the size of the base + anchor on an image. For example, given a feature pyramid with strides + [2^3, ..., 2^7] and anchor scale 4. The base anchor size is + 4 * [2^3, ..., 2^7]. + aspect_ratios: list or tuple of (float) aspect ratios to place on each + grid point. + scales_per_octave: integer number of intermediate scales per scale octave. + normalize_coordinates: whether to produce anchors in normalized + coordinates. (defaults to True). + """ + self._anchor_grid_info = [] + self._aspect_ratios = aspect_ratios + self._scales_per_octave = scales_per_octave + self._normalize_coordinates = normalize_coordinates + + for level in range(min_level, max_level + 1): + anchor_stride = [2**level, 2**level] + scales = [] + aspects = [] + for scale in range(scales_per_octave): + scales.append(2**(float(scale) / scales_per_octave)) + for aspect_ratio in aspect_ratios: + aspects.append(aspect_ratio) + base_anchor_size = [2**level * anchor_scale, 2**level * anchor_scale] + self._anchor_grid_info.append({ + 'level': level, + 'info': [scales, aspects, base_anchor_size, anchor_stride] + }) + + def name_scope(self): + return 'MultiscaleGridAnchorGenerator' + + def num_anchors_per_location(self): + """Returns the number of anchors per spatial location. + + Returns: + a list of integers, one for each expected feature map to be passed to + the Generate function. + """ + return len(self._anchor_grid_info) * [ + len(self._aspect_ratios) * self._scales_per_octave] + + def _generate(self, feature_map_shape_list, im_height, im_width): + """Generates a collection of bounding boxes to be used as anchors. + + Currently we require the input image shape to be statically defined. That + is, im_height and im_width should be integers rather than tensors. + + Args: + feature_map_shape_list: list of pairs of convnet layer resolutions in the + format [(height_0, width_0), (height_1, width_1), ...]. For example, + setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that + correspond to an 8x8 layer followed by a 7x7 layer. + im_height: the height of the image to generate the grid for. + im_width: the width of the image to generate the grid for. + + Returns: + boxes_list: a list of BoxLists each holding anchor boxes corresponding to + the input feature map shapes. + Raises: + ValueError: if im_height and im_width are not integers. + """ + if not isinstance(im_height, int) or not isinstance(im_width, int): + raise ValueError('MultiscaleGridAnchorGenerator currently requires ' + 'input image shape to be statically defined.') + anchor_grid_list = [] + for feat_shape, grid_info in zip(feature_map_shape_list, + self._anchor_grid_info): + # TODO(rathodv) check the feature_map_shape_list is consistent with + # self._anchor_grid_info + level = grid_info['level'] + stride = 2**level + scales, aspect_ratios, base_anchor_size, anchor_stride = grid_info['info'] + feat_h = feat_shape[0] + feat_w = feat_shape[1] + anchor_offset = [0, 0] + if im_height % 2.0**level == 0: + anchor_offset[0] = stride / 2.0 + if im_width % 2.0**level == 0: + anchor_offset[1] = stride / 2.0 + ag = grid_anchor_generator.GridAnchorGenerator( + scales, + aspect_ratios, + base_anchor_size=base_anchor_size, + anchor_stride=anchor_stride, + anchor_offset=anchor_offset) + (anchor_grid,) = ag.generate(feature_map_shape_list=[(feat_h, feat_w)]) + + if self._normalize_coordinates: + anchor_grid = box_list_ops.to_normalized_coordinates( + anchor_grid, im_height, im_width, check_range=False) + anchor_grid_list.append(anchor_grid) + + return anchor_grid_list diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiscale_grid_anchor_generator_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiscale_grid_anchor_generator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c96bdae7b9bcb59d295350ac31a5f8f56b720280 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/anchor_generators/multiscale_grid_anchor_generator_test.py @@ -0,0 +1,258 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for anchor_generators.multiscale_grid_anchor_generator_test.py.""" +import numpy as np +import tensorflow as tf + +from object_detection.anchor_generators import multiscale_grid_anchor_generator as mg +from object_detection.utils import test_case + + +class MultiscaleGridAnchorGeneratorTest(test_case.TestCase): + + def test_construct_single_anchor(self): + min_level = 5 + max_level = 5 + anchor_scale = 4.0 + aspect_ratios = [1.0] + scales_per_octave = 1 + im_height = 64 + im_width = 64 + feature_map_shape_list = [(2, 2)] + exp_anchor_corners = [[-48, -48, 80, 80], + [-48, -16, 80, 112], + [-16, -48, 112, 80], + [-16, -16, 112, 112]] + anchor_generator = mg.MultiscaleGridAnchorGenerator( + min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave, + normalize_coordinates=False) + anchors_list = anchor_generator.generate( + feature_map_shape_list, im_height=im_height, im_width=im_width) + anchor_corners = anchors_list[0].get() + + with self.test_session(): + anchor_corners_out = anchor_corners.eval() + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_single_anchor_in_normalized_coordinates(self): + min_level = 5 + max_level = 5 + anchor_scale = 4.0 + aspect_ratios = [1.0] + scales_per_octave = 1 + im_height = 64 + im_width = 128 + feature_map_shape_list = [(2, 2)] + exp_anchor_corners = [[-48./64, -48./128, 80./64, 80./128], + [-48./64, -16./128, 80./64, 112./128], + [-16./64, -48./128, 112./64, 80./128], + [-16./64, -16./128, 112./64, 112./128]] + anchor_generator = mg.MultiscaleGridAnchorGenerator( + min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave, + normalize_coordinates=True) + anchors_list = anchor_generator.generate( + feature_map_shape_list, im_height=im_height, im_width=im_width) + anchor_corners = anchors_list[0].get() + + with self.test_session(): + anchor_corners_out = anchor_corners.eval() + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_num_anchors_per_location(self): + min_level = 5 + max_level = 6 + anchor_scale = 4.0 + aspect_ratios = [1.0, 2.0] + scales_per_octave = 3 + anchor_generator = mg.MultiscaleGridAnchorGenerator( + min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave, + normalize_coordinates=False) + self.assertEqual(anchor_generator.num_anchors_per_location(), [6, 6]) + + def test_construct_single_anchor_fails_with_tensor_image_size(self): + min_level = 5 + max_level = 5 + anchor_scale = 4.0 + aspect_ratios = [1.0] + scales_per_octave = 1 + im_height = tf.constant(64) + im_width = tf.constant(64) + feature_map_shape_list = [(2, 2)] + anchor_generator = mg.MultiscaleGridAnchorGenerator( + min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave, + normalize_coordinates=False) + with self.assertRaises(ValueError): + anchor_generator.generate( + feature_map_shape_list, im_height=im_height, im_width=im_width) + + def test_construct_single_anchor_with_odd_input_dimension(self): + + def graph_fn(): + min_level = 5 + max_level = 5 + anchor_scale = 4.0 + aspect_ratios = [1.0] + scales_per_octave = 1 + im_height = 65 + im_width = 65 + feature_map_shape_list = [(3, 3)] + anchor_generator = mg.MultiscaleGridAnchorGenerator( + min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave, + normalize_coordinates=False) + anchors_list = anchor_generator.generate( + feature_map_shape_list, im_height=im_height, im_width=im_width) + anchor_corners = anchors_list[0].get() + return (anchor_corners,) + anchor_corners_out = self.execute(graph_fn, []) + exp_anchor_corners = [[-64, -64, 64, 64], + [-64, -32, 64, 96], + [-64, 0, 64, 128], + [-32, -64, 96, 64], + [-32, -32, 96, 96], + [-32, 0, 96, 128], + [0, -64, 128, 64], + [0, -32, 128, 96], + [0, 0, 128, 128]] + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_single_anchor_on_two_feature_maps(self): + + def graph_fn(): + min_level = 5 + max_level = 6 + anchor_scale = 4.0 + aspect_ratios = [1.0] + scales_per_octave = 1 + im_height = 64 + im_width = 64 + feature_map_shape_list = [(2, 2), (1, 1)] + anchor_generator = mg.MultiscaleGridAnchorGenerator( + min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave, + normalize_coordinates=False) + anchors_list = anchor_generator.generate(feature_map_shape_list, + im_height=im_height, + im_width=im_width) + anchor_corners = [anchors.get() for anchors in anchors_list] + return anchor_corners + + anchor_corners_out = np.concatenate(self.execute(graph_fn, []), axis=0) + exp_anchor_corners = [[-48, -48, 80, 80], + [-48, -16, 80, 112], + [-16, -48, 112, 80], + [-16, -16, 112, 112], + [-96, -96, 160, 160]] + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_single_anchor_with_two_scales_per_octave(self): + + def graph_fn(): + min_level = 6 + max_level = 6 + anchor_scale = 4.0 + aspect_ratios = [1.0] + scales_per_octave = 2 + im_height = 64 + im_width = 64 + feature_map_shape_list = [(1, 1)] + + anchor_generator = mg.MultiscaleGridAnchorGenerator( + min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave, + normalize_coordinates=False) + anchors_list = anchor_generator.generate(feature_map_shape_list, + im_height=im_height, + im_width=im_width) + anchor_corners = [anchors.get() for anchors in anchors_list] + return anchor_corners + # There are 4 set of anchors in this configuration. The order is: + # [[2**0.0 intermediate scale + 1.0 aspect], + # [2**0.5 intermediate scale + 1.0 aspect]] + exp_anchor_corners = [[-96., -96., 160., 160.], + [-149.0193, -149.0193, 213.0193, 213.0193]] + + anchor_corners_out = self.execute(graph_fn, []) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_single_anchor_with_two_scales_per_octave_and_aspect(self): + def graph_fn(): + min_level = 6 + max_level = 6 + anchor_scale = 4.0 + aspect_ratios = [1.0, 2.0] + scales_per_octave = 2 + im_height = 64 + im_width = 64 + feature_map_shape_list = [(1, 1)] + anchor_generator = mg.MultiscaleGridAnchorGenerator( + min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave, + normalize_coordinates=False) + anchors_list = anchor_generator.generate(feature_map_shape_list, + im_height=im_height, + im_width=im_width) + anchor_corners = [anchors.get() for anchors in anchors_list] + return anchor_corners + # There are 4 set of anchors in this configuration. The order is: + # [[2**0.0 intermediate scale + 1.0 aspect], + # [2**0.5 intermediate scale + 1.0 aspect], + # [2**0.0 intermediate scale + 2.0 aspect], + # [2**0.5 intermediate scale + 2.0 aspect]] + + exp_anchor_corners = [[-96., -96., 160., 160.], + [-149.0193, -149.0193, 213.0193, 213.0193], + [-58.50967, -149.0193, 122.50967, 213.0193], + [-96., -224., 160., 288.]] + anchor_corners_out = self.execute(graph_fn, []) + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + def test_construct_single_anchors_on_feature_maps_with_dynamic_shape(self): + + def graph_fn(feature_map1_height, feature_map1_width, feature_map2_height, + feature_map2_width): + min_level = 5 + max_level = 6 + anchor_scale = 4.0 + aspect_ratios = [1.0] + scales_per_octave = 1 + im_height = 64 + im_width = 64 + feature_map_shape_list = [(feature_map1_height, feature_map1_width), + (feature_map2_height, feature_map2_width)] + anchor_generator = mg.MultiscaleGridAnchorGenerator( + min_level, max_level, anchor_scale, aspect_ratios, scales_per_octave, + normalize_coordinates=False) + anchors_list = anchor_generator.generate(feature_map_shape_list, + im_height=im_height, + im_width=im_width) + anchor_corners = [anchors.get() for anchors in anchors_list] + return anchor_corners + + anchor_corners_out = np.concatenate( + self.execute_cpu(graph_fn, [ + np.array(2, dtype=np.int32), + np.array(2, dtype=np.int32), + np.array(1, dtype=np.int32), + np.array(1, dtype=np.int32) + ]), + axis=0) + exp_anchor_corners = [[-48, -48, 80, 80], + [-48, -16, 80, 112], + [-16, -48, 112, 80], + [-16, -16, 112, 112], + [-96, -96, 160, 160]] + self.assertAllClose(anchor_corners_out, exp_anchor_corners) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/faster_rcnn_box_coder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/faster_rcnn_box_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..af25e21a105ffa85931d3f30a1ca41c89c5dde53 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/faster_rcnn_box_coder.py @@ -0,0 +1,118 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Faster RCNN box coder. + +Faster RCNN box coder follows the coding schema described below: + ty = (y - ya) / ha + tx = (x - xa) / wa + th = log(h / ha) + tw = log(w / wa) + where x, y, w, h denote the box's center coordinates, width and height + respectively. Similarly, xa, ya, wa, ha denote the anchor's center + coordinates, width and height. tx, ty, tw and th denote the anchor-encoded + center, width and height respectively. + + See http://arxiv.org/abs/1506.01497 for details. +""" + +import tensorflow as tf + +from object_detection.core import box_coder +from object_detection.core import box_list + +EPSILON = 1e-8 + + +class FasterRcnnBoxCoder(box_coder.BoxCoder): + """Faster RCNN box coder.""" + + def __init__(self, scale_factors=None): + """Constructor for FasterRcnnBoxCoder. + + Args: + scale_factors: List of 4 positive scalars to scale ty, tx, th and tw. + If set to None, does not perform scaling. For Faster RCNN, + the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0]. + """ + if scale_factors: + assert len(scale_factors) == 4 + for scalar in scale_factors: + assert scalar > 0 + self._scale_factors = scale_factors + + @property + def code_size(self): + return 4 + + def _encode(self, boxes, anchors): + """Encode a box collection with respect to anchor collection. + + Args: + boxes: BoxList holding N boxes to be encoded. + anchors: BoxList of anchors. + + Returns: + a tensor representing N anchor-encoded boxes of the format + [ty, tx, th, tw]. + """ + # Convert anchors to the center coordinate representation. + ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() + ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes() + # Avoid NaN in division and log below. + ha += EPSILON + wa += EPSILON + h += EPSILON + w += EPSILON + + tx = (xcenter - xcenter_a) / wa + ty = (ycenter - ycenter_a) / ha + tw = tf.log(w / wa) + th = tf.log(h / ha) + # Scales location targets as used in paper for joint training. + if self._scale_factors: + ty *= self._scale_factors[0] + tx *= self._scale_factors[1] + th *= self._scale_factors[2] + tw *= self._scale_factors[3] + return tf.transpose(tf.stack([ty, tx, th, tw])) + + def _decode(self, rel_codes, anchors): + """Decode relative codes to boxes. + + Args: + rel_codes: a tensor representing N anchor-encoded boxes. + anchors: BoxList of anchors. + + Returns: + boxes: BoxList holding N bounding boxes. + """ + ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() + + ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes)) + if self._scale_factors: + ty /= self._scale_factors[0] + tx /= self._scale_factors[1] + th /= self._scale_factors[2] + tw /= self._scale_factors[3] + w = tf.exp(tw) * wa + h = tf.exp(th) * ha + ycenter = ty * ha + ycenter_a + xcenter = tx * wa + xcenter_a + ymin = ycenter - h / 2. + xmin = xcenter - w / 2. + ymax = ycenter + h / 2. + xmax = xcenter + w / 2. + return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax]))) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/faster_rcnn_box_coder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/faster_rcnn_box_coder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b2135f06eea093110c9da17c1c46b7d247f8e806 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/faster_rcnn_box_coder_test.py @@ -0,0 +1,94 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.box_coder.faster_rcnn_box_coder.""" + +import tensorflow as tf + +from object_detection.box_coders import faster_rcnn_box_coder +from object_detection.core import box_list + + +class FasterRcnnBoxCoderTest(tf.test.TestCase): + + def test_get_correct_relative_codes_after_encoding(self): + boxes = [[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]] + anchors = [[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]] + expected_rel_codes = [[-0.5, -0.416666, -0.405465, -0.182321], + [-0.083333, -0.222222, -0.693147, -1.098612]] + boxes = box_list.BoxList(tf.constant(boxes)) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = faster_rcnn_box_coder.FasterRcnnBoxCoder() + rel_codes = coder.encode(boxes, anchors) + with self.test_session() as sess: + rel_codes_out, = sess.run([rel_codes]) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + def test_get_correct_relative_codes_after_encoding_with_scaling(self): + boxes = [[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]] + anchors = [[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]] + scale_factors = [2, 3, 4, 5] + expected_rel_codes = [[-1., -1.25, -1.62186, -0.911608], + [-0.166667, -0.666667, -2.772588, -5.493062]] + boxes = box_list.BoxList(tf.constant(boxes)) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( + scale_factors=scale_factors) + rel_codes = coder.encode(boxes, anchors) + with self.test_session() as sess: + rel_codes_out, = sess.run([rel_codes]) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + def test_get_correct_boxes_after_decoding(self): + anchors = [[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]] + rel_codes = [[-0.5, -0.416666, -0.405465, -0.182321], + [-0.083333, -0.222222, -0.693147, -1.098612]] + expected_boxes = [[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]] + anchors = box_list.BoxList(tf.constant(anchors)) + coder = faster_rcnn_box_coder.FasterRcnnBoxCoder() + boxes = coder.decode(rel_codes, anchors) + with self.test_session() as sess: + boxes_out, = sess.run([boxes.get()]) + self.assertAllClose(boxes_out, expected_boxes) + + def test_get_correct_boxes_after_decoding_with_scaling(self): + anchors = [[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]] + rel_codes = [[-1., -1.25, -1.62186, -0.911608], + [-0.166667, -0.666667, -2.772588, -5.493062]] + scale_factors = [2, 3, 4, 5] + expected_boxes = [[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]] + anchors = box_list.BoxList(tf.constant(anchors)) + coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( + scale_factors=scale_factors) + boxes = coder.decode(rel_codes, anchors) + with self.test_session() as sess: + boxes_out, = sess.run([boxes.get()]) + self.assertAllClose(boxes_out, expected_boxes) + + def test_very_small_Width_nan_after_encoding(self): + boxes = [[10.0, 10.0, 10.0000001, 20.0]] + anchors = [[15.0, 12.0, 30.0, 18.0]] + expected_rel_codes = [[-0.833333, 0., -21.128731, 0.510826]] + boxes = box_list.BoxList(tf.constant(boxes)) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = faster_rcnn_box_coder.FasterRcnnBoxCoder() + rel_codes = coder.encode(boxes, anchors) + with self.test_session() as sess: + rel_codes_out, = sess.run([rel_codes]) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/keypoint_box_coder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/keypoint_box_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..67df3b82ebd83308578bc850ebba2e7c074a9679 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/keypoint_box_coder.py @@ -0,0 +1,171 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Keypoint box coder. + +The keypoint box coder follows the coding schema described below (this is +similar to the FasterRcnnBoxCoder, except that it encodes keypoints in addition +to box coordinates): + ty = (y - ya) / ha + tx = (x - xa) / wa + th = log(h / ha) + tw = log(w / wa) + tky0 = (ky0 - ya) / ha + tkx0 = (kx0 - xa) / wa + tky1 = (ky1 - ya) / ha + tkx1 = (kx1 - xa) / wa + ... + where x, y, w, h denote the box's center coordinates, width and height + respectively. Similarly, xa, ya, wa, ha denote the anchor's center + coordinates, width and height. tx, ty, tw and th denote the anchor-encoded + center, width and height respectively. ky0, kx0, ky1, kx1, ... denote the + keypoints' coordinates, and tky0, tkx0, tky1, tkx1, ... denote the + anchor-encoded keypoint coordinates. +""" + +import tensorflow as tf + +from object_detection.core import box_coder +from object_detection.core import box_list +from object_detection.core import standard_fields as fields + +EPSILON = 1e-8 + + +class KeypointBoxCoder(box_coder.BoxCoder): + """Keypoint box coder.""" + + def __init__(self, num_keypoints, scale_factors=None): + """Constructor for KeypointBoxCoder. + + Args: + num_keypoints: Number of keypoints to encode/decode. + scale_factors: List of 4 positive scalars to scale ty, tx, th and tw. + In addition to scaling ty and tx, the first 2 scalars are used to scale + the y and x coordinates of the keypoints as well. If set to None, does + not perform scaling. + """ + self._num_keypoints = num_keypoints + + if scale_factors: + assert len(scale_factors) == 4 + for scalar in scale_factors: + assert scalar > 0 + self._scale_factors = scale_factors + self._keypoint_scale_factors = None + if scale_factors is not None: + self._keypoint_scale_factors = tf.expand_dims(tf.tile( + [tf.to_float(scale_factors[0]), tf.to_float(scale_factors[1])], + [num_keypoints]), 1) + + @property + def code_size(self): + return 4 + self._num_keypoints * 2 + + def _encode(self, boxes, anchors): + """Encode a box and keypoint collection with respect to anchor collection. + + Args: + boxes: BoxList holding N boxes and keypoints to be encoded. Boxes are + tensors with the shape [N, 4], and keypoints are tensors with the shape + [N, num_keypoints, 2]. + anchors: BoxList of anchors. + + Returns: + a tensor representing N anchor-encoded boxes of the format + [ty, tx, th, tw, tky0, tkx0, tky1, tkx1, ...] where tky0 and tkx0 + represent the y and x coordinates of the first keypoint, tky1 and tkx1 + represent the y and x coordinates of the second keypoint, and so on. + """ + # Convert anchors to the center coordinate representation. + ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() + ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes() + keypoints = boxes.get_field(fields.BoxListFields.keypoints) + keypoints = tf.transpose(tf.reshape(keypoints, + [-1, self._num_keypoints * 2])) + num_boxes = boxes.num_boxes() + + # Avoid NaN in division and log below. + ha += EPSILON + wa += EPSILON + h += EPSILON + w += EPSILON + + tx = (xcenter - xcenter_a) / wa + ty = (ycenter - ycenter_a) / ha + tw = tf.log(w / wa) + th = tf.log(h / ha) + + tiled_anchor_centers = tf.tile( + tf.stack([ycenter_a, xcenter_a]), [self._num_keypoints, 1]) + tiled_anchor_sizes = tf.tile( + tf.stack([ha, wa]), [self._num_keypoints, 1]) + tkeypoints = (keypoints - tiled_anchor_centers) / tiled_anchor_sizes + + # Scales location targets as used in paper for joint training. + if self._scale_factors: + ty *= self._scale_factors[0] + tx *= self._scale_factors[1] + th *= self._scale_factors[2] + tw *= self._scale_factors[3] + tkeypoints *= tf.tile(self._keypoint_scale_factors, [1, num_boxes]) + + tboxes = tf.stack([ty, tx, th, tw]) + return tf.transpose(tf.concat([tboxes, tkeypoints], 0)) + + def _decode(self, rel_codes, anchors): + """Decode relative codes to boxes and keypoints. + + Args: + rel_codes: a tensor with shape [N, 4 + 2 * num_keypoints] representing N + anchor-encoded boxes and keypoints + anchors: BoxList of anchors. + + Returns: + boxes: BoxList holding N bounding boxes and keypoints. + """ + ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() + + num_codes = tf.shape(rel_codes)[0] + result = tf.unstack(tf.transpose(rel_codes)) + ty, tx, th, tw = result[:4] + tkeypoints = result[4:] + if self._scale_factors: + ty /= self._scale_factors[0] + tx /= self._scale_factors[1] + th /= self._scale_factors[2] + tw /= self._scale_factors[3] + tkeypoints /= tf.tile(self._keypoint_scale_factors, [1, num_codes]) + + w = tf.exp(tw) * wa + h = tf.exp(th) * ha + ycenter = ty * ha + ycenter_a + xcenter = tx * wa + xcenter_a + ymin = ycenter - h / 2. + xmin = xcenter - w / 2. + ymax = ycenter + h / 2. + xmax = xcenter + w / 2. + decoded_boxes_keypoints = box_list.BoxList( + tf.transpose(tf.stack([ymin, xmin, ymax, xmax]))) + + tiled_anchor_centers = tf.tile( + tf.stack([ycenter_a, xcenter_a]), [self._num_keypoints, 1]) + tiled_anchor_sizes = tf.tile( + tf.stack([ha, wa]), [self._num_keypoints, 1]) + keypoints = tkeypoints * tiled_anchor_sizes + tiled_anchor_centers + keypoints = tf.reshape(tf.transpose(keypoints), + [-1, self._num_keypoints, 2]) + decoded_boxes_keypoints.add_field(fields.BoxListFields.keypoints, keypoints) + return decoded_boxes_keypoints diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/keypoint_box_coder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/keypoint_box_coder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..330641e586af98af5f4764fb08f5307458777458 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/keypoint_box_coder_test.py @@ -0,0 +1,140 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.box_coder.keypoint_box_coder.""" + +import tensorflow as tf + +from object_detection.box_coders import keypoint_box_coder +from object_detection.core import box_list +from object_detection.core import standard_fields as fields + + +class KeypointBoxCoderTest(tf.test.TestCase): + + def test_get_correct_relative_codes_after_encoding(self): + boxes = [[10., 10., 20., 15.], + [0.2, 0.1, 0.5, 0.4]] + keypoints = [[[15., 12.], [10., 15.]], + [[0.5, 0.3], [0.2, 0.4]]] + num_keypoints = len(keypoints[0]) + anchors = [[15., 12., 30., 18.], + [0.1, 0.0, 0.7, 0.9]] + expected_rel_codes = [ + [-0.5, -0.416666, -0.405465, -0.182321, + -0.5, -0.5, -0.833333, 0.], + [-0.083333, -0.222222, -0.693147, -1.098612, + 0.166667, -0.166667, -0.333333, -0.055556] + ] + boxes = box_list.BoxList(tf.constant(boxes)) + boxes.add_field(fields.BoxListFields.keypoints, tf.constant(keypoints)) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = keypoint_box_coder.KeypointBoxCoder(num_keypoints) + rel_codes = coder.encode(boxes, anchors) + with self.test_session() as sess: + rel_codes_out, = sess.run([rel_codes]) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + def test_get_correct_relative_codes_after_encoding_with_scaling(self): + boxes = [[10., 10., 20., 15.], + [0.2, 0.1, 0.5, 0.4]] + keypoints = [[[15., 12.], [10., 15.]], + [[0.5, 0.3], [0.2, 0.4]]] + num_keypoints = len(keypoints[0]) + anchors = [[15., 12., 30., 18.], + [0.1, 0.0, 0.7, 0.9]] + scale_factors = [2, 3, 4, 5] + expected_rel_codes = [ + [-1., -1.25, -1.62186, -0.911608, + -1.0, -1.5, -1.666667, 0.], + [-0.166667, -0.666667, -2.772588, -5.493062, + 0.333333, -0.5, -0.666667, -0.166667] + ] + boxes = box_list.BoxList(tf.constant(boxes)) + boxes.add_field(fields.BoxListFields.keypoints, tf.constant(keypoints)) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = keypoint_box_coder.KeypointBoxCoder( + num_keypoints, scale_factors=scale_factors) + rel_codes = coder.encode(boxes, anchors) + with self.test_session() as sess: + rel_codes_out, = sess.run([rel_codes]) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + def test_get_correct_boxes_after_decoding(self): + anchors = [[15., 12., 30., 18.], + [0.1, 0.0, 0.7, 0.9]] + rel_codes = [ + [-0.5, -0.416666, -0.405465, -0.182321, + -0.5, -0.5, -0.833333, 0.], + [-0.083333, -0.222222, -0.693147, -1.098612, + 0.166667, -0.166667, -0.333333, -0.055556] + ] + expected_boxes = [[10., 10., 20., 15.], + [0.2, 0.1, 0.5, 0.4]] + expected_keypoints = [[[15., 12.], [10., 15.]], + [[0.5, 0.3], [0.2, 0.4]]] + num_keypoints = len(expected_keypoints[0]) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = keypoint_box_coder.KeypointBoxCoder(num_keypoints) + boxes = coder.decode(rel_codes, anchors) + with self.test_session() as sess: + boxes_out, keypoints_out = sess.run( + [boxes.get(), boxes.get_field(fields.BoxListFields.keypoints)]) + self.assertAllClose(boxes_out, expected_boxes) + self.assertAllClose(keypoints_out, expected_keypoints) + + def test_get_correct_boxes_after_decoding_with_scaling(self): + anchors = [[15., 12., 30., 18.], + [0.1, 0.0, 0.7, 0.9]] + rel_codes = [ + [-1., -1.25, -1.62186, -0.911608, + -1.0, -1.5, -1.666667, 0.], + [-0.166667, -0.666667, -2.772588, -5.493062, + 0.333333, -0.5, -0.666667, -0.166667] + ] + scale_factors = [2, 3, 4, 5] + expected_boxes = [[10., 10., 20., 15.], + [0.2, 0.1, 0.5, 0.4]] + expected_keypoints = [[[15., 12.], [10., 15.]], + [[0.5, 0.3], [0.2, 0.4]]] + num_keypoints = len(expected_keypoints[0]) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = keypoint_box_coder.KeypointBoxCoder( + num_keypoints, scale_factors=scale_factors) + boxes = coder.decode(rel_codes, anchors) + with self.test_session() as sess: + boxes_out, keypoints_out = sess.run( + [boxes.get(), boxes.get_field(fields.BoxListFields.keypoints)]) + self.assertAllClose(boxes_out, expected_boxes) + self.assertAllClose(keypoints_out, expected_keypoints) + + def test_very_small_width_nan_after_encoding(self): + boxes = [[10., 10., 10.0000001, 20.]] + keypoints = [[[10., 10.], [10.0000001, 20.]]] + anchors = [[15., 12., 30., 18.]] + expected_rel_codes = [[-0.833333, 0., -21.128731, 0.510826, + -0.833333, -0.833333, -0.833333, 0.833333]] + boxes = box_list.BoxList(tf.constant(boxes)) + boxes.add_field(fields.BoxListFields.keypoints, tf.constant(keypoints)) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = keypoint_box_coder.KeypointBoxCoder(2) + rel_codes = coder.encode(boxes, anchors) + with self.test_session() as sess: + rel_codes_out, = sess.run([rel_codes]) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/mean_stddev_box_coder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/mean_stddev_box_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..256f53fd036798cd7b3da8fcdd720c7e3c46e2e4 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/mean_stddev_box_coder.py @@ -0,0 +1,79 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Mean stddev box coder. + +This box coder use the following coding schema to encode boxes: +rel_code = (box_corner - anchor_corner_mean) / anchor_corner_stddev. +""" +from object_detection.core import box_coder +from object_detection.core import box_list + + +class MeanStddevBoxCoder(box_coder.BoxCoder): + """Mean stddev box coder.""" + + def __init__(self, stddev=0.01): + """Constructor for MeanStddevBoxCoder. + + Args: + stddev: The standard deviation used to encode and decode boxes. + """ + self._stddev = stddev + + @property + def code_size(self): + return 4 + + def _encode(self, boxes, anchors): + """Encode a box collection with respect to anchor collection. + + Args: + boxes: BoxList holding N boxes to be encoded. + anchors: BoxList of N anchors. + + Returns: + a tensor representing N anchor-encoded boxes + + Raises: + ValueError: if the anchors still have deprecated stddev field. + """ + box_corners = boxes.get() + if anchors.has_field('stddev'): + raise ValueError("'stddev' is a parameter of MeanStddevBoxCoder and " + "should not be specified in the box list.") + means = anchors.get() + return (box_corners - means) / self._stddev + + def _decode(self, rel_codes, anchors): + """Decode. + + Args: + rel_codes: a tensor representing N anchor-encoded boxes. + anchors: BoxList of anchors. + + Returns: + boxes: BoxList holding N bounding boxes + + Raises: + ValueError: if the anchors still have deprecated stddev field and expects + the decode method to use stddev value from that field. + """ + means = anchors.get() + if anchors.has_field('stddev'): + raise ValueError("'stddev' is a parameter of MeanStddevBoxCoder and " + "should not be specified in the box list.") + box_corners = rel_codes * self._stddev + means + return box_list.BoxList(box_corners) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/mean_stddev_box_coder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/mean_stddev_box_coder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..3e0eba936fe5a47e34501af73a926d8f83f9f163 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/mean_stddev_box_coder_test.py @@ -0,0 +1,54 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.box_coder.mean_stddev_boxcoder.""" + +import tensorflow as tf + +from object_detection.box_coders import mean_stddev_box_coder +from object_detection.core import box_list + + +class MeanStddevBoxCoderTest(tf.test.TestCase): + + def testGetCorrectRelativeCodesAfterEncoding(self): + box_corners = [[0.0, 0.0, 0.5, 0.5], [0.0, 0.0, 0.5, 0.5]] + boxes = box_list.BoxList(tf.constant(box_corners)) + expected_rel_codes = [[0.0, 0.0, 0.0, 0.0], [-5.0, -5.0, -5.0, -3.0]] + prior_means = tf.constant([[0.0, 0.0, 0.5, 0.5], [0.5, 0.5, 1.0, 0.8]]) + priors = box_list.BoxList(prior_means) + + coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + rel_codes = coder.encode(boxes, priors) + with self.test_session() as sess: + rel_codes_out = sess.run(rel_codes) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + def testGetCorrectBoxesAfterDecoding(self): + rel_codes = tf.constant([[0.0, 0.0, 0.0, 0.0], [-5.0, -5.0, -5.0, -3.0]]) + expected_box_corners = [[0.0, 0.0, 0.5, 0.5], [0.0, 0.0, 0.5, 0.5]] + prior_means = tf.constant([[0.0, 0.0, 0.5, 0.5], [0.5, 0.5, 1.0, 0.8]]) + priors = box_list.BoxList(prior_means) + + coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + decoded_boxes = coder.decode(rel_codes, priors) + decoded_box_corners = decoded_boxes.get() + with self.test_session() as sess: + decoded_out = sess.run(decoded_box_corners) + self.assertAllClose(decoded_out, expected_box_corners) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/square_box_coder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/square_box_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..ee46b689524838518182ff0f9208168e78c8b2cf --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/square_box_coder.py @@ -0,0 +1,126 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Square box coder. + +Square box coder follows the coding schema described below: +l = sqrt(h * w) +la = sqrt(ha * wa) +ty = (y - ya) / la +tx = (x - xa) / la +tl = log(l / la) +where x, y, w, h denote the box's center coordinates, width, and height, +respectively. Similarly, xa, ya, wa, ha denote the anchor's center +coordinates, width and height. tx, ty, tl denote the anchor-encoded +center, and length, respectively. Because the encoded box is a square, only +one length is encoded. + +This has shown to provide performance improvements over the Faster RCNN box +coder when the objects being detected tend to be square (e.g. faces) and when +the input images are not distorted via resizing. +""" + +import tensorflow as tf + +from object_detection.core import box_coder +from object_detection.core import box_list + +EPSILON = 1e-8 + + +class SquareBoxCoder(box_coder.BoxCoder): + """Encodes a 3-scalar representation of a square box.""" + + def __init__(self, scale_factors=None): + """Constructor for SquareBoxCoder. + + Args: + scale_factors: List of 3 positive scalars to scale ty, tx, and tl. + If set to None, does not perform scaling. For faster RCNN, + the open-source implementation recommends using [10.0, 10.0, 5.0]. + + Raises: + ValueError: If scale_factors is not length 3 or contains values less than + or equal to 0. + """ + if scale_factors: + if len(scale_factors) != 3: + raise ValueError('The argument scale_factors must be a list of length ' + '3.') + if any(scalar <= 0 for scalar in scale_factors): + raise ValueError('The values in scale_factors must all be greater ' + 'than 0.') + self._scale_factors = scale_factors + + @property + def code_size(self): + return 3 + + def _encode(self, boxes, anchors): + """Encodes a box collection with respect to an anchor collection. + + Args: + boxes: BoxList holding N boxes to be encoded. + anchors: BoxList of anchors. + + Returns: + a tensor representing N anchor-encoded boxes of the format + [ty, tx, tl]. + """ + # Convert anchors to the center coordinate representation. + ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() + la = tf.sqrt(ha * wa) + ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes() + l = tf.sqrt(h * w) + # Avoid NaN in division and log below. + la += EPSILON + l += EPSILON + + tx = (xcenter - xcenter_a) / la + ty = (ycenter - ycenter_a) / la + tl = tf.log(l / la) + # Scales location targets for joint training. + if self._scale_factors: + ty *= self._scale_factors[0] + tx *= self._scale_factors[1] + tl *= self._scale_factors[2] + return tf.transpose(tf.stack([ty, tx, tl])) + + def _decode(self, rel_codes, anchors): + """Decodes relative codes to boxes. + + Args: + rel_codes: a tensor representing N anchor-encoded boxes. + anchors: BoxList of anchors. + + Returns: + boxes: BoxList holding N bounding boxes. + """ + ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() + la = tf.sqrt(ha * wa) + + ty, tx, tl = tf.unstack(tf.transpose(rel_codes)) + if self._scale_factors: + ty /= self._scale_factors[0] + tx /= self._scale_factors[1] + tl /= self._scale_factors[2] + l = tf.exp(tl) * la + ycenter = ty * la + ycenter_a + xcenter = tx * la + xcenter_a + ymin = ycenter - l / 2. + xmin = xcenter - l / 2. + ymax = ycenter + l / 2. + xmax = xcenter + l / 2. + return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax]))) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/square_box_coder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/square_box_coder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7f739c6b4f38de3d280cb91e9c8e04a661a621e4 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/box_coders/square_box_coder_test.py @@ -0,0 +1,97 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.box_coder.square_box_coder.""" + +import tensorflow as tf + +from object_detection.box_coders import square_box_coder +from object_detection.core import box_list + + +class SquareBoxCoderTest(tf.test.TestCase): + + def test_correct_relative_codes_with_default_scale(self): + boxes = [[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]] + anchors = [[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]] + scale_factors = None + expected_rel_codes = [[-0.790569, -0.263523, -0.293893], + [-0.068041, -0.272166, -0.89588]] + + boxes = box_list.BoxList(tf.constant(boxes)) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = square_box_coder.SquareBoxCoder(scale_factors=scale_factors) + rel_codes = coder.encode(boxes, anchors) + with self.test_session() as sess: + (rel_codes_out,) = sess.run([rel_codes]) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + def test_correct_relative_codes_with_non_default_scale(self): + boxes = [[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]] + anchors = [[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]] + scale_factors = [2, 3, 4] + expected_rel_codes = [[-1.581139, -0.790569, -1.175573], + [-0.136083, -0.816497, -3.583519]] + boxes = box_list.BoxList(tf.constant(boxes)) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = square_box_coder.SquareBoxCoder(scale_factors=scale_factors) + rel_codes = coder.encode(boxes, anchors) + with self.test_session() as sess: + (rel_codes_out,) = sess.run([rel_codes]) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + def test_correct_relative_codes_with_small_width(self): + boxes = [[10.0, 10.0, 10.0000001, 20.0]] + anchors = [[15.0, 12.0, 30.0, 18.0]] + scale_factors = None + expected_rel_codes = [[-1.317616, 0., -20.670586]] + boxes = box_list.BoxList(tf.constant(boxes)) + anchors = box_list.BoxList(tf.constant(anchors)) + coder = square_box_coder.SquareBoxCoder(scale_factors=scale_factors) + rel_codes = coder.encode(boxes, anchors) + with self.test_session() as sess: + (rel_codes_out,) = sess.run([rel_codes]) + self.assertAllClose(rel_codes_out, expected_rel_codes) + + def test_correct_boxes_with_default_scale(self): + anchors = [[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]] + rel_codes = [[-0.5, -0.416666, -0.405465], + [-0.083333, -0.222222, -0.693147]] + scale_factors = None + expected_boxes = [[14.594306, 7.884875, 20.918861, 14.209432], + [0.155051, 0.102989, 0.522474, 0.470412]] + anchors = box_list.BoxList(tf.constant(anchors)) + coder = square_box_coder.SquareBoxCoder(scale_factors=scale_factors) + boxes = coder.decode(rel_codes, anchors) + with self.test_session() as sess: + (boxes_out,) = sess.run([boxes.get()]) + self.assertAllClose(boxes_out, expected_boxes) + + def test_correct_boxes_with_non_default_scale(self): + anchors = [[15.0, 12.0, 30.0, 18.0], [0.1, 0.0, 0.7, 0.9]] + rel_codes = [[-1., -1.25, -1.62186], [-0.166667, -0.666667, -2.772588]] + scale_factors = [2, 3, 4] + expected_boxes = [[14.594306, 7.884875, 20.918861, 14.209432], + [0.155051, 0.102989, 0.522474, 0.470412]] + anchors = box_list.BoxList(tf.constant(anchors)) + coder = square_box_coder.SquareBoxCoder(scale_factors=scale_factors) + boxes = coder.decode(rel_codes, anchors) + with self.test_session() as sess: + (boxes_out,) = sess.run([boxes.get()]) + self.assertAllClose(boxes_out, expected_boxes) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/anchor_generator_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/anchor_generator_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..54cec3a1df57f06466cde5e2bd9c6b706133c174 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/anchor_generator_builder.py @@ -0,0 +1,94 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""A function to build an object detection anchor generator from config.""" + +from object_detection.anchor_generators import grid_anchor_generator +from object_detection.anchor_generators import multiple_grid_anchor_generator +from object_detection.anchor_generators import multiscale_grid_anchor_generator +from object_detection.protos import anchor_generator_pb2 + + +def build(anchor_generator_config): + """Builds an anchor generator based on the config. + + Args: + anchor_generator_config: An anchor_generator.proto object containing the + config for the desired anchor generator. + + Returns: + Anchor generator based on the config. + + Raises: + ValueError: On empty anchor generator proto. + """ + if not isinstance(anchor_generator_config, + anchor_generator_pb2.AnchorGenerator): + raise ValueError('anchor_generator_config not of type ' + 'anchor_generator_pb2.AnchorGenerator') + if anchor_generator_config.WhichOneof( + 'anchor_generator_oneof') == 'grid_anchor_generator': + grid_anchor_generator_config = anchor_generator_config.grid_anchor_generator + return grid_anchor_generator.GridAnchorGenerator( + scales=[float(scale) for scale in grid_anchor_generator_config.scales], + aspect_ratios=[float(aspect_ratio) + for aspect_ratio + in grid_anchor_generator_config.aspect_ratios], + base_anchor_size=[grid_anchor_generator_config.height, + grid_anchor_generator_config.width], + anchor_stride=[grid_anchor_generator_config.height_stride, + grid_anchor_generator_config.width_stride], + anchor_offset=[grid_anchor_generator_config.height_offset, + grid_anchor_generator_config.width_offset]) + elif anchor_generator_config.WhichOneof( + 'anchor_generator_oneof') == 'ssd_anchor_generator': + ssd_anchor_generator_config = anchor_generator_config.ssd_anchor_generator + anchor_strides = None + if ssd_anchor_generator_config.height_stride: + anchor_strides = zip(ssd_anchor_generator_config.height_stride, + ssd_anchor_generator_config.width_stride) + anchor_offsets = None + if ssd_anchor_generator_config.height_offset: + anchor_offsets = zip(ssd_anchor_generator_config.height_offset, + ssd_anchor_generator_config.width_offset) + return multiple_grid_anchor_generator.create_ssd_anchors( + num_layers=ssd_anchor_generator_config.num_layers, + min_scale=ssd_anchor_generator_config.min_scale, + max_scale=ssd_anchor_generator_config.max_scale, + scales=[float(scale) for scale in ssd_anchor_generator_config.scales], + aspect_ratios=ssd_anchor_generator_config.aspect_ratios, + interpolated_scale_aspect_ratio=( + ssd_anchor_generator_config.interpolated_scale_aspect_ratio), + base_anchor_size=[ + ssd_anchor_generator_config.base_anchor_height, + ssd_anchor_generator_config.base_anchor_width + ], + anchor_strides=anchor_strides, + anchor_offsets=anchor_offsets, + reduce_boxes_in_lowest_layer=( + ssd_anchor_generator_config.reduce_boxes_in_lowest_layer)) + elif anchor_generator_config.WhichOneof( + 'anchor_generator_oneof') == 'multiscale_anchor_generator': + cfg = anchor_generator_config.multiscale_anchor_generator + return multiscale_grid_anchor_generator.MultiscaleGridAnchorGenerator( + cfg.min_level, + cfg.max_level, + cfg.anchor_scale, + [float(aspect_ratio) for aspect_ratio in cfg.aspect_ratios], + cfg.scales_per_octave, + cfg.normalize_coordinates + ) + else: + raise ValueError('Empty anchor generator.') diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/anchor_generator_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/anchor_generator_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..2a23c2d96b411634263ef7bd20ed045c6305c790 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/anchor_generator_builder_test.py @@ -0,0 +1,300 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for anchor_generator_builder.""" + +import math + +import tensorflow as tf + +from google.protobuf import text_format +from object_detection.anchor_generators import grid_anchor_generator +from object_detection.anchor_generators import multiple_grid_anchor_generator +from object_detection.anchor_generators import multiscale_grid_anchor_generator +from object_detection.builders import anchor_generator_builder +from object_detection.protos import anchor_generator_pb2 + + +class AnchorGeneratorBuilderTest(tf.test.TestCase): + + def assert_almost_list_equal(self, expected_list, actual_list, delta=None): + self.assertEqual(len(expected_list), len(actual_list)) + for expected_item, actual_item in zip(expected_list, actual_list): + self.assertAlmostEqual(expected_item, actual_item, delta=delta) + + def test_build_grid_anchor_generator_with_defaults(self): + anchor_generator_text_proto = """ + grid_anchor_generator { + } + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + anchor_generator_object = anchor_generator_builder.build( + anchor_generator_proto) + self.assertTrue(isinstance(anchor_generator_object, + grid_anchor_generator.GridAnchorGenerator)) + self.assertListEqual(anchor_generator_object._scales, []) + self.assertListEqual(anchor_generator_object._aspect_ratios, []) + with self.test_session() as sess: + base_anchor_size, anchor_offset, anchor_stride = sess.run( + [anchor_generator_object._base_anchor_size, + anchor_generator_object._anchor_offset, + anchor_generator_object._anchor_stride]) + self.assertAllEqual(anchor_offset, [0, 0]) + self.assertAllEqual(anchor_stride, [16, 16]) + self.assertAllEqual(base_anchor_size, [256, 256]) + + def test_build_grid_anchor_generator_with_non_default_parameters(self): + anchor_generator_text_proto = """ + grid_anchor_generator { + height: 128 + width: 512 + height_stride: 10 + width_stride: 20 + height_offset: 30 + width_offset: 40 + scales: [0.4, 2.2] + aspect_ratios: [0.3, 4.5] + } + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + anchor_generator_object = anchor_generator_builder.build( + anchor_generator_proto) + self.assertTrue(isinstance(anchor_generator_object, + grid_anchor_generator.GridAnchorGenerator)) + self.assert_almost_list_equal(anchor_generator_object._scales, + [0.4, 2.2]) + self.assert_almost_list_equal(anchor_generator_object._aspect_ratios, + [0.3, 4.5]) + with self.test_session() as sess: + base_anchor_size, anchor_offset, anchor_stride = sess.run( + [anchor_generator_object._base_anchor_size, + anchor_generator_object._anchor_offset, + anchor_generator_object._anchor_stride]) + self.assertAllEqual(anchor_offset, [30, 40]) + self.assertAllEqual(anchor_stride, [10, 20]) + self.assertAllEqual(base_anchor_size, [128, 512]) + + def test_build_ssd_anchor_generator_with_defaults(self): + anchor_generator_text_proto = """ + ssd_anchor_generator { + aspect_ratios: [1.0] + } + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + anchor_generator_object = anchor_generator_builder.build( + anchor_generator_proto) + self.assertTrue(isinstance(anchor_generator_object, + multiple_grid_anchor_generator. + MultipleGridAnchorGenerator)) + for actual_scales, expected_scales in zip( + list(anchor_generator_object._scales), + [(0.1, 0.2, 0.2), + (0.35, 0.418), + (0.499, 0.570), + (0.649, 0.721), + (0.799, 0.871), + (0.949, 0.974)]): + self.assert_almost_list_equal(expected_scales, actual_scales, delta=1e-2) + for actual_aspect_ratio, expected_aspect_ratio in zip( + list(anchor_generator_object._aspect_ratios), + [(1.0, 2.0, 0.5)] + 5 * [(1.0, 1.0)]): + self.assert_almost_list_equal(expected_aspect_ratio, actual_aspect_ratio) + + with self.test_session() as sess: + base_anchor_size = sess.run(anchor_generator_object._base_anchor_size) + self.assertAllClose(base_anchor_size, [1.0, 1.0]) + + def test_build_ssd_anchor_generator_with_custom_scales(self): + anchor_generator_text_proto = """ + ssd_anchor_generator { + aspect_ratios: [1.0] + scales: [0.1, 0.15, 0.2, 0.4, 0.6, 0.8] + reduce_boxes_in_lowest_layer: false + } + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + anchor_generator_object = anchor_generator_builder.build( + anchor_generator_proto) + self.assertTrue(isinstance(anchor_generator_object, + multiple_grid_anchor_generator. + MultipleGridAnchorGenerator)) + for actual_scales, expected_scales in zip( + list(anchor_generator_object._scales), + [(0.1, math.sqrt(0.1 * 0.15)), + (0.15, math.sqrt(0.15 * 0.2)), + (0.2, math.sqrt(0.2 * 0.4)), + (0.4, math.sqrt(0.4 * 0.6)), + (0.6, math.sqrt(0.6 * 0.8)), + (0.8, math.sqrt(0.8 * 1.0))]): + self.assert_almost_list_equal(expected_scales, actual_scales, delta=1e-2) + + def test_build_ssd_anchor_generator_with_custom_interpolated_scale(self): + anchor_generator_text_proto = """ + ssd_anchor_generator { + aspect_ratios: [0.5] + interpolated_scale_aspect_ratio: 0.5 + reduce_boxes_in_lowest_layer: false + } + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + anchor_generator_object = anchor_generator_builder.build( + anchor_generator_proto) + self.assertTrue(isinstance(anchor_generator_object, + multiple_grid_anchor_generator. + MultipleGridAnchorGenerator)) + for actual_aspect_ratio, expected_aspect_ratio in zip( + list(anchor_generator_object._aspect_ratios), + 6 * [(0.5, 0.5)]): + self.assert_almost_list_equal(expected_aspect_ratio, actual_aspect_ratio) + + def test_build_ssd_anchor_generator_without_reduced_boxes(self): + anchor_generator_text_proto = """ + ssd_anchor_generator { + aspect_ratios: [1.0] + reduce_boxes_in_lowest_layer: false + } + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + anchor_generator_object = anchor_generator_builder.build( + anchor_generator_proto) + self.assertTrue(isinstance(anchor_generator_object, + multiple_grid_anchor_generator. + MultipleGridAnchorGenerator)) + + for actual_scales, expected_scales in zip( + list(anchor_generator_object._scales), + [(0.2, 0.264), + (0.35, 0.418), + (0.499, 0.570), + (0.649, 0.721), + (0.799, 0.871), + (0.949, 0.974)]): + self.assert_almost_list_equal(expected_scales, actual_scales, delta=1e-2) + + for actual_aspect_ratio, expected_aspect_ratio in zip( + list(anchor_generator_object._aspect_ratios), + 6 * [(1.0, 1.0)]): + self.assert_almost_list_equal(expected_aspect_ratio, actual_aspect_ratio) + + with self.test_session() as sess: + base_anchor_size = sess.run(anchor_generator_object._base_anchor_size) + self.assertAllClose(base_anchor_size, [1.0, 1.0]) + + def test_build_ssd_anchor_generator_with_non_default_parameters(self): + anchor_generator_text_proto = """ + ssd_anchor_generator { + num_layers: 2 + min_scale: 0.3 + max_scale: 0.8 + aspect_ratios: [2.0] + height_stride: 16 + height_stride: 32 + width_stride: 20 + width_stride: 30 + height_offset: 8 + height_offset: 16 + width_offset: 0 + width_offset: 10 + } + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + anchor_generator_object = anchor_generator_builder.build( + anchor_generator_proto) + self.assertTrue(isinstance(anchor_generator_object, + multiple_grid_anchor_generator. + MultipleGridAnchorGenerator)) + + for actual_scales, expected_scales in zip( + list(anchor_generator_object._scales), + [(0.1, 0.3, 0.3), (0.8, 0.894)]): + self.assert_almost_list_equal(expected_scales, actual_scales, delta=1e-2) + + for actual_aspect_ratio, expected_aspect_ratio in zip( + list(anchor_generator_object._aspect_ratios), + [(1.0, 2.0, 0.5), (2.0, 1.0)]): + self.assert_almost_list_equal(expected_aspect_ratio, actual_aspect_ratio) + + for actual_strides, expected_strides in zip( + list(anchor_generator_object._anchor_strides), [(16, 20), (32, 30)]): + self.assert_almost_list_equal(expected_strides, actual_strides) + + for actual_offsets, expected_offsets in zip( + list(anchor_generator_object._anchor_offsets), [(8, 0), (16, 10)]): + self.assert_almost_list_equal(expected_offsets, actual_offsets) + + with self.test_session() as sess: + base_anchor_size = sess.run(anchor_generator_object._base_anchor_size) + self.assertAllClose(base_anchor_size, [1.0, 1.0]) + + def test_raise_value_error_on_empty_anchor_genertor(self): + anchor_generator_text_proto = """ + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + with self.assertRaises(ValueError): + anchor_generator_builder.build(anchor_generator_proto) + + def test_build_multiscale_anchor_generator_custom_aspect_ratios(self): + anchor_generator_text_proto = """ + multiscale_anchor_generator { + aspect_ratios: [1.0] + } + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + anchor_generator_object = anchor_generator_builder.build( + anchor_generator_proto) + self.assertTrue(isinstance(anchor_generator_object, + multiscale_grid_anchor_generator. + MultiscaleGridAnchorGenerator)) + for level, anchor_grid_info in zip( + range(3, 8), anchor_generator_object._anchor_grid_info): + self.assertEqual(set(anchor_grid_info.keys()), set(['level', 'info'])) + self.assertTrue(level, anchor_grid_info['level']) + self.assertEqual(len(anchor_grid_info['info']), 4) + self.assertAllClose(anchor_grid_info['info'][0], [2**0, 2**0.5]) + self.assertTrue(anchor_grid_info['info'][1], 1.0) + self.assertAllClose(anchor_grid_info['info'][2], + [4.0 * 2**level, 4.0 * 2**level]) + self.assertAllClose(anchor_grid_info['info'][3], [2**level, 2**level]) + self.assertTrue(anchor_generator_object._normalize_coordinates) + + def test_build_multiscale_anchor_generator_with_anchors_in_pixel_coordinates( + self): + anchor_generator_text_proto = """ + multiscale_anchor_generator { + aspect_ratios: [1.0] + normalize_coordinates: false + } + """ + anchor_generator_proto = anchor_generator_pb2.AnchorGenerator() + text_format.Merge(anchor_generator_text_proto, anchor_generator_proto) + anchor_generator_object = anchor_generator_builder.build( + anchor_generator_proto) + self.assertTrue(isinstance(anchor_generator_object, + multiscale_grid_anchor_generator. + MultiscaleGridAnchorGenerator)) + self.assertFalse(anchor_generator_object._normalize_coordinates) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_coder_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_coder_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..cc13d5a2f01c5a1f66e83abc5bb5ada542047d83 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_coder_builder.py @@ -0,0 +1,66 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""A function to build an object detection box coder from configuration.""" +from object_detection.box_coders import faster_rcnn_box_coder +from object_detection.box_coders import keypoint_box_coder +from object_detection.box_coders import mean_stddev_box_coder +from object_detection.box_coders import square_box_coder +from object_detection.protos import box_coder_pb2 + + +def build(box_coder_config): + """Builds a box coder object based on the box coder config. + + Args: + box_coder_config: A box_coder.proto object containing the config for the + desired box coder. + + Returns: + BoxCoder based on the config. + + Raises: + ValueError: On empty box coder proto. + """ + if not isinstance(box_coder_config, box_coder_pb2.BoxCoder): + raise ValueError('box_coder_config not of type box_coder_pb2.BoxCoder.') + + if box_coder_config.WhichOneof('box_coder_oneof') == 'faster_rcnn_box_coder': + return faster_rcnn_box_coder.FasterRcnnBoxCoder(scale_factors=[ + box_coder_config.faster_rcnn_box_coder.y_scale, + box_coder_config.faster_rcnn_box_coder.x_scale, + box_coder_config.faster_rcnn_box_coder.height_scale, + box_coder_config.faster_rcnn_box_coder.width_scale + ]) + if box_coder_config.WhichOneof('box_coder_oneof') == 'keypoint_box_coder': + return keypoint_box_coder.KeypointBoxCoder( + box_coder_config.keypoint_box_coder.num_keypoints, + scale_factors=[ + box_coder_config.keypoint_box_coder.y_scale, + box_coder_config.keypoint_box_coder.x_scale, + box_coder_config.keypoint_box_coder.height_scale, + box_coder_config.keypoint_box_coder.width_scale + ]) + if (box_coder_config.WhichOneof('box_coder_oneof') == + 'mean_stddev_box_coder'): + return mean_stddev_box_coder.MeanStddevBoxCoder( + stddev=box_coder_config.mean_stddev_box_coder.stddev) + if box_coder_config.WhichOneof('box_coder_oneof') == 'square_box_coder': + return square_box_coder.SquareBoxCoder(scale_factors=[ + box_coder_config.square_box_coder.y_scale, + box_coder_config.square_box_coder.x_scale, + box_coder_config.square_box_coder.length_scale + ]) + raise ValueError('Empty box coder.') diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_coder_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_coder_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..286012e9de7661a5663e0ba2873818337f106985 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_coder_builder_test.py @@ -0,0 +1,136 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for box_coder_builder.""" + +import tensorflow as tf + +from google.protobuf import text_format +from object_detection.box_coders import faster_rcnn_box_coder +from object_detection.box_coders import keypoint_box_coder +from object_detection.box_coders import mean_stddev_box_coder +from object_detection.box_coders import square_box_coder +from object_detection.builders import box_coder_builder +from object_detection.protos import box_coder_pb2 + + +class BoxCoderBuilderTest(tf.test.TestCase): + + def test_build_faster_rcnn_box_coder_with_defaults(self): + box_coder_text_proto = """ + faster_rcnn_box_coder { + } + """ + box_coder_proto = box_coder_pb2.BoxCoder() + text_format.Merge(box_coder_text_proto, box_coder_proto) + box_coder_object = box_coder_builder.build(box_coder_proto) + self.assertIsInstance(box_coder_object, + faster_rcnn_box_coder.FasterRcnnBoxCoder) + self.assertEqual(box_coder_object._scale_factors, [10.0, 10.0, 5.0, 5.0]) + + def test_build_faster_rcnn_box_coder_with_non_default_parameters(self): + box_coder_text_proto = """ + faster_rcnn_box_coder { + y_scale: 6.0 + x_scale: 3.0 + height_scale: 7.0 + width_scale: 8.0 + } + """ + box_coder_proto = box_coder_pb2.BoxCoder() + text_format.Merge(box_coder_text_proto, box_coder_proto) + box_coder_object = box_coder_builder.build(box_coder_proto) + self.assertIsInstance(box_coder_object, + faster_rcnn_box_coder.FasterRcnnBoxCoder) + self.assertEqual(box_coder_object._scale_factors, [6.0, 3.0, 7.0, 8.0]) + + def test_build_keypoint_box_coder_with_defaults(self): + box_coder_text_proto = """ + keypoint_box_coder { + } + """ + box_coder_proto = box_coder_pb2.BoxCoder() + text_format.Merge(box_coder_text_proto, box_coder_proto) + box_coder_object = box_coder_builder.build(box_coder_proto) + self.assertIsInstance(box_coder_object, keypoint_box_coder.KeypointBoxCoder) + self.assertEqual(box_coder_object._scale_factors, [10.0, 10.0, 5.0, 5.0]) + + def test_build_keypoint_box_coder_with_non_default_parameters(self): + box_coder_text_proto = """ + keypoint_box_coder { + num_keypoints: 6 + y_scale: 6.0 + x_scale: 3.0 + height_scale: 7.0 + width_scale: 8.0 + } + """ + box_coder_proto = box_coder_pb2.BoxCoder() + text_format.Merge(box_coder_text_proto, box_coder_proto) + box_coder_object = box_coder_builder.build(box_coder_proto) + self.assertIsInstance(box_coder_object, keypoint_box_coder.KeypointBoxCoder) + self.assertEqual(box_coder_object._num_keypoints, 6) + self.assertEqual(box_coder_object._scale_factors, [6.0, 3.0, 7.0, 8.0]) + + def test_build_mean_stddev_box_coder(self): + box_coder_text_proto = """ + mean_stddev_box_coder { + } + """ + box_coder_proto = box_coder_pb2.BoxCoder() + text_format.Merge(box_coder_text_proto, box_coder_proto) + box_coder_object = box_coder_builder.build(box_coder_proto) + self.assertTrue( + isinstance(box_coder_object, + mean_stddev_box_coder.MeanStddevBoxCoder)) + + def test_build_square_box_coder_with_defaults(self): + box_coder_text_proto = """ + square_box_coder { + } + """ + box_coder_proto = box_coder_pb2.BoxCoder() + text_format.Merge(box_coder_text_proto, box_coder_proto) + box_coder_object = box_coder_builder.build(box_coder_proto) + self.assertTrue( + isinstance(box_coder_object, square_box_coder.SquareBoxCoder)) + self.assertEqual(box_coder_object._scale_factors, [10.0, 10.0, 5.0]) + + def test_build_square_box_coder_with_non_default_parameters(self): + box_coder_text_proto = """ + square_box_coder { + y_scale: 6.0 + x_scale: 3.0 + length_scale: 7.0 + } + """ + box_coder_proto = box_coder_pb2.BoxCoder() + text_format.Merge(box_coder_text_proto, box_coder_proto) + box_coder_object = box_coder_builder.build(box_coder_proto) + self.assertTrue( + isinstance(box_coder_object, square_box_coder.SquareBoxCoder)) + self.assertEqual(box_coder_object._scale_factors, [6.0, 3.0, 7.0]) + + def test_raise_error_on_empty_box_coder(self): + box_coder_text_proto = """ + """ + box_coder_proto = box_coder_pb2.BoxCoder() + text_format.Merge(box_coder_text_proto, box_coder_proto) + with self.assertRaises(ValueError): + box_coder_builder.build(box_coder_proto) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_predictor_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_predictor_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..2f311221ce4729599501c93d2192764d5ed8207b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_predictor_builder.py @@ -0,0 +1,138 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Function to build box predictor from configuration.""" + +from object_detection.core import box_predictor +from object_detection.protos import box_predictor_pb2 + + +def build(argscope_fn, box_predictor_config, is_training, num_classes): + """Builds box predictor based on the configuration. + + Builds box predictor based on the configuration. See box_predictor.proto for + configurable options. Also, see box_predictor.py for more details. + + Args: + argscope_fn: A function that takes the following inputs: + * hyperparams_pb2.Hyperparams proto + * a boolean indicating if the model is in training mode. + and returns a tf slim argscope for Conv and FC hyperparameters. + box_predictor_config: box_predictor_pb2.BoxPredictor proto containing + configuration. + is_training: Whether the models is in training mode. + num_classes: Number of classes to predict. + + Returns: + box_predictor: box_predictor.BoxPredictor object. + + Raises: + ValueError: On unknown box predictor. + """ + if not isinstance(box_predictor_config, box_predictor_pb2.BoxPredictor): + raise ValueError('box_predictor_config not of type ' + 'box_predictor_pb2.BoxPredictor.') + + box_predictor_oneof = box_predictor_config.WhichOneof('box_predictor_oneof') + + if box_predictor_oneof == 'convolutional_box_predictor': + conv_box_predictor = box_predictor_config.convolutional_box_predictor + conv_hyperparams_fn = argscope_fn(conv_box_predictor.conv_hyperparams, + is_training) + box_predictor_object = box_predictor.ConvolutionalBoxPredictor( + is_training=is_training, + num_classes=num_classes, + conv_hyperparams_fn=conv_hyperparams_fn, + min_depth=conv_box_predictor.min_depth, + max_depth=conv_box_predictor.max_depth, + num_layers_before_predictor=(conv_box_predictor. + num_layers_before_predictor), + use_dropout=conv_box_predictor.use_dropout, + dropout_keep_prob=conv_box_predictor.dropout_keep_probability, + kernel_size=conv_box_predictor.kernel_size, + box_code_size=conv_box_predictor.box_code_size, + apply_sigmoid_to_scores=conv_box_predictor.apply_sigmoid_to_scores, + class_prediction_bias_init=(conv_box_predictor. + class_prediction_bias_init), + use_depthwise=conv_box_predictor.use_depthwise + ) + return box_predictor_object + + if box_predictor_oneof == 'weight_shared_convolutional_box_predictor': + conv_box_predictor = (box_predictor_config. + weight_shared_convolutional_box_predictor) + conv_hyperparams_fn = argscope_fn(conv_box_predictor.conv_hyperparams, + is_training) + box_predictor_object = box_predictor.WeightSharedConvolutionalBoxPredictor( + is_training=is_training, + num_classes=num_classes, + conv_hyperparams_fn=conv_hyperparams_fn, + depth=conv_box_predictor.depth, + num_layers_before_predictor=( + conv_box_predictor.num_layers_before_predictor), + kernel_size=conv_box_predictor.kernel_size, + box_code_size=conv_box_predictor.box_code_size, + class_prediction_bias_init=conv_box_predictor. + class_prediction_bias_init, + use_dropout=conv_box_predictor.use_dropout, + dropout_keep_prob=conv_box_predictor.dropout_keep_probability) + return box_predictor_object + + if box_predictor_oneof == 'mask_rcnn_box_predictor': + mask_rcnn_box_predictor = box_predictor_config.mask_rcnn_box_predictor + fc_hyperparams_fn = argscope_fn(mask_rcnn_box_predictor.fc_hyperparams, + is_training) + conv_hyperparams_fn = None + if mask_rcnn_box_predictor.HasField('conv_hyperparams'): + conv_hyperparams_fn = argscope_fn( + mask_rcnn_box_predictor.conv_hyperparams, is_training) + box_predictor_object = box_predictor.MaskRCNNBoxPredictor( + is_training=is_training, + num_classes=num_classes, + fc_hyperparams_fn=fc_hyperparams_fn, + use_dropout=mask_rcnn_box_predictor.use_dropout, + dropout_keep_prob=mask_rcnn_box_predictor.dropout_keep_probability, + box_code_size=mask_rcnn_box_predictor.box_code_size, + conv_hyperparams_fn=conv_hyperparams_fn, + predict_instance_masks=mask_rcnn_box_predictor.predict_instance_masks, + mask_height=mask_rcnn_box_predictor.mask_height, + mask_width=mask_rcnn_box_predictor.mask_width, + mask_prediction_num_conv_layers=( + mask_rcnn_box_predictor.mask_prediction_num_conv_layers), + mask_prediction_conv_depth=( + mask_rcnn_box_predictor.mask_prediction_conv_depth), + masks_are_class_agnostic=( + mask_rcnn_box_predictor.masks_are_class_agnostic), + predict_keypoints=mask_rcnn_box_predictor.predict_keypoints, + share_box_across_classes=( + mask_rcnn_box_predictor.share_box_across_classes)) + return box_predictor_object + + if box_predictor_oneof == 'rfcn_box_predictor': + rfcn_box_predictor = box_predictor_config.rfcn_box_predictor + conv_hyperparams_fn = argscope_fn(rfcn_box_predictor.conv_hyperparams, + is_training) + box_predictor_object = box_predictor.RfcnBoxPredictor( + is_training=is_training, + num_classes=num_classes, + conv_hyperparams_fn=conv_hyperparams_fn, + crop_size=[rfcn_box_predictor.crop_height, + rfcn_box_predictor.crop_width], + num_spatial_bins=[rfcn_box_predictor.num_spatial_bins_height, + rfcn_box_predictor.num_spatial_bins_width], + depth=rfcn_box_predictor.depth, + box_code_size=rfcn_box_predictor.box_code_size) + return box_predictor_object + raise ValueError('Unknown box predictor: {}'.format(box_predictor_oneof)) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_predictor_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_predictor_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..35ad57be9975bedb93a953d3189062ef0d8d5568 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/box_predictor_builder_test.py @@ -0,0 +1,514 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for box_predictor_builder.""" +import mock +import tensorflow as tf + +from google.protobuf import text_format +from object_detection.builders import box_predictor_builder +from object_detection.builders import hyperparams_builder +from object_detection.protos import box_predictor_pb2 +from object_detection.protos import hyperparams_pb2 + + +class ConvolutionalBoxPredictorBuilderTest(tf.test.TestCase): + + def test_box_predictor_calls_conv_argscope_fn(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + weight: 0.0003 + } + } + initializer { + truncated_normal_initializer { + mean: 0.0 + stddev: 0.3 + } + } + activation: RELU_6 + """ + hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, hyperparams_proto) + def mock_conv_argscope_builder(conv_hyperparams_arg, is_training): + return (conv_hyperparams_arg, is_training) + + box_predictor_proto = box_predictor_pb2.BoxPredictor() + box_predictor_proto.convolutional_box_predictor.conv_hyperparams.CopyFrom( + hyperparams_proto) + box_predictor = box_predictor_builder.build( + argscope_fn=mock_conv_argscope_builder, + box_predictor_config=box_predictor_proto, + is_training=False, + num_classes=10) + (conv_hyperparams_actual, is_training) = box_predictor._conv_hyperparams_fn + self.assertAlmostEqual((hyperparams_proto.regularizer. + l1_regularizer.weight), + (conv_hyperparams_actual.regularizer.l1_regularizer. + weight)) + self.assertAlmostEqual((hyperparams_proto.initializer. + truncated_normal_initializer.stddev), + (conv_hyperparams_actual.initializer. + truncated_normal_initializer.stddev)) + self.assertAlmostEqual((hyperparams_proto.initializer. + truncated_normal_initializer.mean), + (conv_hyperparams_actual.initializer. + truncated_normal_initializer.mean)) + self.assertEqual(hyperparams_proto.activation, + conv_hyperparams_actual.activation) + self.assertFalse(is_training) + + def test_construct_non_default_conv_box_predictor(self): + box_predictor_text_proto = """ + convolutional_box_predictor { + min_depth: 2 + max_depth: 16 + num_layers_before_predictor: 2 + use_dropout: false + dropout_keep_probability: 0.4 + kernel_size: 3 + box_code_size: 3 + apply_sigmoid_to_scores: true + class_prediction_bias_init: 4.0 + use_depthwise: true + } + """ + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, hyperparams_proto) + def mock_conv_argscope_builder(conv_hyperparams_arg, is_training): + return (conv_hyperparams_arg, is_training) + + box_predictor_proto = box_predictor_pb2.BoxPredictor() + text_format.Merge(box_predictor_text_proto, box_predictor_proto) + box_predictor_proto.convolutional_box_predictor.conv_hyperparams.CopyFrom( + hyperparams_proto) + box_predictor = box_predictor_builder.build( + argscope_fn=mock_conv_argscope_builder, + box_predictor_config=box_predictor_proto, + is_training=False, + num_classes=10) + self.assertEqual(box_predictor._min_depth, 2) + self.assertEqual(box_predictor._max_depth, 16) + self.assertEqual(box_predictor._num_layers_before_predictor, 2) + self.assertFalse(box_predictor._use_dropout) + self.assertAlmostEqual(box_predictor._dropout_keep_prob, 0.4) + self.assertTrue(box_predictor._apply_sigmoid_to_scores) + self.assertAlmostEqual(box_predictor._class_prediction_bias_init, 4.0) + self.assertEqual(box_predictor.num_classes, 10) + self.assertFalse(box_predictor._is_training) + self.assertTrue(box_predictor._use_depthwise) + + def test_construct_default_conv_box_predictor(self): + box_predictor_text_proto = """ + convolutional_box_predictor { + conv_hyperparams { + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + }""" + box_predictor_proto = box_predictor_pb2.BoxPredictor() + text_format.Merge(box_predictor_text_proto, box_predictor_proto) + box_predictor = box_predictor_builder.build( + argscope_fn=hyperparams_builder.build, + box_predictor_config=box_predictor_proto, + is_training=True, + num_classes=90) + self.assertEqual(box_predictor._min_depth, 0) + self.assertEqual(box_predictor._max_depth, 0) + self.assertEqual(box_predictor._num_layers_before_predictor, 0) + self.assertTrue(box_predictor._use_dropout) + self.assertAlmostEqual(box_predictor._dropout_keep_prob, 0.8) + self.assertFalse(box_predictor._apply_sigmoid_to_scores) + self.assertEqual(box_predictor.num_classes, 90) + self.assertTrue(box_predictor._is_training) + self.assertFalse(box_predictor._use_depthwise) + + +class WeightSharedConvolutionalBoxPredictorBuilderTest(tf.test.TestCase): + + def test_box_predictor_calls_conv_argscope_fn(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + weight: 0.0003 + } + } + initializer { + truncated_normal_initializer { + mean: 0.0 + stddev: 0.3 + } + } + activation: RELU_6 + """ + hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, hyperparams_proto) + def mock_conv_argscope_builder(conv_hyperparams_arg, is_training): + return (conv_hyperparams_arg, is_training) + + box_predictor_proto = box_predictor_pb2.BoxPredictor() + (box_predictor_proto.weight_shared_convolutional_box_predictor + .conv_hyperparams.CopyFrom(hyperparams_proto)) + box_predictor = box_predictor_builder.build( + argscope_fn=mock_conv_argscope_builder, + box_predictor_config=box_predictor_proto, + is_training=False, + num_classes=10) + (conv_hyperparams_actual, is_training) = box_predictor._conv_hyperparams_fn + self.assertAlmostEqual((hyperparams_proto.regularizer. + l1_regularizer.weight), + (conv_hyperparams_actual.regularizer.l1_regularizer. + weight)) + self.assertAlmostEqual((hyperparams_proto.initializer. + truncated_normal_initializer.stddev), + (conv_hyperparams_actual.initializer. + truncated_normal_initializer.stddev)) + self.assertAlmostEqual((hyperparams_proto.initializer. + truncated_normal_initializer.mean), + (conv_hyperparams_actual.initializer. + truncated_normal_initializer.mean)) + self.assertEqual(hyperparams_proto.activation, + conv_hyperparams_actual.activation) + self.assertFalse(is_training) + + def test_construct_non_default_conv_box_predictor(self): + box_predictor_text_proto = """ + weight_shared_convolutional_box_predictor { + depth: 2 + num_layers_before_predictor: 2 + kernel_size: 7 + box_code_size: 3 + class_prediction_bias_init: 4.0 + } + """ + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, hyperparams_proto) + def mock_conv_argscope_builder(conv_hyperparams_arg, is_training): + return (conv_hyperparams_arg, is_training) + + box_predictor_proto = box_predictor_pb2.BoxPredictor() + text_format.Merge(box_predictor_text_proto, box_predictor_proto) + (box_predictor_proto.weight_shared_convolutional_box_predictor. + conv_hyperparams.CopyFrom(hyperparams_proto)) + box_predictor = box_predictor_builder.build( + argscope_fn=mock_conv_argscope_builder, + box_predictor_config=box_predictor_proto, + is_training=False, + num_classes=10) + self.assertEqual(box_predictor._depth, 2) + self.assertEqual(box_predictor._num_layers_before_predictor, 2) + self.assertAlmostEqual(box_predictor._class_prediction_bias_init, 4.0) + self.assertEqual(box_predictor.num_classes, 10) + self.assertFalse(box_predictor._is_training) + + def test_construct_default_conv_box_predictor(self): + box_predictor_text_proto = """ + weight_shared_convolutional_box_predictor { + conv_hyperparams { + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + }""" + box_predictor_proto = box_predictor_pb2.BoxPredictor() + text_format.Merge(box_predictor_text_proto, box_predictor_proto) + box_predictor = box_predictor_builder.build( + argscope_fn=hyperparams_builder.build, + box_predictor_config=box_predictor_proto, + is_training=True, + num_classes=90) + self.assertEqual(box_predictor._depth, 0) + self.assertEqual(box_predictor._num_layers_before_predictor, 0) + self.assertEqual(box_predictor.num_classes, 90) + self.assertTrue(box_predictor._is_training) + + +class MaskRCNNBoxPredictorBuilderTest(tf.test.TestCase): + + def test_box_predictor_builder_calls_fc_argscope_fn(self): + fc_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + weight: 0.0003 + } + } + initializer { + truncated_normal_initializer { + mean: 0.0 + stddev: 0.3 + } + } + activation: RELU_6 + op: FC + """ + hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(fc_hyperparams_text_proto, hyperparams_proto) + box_predictor_proto = box_predictor_pb2.BoxPredictor() + box_predictor_proto.mask_rcnn_box_predictor.fc_hyperparams.CopyFrom( + hyperparams_proto) + mock_argscope_fn = mock.Mock(return_value='arg_scope') + box_predictor = box_predictor_builder.build( + argscope_fn=mock_argscope_fn, + box_predictor_config=box_predictor_proto, + is_training=False, + num_classes=10) + mock_argscope_fn.assert_called_with(hyperparams_proto, False) + self.assertEqual(box_predictor._fc_hyperparams_fn, 'arg_scope') + + def test_non_default_mask_rcnn_box_predictor(self): + fc_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + activation: RELU_6 + op: FC + """ + box_predictor_text_proto = """ + mask_rcnn_box_predictor { + use_dropout: true + dropout_keep_probability: 0.8 + box_code_size: 3 + share_box_across_classes: true + } + """ + hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(fc_hyperparams_text_proto, hyperparams_proto) + def mock_fc_argscope_builder(fc_hyperparams_arg, is_training): + return (fc_hyperparams_arg, is_training) + + box_predictor_proto = box_predictor_pb2.BoxPredictor() + text_format.Merge(box_predictor_text_proto, box_predictor_proto) + box_predictor_proto.mask_rcnn_box_predictor.fc_hyperparams.CopyFrom( + hyperparams_proto) + box_predictor = box_predictor_builder.build( + argscope_fn=mock_fc_argscope_builder, + box_predictor_config=box_predictor_proto, + is_training=True, + num_classes=90) + self.assertTrue(box_predictor._use_dropout) + self.assertAlmostEqual(box_predictor._dropout_keep_prob, 0.8) + self.assertEqual(box_predictor.num_classes, 90) + self.assertTrue(box_predictor._is_training) + self.assertEqual(box_predictor._box_code_size, 3) + self.assertEqual(box_predictor._share_box_across_classes, True) + + def test_build_default_mask_rcnn_box_predictor(self): + box_predictor_proto = box_predictor_pb2.BoxPredictor() + box_predictor_proto.mask_rcnn_box_predictor.fc_hyperparams.op = ( + hyperparams_pb2.Hyperparams.FC) + box_predictor = box_predictor_builder.build( + argscope_fn=mock.Mock(return_value='arg_scope'), + box_predictor_config=box_predictor_proto, + is_training=True, + num_classes=90) + self.assertFalse(box_predictor._use_dropout) + self.assertAlmostEqual(box_predictor._dropout_keep_prob, 0.5) + self.assertEqual(box_predictor.num_classes, 90) + self.assertTrue(box_predictor._is_training) + self.assertEqual(box_predictor._box_code_size, 4) + self.assertFalse(box_predictor._predict_instance_masks) + self.assertFalse(box_predictor._predict_keypoints) + + def test_build_box_predictor_with_mask_branch(self): + box_predictor_proto = box_predictor_pb2.BoxPredictor() + box_predictor_proto.mask_rcnn_box_predictor.fc_hyperparams.op = ( + hyperparams_pb2.Hyperparams.FC) + box_predictor_proto.mask_rcnn_box_predictor.conv_hyperparams.op = ( + hyperparams_pb2.Hyperparams.CONV) + box_predictor_proto.mask_rcnn_box_predictor.predict_instance_masks = True + box_predictor_proto.mask_rcnn_box_predictor.mask_prediction_conv_depth = 512 + box_predictor_proto.mask_rcnn_box_predictor.mask_height = 16 + box_predictor_proto.mask_rcnn_box_predictor.mask_width = 16 + mock_argscope_fn = mock.Mock(return_value='arg_scope') + box_predictor = box_predictor_builder.build( + argscope_fn=mock_argscope_fn, + box_predictor_config=box_predictor_proto, + is_training=True, + num_classes=90) + mock_argscope_fn.assert_has_calls( + [mock.call(box_predictor_proto.mask_rcnn_box_predictor.fc_hyperparams, + True), + mock.call(box_predictor_proto.mask_rcnn_box_predictor.conv_hyperparams, + True)], any_order=True) + self.assertFalse(box_predictor._use_dropout) + self.assertAlmostEqual(box_predictor._dropout_keep_prob, 0.5) + self.assertEqual(box_predictor.num_classes, 90) + self.assertTrue(box_predictor._is_training) + self.assertEqual(box_predictor._box_code_size, 4) + self.assertTrue(box_predictor._predict_instance_masks) + self.assertEqual(box_predictor._mask_prediction_conv_depth, 512) + self.assertFalse(box_predictor._predict_keypoints) + + +class RfcnBoxPredictorBuilderTest(tf.test.TestCase): + + def test_box_predictor_calls_fc_argscope_fn(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + weight: 0.0003 + } + } + initializer { + truncated_normal_initializer { + mean: 0.0 + stddev: 0.3 + } + } + activation: RELU_6 + """ + hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, hyperparams_proto) + def mock_conv_argscope_builder(conv_hyperparams_arg, is_training): + return (conv_hyperparams_arg, is_training) + + box_predictor_proto = box_predictor_pb2.BoxPredictor() + box_predictor_proto.rfcn_box_predictor.conv_hyperparams.CopyFrom( + hyperparams_proto) + box_predictor = box_predictor_builder.build( + argscope_fn=mock_conv_argscope_builder, + box_predictor_config=box_predictor_proto, + is_training=False, + num_classes=10) + (conv_hyperparams_actual, is_training) = box_predictor._conv_hyperparams_fn + self.assertAlmostEqual((hyperparams_proto.regularizer. + l1_regularizer.weight), + (conv_hyperparams_actual.regularizer.l1_regularizer. + weight)) + self.assertAlmostEqual((hyperparams_proto.initializer. + truncated_normal_initializer.stddev), + (conv_hyperparams_actual.initializer. + truncated_normal_initializer.stddev)) + self.assertAlmostEqual((hyperparams_proto.initializer. + truncated_normal_initializer.mean), + (conv_hyperparams_actual.initializer. + truncated_normal_initializer.mean)) + self.assertEqual(hyperparams_proto.activation, + conv_hyperparams_actual.activation) + self.assertFalse(is_training) + + def test_non_default_rfcn_box_predictor(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + activation: RELU_6 + """ + box_predictor_text_proto = """ + rfcn_box_predictor { + num_spatial_bins_height: 4 + num_spatial_bins_width: 4 + depth: 4 + box_code_size: 3 + crop_height: 16 + crop_width: 16 + } + """ + hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, hyperparams_proto) + def mock_conv_argscope_builder(conv_hyperparams_arg, is_training): + return (conv_hyperparams_arg, is_training) + + box_predictor_proto = box_predictor_pb2.BoxPredictor() + text_format.Merge(box_predictor_text_proto, box_predictor_proto) + box_predictor_proto.rfcn_box_predictor.conv_hyperparams.CopyFrom( + hyperparams_proto) + box_predictor = box_predictor_builder.build( + argscope_fn=mock_conv_argscope_builder, + box_predictor_config=box_predictor_proto, + is_training=True, + num_classes=90) + self.assertEqual(box_predictor.num_classes, 90) + self.assertTrue(box_predictor._is_training) + self.assertEqual(box_predictor._box_code_size, 3) + self.assertEqual(box_predictor._num_spatial_bins, [4, 4]) + self.assertEqual(box_predictor._crop_size, [16, 16]) + + def test_default_rfcn_box_predictor(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + activation: RELU_6 + """ + hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, hyperparams_proto) + def mock_conv_argscope_builder(conv_hyperparams_arg, is_training): + return (conv_hyperparams_arg, is_training) + + box_predictor_proto = box_predictor_pb2.BoxPredictor() + box_predictor_proto.rfcn_box_predictor.conv_hyperparams.CopyFrom( + hyperparams_proto) + box_predictor = box_predictor_builder.build( + argscope_fn=mock_conv_argscope_builder, + box_predictor_config=box_predictor_proto, + is_training=True, + num_classes=90) + self.assertEqual(box_predictor.num_classes, 90) + self.assertTrue(box_predictor._is_training) + self.assertEqual(box_predictor._box_code_size, 4) + self.assertEqual(box_predictor._num_spatial_bins, [3, 3]) + self.assertEqual(box_predictor._crop_size, [12, 12]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/dataset_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/dataset_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..3628a85ea3ec33373e0642244a6a96984677358b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/dataset_builder.py @@ -0,0 +1,196 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""tf.data.Dataset builder. + +Creates data sources for DetectionModels from an InputReader config. See +input_reader.proto for options. + +Note: If users wishes to also use their own InputReaders with the Object +Detection configuration framework, they should define their own builder function +that wraps the build function. +""" +import functools +import tensorflow as tf + +from object_detection.core import standard_fields as fields +from object_detection.data_decoders import tf_example_decoder +from object_detection.protos import input_reader_pb2 +from object_detection.utils import dataset_util + + +def _get_padding_shapes(dataset, max_num_boxes=None, num_classes=None, + spatial_image_shape=None): + """Returns shapes to pad dataset tensors to before batching. + + Args: + dataset: tf.data.Dataset object. + max_num_boxes: Max number of groundtruth boxes needed to computes shapes for + padding. + num_classes: Number of classes in the dataset needed to compute shapes for + padding. + spatial_image_shape: A list of two integers of the form [height, width] + containing expected spatial shape of the image. + + Returns: + A dictionary keyed by fields.InputDataFields containing padding shapes for + tensors in the dataset. + + Raises: + ValueError: If groundtruth classes is neither rank 1 nor rank 2. + """ + + if not spatial_image_shape or spatial_image_shape == [-1, -1]: + height, width = None, None + else: + height, width = spatial_image_shape # pylint: disable=unpacking-non-sequence + + num_additional_channels = 0 + if fields.InputDataFields.image_additional_channels in dataset.output_shapes: + num_additional_channels = dataset.output_shapes[ + fields.InputDataFields.image_additional_channels].dims[2].value + padding_shapes = { + # Additional channels are merged before batching. + fields.InputDataFields.image: [ + height, width, 3 + num_additional_channels + ], + fields.InputDataFields.image_additional_channels: [ + height, width, num_additional_channels + ], + fields.InputDataFields.source_id: [], + fields.InputDataFields.filename: [], + fields.InputDataFields.key: [], + fields.InputDataFields.groundtruth_difficult: [max_num_boxes], + fields.InputDataFields.groundtruth_boxes: [max_num_boxes, 4], + fields.InputDataFields.groundtruth_instance_masks: [ + max_num_boxes, height, width + ], + fields.InputDataFields.groundtruth_is_crowd: [max_num_boxes], + fields.InputDataFields.groundtruth_group_of: [max_num_boxes], + fields.InputDataFields.groundtruth_area: [max_num_boxes], + fields.InputDataFields.groundtruth_weights: [max_num_boxes], + fields.InputDataFields.num_groundtruth_boxes: [], + fields.InputDataFields.groundtruth_label_types: [max_num_boxes], + fields.InputDataFields.groundtruth_label_scores: [max_num_boxes], + fields.InputDataFields.true_image_shape: [3], + fields.InputDataFields.multiclass_scores: [ + max_num_boxes, num_classes + 1 if num_classes is not None else None + ], + } + # Determine whether groundtruth_classes are integers or one-hot encodings, and + # apply batching appropriately. + classes_shape = dataset.output_shapes[ + fields.InputDataFields.groundtruth_classes] + if len(classes_shape) == 1: # Class integers. + padding_shapes[fields.InputDataFields.groundtruth_classes] = [max_num_boxes] + elif len(classes_shape) == 2: # One-hot or k-hot encoding. + padding_shapes[fields.InputDataFields.groundtruth_classes] = [ + max_num_boxes, num_classes] + else: + raise ValueError('Groundtruth classes must be a rank 1 tensor (classes) or ' + 'rank 2 tensor (one-hot encodings)') + + if fields.InputDataFields.original_image in dataset.output_shapes: + padding_shapes[fields.InputDataFields.original_image] = [ + None, None, 3 + num_additional_channels + ] + if fields.InputDataFields.groundtruth_keypoints in dataset.output_shapes: + tensor_shape = dataset.output_shapes[fields.InputDataFields. + groundtruth_keypoints] + padding_shape = [max_num_boxes, tensor_shape[1].value, + tensor_shape[2].value] + padding_shapes[fields.InputDataFields.groundtruth_keypoints] = padding_shape + if (fields.InputDataFields.groundtruth_keypoint_visibilities + in dataset.output_shapes): + tensor_shape = dataset.output_shapes[fields.InputDataFields. + groundtruth_keypoint_visibilities] + padding_shape = [max_num_boxes, tensor_shape[1].value] + padding_shapes[fields.InputDataFields. + groundtruth_keypoint_visibilities] = padding_shape + return {tensor_key: padding_shapes[tensor_key] + for tensor_key, _ in dataset.output_shapes.items()} + + +def build(input_reader_config, + transform_input_data_fn=None, + batch_size=None, + max_num_boxes=None, + num_classes=None, + spatial_image_shape=None, + num_additional_channels=0): + """Builds a tf.data.Dataset. + + Builds a tf.data.Dataset by applying the `transform_input_data_fn` on all + records. Applies a padded batch to the resulting dataset. + + Args: + input_reader_config: A input_reader_pb2.InputReader object. + transform_input_data_fn: Function to apply to all records, or None if + no extra decoding is required. + batch_size: Batch size. If None, batching is not performed. + max_num_boxes: Max number of groundtruth boxes needed to compute shapes for + padding. If None, will use a dynamic shape. + num_classes: Number of classes in the dataset needed to compute shapes for + padding. If None, will use a dynamic shape. + spatial_image_shape: A list of two integers of the form [height, width] + containing expected spatial shape of the image after applying + transform_input_data_fn. If None, will use dynamic shapes. + num_additional_channels: Number of additional channels to use in the input. + + Returns: + A tf.data.Dataset based on the input_reader_config. + + Raises: + ValueError: On invalid input reader proto. + ValueError: If no input paths are specified. + """ + if not isinstance(input_reader_config, input_reader_pb2.InputReader): + raise ValueError('input_reader_config not of type ' + 'input_reader_pb2.InputReader.') + + if input_reader_config.WhichOneof('input_reader') == 'tf_record_input_reader': + config = input_reader_config.tf_record_input_reader + if not config.input_path: + raise ValueError('At least one input path must be specified in ' + '`input_reader_config`.') + + label_map_proto_file = None + if input_reader_config.HasField('label_map_path'): + label_map_proto_file = input_reader_config.label_map_path + decoder = tf_example_decoder.TfExampleDecoder( + load_instance_masks=input_reader_config.load_instance_masks, + instance_mask_type=input_reader_config.mask_type, + label_map_proto_file=label_map_proto_file, + use_display_name=input_reader_config.use_display_name, + num_additional_channels=num_additional_channels) + + def process_fn(value): + processed = decoder.decode(value) + if transform_input_data_fn is not None: + return transform_input_data_fn(processed) + return processed + + dataset = dataset_util.read_dataset( + functools.partial(tf.data.TFRecordDataset, buffer_size=8 * 1000 * 1000), + process_fn, config.input_path[:], input_reader_config) + + if batch_size: + padding_shapes = _get_padding_shapes(dataset, max_num_boxes, num_classes, + spatial_image_shape) + dataset = dataset.apply( + tf.contrib.data.padded_batch_and_drop_remainder(batch_size, + padding_shapes)) + return dataset + + raise ValueError('Unsupported input_reader_config.') diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/dataset_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/dataset_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0f1360f5e18892ebf4155407e65b46e12e69a96a --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/dataset_builder_test.py @@ -0,0 +1,260 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for dataset_builder.""" + +import os +import numpy as np +import tensorflow as tf + +from google.protobuf import text_format + +from tensorflow.core.example import example_pb2 +from tensorflow.core.example import feature_pb2 +from object_detection.builders import dataset_builder +from object_detection.core import standard_fields as fields +from object_detection.protos import input_reader_pb2 +from object_detection.utils import dataset_util + + +class DatasetBuilderTest(tf.test.TestCase): + + def create_tf_record(self, has_additional_channels=False): + path = os.path.join(self.get_temp_dir(), 'tfrecord') + writer = tf.python_io.TFRecordWriter(path) + + image_tensor = np.random.randint(255, size=(4, 5, 3)).astype(np.uint8) + additional_channels_tensor = np.random.randint( + 255, size=(4, 5, 1)).astype(np.uint8) + flat_mask = (4 * 5) * [1.0] + with self.test_session(): + encoded_jpeg = tf.image.encode_jpeg(tf.constant(image_tensor)).eval() + encoded_additional_channels_jpeg = tf.image.encode_jpeg( + tf.constant(additional_channels_tensor)).eval() + features = { + 'image/encoded': + feature_pb2.Feature( + bytes_list=feature_pb2.BytesList(value=[encoded_jpeg])), + 'image/format': + feature_pb2.Feature( + bytes_list=feature_pb2.BytesList(value=['jpeg'.encode('utf-8')]) + ), + 'image/height': + feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=[4])), + 'image/width': + feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=[5])), + 'image/object/bbox/xmin': + feature_pb2.Feature(float_list=feature_pb2.FloatList(value=[0.0])), + 'image/object/bbox/xmax': + feature_pb2.Feature(float_list=feature_pb2.FloatList(value=[1.0])), + 'image/object/bbox/ymin': + feature_pb2.Feature(float_list=feature_pb2.FloatList(value=[0.0])), + 'image/object/bbox/ymax': + feature_pb2.Feature(float_list=feature_pb2.FloatList(value=[1.0])), + 'image/object/class/label': + feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=[2])), + 'image/object/mask': + feature_pb2.Feature( + float_list=feature_pb2.FloatList(value=flat_mask)), + } + if has_additional_channels: + features['image/additional_channels/encoded'] = feature_pb2.Feature( + bytes_list=feature_pb2.BytesList( + value=[encoded_additional_channels_jpeg] * 2)) + example = example_pb2.Example( + features=feature_pb2.Features(feature=features)) + writer.write(example.SerializeToString()) + writer.close() + + return path + + def test_build_tf_record_input_reader(self): + tf_record_path = self.create_tf_record() + + input_reader_text_proto = """ + shuffle: false + num_readers: 1 + tf_record_input_reader {{ + input_path: '{0}' + }} + """.format(tf_record_path) + input_reader_proto = input_reader_pb2.InputReader() + text_format.Merge(input_reader_text_proto, input_reader_proto) + tensor_dict = dataset_util.make_initializable_iterator( + dataset_builder.build(input_reader_proto, batch_size=1)).get_next() + + sv = tf.train.Supervisor(logdir=self.get_temp_dir()) + with sv.prepare_or_wait_for_session() as sess: + sv.start_queue_runners(sess) + output_dict = sess.run(tensor_dict) + + self.assertTrue( + fields.InputDataFields.groundtruth_instance_masks not in output_dict) + self.assertEquals((1, 4, 5, 3), + output_dict[fields.InputDataFields.image].shape) + self.assertAllEqual([[2]], + output_dict[fields.InputDataFields.groundtruth_classes]) + self.assertEquals( + (1, 1, 4), output_dict[fields.InputDataFields.groundtruth_boxes].shape) + self.assertAllEqual( + [0.0, 0.0, 1.0, 1.0], + output_dict[fields.InputDataFields.groundtruth_boxes][0][0]) + + def test_build_tf_record_input_reader_and_load_instance_masks(self): + tf_record_path = self.create_tf_record() + + input_reader_text_proto = """ + shuffle: false + num_readers: 1 + load_instance_masks: true + tf_record_input_reader {{ + input_path: '{0}' + }} + """.format(tf_record_path) + input_reader_proto = input_reader_pb2.InputReader() + text_format.Merge(input_reader_text_proto, input_reader_proto) + tensor_dict = dataset_util.make_initializable_iterator( + dataset_builder.build(input_reader_proto, batch_size=1)).get_next() + + sv = tf.train.Supervisor(logdir=self.get_temp_dir()) + with sv.prepare_or_wait_for_session() as sess: + sv.start_queue_runners(sess) + output_dict = sess.run(tensor_dict) + self.assertAllEqual( + (1, 1, 4, 5), + output_dict[fields.InputDataFields.groundtruth_instance_masks].shape) + + def test_build_tf_record_input_reader_with_batch_size_two(self): + tf_record_path = self.create_tf_record() + + input_reader_text_proto = """ + shuffle: false + num_readers: 1 + tf_record_input_reader {{ + input_path: '{0}' + }} + """.format(tf_record_path) + input_reader_proto = input_reader_pb2.InputReader() + text_format.Merge(input_reader_text_proto, input_reader_proto) + + def one_hot_class_encoding_fn(tensor_dict): + tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.one_hot( + tensor_dict[fields.InputDataFields.groundtruth_classes] - 1, depth=3) + return tensor_dict + + tensor_dict = dataset_util.make_initializable_iterator( + dataset_builder.build( + input_reader_proto, + transform_input_data_fn=one_hot_class_encoding_fn, + batch_size=2, + max_num_boxes=2, + num_classes=3, + spatial_image_shape=[4, 5])).get_next() + + sv = tf.train.Supervisor(logdir=self.get_temp_dir()) + with sv.prepare_or_wait_for_session() as sess: + sv.start_queue_runners(sess) + output_dict = sess.run(tensor_dict) + + self.assertAllEqual([2, 4, 5, 3], + output_dict[fields.InputDataFields.image].shape) + self.assertAllEqual([2, 2, 3], + output_dict[fields.InputDataFields.groundtruth_classes]. + shape) + self.assertAllEqual([2, 2, 4], + output_dict[fields.InputDataFields.groundtruth_boxes]. + shape) + self.assertAllEqual( + [[[0.0, 0.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]], + [[0.0, 0.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]], + output_dict[fields.InputDataFields.groundtruth_boxes]) + + def test_build_tf_record_input_reader_with_batch_size_two_and_masks(self): + tf_record_path = self.create_tf_record() + + input_reader_text_proto = """ + shuffle: false + num_readers: 1 + load_instance_masks: true + tf_record_input_reader {{ + input_path: '{0}' + }} + """.format(tf_record_path) + input_reader_proto = input_reader_pb2.InputReader() + text_format.Merge(input_reader_text_proto, input_reader_proto) + + def one_hot_class_encoding_fn(tensor_dict): + tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.one_hot( + tensor_dict[fields.InputDataFields.groundtruth_classes] - 1, depth=3) + return tensor_dict + + tensor_dict = dataset_util.make_initializable_iterator( + dataset_builder.build( + input_reader_proto, + transform_input_data_fn=one_hot_class_encoding_fn, + batch_size=2, + max_num_boxes=2, + num_classes=3, + spatial_image_shape=[4, 5])).get_next() + + sv = tf.train.Supervisor(logdir=self.get_temp_dir()) + with sv.prepare_or_wait_for_session() as sess: + sv.start_queue_runners(sess) + output_dict = sess.run(tensor_dict) + + self.assertAllEqual( + [2, 2, 4, 5], + output_dict[fields.InputDataFields.groundtruth_instance_masks].shape) + + def test_build_tf_record_input_reader_with_additional_channels(self): + tf_record_path = self.create_tf_record(has_additional_channels=True) + + input_reader_text_proto = """ + shuffle: false + num_readers: 1 + tf_record_input_reader {{ + input_path: '{0}' + }} + """.format(tf_record_path) + input_reader_proto = input_reader_pb2.InputReader() + text_format.Merge(input_reader_text_proto, input_reader_proto) + tensor_dict = dataset_util.make_initializable_iterator( + dataset_builder.build( + input_reader_proto, batch_size=2, + num_additional_channels=2)).get_next() + + sv = tf.train.Supervisor(logdir=self.get_temp_dir()) + with sv.prepare_or_wait_for_session() as sess: + sv.start_queue_runners(sess) + output_dict = sess.run(tensor_dict) + + self.assertEquals((2, 4, 5, 5), + output_dict[fields.InputDataFields.image].shape) + + def test_raises_error_with_no_input_paths(self): + input_reader_text_proto = """ + shuffle: false + num_readers: 1 + load_instance_masks: true + """ + input_reader_proto = input_reader_pb2.InputReader() + text_format.Merge(input_reader_text_proto, input_reader_proto) + with self.assertRaises(ValueError): + dataset_builder.build(input_reader_proto) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/graph_rewriter_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/graph_rewriter_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..77e60479bd8f6e6267acabcec9a4995ed1622959 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/graph_rewriter_builder.py @@ -0,0 +1,42 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functions for quantized training and evaluation.""" + +import tensorflow as tf + + +def build(graph_rewriter_config, is_training): + """Returns a function that modifies default graph based on options. + + Args: + graph_rewriter_config: graph_rewriter_pb2.GraphRewriter proto. + is_training: whether in training of eval mode. + """ + def graph_rewrite_fn(): + """Function to quantize weights and activation of the default graph.""" + if (graph_rewriter_config.quantization.weight_bits != 8 or + graph_rewriter_config.quantization.activation_bits != 8): + raise ValueError('Only 8bit quantization is supported') + + # Quantize the graph by inserting quantize ops for weights and activations + if is_training: + tf.contrib.quantize.create_training_graph( + input_graph=tf.get_default_graph(), + quant_delay=graph_rewriter_config.quantization.delay) + else: + tf.contrib.quantize.create_eval_graph(input_graph=tf.get_default_graph()) + + tf.contrib.layers.summarize_collection('quant_vars') + return graph_rewrite_fn diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/graph_rewriter_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/graph_rewriter_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..5f38d5a27df1e74674e74748687efdef191781f0 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/graph_rewriter_builder_test.py @@ -0,0 +1,57 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for graph_rewriter_builder.""" +import mock +import tensorflow as tf +from object_detection.builders import graph_rewriter_builder +from object_detection.protos import graph_rewriter_pb2 + + +class QuantizationBuilderTest(tf.test.TestCase): + + def testQuantizationBuilderSetsUpCorrectTrainArguments(self): + with mock.patch.object( + tf.contrib.quantize, 'create_training_graph') as mock_quant_fn: + with mock.patch.object(tf.contrib.layers, + 'summarize_collection') as mock_summarize_col: + graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter() + graph_rewriter_proto.quantization.delay = 10 + graph_rewriter_proto.quantization.weight_bits = 8 + graph_rewriter_proto.quantization.activation_bits = 8 + graph_rewrite_fn = graph_rewriter_builder.build( + graph_rewriter_proto, is_training=True) + graph_rewrite_fn() + _, kwargs = mock_quant_fn.call_args + self.assertEqual(kwargs['input_graph'], tf.get_default_graph()) + self.assertEqual(kwargs['quant_delay'], 10) + mock_summarize_col.assert_called_with('quant_vars') + + def testQuantizationBuilderSetsUpCorrectEvalArguments(self): + with mock.patch.object(tf.contrib.quantize, + 'create_eval_graph') as mock_quant_fn: + with mock.patch.object(tf.contrib.layers, + 'summarize_collection') as mock_summarize_col: + graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter() + graph_rewriter_proto.quantization.delay = 10 + graph_rewrite_fn = graph_rewriter_builder.build( + graph_rewriter_proto, is_training=False) + graph_rewrite_fn() + _, kwargs = mock_quant_fn.call_args + self.assertEqual(kwargs['input_graph'], tf.get_default_graph()) + mock_summarize_col.assert_called_with('quant_vars') + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/hyperparams_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/hyperparams_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..05addddaafa5785aa0995fb58181841511a250bc --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/hyperparams_builder.py @@ -0,0 +1,182 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Builder function to construct tf-slim arg_scope for convolution, fc ops.""" +import tensorflow as tf + +from object_detection.protos import hyperparams_pb2 +from object_detection.utils import context_manager + +slim = tf.contrib.slim + + +def build(hyperparams_config, is_training): + """Builds tf-slim arg_scope for convolution ops based on the config. + + Returns an arg_scope to use for convolution ops containing weights + initializer, weights regularizer, activation function, batch norm function + and batch norm parameters based on the configuration. + + Note that if the batch_norm parameteres are not specified in the config + (i.e. left to default) then batch norm is excluded from the arg_scope. + + The batch norm parameters are set for updates based on `is_training` argument + and conv_hyperparams_config.batch_norm.train parameter. During training, they + are updated only if batch_norm.train parameter is true. However, during eval, + no updates are made to the batch norm variables. In both cases, their current + values are used during forward pass. + + Args: + hyperparams_config: hyperparams.proto object containing + hyperparameters. + is_training: Whether the network is in training mode. + + Returns: + arg_scope_fn: A function to construct tf-slim arg_scope containing + hyperparameters for ops. + + Raises: + ValueError: if hyperparams_config is not of type hyperparams.Hyperparams. + """ + if not isinstance(hyperparams_config, + hyperparams_pb2.Hyperparams): + raise ValueError('hyperparams_config not of type ' + 'hyperparams_pb.Hyperparams.') + + batch_norm = None + batch_norm_params = None + if hyperparams_config.HasField('batch_norm'): + batch_norm = slim.batch_norm + batch_norm_params = _build_batch_norm_params( + hyperparams_config.batch_norm, is_training) + + affected_ops = [slim.conv2d, slim.separable_conv2d, slim.conv2d_transpose] + if hyperparams_config.HasField('op') and ( + hyperparams_config.op == hyperparams_pb2.Hyperparams.FC): + affected_ops = [slim.fully_connected] + def scope_fn(): + with (slim.arg_scope([slim.batch_norm], **batch_norm_params) + if batch_norm_params is not None else + context_manager.IdentityContextManager()): + with slim.arg_scope( + affected_ops, + weights_regularizer=_build_regularizer( + hyperparams_config.regularizer), + weights_initializer=_build_initializer( + hyperparams_config.initializer), + activation_fn=_build_activation_fn(hyperparams_config.activation), + normalizer_fn=batch_norm) as sc: + return sc + + return scope_fn + + +def _build_activation_fn(activation_fn): + """Builds a callable activation from config. + + Args: + activation_fn: hyperparams_pb2.Hyperparams.activation + + Returns: + Callable activation function. + + Raises: + ValueError: On unknown activation function. + """ + if activation_fn == hyperparams_pb2.Hyperparams.NONE: + return None + if activation_fn == hyperparams_pb2.Hyperparams.RELU: + return tf.nn.relu + if activation_fn == hyperparams_pb2.Hyperparams.RELU_6: + return tf.nn.relu6 + raise ValueError('Unknown activation function: {}'.format(activation_fn)) + + +def _build_regularizer(regularizer): + """Builds a tf-slim regularizer from config. + + Args: + regularizer: hyperparams_pb2.Hyperparams.regularizer proto. + + Returns: + tf-slim regularizer. + + Raises: + ValueError: On unknown regularizer. + """ + regularizer_oneof = regularizer.WhichOneof('regularizer_oneof') + if regularizer_oneof == 'l1_regularizer': + return slim.l1_regularizer(scale=float(regularizer.l1_regularizer.weight)) + if regularizer_oneof == 'l2_regularizer': + return slim.l2_regularizer(scale=float(regularizer.l2_regularizer.weight)) + raise ValueError('Unknown regularizer function: {}'.format(regularizer_oneof)) + + +def _build_initializer(initializer): + """Build a tf initializer from config. + + Args: + initializer: hyperparams_pb2.Hyperparams.regularizer proto. + + Returns: + tf initializer. + + Raises: + ValueError: On unknown initializer. + """ + initializer_oneof = initializer.WhichOneof('initializer_oneof') + if initializer_oneof == 'truncated_normal_initializer': + return tf.truncated_normal_initializer( + mean=initializer.truncated_normal_initializer.mean, + stddev=initializer.truncated_normal_initializer.stddev) + if initializer_oneof == 'random_normal_initializer': + return tf.random_normal_initializer( + mean=initializer.random_normal_initializer.mean, + stddev=initializer.random_normal_initializer.stddev) + if initializer_oneof == 'variance_scaling_initializer': + enum_descriptor = (hyperparams_pb2.VarianceScalingInitializer. + DESCRIPTOR.enum_types_by_name['Mode']) + mode = enum_descriptor.values_by_number[initializer. + variance_scaling_initializer. + mode].name + return slim.variance_scaling_initializer( + factor=initializer.variance_scaling_initializer.factor, + mode=mode, + uniform=initializer.variance_scaling_initializer.uniform) + raise ValueError('Unknown initializer function: {}'.format( + initializer_oneof)) + + +def _build_batch_norm_params(batch_norm, is_training): + """Build a dictionary of batch_norm params from config. + + Args: + batch_norm: hyperparams_pb2.ConvHyperparams.batch_norm proto. + is_training: Whether the models is in training mode. + + Returns: + A dictionary containing batch_norm parameters. + """ + batch_norm_params = { + 'decay': batch_norm.decay, + 'center': batch_norm.center, + 'scale': batch_norm.scale, + 'epsilon': batch_norm.epsilon, + # Remove is_training parameter from here and deprecate it in the proto + # once we refactor Faster RCNN models to set is_training through an outer + # arg_scope in the meta architecture. + 'is_training': is_training and batch_norm.train, + } + return batch_norm_params diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/hyperparams_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/hyperparams_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..943532fbebca2870e5035fa39becd994f6d0b1ca --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/hyperparams_builder_test.py @@ -0,0 +1,509 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests object_detection.core.hyperparams_builder.""" + +import numpy as np +import tensorflow as tf + +from google.protobuf import text_format + +from object_detection.builders import hyperparams_builder +from object_detection.protos import hyperparams_pb2 + +slim = tf.contrib.slim + + +def _get_scope_key(op): + return getattr(op, '_key_op', str(op)) + + +class HyperparamsBuilderTest(tf.test.TestCase): + + def test_default_arg_scope_has_conv2d_op(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + self.assertTrue(_get_scope_key(slim.conv2d) in scope) + + def test_default_arg_scope_has_separable_conv2d_op(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + self.assertTrue(_get_scope_key(slim.separable_conv2d) in scope) + + def test_default_arg_scope_has_conv2d_transpose_op(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + self.assertTrue(_get_scope_key(slim.conv2d_transpose) in scope) + + def test_explicit_fc_op_arg_scope_has_fully_connected_op(self): + conv_hyperparams_text_proto = """ + op: FC + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + self.assertTrue(_get_scope_key(slim.fully_connected) in scope) + + def test_separable_conv2d_and_conv2d_and_transpose_have_same_parameters(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + kwargs_1, kwargs_2, kwargs_3 = scope.values() + self.assertDictEqual(kwargs_1, kwargs_2) + self.assertDictEqual(kwargs_1, kwargs_3) + + def test_return_l1_regularized_weights(self): + conv_hyperparams_text_proto = """ + regularizer { + l1_regularizer { + weight: 0.5 + } + } + initializer { + truncated_normal_initializer { + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope.values()[0] + regularizer = conv_scope_arguments['weights_regularizer'] + weights = np.array([1., -1, 4., 2.]) + with self.test_session() as sess: + result = sess.run(regularizer(tf.constant(weights))) + self.assertAllClose(np.abs(weights).sum() * 0.5, result) + + def test_return_l2_regularizer_weights(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + weight: 0.42 + } + } + initializer { + truncated_normal_initializer { + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + + regularizer = conv_scope_arguments['weights_regularizer'] + weights = np.array([1., -1, 4., 2.]) + with self.test_session() as sess: + result = sess.run(regularizer(tf.constant(weights))) + self.assertAllClose(np.power(weights, 2).sum() / 2.0 * 0.42, result) + + def test_return_non_default_batch_norm_params_with_train_during_train(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + batch_norm { + decay: 0.7 + center: false + scale: true + epsilon: 0.03 + train: true + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + self.assertEqual(conv_scope_arguments['normalizer_fn'], slim.batch_norm) + batch_norm_params = scope[_get_scope_key(slim.batch_norm)] + self.assertAlmostEqual(batch_norm_params['decay'], 0.7) + self.assertAlmostEqual(batch_norm_params['epsilon'], 0.03) + self.assertFalse(batch_norm_params['center']) + self.assertTrue(batch_norm_params['scale']) + self.assertTrue(batch_norm_params['is_training']) + + def test_return_batch_norm_params_with_notrain_during_eval(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + batch_norm { + decay: 0.7 + center: false + scale: true + epsilon: 0.03 + train: true + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=False) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + self.assertEqual(conv_scope_arguments['normalizer_fn'], slim.batch_norm) + batch_norm_params = scope[_get_scope_key(slim.batch_norm)] + self.assertAlmostEqual(batch_norm_params['decay'], 0.7) + self.assertAlmostEqual(batch_norm_params['epsilon'], 0.03) + self.assertFalse(batch_norm_params['center']) + self.assertTrue(batch_norm_params['scale']) + self.assertFalse(batch_norm_params['is_training']) + + def test_return_batch_norm_params_with_notrain_when_train_is_false(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + batch_norm { + decay: 0.7 + center: false + scale: true + epsilon: 0.03 + train: false + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + self.assertEqual(conv_scope_arguments['normalizer_fn'], slim.batch_norm) + batch_norm_params = scope[_get_scope_key(slim.batch_norm)] + self.assertAlmostEqual(batch_norm_params['decay'], 0.7) + self.assertAlmostEqual(batch_norm_params['epsilon'], 0.03) + self.assertFalse(batch_norm_params['center']) + self.assertTrue(batch_norm_params['scale']) + self.assertFalse(batch_norm_params['is_training']) + + def test_do_not_use_batch_norm_if_default(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + self.assertEqual(conv_scope_arguments['normalizer_fn'], None) + + def test_use_none_activation(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + activation: NONE + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + self.assertEqual(conv_scope_arguments['activation_fn'], None) + + def test_use_relu_activation(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + activation: RELU + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + self.assertEqual(conv_scope_arguments['activation_fn'], tf.nn.relu) + + def test_use_relu_6_activation(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + activation: RELU_6 + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + self.assertEqual(conv_scope_arguments['activation_fn'], tf.nn.relu6) + + def _assert_variance_in_range(self, initializer, shape, variance, + tol=1e-2): + with tf.Graph().as_default() as g: + with self.test_session(graph=g) as sess: + var = tf.get_variable( + name='test', + shape=shape, + dtype=tf.float32, + initializer=initializer) + sess.run(tf.global_variables_initializer()) + values = sess.run(var) + self.assertAllClose(np.var(values), variance, tol, tol) + + def test_variance_in_range_with_variance_scaling_initializer_fan_in(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + variance_scaling_initializer { + factor: 2.0 + mode: FAN_IN + uniform: false + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + initializer = conv_scope_arguments['weights_initializer'] + self._assert_variance_in_range(initializer, shape=[100, 40], + variance=2. / 100.) + + def test_variance_in_range_with_variance_scaling_initializer_fan_out(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + variance_scaling_initializer { + factor: 2.0 + mode: FAN_OUT + uniform: false + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + initializer = conv_scope_arguments['weights_initializer'] + self._assert_variance_in_range(initializer, shape=[100, 40], + variance=2. / 40.) + + def test_variance_in_range_with_variance_scaling_initializer_fan_avg(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + variance_scaling_initializer { + factor: 2.0 + mode: FAN_AVG + uniform: false + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + initializer = conv_scope_arguments['weights_initializer'] + self._assert_variance_in_range(initializer, shape=[100, 40], + variance=4. / (100. + 40.)) + + def test_variance_in_range_with_variance_scaling_initializer_uniform(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + variance_scaling_initializer { + factor: 2.0 + mode: FAN_IN + uniform: true + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + initializer = conv_scope_arguments['weights_initializer'] + self._assert_variance_in_range(initializer, shape=[100, 40], + variance=2. / 100.) + + def test_variance_in_range_with_truncated_normal_initializer(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + mean: 0.0 + stddev: 0.8 + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + initializer = conv_scope_arguments['weights_initializer'] + self._assert_variance_in_range(initializer, shape=[100, 40], + variance=0.49, tol=1e-1) + + def test_variance_in_range_with_random_normal_initializer(self): + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + random_normal_initializer { + mean: 0.0 + stddev: 0.8 + } + } + """ + conv_hyperparams_proto = hyperparams_pb2.Hyperparams() + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams_proto) + scope_fn = hyperparams_builder.build(conv_hyperparams_proto, + is_training=True) + scope = scope_fn() + conv_scope_arguments = scope[_get_scope_key(slim.conv2d)] + initializer = conv_scope_arguments['weights_initializer'] + self._assert_variance_in_range(initializer, shape=[100, 40], + variance=0.64, tol=1e-1) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/image_resizer_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/image_resizer_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..3b3014f727e13d2bf671ea12f3ff30972cc67684 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/image_resizer_builder.py @@ -0,0 +1,115 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Builder function for image resizing operations.""" +import functools +import tensorflow as tf + +from object_detection.core import preprocessor +from object_detection.protos import image_resizer_pb2 + + +def _tf_resize_method(resize_method): + """Maps image resize method from enumeration type to TensorFlow. + + Args: + resize_method: The resize_method attribute of keep_aspect_ratio_resizer or + fixed_shape_resizer. + + Returns: + method: The corresponding TensorFlow ResizeMethod. + + Raises: + ValueError: if `resize_method` is of unknown type. + """ + dict_method = { + image_resizer_pb2.BILINEAR: + tf.image.ResizeMethod.BILINEAR, + image_resizer_pb2.NEAREST_NEIGHBOR: + tf.image.ResizeMethod.NEAREST_NEIGHBOR, + image_resizer_pb2.BICUBIC: + tf.image.ResizeMethod.BICUBIC, + image_resizer_pb2.AREA: + tf.image.ResizeMethod.AREA + } + if resize_method in dict_method: + return dict_method[resize_method] + else: + raise ValueError('Unknown resize_method') + + +def build(image_resizer_config): + """Builds callable for image resizing operations. + + Args: + image_resizer_config: image_resizer.proto object containing parameters for + an image resizing operation. + + Returns: + image_resizer_fn: Callable for image resizing. This callable always takes + a rank-3 image tensor (corresponding to a single image) and returns a + rank-3 image tensor, possibly with new spatial dimensions. + + Raises: + ValueError: if `image_resizer_config` is of incorrect type. + ValueError: if `image_resizer_config.image_resizer_oneof` is of expected + type. + ValueError: if min_dimension > max_dimension when keep_aspect_ratio_resizer + is used. + """ + if not isinstance(image_resizer_config, image_resizer_pb2.ImageResizer): + raise ValueError('image_resizer_config not of type ' + 'image_resizer_pb2.ImageResizer.') + + image_resizer_oneof = image_resizer_config.WhichOneof('image_resizer_oneof') + if image_resizer_oneof == 'keep_aspect_ratio_resizer': + keep_aspect_ratio_config = image_resizer_config.keep_aspect_ratio_resizer + if not (keep_aspect_ratio_config.min_dimension <= + keep_aspect_ratio_config.max_dimension): + raise ValueError('min_dimension > max_dimension') + method = _tf_resize_method(keep_aspect_ratio_config.resize_method) + per_channel_pad_value = (0, 0, 0) + if keep_aspect_ratio_config.per_channel_pad_value: + per_channel_pad_value = tuple(keep_aspect_ratio_config. + per_channel_pad_value) + image_resizer_fn = functools.partial( + preprocessor.resize_to_range, + min_dimension=keep_aspect_ratio_config.min_dimension, + max_dimension=keep_aspect_ratio_config.max_dimension, + method=method, + pad_to_max_dimension=keep_aspect_ratio_config.pad_to_max_dimension, + per_channel_pad_value=per_channel_pad_value) + if not keep_aspect_ratio_config.convert_to_grayscale: + return image_resizer_fn + elif image_resizer_oneof == 'fixed_shape_resizer': + fixed_shape_resizer_config = image_resizer_config.fixed_shape_resizer + method = _tf_resize_method(fixed_shape_resizer_config.resize_method) + image_resizer_fn = functools.partial( + preprocessor.resize_image, + new_height=fixed_shape_resizer_config.height, + new_width=fixed_shape_resizer_config.width, + method=method) + if not fixed_shape_resizer_config.convert_to_grayscale: + return image_resizer_fn + else: + raise ValueError( + 'Invalid image resizer option: \'%s\'.' % image_resizer_oneof) + + def grayscale_image_resizer(image): + [resized_image, resized_image_shape] = image_resizer_fn(image) + grayscale_image = preprocessor.rgb_to_gray(resized_image) + grayscale_image_shape = tf.concat([resized_image_shape[:-1], [1]], 0) + return [grayscale_image, grayscale_image_shape] + + return functools.partial(grayscale_image_resizer) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/image_resizer_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/image_resizer_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..38f620e04050888c5f3b1c73cdab8942a99b9d57 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/image_resizer_builder_test.py @@ -0,0 +1,113 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for object_detection.builders.image_resizer_builder.""" +import numpy as np +import tensorflow as tf +from google.protobuf import text_format +from object_detection.builders import image_resizer_builder +from object_detection.protos import image_resizer_pb2 + + +class ImageResizerBuilderTest(tf.test.TestCase): + + def _shape_of_resized_random_image_given_text_proto(self, input_shape, + text_proto): + image_resizer_config = image_resizer_pb2.ImageResizer() + text_format.Merge(text_proto, image_resizer_config) + image_resizer_fn = image_resizer_builder.build(image_resizer_config) + images = tf.to_float( + tf.random_uniform(input_shape, minval=0, maxval=255, dtype=tf.int32)) + resized_images, _ = image_resizer_fn(images) + with self.test_session() as sess: + return sess.run(resized_images).shape + + def test_build_keep_aspect_ratio_resizer_returns_expected_shape(self): + image_resizer_text_proto = """ + keep_aspect_ratio_resizer { + min_dimension: 10 + max_dimension: 20 + } + """ + input_shape = (50, 25, 3) + expected_output_shape = (20, 10, 3) + output_shape = self._shape_of_resized_random_image_given_text_proto( + input_shape, image_resizer_text_proto) + self.assertEqual(output_shape, expected_output_shape) + + def test_build_keep_aspect_ratio_resizer_with_padding(self): + image_resizer_text_proto = """ + keep_aspect_ratio_resizer { + min_dimension: 10 + max_dimension: 20 + pad_to_max_dimension: true + per_channel_pad_value: 3 + per_channel_pad_value: 4 + per_channel_pad_value: 5 + } + """ + input_shape = (50, 25, 3) + expected_output_shape = (20, 20, 3) + output_shape = self._shape_of_resized_random_image_given_text_proto( + input_shape, image_resizer_text_proto) + self.assertEqual(output_shape, expected_output_shape) + + def test_built_fixed_shape_resizer_returns_expected_shape(self): + image_resizer_text_proto = """ + fixed_shape_resizer { + height: 10 + width: 20 + } + """ + input_shape = (50, 25, 3) + expected_output_shape = (10, 20, 3) + output_shape = self._shape_of_resized_random_image_given_text_proto( + input_shape, image_resizer_text_proto) + self.assertEqual(output_shape, expected_output_shape) + + def test_raises_error_on_invalid_input(self): + invalid_input = 'invalid_input' + with self.assertRaises(ValueError): + image_resizer_builder.build(invalid_input) + + def _resized_image_given_text_proto(self, image, text_proto): + image_resizer_config = image_resizer_pb2.ImageResizer() + text_format.Merge(text_proto, image_resizer_config) + image_resizer_fn = image_resizer_builder.build(image_resizer_config) + image_placeholder = tf.placeholder(tf.uint8, [1, None, None, 3]) + resized_image, _ = image_resizer_fn(image_placeholder) + with self.test_session() as sess: + return sess.run(resized_image, feed_dict={image_placeholder: image}) + + def test_fixed_shape_resizer_nearest_neighbor_method(self): + image_resizer_text_proto = """ + fixed_shape_resizer { + height: 1 + width: 1 + resize_method: NEAREST_NEIGHBOR + } + """ + image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + image = np.expand_dims(image, axis=2) + image = np.tile(image, (1, 1, 3)) + image = np.expand_dims(image, axis=0) + resized_image = self._resized_image_given_text_proto( + image, image_resizer_text_proto) + vals = np.unique(resized_image).tolist() + self.assertEqual(len(vals), 1) + self.assertEqual(vals[0], 1) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/input_reader_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/input_reader_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..8cb5e2f05448f1817a7644f1a553eac1ee98ba17 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/input_reader_builder.py @@ -0,0 +1,76 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Input reader builder. + +Creates data sources for DetectionModels from an InputReader config. See +input_reader.proto for options. + +Note: If users wishes to also use their own InputReaders with the Object +Detection configuration framework, they should define their own builder function +that wraps the build function. +""" + +import tensorflow as tf + +from object_detection.data_decoders import tf_example_decoder +from object_detection.protos import input_reader_pb2 + +parallel_reader = tf.contrib.slim.parallel_reader + + +def build(input_reader_config): + """Builds a tensor dictionary based on the InputReader config. + + Args: + input_reader_config: A input_reader_pb2.InputReader object. + + Returns: + A tensor dict based on the input_reader_config. + + Raises: + ValueError: On invalid input reader proto. + ValueError: If no input paths are specified. + """ + if not isinstance(input_reader_config, input_reader_pb2.InputReader): + raise ValueError('input_reader_config not of type ' + 'input_reader_pb2.InputReader.') + + if input_reader_config.WhichOneof('input_reader') == 'tf_record_input_reader': + config = input_reader_config.tf_record_input_reader + if not config.input_path: + raise ValueError('At least one input path must be specified in ' + '`input_reader_config`.') + _, string_tensor = parallel_reader.parallel_read( + config.input_path[:], # Convert `RepeatedScalarContainer` to list. + reader_class=tf.TFRecordReader, + num_epochs=(input_reader_config.num_epochs + if input_reader_config.num_epochs else None), + num_readers=input_reader_config.num_readers, + shuffle=input_reader_config.shuffle, + dtypes=[tf.string, tf.string], + capacity=input_reader_config.queue_capacity, + min_after_dequeue=input_reader_config.min_after_dequeue) + + label_map_proto_file = None + if input_reader_config.HasField('label_map_path'): + label_map_proto_file = input_reader_config.label_map_path + decoder = tf_example_decoder.TfExampleDecoder( + load_instance_masks=input_reader_config.load_instance_masks, + instance_mask_type=input_reader_config.mask_type, + label_map_proto_file=label_map_proto_file) + return decoder.decode(string_tensor) + + raise ValueError('Unsupported input_reader_config.') diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/input_reader_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/input_reader_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..f09f60e5777b133e5fa50840d63728f2de55c147 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/input_reader_builder_test.py @@ -0,0 +1,144 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for input_reader_builder.""" + +import os +import numpy as np +import tensorflow as tf + +from google.protobuf import text_format + +from tensorflow.core.example import example_pb2 +from tensorflow.core.example import feature_pb2 +from object_detection.builders import input_reader_builder +from object_detection.core import standard_fields as fields +from object_detection.protos import input_reader_pb2 + + +class InputReaderBuilderTest(tf.test.TestCase): + + def create_tf_record(self): + path = os.path.join(self.get_temp_dir(), 'tfrecord') + writer = tf.python_io.TFRecordWriter(path) + + image_tensor = np.random.randint(255, size=(4, 5, 3)).astype(np.uint8) + flat_mask = (4 * 5) * [1.0] + with self.test_session(): + encoded_jpeg = tf.image.encode_jpeg(tf.constant(image_tensor)).eval() + example = example_pb2.Example(features=feature_pb2.Features(feature={ + 'image/encoded': feature_pb2.Feature( + bytes_list=feature_pb2.BytesList(value=[encoded_jpeg])), + 'image/format': feature_pb2.Feature( + bytes_list=feature_pb2.BytesList(value=['jpeg'.encode('utf-8')])), + 'image/height': feature_pb2.Feature( + int64_list=feature_pb2.Int64List(value=[4])), + 'image/width': feature_pb2.Feature( + int64_list=feature_pb2.Int64List(value=[5])), + 'image/object/bbox/xmin': feature_pb2.Feature( + float_list=feature_pb2.FloatList(value=[0.0])), + 'image/object/bbox/xmax': feature_pb2.Feature( + float_list=feature_pb2.FloatList(value=[1.0])), + 'image/object/bbox/ymin': feature_pb2.Feature( + float_list=feature_pb2.FloatList(value=[0.0])), + 'image/object/bbox/ymax': feature_pb2.Feature( + float_list=feature_pb2.FloatList(value=[1.0])), + 'image/object/class/label': feature_pb2.Feature( + int64_list=feature_pb2.Int64List(value=[2])), + 'image/object/mask': feature_pb2.Feature( + float_list=feature_pb2.FloatList(value=flat_mask)), + })) + writer.write(example.SerializeToString()) + writer.close() + + return path + + def test_build_tf_record_input_reader(self): + tf_record_path = self.create_tf_record() + + input_reader_text_proto = """ + shuffle: false + num_readers: 1 + tf_record_input_reader {{ + input_path: '{0}' + }} + """.format(tf_record_path) + input_reader_proto = input_reader_pb2.InputReader() + text_format.Merge(input_reader_text_proto, input_reader_proto) + tensor_dict = input_reader_builder.build(input_reader_proto) + + sv = tf.train.Supervisor(logdir=self.get_temp_dir()) + with sv.prepare_or_wait_for_session() as sess: + sv.start_queue_runners(sess) + output_dict = sess.run(tensor_dict) + + self.assertTrue(fields.InputDataFields.groundtruth_instance_masks + not in output_dict) + self.assertEquals( + (4, 5, 3), output_dict[fields.InputDataFields.image].shape) + self.assertEquals( + [2], output_dict[fields.InputDataFields.groundtruth_classes]) + self.assertEquals( + (1, 4), output_dict[fields.InputDataFields.groundtruth_boxes].shape) + self.assertAllEqual( + [0.0, 0.0, 1.0, 1.0], + output_dict[fields.InputDataFields.groundtruth_boxes][0]) + + def test_build_tf_record_input_reader_and_load_instance_masks(self): + tf_record_path = self.create_tf_record() + + input_reader_text_proto = """ + shuffle: false + num_readers: 1 + load_instance_masks: true + tf_record_input_reader {{ + input_path: '{0}' + }} + """.format(tf_record_path) + input_reader_proto = input_reader_pb2.InputReader() + text_format.Merge(input_reader_text_proto, input_reader_proto) + tensor_dict = input_reader_builder.build(input_reader_proto) + + sv = tf.train.Supervisor(logdir=self.get_temp_dir()) + with sv.prepare_or_wait_for_session() as sess: + sv.start_queue_runners(sess) + output_dict = sess.run(tensor_dict) + + self.assertEquals( + (4, 5, 3), output_dict[fields.InputDataFields.image].shape) + self.assertEquals( + [2], output_dict[fields.InputDataFields.groundtruth_classes]) + self.assertEquals( + (1, 4), output_dict[fields.InputDataFields.groundtruth_boxes].shape) + self.assertAllEqual( + [0.0, 0.0, 1.0, 1.0], + output_dict[fields.InputDataFields.groundtruth_boxes][0]) + self.assertAllEqual( + (1, 4, 5), + output_dict[fields.InputDataFields.groundtruth_instance_masks].shape) + + def test_raises_error_with_no_input_paths(self): + input_reader_text_proto = """ + shuffle: false + num_readers: 1 + load_instance_masks: true + """ + input_reader_proto = input_reader_pb2.InputReader() + text_format.Merge(input_reader_text_proto, input_reader_proto) + with self.assertRaises(ValueError): + input_reader_builder.build(input_reader_proto) + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/losses_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/losses_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..e4f7a12400fc3ce8c90407943c4530da1cef9594 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/losses_builder.py @@ -0,0 +1,222 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""A function to build localization and classification losses from config.""" + +from object_detection.core import balanced_positive_negative_sampler as sampler +from object_detection.core import losses +from object_detection.protos import losses_pb2 + + +def build(loss_config): + """Build losses based on the config. + + Builds classification, localization losses and optionally a hard example miner + based on the config. + + Args: + loss_config: A losses_pb2.Loss object. + + Returns: + classification_loss: Classification loss object. + localization_loss: Localization loss object. + classification_weight: Classification loss weight. + localization_weight: Localization loss weight. + hard_example_miner: Hard example miner object. + random_example_sampler: BalancedPositiveNegativeSampler object. + + Raises: + ValueError: If hard_example_miner is used with sigmoid_focal_loss. + ValueError: If random_example_sampler is getting non-positive value as + desired positive example fraction. + """ + classification_loss = _build_classification_loss( + loss_config.classification_loss) + localization_loss = _build_localization_loss( + loss_config.localization_loss) + classification_weight = loss_config.classification_weight + localization_weight = loss_config.localization_weight + hard_example_miner = None + if loss_config.HasField('hard_example_miner'): + if (loss_config.classification_loss.WhichOneof('classification_loss') == + 'weighted_sigmoid_focal'): + raise ValueError('HardExampleMiner should not be used with sigmoid focal ' + 'loss') + hard_example_miner = build_hard_example_miner( + loss_config.hard_example_miner, + classification_weight, + localization_weight) + random_example_sampler = None + if loss_config.HasField('random_example_sampler'): + if loss_config.random_example_sampler.positive_sample_fraction <= 0: + raise ValueError('RandomExampleSampler should not use non-positive' + 'value as positive sample fraction.') + random_example_sampler = sampler.BalancedPositiveNegativeSampler( + positive_fraction=loss_config.random_example_sampler. + positive_sample_fraction) + return (classification_loss, localization_loss, classification_weight, + localization_weight, hard_example_miner, random_example_sampler) + + +def build_hard_example_miner(config, + classification_weight, + localization_weight): + """Builds hard example miner based on the config. + + Args: + config: A losses_pb2.HardExampleMiner object. + classification_weight: Classification loss weight. + localization_weight: Localization loss weight. + + Returns: + Hard example miner. + + """ + loss_type = None + if config.loss_type == losses_pb2.HardExampleMiner.BOTH: + loss_type = 'both' + if config.loss_type == losses_pb2.HardExampleMiner.CLASSIFICATION: + loss_type = 'cls' + if config.loss_type == losses_pb2.HardExampleMiner.LOCALIZATION: + loss_type = 'loc' + + max_negatives_per_positive = None + num_hard_examples = None + if config.max_negatives_per_positive > 0: + max_negatives_per_positive = config.max_negatives_per_positive + if config.num_hard_examples > 0: + num_hard_examples = config.num_hard_examples + hard_example_miner = losses.HardExampleMiner( + num_hard_examples=num_hard_examples, + iou_threshold=config.iou_threshold, + loss_type=loss_type, + cls_loss_weight=classification_weight, + loc_loss_weight=localization_weight, + max_negatives_per_positive=max_negatives_per_positive, + min_negatives_per_image=config.min_negatives_per_image) + return hard_example_miner + + +def build_faster_rcnn_classification_loss(loss_config): + """Builds a classification loss for Faster RCNN based on the loss config. + + Args: + loss_config: A losses_pb2.ClassificationLoss object. + + Returns: + Loss based on the config. + + Raises: + ValueError: On invalid loss_config. + """ + if not isinstance(loss_config, losses_pb2.ClassificationLoss): + raise ValueError('loss_config not of type losses_pb2.ClassificationLoss.') + + loss_type = loss_config.WhichOneof('classification_loss') + + if loss_type == 'weighted_sigmoid': + return losses.WeightedSigmoidClassificationLoss() + if loss_type == 'weighted_softmax': + config = loss_config.weighted_softmax + return losses.WeightedSoftmaxClassificationLoss( + logit_scale=config.logit_scale) + if loss_type == 'weighted_logits_softmax': + config = loss_config.weighted_logits_softmax + return losses.WeightedSoftmaxClassificationAgainstLogitsLoss( + logit_scale=config.logit_scale) + + # By default, Faster RCNN second stage classifier uses Softmax loss + # with anchor-wise outputs. + config = loss_config.weighted_softmax + return losses.WeightedSoftmaxClassificationLoss( + logit_scale=config.logit_scale) + + +def _build_localization_loss(loss_config): + """Builds a localization loss based on the loss config. + + Args: + loss_config: A losses_pb2.LocalizationLoss object. + + Returns: + Loss based on the config. + + Raises: + ValueError: On invalid loss_config. + """ + if not isinstance(loss_config, losses_pb2.LocalizationLoss): + raise ValueError('loss_config not of type losses_pb2.LocalizationLoss.') + + loss_type = loss_config.WhichOneof('localization_loss') + + if loss_type == 'weighted_l2': + return losses.WeightedL2LocalizationLoss() + + if loss_type == 'weighted_smooth_l1': + return losses.WeightedSmoothL1LocalizationLoss( + loss_config.weighted_smooth_l1.delta) + + if loss_type == 'weighted_iou': + return losses.WeightedIOULocalizationLoss() + + raise ValueError('Empty loss config.') + + +def _build_classification_loss(loss_config): + """Builds a classification loss based on the loss config. + + Args: + loss_config: A losses_pb2.ClassificationLoss object. + + Returns: + Loss based on the config. + + Raises: + ValueError: On invalid loss_config. + """ + if not isinstance(loss_config, losses_pb2.ClassificationLoss): + raise ValueError('loss_config not of type losses_pb2.ClassificationLoss.') + + loss_type = loss_config.WhichOneof('classification_loss') + + if loss_type == 'weighted_sigmoid': + return losses.WeightedSigmoidClassificationLoss() + + if loss_type == 'weighted_sigmoid_focal': + config = loss_config.weighted_sigmoid_focal + alpha = None + if config.HasField('alpha'): + alpha = config.alpha + return losses.SigmoidFocalClassificationLoss( + gamma=config.gamma, + alpha=alpha) + + if loss_type == 'weighted_softmax': + config = loss_config.weighted_softmax + return losses.WeightedSoftmaxClassificationLoss( + logit_scale=config.logit_scale) + + if loss_type == 'weighted_logits_softmax': + config = loss_config.weighted_logits_softmax + return losses.WeightedSoftmaxClassificationAgainstLogitsLoss( + logit_scale=config.logit_scale) + + if loss_type == 'bootstrapped_sigmoid': + config = loss_config.bootstrapped_sigmoid + return losses.BootstrappedSigmoidClassificationLoss( + alpha=config.alpha, + bootstrap_type=('hard' if config.hard_bootstrap else 'soft')) + + raise ValueError('Empty loss config.') diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/losses_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/losses_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..4dc4a754eca9a2180963da8dbb75afd9a520225f --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/losses_builder_test.py @@ -0,0 +1,488 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for losses_builder.""" + +import tensorflow as tf + +from google.protobuf import text_format +from object_detection.builders import losses_builder +from object_detection.core import losses +from object_detection.protos import losses_pb2 + + +class LocalizationLossBuilderTest(tf.test.TestCase): + + def test_build_weighted_l2_localization_loss(self): + losses_text_proto = """ + localization_loss { + weighted_l2 { + } + } + classification_loss { + weighted_softmax { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + _, localization_loss, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(localization_loss, + losses.WeightedL2LocalizationLoss)) + + def test_build_weighted_smooth_l1_localization_loss_default_delta(self): + losses_text_proto = """ + localization_loss { + weighted_smooth_l1 { + } + } + classification_loss { + weighted_softmax { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + _, localization_loss, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(localization_loss, + losses.WeightedSmoothL1LocalizationLoss)) + self.assertAlmostEqual(localization_loss._delta, 1.0) + + def test_build_weighted_smooth_l1_localization_loss_non_default_delta(self): + losses_text_proto = """ + localization_loss { + weighted_smooth_l1 { + delta: 0.1 + } + } + classification_loss { + weighted_softmax { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + _, localization_loss, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(localization_loss, + losses.WeightedSmoothL1LocalizationLoss)) + self.assertAlmostEqual(localization_loss._delta, 0.1) + + def test_build_weighted_iou_localization_loss(self): + losses_text_proto = """ + localization_loss { + weighted_iou { + } + } + classification_loss { + weighted_softmax { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + _, localization_loss, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(localization_loss, + losses.WeightedIOULocalizationLoss)) + + def test_anchorwise_output(self): + losses_text_proto = """ + localization_loss { + weighted_smooth_l1 { + } + } + classification_loss { + weighted_softmax { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + _, localization_loss, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(localization_loss, + losses.WeightedSmoothL1LocalizationLoss)) + predictions = tf.constant([[[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]]]) + targets = tf.constant([[[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]]]) + weights = tf.constant([[1.0, 1.0]]) + loss = localization_loss(predictions, targets, weights=weights) + self.assertEqual(loss.shape, [1, 2]) + + def test_raise_error_on_empty_localization_config(self): + losses_text_proto = """ + classification_loss { + weighted_softmax { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + with self.assertRaises(ValueError): + losses_builder._build_localization_loss(losses_proto) + + +class ClassificationLossBuilderTest(tf.test.TestCase): + + def test_build_weighted_sigmoid_classification_loss(self): + losses_text_proto = """ + classification_loss { + weighted_sigmoid { + } + } + localization_loss { + weighted_l2 { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss, _, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.WeightedSigmoidClassificationLoss)) + + def test_build_weighted_sigmoid_focal_classification_loss(self): + losses_text_proto = """ + classification_loss { + weighted_sigmoid_focal { + } + } + localization_loss { + weighted_l2 { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss, _, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.SigmoidFocalClassificationLoss)) + self.assertAlmostEqual(classification_loss._alpha, None) + self.assertAlmostEqual(classification_loss._gamma, 2.0) + + def test_build_weighted_sigmoid_focal_loss_non_default(self): + losses_text_proto = """ + classification_loss { + weighted_sigmoid_focal { + alpha: 0.25 + gamma: 3.0 + } + } + localization_loss { + weighted_l2 { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss, _, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.SigmoidFocalClassificationLoss)) + self.assertAlmostEqual(classification_loss._alpha, 0.25) + self.assertAlmostEqual(classification_loss._gamma, 3.0) + + def test_build_weighted_softmax_classification_loss(self): + losses_text_proto = """ + classification_loss { + weighted_softmax { + } + } + localization_loss { + weighted_l2 { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss, _, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.WeightedSoftmaxClassificationLoss)) + + def test_build_weighted_logits_softmax_classification_loss(self): + losses_text_proto = """ + classification_loss { + weighted_logits_softmax { + } + } + localization_loss { + weighted_l2 { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss, _, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue( + isinstance(classification_loss, + losses.WeightedSoftmaxClassificationAgainstLogitsLoss)) + + def test_build_weighted_softmax_classification_loss_with_logit_scale(self): + losses_text_proto = """ + classification_loss { + weighted_softmax { + logit_scale: 2.0 + } + } + localization_loss { + weighted_l2 { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss, _, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.WeightedSoftmaxClassificationLoss)) + + def test_build_bootstrapped_sigmoid_classification_loss(self): + losses_text_proto = """ + classification_loss { + bootstrapped_sigmoid { + alpha: 0.5 + } + } + localization_loss { + weighted_l2 { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss, _, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.BootstrappedSigmoidClassificationLoss)) + + def test_anchorwise_output(self): + losses_text_proto = """ + classification_loss { + weighted_sigmoid { + anchorwise_output: true + } + } + localization_loss { + weighted_l2 { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss, _, _, _, _, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.WeightedSigmoidClassificationLoss)) + predictions = tf.constant([[[0.0, 1.0, 0.0], [0.0, 0.5, 0.5]]]) + targets = tf.constant([[[0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]]) + weights = tf.constant([[1.0, 1.0]]) + loss = classification_loss(predictions, targets, weights=weights) + self.assertEqual(loss.shape, [1, 2, 3]) + + def test_raise_error_on_empty_config(self): + losses_text_proto = """ + localization_loss { + weighted_l2 { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + with self.assertRaises(ValueError): + losses_builder.build(losses_proto) + + +class HardExampleMinerBuilderTest(tf.test.TestCase): + + def test_do_not_build_hard_example_miner_by_default(self): + losses_text_proto = """ + localization_loss { + weighted_l2 { + } + } + classification_loss { + weighted_softmax { + } + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + _, _, _, _, hard_example_miner, _ = losses_builder.build(losses_proto) + self.assertEqual(hard_example_miner, None) + + def test_build_hard_example_miner_for_classification_loss(self): + losses_text_proto = """ + localization_loss { + weighted_l2 { + } + } + classification_loss { + weighted_softmax { + } + } + hard_example_miner { + loss_type: CLASSIFICATION + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + _, _, _, _, hard_example_miner, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(hard_example_miner, losses.HardExampleMiner)) + self.assertEqual(hard_example_miner._loss_type, 'cls') + + def test_build_hard_example_miner_for_localization_loss(self): + losses_text_proto = """ + localization_loss { + weighted_l2 { + } + } + classification_loss { + weighted_softmax { + } + } + hard_example_miner { + loss_type: LOCALIZATION + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + _, _, _, _, hard_example_miner, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(hard_example_miner, losses.HardExampleMiner)) + self.assertEqual(hard_example_miner._loss_type, 'loc') + + def test_build_hard_example_miner_with_non_default_values(self): + losses_text_proto = """ + localization_loss { + weighted_l2 { + } + } + classification_loss { + weighted_softmax { + } + } + hard_example_miner { + num_hard_examples: 32 + iou_threshold: 0.5 + loss_type: LOCALIZATION + max_negatives_per_positive: 10 + min_negatives_per_image: 3 + } + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + _, _, _, _, hard_example_miner, _ = losses_builder.build(losses_proto) + self.assertTrue(isinstance(hard_example_miner, losses.HardExampleMiner)) + self.assertEqual(hard_example_miner._num_hard_examples, 32) + self.assertAlmostEqual(hard_example_miner._iou_threshold, 0.5) + self.assertEqual(hard_example_miner._max_negatives_per_positive, 10) + self.assertEqual(hard_example_miner._min_negatives_per_image, 3) + + +class LossBuilderTest(tf.test.TestCase): + + def test_build_all_loss_parameters(self): + losses_text_proto = """ + localization_loss { + weighted_l2 { + } + } + classification_loss { + weighted_softmax { + } + } + hard_example_miner { + } + classification_weight: 0.8 + localization_weight: 0.2 + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + (classification_loss, localization_loss, + classification_weight, localization_weight, + hard_example_miner, _) = losses_builder.build(losses_proto) + self.assertTrue(isinstance(hard_example_miner, losses.HardExampleMiner)) + self.assertTrue(isinstance(classification_loss, + losses.WeightedSoftmaxClassificationLoss)) + self.assertTrue(isinstance(localization_loss, + losses.WeightedL2LocalizationLoss)) + self.assertAlmostEqual(classification_weight, 0.8) + self.assertAlmostEqual(localization_weight, 0.2) + + def test_raise_error_when_both_focal_loss_and_hard_example_miner(self): + losses_text_proto = """ + localization_loss { + weighted_l2 { + } + } + classification_loss { + weighted_sigmoid_focal { + } + } + hard_example_miner { + } + classification_weight: 0.8 + localization_weight: 0.2 + """ + losses_proto = losses_pb2.Loss() + text_format.Merge(losses_text_proto, losses_proto) + with self.assertRaises(ValueError): + losses_builder.build(losses_proto) + + +class FasterRcnnClassificationLossBuilderTest(tf.test.TestCase): + + def test_build_sigmoid_loss(self): + losses_text_proto = """ + weighted_sigmoid { + } + """ + losses_proto = losses_pb2.ClassificationLoss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss = losses_builder.build_faster_rcnn_classification_loss( + losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.WeightedSigmoidClassificationLoss)) + + def test_build_softmax_loss(self): + losses_text_proto = """ + weighted_softmax { + } + """ + losses_proto = losses_pb2.ClassificationLoss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss = losses_builder.build_faster_rcnn_classification_loss( + losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.WeightedSoftmaxClassificationLoss)) + + def test_build_logits_softmax_loss(self): + losses_text_proto = """ + weighted_logits_softmax { + } + """ + losses_proto = losses_pb2.ClassificationLoss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss = losses_builder.build_faster_rcnn_classification_loss( + losses_proto) + self.assertTrue( + isinstance(classification_loss, + losses.WeightedSoftmaxClassificationAgainstLogitsLoss)) + + def test_build_softmax_loss_by_default(self): + losses_text_proto = """ + """ + losses_proto = losses_pb2.ClassificationLoss() + text_format.Merge(losses_text_proto, losses_proto) + classification_loss = losses_builder.build_faster_rcnn_classification_loss( + losses_proto) + self.assertTrue(isinstance(classification_loss, + losses.WeightedSoftmaxClassificationLoss)) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/matcher_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/matcher_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..d334f435372984eb78265d72b2bcdf63c45bde5b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/matcher_builder.py @@ -0,0 +1,53 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""A function to build an object detection matcher from configuration.""" + +from object_detection.matchers import argmax_matcher +from object_detection.matchers import bipartite_matcher +from object_detection.protos import matcher_pb2 + + +def build(matcher_config): + """Builds a matcher object based on the matcher config. + + Args: + matcher_config: A matcher.proto object containing the config for the desired + Matcher. + + Returns: + Matcher based on the config. + + Raises: + ValueError: On empty matcher proto. + """ + if not isinstance(matcher_config, matcher_pb2.Matcher): + raise ValueError('matcher_config not of type matcher_pb2.Matcher.') + if matcher_config.WhichOneof('matcher_oneof') == 'argmax_matcher': + matcher = matcher_config.argmax_matcher + matched_threshold = unmatched_threshold = None + if not matcher.ignore_thresholds: + matched_threshold = matcher.matched_threshold + unmatched_threshold = matcher.unmatched_threshold + return argmax_matcher.ArgMaxMatcher( + matched_threshold=matched_threshold, + unmatched_threshold=unmatched_threshold, + negatives_lower_than_unmatched=matcher.negatives_lower_than_unmatched, + force_match_for_each_row=matcher.force_match_for_each_row, + use_matmul_gather=matcher.use_matmul_gather) + if matcher_config.WhichOneof('matcher_oneof') == 'bipartite_matcher': + matcher = matcher_config.bipartite_matcher + return bipartite_matcher.GreedyBipartiteMatcher(matcher.use_matmul_gather) + raise ValueError('Empty matcher.') diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/matcher_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/matcher_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..66854491192c1739855b9f2a428a2f29005ad866 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/matcher_builder_test.py @@ -0,0 +1,99 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for matcher_builder.""" + +import tensorflow as tf + +from google.protobuf import text_format +from object_detection.builders import matcher_builder +from object_detection.matchers import argmax_matcher +from object_detection.matchers import bipartite_matcher +from object_detection.protos import matcher_pb2 + + +class MatcherBuilderTest(tf.test.TestCase): + + def test_build_arg_max_matcher_with_defaults(self): + matcher_text_proto = """ + argmax_matcher { + } + """ + matcher_proto = matcher_pb2.Matcher() + text_format.Merge(matcher_text_proto, matcher_proto) + matcher_object = matcher_builder.build(matcher_proto) + self.assertTrue(isinstance(matcher_object, argmax_matcher.ArgMaxMatcher)) + self.assertAlmostEqual(matcher_object._matched_threshold, 0.5) + self.assertAlmostEqual(matcher_object._unmatched_threshold, 0.5) + self.assertTrue(matcher_object._negatives_lower_than_unmatched) + self.assertFalse(matcher_object._force_match_for_each_row) + + def test_build_arg_max_matcher_without_thresholds(self): + matcher_text_proto = """ + argmax_matcher { + ignore_thresholds: true + } + """ + matcher_proto = matcher_pb2.Matcher() + text_format.Merge(matcher_text_proto, matcher_proto) + matcher_object = matcher_builder.build(matcher_proto) + self.assertTrue(isinstance(matcher_object, argmax_matcher.ArgMaxMatcher)) + self.assertEqual(matcher_object._matched_threshold, None) + self.assertEqual(matcher_object._unmatched_threshold, None) + self.assertTrue(matcher_object._negatives_lower_than_unmatched) + self.assertFalse(matcher_object._force_match_for_each_row) + + def test_build_arg_max_matcher_with_non_default_parameters(self): + matcher_text_proto = """ + argmax_matcher { + matched_threshold: 0.7 + unmatched_threshold: 0.3 + negatives_lower_than_unmatched: false + force_match_for_each_row: true + use_matmul_gather: true + } + """ + matcher_proto = matcher_pb2.Matcher() + text_format.Merge(matcher_text_proto, matcher_proto) + matcher_object = matcher_builder.build(matcher_proto) + self.assertTrue(isinstance(matcher_object, argmax_matcher.ArgMaxMatcher)) + self.assertAlmostEqual(matcher_object._matched_threshold, 0.7) + self.assertAlmostEqual(matcher_object._unmatched_threshold, 0.3) + self.assertFalse(matcher_object._negatives_lower_than_unmatched) + self.assertTrue(matcher_object._force_match_for_each_row) + self.assertTrue(matcher_object._use_matmul_gather) + + def test_build_bipartite_matcher(self): + matcher_text_proto = """ + bipartite_matcher { + } + """ + matcher_proto = matcher_pb2.Matcher() + text_format.Merge(matcher_text_proto, matcher_proto) + matcher_object = matcher_builder.build(matcher_proto) + self.assertTrue( + isinstance(matcher_object, bipartite_matcher.GreedyBipartiteMatcher)) + + def test_raise_error_on_empty_matcher(self): + matcher_text_proto = """ + """ + matcher_proto = matcher_pb2.Matcher() + text_format.Merge(matcher_text_proto, matcher_proto) + with self.assertRaises(ValueError): + matcher_builder.build(matcher_proto) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/model_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/model_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..1ebdcb79f391726f2af9c85888d36a5f881f590e --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/model_builder.py @@ -0,0 +1,377 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""A function to build a DetectionModel from configuration.""" +from object_detection.builders import anchor_generator_builder +from object_detection.builders import box_coder_builder +from object_detection.builders import box_predictor_builder +from object_detection.builders import hyperparams_builder +from object_detection.builders import image_resizer_builder +from object_detection.builders import losses_builder +from object_detection.builders import matcher_builder +from object_detection.builders import post_processing_builder +from object_detection.builders import region_similarity_calculator_builder as sim_calc +from object_detection.core import box_predictor +from object_detection.meta_architectures import faster_rcnn_meta_arch +from object_detection.meta_architectures import rfcn_meta_arch +from object_detection.meta_architectures import ssd_meta_arch +from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res +from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2 +from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas +from object_detection.models import faster_rcnn_pnas_feature_extractor as frcnn_pnas +from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1 +from object_detection.models import ssd_resnet_v1_fpn_feature_extractor as ssd_resnet_v1_fpn +from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor +from object_detection.models.ssd_inception_v2_feature_extractor import SSDInceptionV2FeatureExtractor +from object_detection.models.ssd_inception_v3_feature_extractor import SSDInceptionV3FeatureExtractor +from object_detection.models.ssd_mobilenet_v1_feature_extractor import SSDMobileNetV1FeatureExtractor +from object_detection.models.ssd_mobilenet_v2_feature_extractor import SSDMobileNetV2FeatureExtractor +from object_detection.protos import model_pb2 + +# A map of names to SSD feature extractors. +SSD_FEATURE_EXTRACTOR_CLASS_MAP = { + 'ssd_inception_v2': SSDInceptionV2FeatureExtractor, + 'ssd_inception_v3': SSDInceptionV3FeatureExtractor, + 'ssd_mobilenet_v1': SSDMobileNetV1FeatureExtractor, + 'ssd_mobilenet_v2': SSDMobileNetV2FeatureExtractor, + 'ssd_resnet50_v1_fpn': ssd_resnet_v1_fpn.SSDResnet50V1FpnFeatureExtractor, + 'ssd_resnet101_v1_fpn': ssd_resnet_v1_fpn.SSDResnet101V1FpnFeatureExtractor, + 'ssd_resnet152_v1_fpn': ssd_resnet_v1_fpn.SSDResnet152V1FpnFeatureExtractor, + 'embedded_ssd_mobilenet_v1': EmbeddedSSDMobileNetV1FeatureExtractor, +} + +# A map of names to Faster R-CNN feature extractors. +FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP = { + 'faster_rcnn_nas': + frcnn_nas.FasterRCNNNASFeatureExtractor, + 'faster_rcnn_pnas': + frcnn_pnas.FasterRCNNPNASFeatureExtractor, + 'faster_rcnn_inception_resnet_v2': + frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor, + 'faster_rcnn_inception_v2': + frcnn_inc_v2.FasterRCNNInceptionV2FeatureExtractor, + 'faster_rcnn_resnet50': + frcnn_resnet_v1.FasterRCNNResnet50FeatureExtractor, + 'faster_rcnn_resnet101': + frcnn_resnet_v1.FasterRCNNResnet101FeatureExtractor, + 'faster_rcnn_resnet152': + frcnn_resnet_v1.FasterRCNNResnet152FeatureExtractor, +} + + +def build(model_config, is_training, add_summaries=True, + add_background_class=True): + """Builds a DetectionModel based on the model config. + + Args: + model_config: A model.proto object containing the config for the desired + DetectionModel. + is_training: True if this model is being built for training purposes. + add_summaries: Whether to add tensorflow summaries in the model graph. + add_background_class: Whether to add an implicit background class to one-hot + encodings of groundtruth labels. Set to false if using groundtruth labels + with an explicit background class or using multiclass scores instead of + truth in the case of distillation. Ignored in the case of faster_rcnn. + Returns: + DetectionModel based on the config. + + Raises: + ValueError: On invalid meta architecture or model. + """ + if not isinstance(model_config, model_pb2.DetectionModel): + raise ValueError('model_config not of type model_pb2.DetectionModel.') + meta_architecture = model_config.WhichOneof('model') + if meta_architecture == 'ssd': + return _build_ssd_model(model_config.ssd, is_training, add_summaries, + add_background_class) + if meta_architecture == 'faster_rcnn': + return _build_faster_rcnn_model(model_config.faster_rcnn, is_training, + add_summaries) + raise ValueError('Unknown meta architecture: {}'.format(meta_architecture)) + + +def _build_ssd_feature_extractor(feature_extractor_config, is_training, + reuse_weights=None): + """Builds a ssd_meta_arch.SSDFeatureExtractor based on config. + + Args: + feature_extractor_config: A SSDFeatureExtractor proto config from ssd.proto. + is_training: True if this feature extractor is being built for training. + reuse_weights: if the feature extractor should reuse weights. + + Returns: + ssd_meta_arch.SSDFeatureExtractor based on config. + + Raises: + ValueError: On invalid feature extractor type. + """ + feature_type = feature_extractor_config.type + depth_multiplier = feature_extractor_config.depth_multiplier + min_depth = feature_extractor_config.min_depth + pad_to_multiple = feature_extractor_config.pad_to_multiple + use_explicit_padding = feature_extractor_config.use_explicit_padding + use_depthwise = feature_extractor_config.use_depthwise + conv_hyperparams = hyperparams_builder.build( + feature_extractor_config.conv_hyperparams, is_training) + override_base_feature_extractor_hyperparams = ( + feature_extractor_config.override_base_feature_extractor_hyperparams) + + if feature_type not in SSD_FEATURE_EXTRACTOR_CLASS_MAP: + raise ValueError('Unknown ssd feature_extractor: {}'.format(feature_type)) + + feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type] + return feature_extractor_class( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise, + override_base_feature_extractor_hyperparams) + + +def _build_ssd_model(ssd_config, is_training, add_summaries, + add_background_class=True): + """Builds an SSD detection model based on the model config. + + Args: + ssd_config: A ssd.proto object containing the config for the desired + SSDMetaArch. + is_training: True if this model is being built for training purposes. + add_summaries: Whether to add tf summaries in the model. + add_background_class: Whether to add an implicit background class to one-hot + encodings of groundtruth labels. Set to false if using groundtruth labels + with an explicit background class or using multiclass scores instead of + truth in the case of distillation. + Returns: + SSDMetaArch based on the config. + + Raises: + ValueError: If ssd_config.type is not recognized (i.e. not registered in + model_class_map). + """ + num_classes = ssd_config.num_classes + + # Feature extractor + feature_extractor = _build_ssd_feature_extractor( + feature_extractor_config=ssd_config.feature_extractor, + is_training=is_training) + + box_coder = box_coder_builder.build(ssd_config.box_coder) + matcher = matcher_builder.build(ssd_config.matcher) + region_similarity_calculator = sim_calc.build( + ssd_config.similarity_calculator) + encode_background_as_zeros = ssd_config.encode_background_as_zeros + negative_class_weight = ssd_config.negative_class_weight + ssd_box_predictor = box_predictor_builder.build(hyperparams_builder.build, + ssd_config.box_predictor, + is_training, num_classes) + anchor_generator = anchor_generator_builder.build( + ssd_config.anchor_generator) + image_resizer_fn = image_resizer_builder.build(ssd_config.image_resizer) + non_max_suppression_fn, score_conversion_fn = post_processing_builder.build( + ssd_config.post_processing) + (classification_loss, localization_loss, classification_weight, + localization_weight, hard_example_miner, + random_example_sampler) = losses_builder.build(ssd_config.loss) + normalize_loss_by_num_matches = ssd_config.normalize_loss_by_num_matches + normalize_loc_loss_by_codesize = ssd_config.normalize_loc_loss_by_codesize + + return ssd_meta_arch.SSDMetaArch( + is_training, + anchor_generator, + ssd_box_predictor, + box_coder, + feature_extractor, + matcher, + region_similarity_calculator, + encode_background_as_zeros, + negative_class_weight, + image_resizer_fn, + non_max_suppression_fn, + score_conversion_fn, + classification_loss, + localization_loss, + classification_weight, + localization_weight, + normalize_loss_by_num_matches, + hard_example_miner, + add_summaries=add_summaries, + normalize_loc_loss_by_codesize=normalize_loc_loss_by_codesize, + freeze_batchnorm=ssd_config.freeze_batchnorm, + inplace_batchnorm_update=ssd_config.inplace_batchnorm_update, + add_background_class=add_background_class, + random_example_sampler=random_example_sampler) + + +def _build_faster_rcnn_feature_extractor( + feature_extractor_config, is_training, reuse_weights=None, + inplace_batchnorm_update=False): + """Builds a faster_rcnn_meta_arch.FasterRCNNFeatureExtractor based on config. + + Args: + feature_extractor_config: A FasterRcnnFeatureExtractor proto config from + faster_rcnn.proto. + is_training: True if this feature extractor is being built for training. + reuse_weights: if the feature extractor should reuse weights. + inplace_batchnorm_update: Whether to update batch_norm inplace during + training. This is required for batch norm to work correctly on TPUs. When + this is false, user must add a control dependency on + tf.GraphKeys.UPDATE_OPS for train/loss op in order to update the batch + norm moving average parameters. + + Returns: + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor based on config. + + Raises: + ValueError: On invalid feature extractor type. + """ + if inplace_batchnorm_update: + raise ValueError('inplace batchnorm updates not supported.') + feature_type = feature_extractor_config.type + first_stage_features_stride = ( + feature_extractor_config.first_stage_features_stride) + batch_norm_trainable = feature_extractor_config.batch_norm_trainable + + if feature_type not in FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP: + raise ValueError('Unknown Faster R-CNN feature_extractor: {}'.format( + feature_type)) + feature_extractor_class = FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP[ + feature_type] + return feature_extractor_class( + is_training, first_stage_features_stride, + batch_norm_trainable, reuse_weights) + + +def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries): + """Builds a Faster R-CNN or R-FCN detection model based on the model config. + + Builds R-FCN model if the second_stage_box_predictor in the config is of type + `rfcn_box_predictor` else builds a Faster R-CNN model. + + Args: + frcnn_config: A faster_rcnn.proto object containing the config for the + desired FasterRCNNMetaArch or RFCNMetaArch. + is_training: True if this model is being built for training purposes. + add_summaries: Whether to add tf summaries in the model. + + Returns: + FasterRCNNMetaArch based on the config. + + Raises: + ValueError: If frcnn_config.type is not recognized (i.e. not registered in + model_class_map). + """ + num_classes = frcnn_config.num_classes + image_resizer_fn = image_resizer_builder.build(frcnn_config.image_resizer) + + feature_extractor = _build_faster_rcnn_feature_extractor( + frcnn_config.feature_extractor, is_training, + frcnn_config.inplace_batchnorm_update) + + number_of_stages = frcnn_config.number_of_stages + first_stage_anchor_generator = anchor_generator_builder.build( + frcnn_config.first_stage_anchor_generator) + + first_stage_atrous_rate = frcnn_config.first_stage_atrous_rate + first_stage_box_predictor_arg_scope_fn = hyperparams_builder.build( + frcnn_config.first_stage_box_predictor_conv_hyperparams, is_training) + first_stage_box_predictor_kernel_size = ( + frcnn_config.first_stage_box_predictor_kernel_size) + first_stage_box_predictor_depth = frcnn_config.first_stage_box_predictor_depth + first_stage_minibatch_size = frcnn_config.first_stage_minibatch_size + first_stage_positive_balance_fraction = ( + frcnn_config.first_stage_positive_balance_fraction) + first_stage_nms_score_threshold = frcnn_config.first_stage_nms_score_threshold + first_stage_nms_iou_threshold = frcnn_config.first_stage_nms_iou_threshold + first_stage_max_proposals = frcnn_config.first_stage_max_proposals + first_stage_loc_loss_weight = ( + frcnn_config.first_stage_localization_loss_weight) + first_stage_obj_loss_weight = frcnn_config.first_stage_objectness_loss_weight + + initial_crop_size = frcnn_config.initial_crop_size + maxpool_kernel_size = frcnn_config.maxpool_kernel_size + maxpool_stride = frcnn_config.maxpool_stride + + second_stage_box_predictor = box_predictor_builder.build( + hyperparams_builder.build, + frcnn_config.second_stage_box_predictor, + is_training=is_training, + num_classes=num_classes) + second_stage_batch_size = frcnn_config.second_stage_batch_size + second_stage_balance_fraction = frcnn_config.second_stage_balance_fraction + (second_stage_non_max_suppression_fn, second_stage_score_conversion_fn + ) = post_processing_builder.build(frcnn_config.second_stage_post_processing) + second_stage_localization_loss_weight = ( + frcnn_config.second_stage_localization_loss_weight) + second_stage_classification_loss = ( + losses_builder.build_faster_rcnn_classification_loss( + frcnn_config.second_stage_classification_loss)) + second_stage_classification_loss_weight = ( + frcnn_config.second_stage_classification_loss_weight) + second_stage_mask_prediction_loss_weight = ( + frcnn_config.second_stage_mask_prediction_loss_weight) + + hard_example_miner = None + if frcnn_config.HasField('hard_example_miner'): + hard_example_miner = losses_builder.build_hard_example_miner( + frcnn_config.hard_example_miner, + second_stage_classification_loss_weight, + second_stage_localization_loss_weight) + + common_kwargs = { + 'is_training': is_training, + 'num_classes': num_classes, + 'image_resizer_fn': image_resizer_fn, + 'feature_extractor': feature_extractor, + 'number_of_stages': number_of_stages, + 'first_stage_anchor_generator': first_stage_anchor_generator, + 'first_stage_atrous_rate': first_stage_atrous_rate, + 'first_stage_box_predictor_arg_scope_fn': + first_stage_box_predictor_arg_scope_fn, + 'first_stage_box_predictor_kernel_size': + first_stage_box_predictor_kernel_size, + 'first_stage_box_predictor_depth': first_stage_box_predictor_depth, + 'first_stage_minibatch_size': first_stage_minibatch_size, + 'first_stage_positive_balance_fraction': + first_stage_positive_balance_fraction, + 'first_stage_nms_score_threshold': first_stage_nms_score_threshold, + 'first_stage_nms_iou_threshold': first_stage_nms_iou_threshold, + 'first_stage_max_proposals': first_stage_max_proposals, + 'first_stage_localization_loss_weight': first_stage_loc_loss_weight, + 'first_stage_objectness_loss_weight': first_stage_obj_loss_weight, + 'second_stage_batch_size': second_stage_batch_size, + 'second_stage_balance_fraction': second_stage_balance_fraction, + 'second_stage_non_max_suppression_fn': + second_stage_non_max_suppression_fn, + 'second_stage_score_conversion_fn': second_stage_score_conversion_fn, + 'second_stage_localization_loss_weight': + second_stage_localization_loss_weight, + 'second_stage_classification_loss': + second_stage_classification_loss, + 'second_stage_classification_loss_weight': + second_stage_classification_loss_weight, + 'hard_example_miner': hard_example_miner, + 'add_summaries': add_summaries} + + if isinstance(second_stage_box_predictor, box_predictor.RfcnBoxPredictor): + return rfcn_meta_arch.RFCNMetaArch( + second_stage_rfcn_box_predictor=second_stage_box_predictor, + **common_kwargs) + else: + return faster_rcnn_meta_arch.FasterRCNNMetaArch( + initial_crop_size=initial_crop_size, + maxpool_kernel_size=maxpool_kernel_size, + maxpool_stride=maxpool_stride, + second_stage_mask_rcnn_box_predictor=second_stage_box_predictor, + second_stage_mask_prediction_loss_weight=( + second_stage_mask_prediction_loss_weight), + **common_kwargs) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/model_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/model_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..225e1d50b1c229b7d7b2017661df55973098eb99 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/model_builder_test.py @@ -0,0 +1,1056 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.models.model_builder.""" + +import tensorflow as tf + +from google.protobuf import text_format +from object_detection.builders import model_builder +from object_detection.meta_architectures import faster_rcnn_meta_arch +from object_detection.meta_architectures import rfcn_meta_arch +from object_detection.meta_architectures import ssd_meta_arch +from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res +from object_detection.models import faster_rcnn_inception_v2_feature_extractor as frcnn_inc_v2 +from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas +from object_detection.models import faster_rcnn_pnas_feature_extractor as frcnn_pnas +from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as frcnn_resnet_v1 +from object_detection.models import ssd_resnet_v1_fpn_feature_extractor as ssd_resnet_v1_fpn +from object_detection.models.embedded_ssd_mobilenet_v1_feature_extractor import EmbeddedSSDMobileNetV1FeatureExtractor +from object_detection.models.ssd_inception_v2_feature_extractor import SSDInceptionV2FeatureExtractor +from object_detection.models.ssd_inception_v3_feature_extractor import SSDInceptionV3FeatureExtractor +from object_detection.models.ssd_mobilenet_v1_feature_extractor import SSDMobileNetV1FeatureExtractor +from object_detection.models.ssd_mobilenet_v2_feature_extractor import SSDMobileNetV2FeatureExtractor +from object_detection.protos import model_pb2 + +FRCNN_RESNET_FEAT_MAPS = { + 'faster_rcnn_resnet50': + frcnn_resnet_v1.FasterRCNNResnet50FeatureExtractor, + 'faster_rcnn_resnet101': + frcnn_resnet_v1.FasterRCNNResnet101FeatureExtractor, + 'faster_rcnn_resnet152': + frcnn_resnet_v1.FasterRCNNResnet152FeatureExtractor +} + +SSD_RESNET_V1_FPN_FEAT_MAPS = { + 'ssd_resnet50_v1_fpn': + ssd_resnet_v1_fpn.SSDResnet50V1FpnFeatureExtractor, + 'ssd_resnet101_v1_fpn': + ssd_resnet_v1_fpn.SSDResnet101V1FpnFeatureExtractor, + 'ssd_resnet152_v1_fpn': + ssd_resnet_v1_fpn.SSDResnet152V1FpnFeatureExtractor +} + + +class ModelBuilderTest(tf.test.TestCase): + + def create_model(self, model_config): + """Builds a DetectionModel based on the model config. + + Args: + model_config: A model.proto object containing the config for the desired + DetectionModel. + + Returns: + DetectionModel based on the config. + """ + return model_builder.build(model_config, is_training=True) + + def test_create_ssd_inception_v2_model_from_config(self): + model_text_proto = """ + ssd { + feature_extractor { + type: 'ssd_inception_v2' + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + override_base_feature_extractor_hyperparams: true + } + box_coder { + faster_rcnn_box_coder { + } + } + matcher { + argmax_matcher { + } + } + similarity_calculator { + iou_similarity { + } + } + anchor_generator { + ssd_anchor_generator { + aspect_ratios: 1.0 + } + } + image_resizer { + fixed_shape_resizer { + height: 320 + width: 320 + } + } + box_predictor { + convolutional_box_predictor { + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + loss { + classification_loss { + weighted_softmax { + } + } + localization_loss { + weighted_smooth_l1 { + } + } + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = self.create_model(model_proto) + self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch) + self.assertIsInstance(model._feature_extractor, + SSDInceptionV2FeatureExtractor) + + def test_create_ssd_inception_v3_model_from_config(self): + model_text_proto = """ + ssd { + feature_extractor { + type: 'ssd_inception_v3' + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + override_base_feature_extractor_hyperparams: true + } + box_coder { + faster_rcnn_box_coder { + } + } + matcher { + argmax_matcher { + } + } + similarity_calculator { + iou_similarity { + } + } + anchor_generator { + ssd_anchor_generator { + aspect_ratios: 1.0 + } + } + image_resizer { + fixed_shape_resizer { + height: 320 + width: 320 + } + } + box_predictor { + convolutional_box_predictor { + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + loss { + classification_loss { + weighted_softmax { + } + } + localization_loss { + weighted_smooth_l1 { + } + } + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = self.create_model(model_proto) + self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch) + self.assertIsInstance(model._feature_extractor, + SSDInceptionV3FeatureExtractor) + + def test_create_ssd_resnet_v1_fpn_model_from_config(self): + model_text_proto = """ + ssd { + feature_extractor { + type: 'ssd_resnet50_v1_fpn' + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + box_coder { + faster_rcnn_box_coder { + } + } + matcher { + argmax_matcher { + } + } + similarity_calculator { + iou_similarity { + } + } + encode_background_as_zeros: true + anchor_generator { + multiscale_anchor_generator { + aspect_ratios: [1.0, 2.0, 0.5] + scales_per_octave: 2 + } + } + image_resizer { + fixed_shape_resizer { + height: 320 + width: 320 + } + } + box_predictor { + weight_shared_convolutional_box_predictor { + depth: 32 + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + random_normal_initializer { + } + } + } + num_layers_before_predictor: 1 + } + } + normalize_loss_by_num_matches: true + normalize_loc_loss_by_codesize: true + loss { + classification_loss { + weighted_sigmoid_focal { + alpha: 0.25 + gamma: 2.0 + } + } + localization_loss { + weighted_smooth_l1 { + delta: 0.1 + } + } + classification_weight: 1.0 + localization_weight: 1.0 + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + + for extractor_type, extractor_class in SSD_RESNET_V1_FPN_FEAT_MAPS.items(): + model_proto.ssd.feature_extractor.type = extractor_type + model = model_builder.build(model_proto, is_training=True) + self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch) + self.assertIsInstance(model._feature_extractor, extractor_class) + + def test_create_ssd_mobilenet_v1_model_from_config(self): + model_text_proto = """ + ssd { + freeze_batchnorm: true + inplace_batchnorm_update: true + feature_extractor { + type: 'ssd_mobilenet_v1' + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + box_coder { + faster_rcnn_box_coder { + } + } + matcher { + argmax_matcher { + } + } + similarity_calculator { + iou_similarity { + } + } + anchor_generator { + ssd_anchor_generator { + aspect_ratios: 1.0 + } + } + image_resizer { + fixed_shape_resizer { + height: 320 + width: 320 + } + } + box_predictor { + convolutional_box_predictor { + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + normalize_loc_loss_by_codesize: true + loss { + classification_loss { + weighted_softmax { + } + } + localization_loss { + weighted_smooth_l1 { + } + } + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = self.create_model(model_proto) + self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch) + self.assertIsInstance(model._feature_extractor, + SSDMobileNetV1FeatureExtractor) + self.assertTrue(model._normalize_loc_loss_by_codesize) + self.assertTrue(model._freeze_batchnorm) + self.assertTrue(model._inplace_batchnorm_update) + + def test_create_ssd_mobilenet_v2_model_from_config(self): + model_text_proto = """ + ssd { + feature_extractor { + type: 'ssd_mobilenet_v2' + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + box_coder { + faster_rcnn_box_coder { + } + } + matcher { + argmax_matcher { + } + } + similarity_calculator { + iou_similarity { + } + } + anchor_generator { + ssd_anchor_generator { + aspect_ratios: 1.0 + } + } + image_resizer { + fixed_shape_resizer { + height: 320 + width: 320 + } + } + box_predictor { + convolutional_box_predictor { + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + normalize_loc_loss_by_codesize: true + loss { + classification_loss { + weighted_softmax { + } + } + localization_loss { + weighted_smooth_l1 { + } + } + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = self.create_model(model_proto) + self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch) + self.assertIsInstance(model._feature_extractor, + SSDMobileNetV2FeatureExtractor) + self.assertTrue(model._normalize_loc_loss_by_codesize) + + def test_create_embedded_ssd_mobilenet_v1_model_from_config(self): + model_text_proto = """ + ssd { + feature_extractor { + type: 'embedded_ssd_mobilenet_v1' + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + box_coder { + faster_rcnn_box_coder { + } + } + matcher { + argmax_matcher { + } + } + similarity_calculator { + iou_similarity { + } + } + anchor_generator { + ssd_anchor_generator { + aspect_ratios: 1.0 + } + } + image_resizer { + fixed_shape_resizer { + height: 256 + width: 256 + } + } + box_predictor { + convolutional_box_predictor { + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + loss { + classification_loss { + weighted_softmax { + } + } + localization_loss { + weighted_smooth_l1 { + } + } + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = self.create_model(model_proto) + self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch) + self.assertIsInstance(model._feature_extractor, + EmbeddedSSDMobileNetV1FeatureExtractor) + + def test_create_faster_rcnn_resnet_v1_models_from_config(self): + model_text_proto = """ + faster_rcnn { + inplace_batchnorm_update: true + num_classes: 3 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 600 + max_dimension: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_resnet101' + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_box_predictor { + mask_rcnn_box_predictor { + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.01 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + score_converter: SOFTMAX + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + for extractor_type, extractor_class in FRCNN_RESNET_FEAT_MAPS.items(): + model_proto.faster_rcnn.feature_extractor.type = extractor_type + model = model_builder.build(model_proto, is_training=True) + self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch) + self.assertIsInstance(model._feature_extractor, extractor_class) + + def test_create_faster_rcnn_resnet101_with_mask_prediction_enabled(self): + model_text_proto = """ + faster_rcnn { + num_classes: 3 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 600 + max_dimension: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_resnet101' + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_box_predictor { + mask_rcnn_box_predictor { + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + predict_instance_masks: true + } + } + second_stage_mask_prediction_loss_weight: 3.0 + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.01 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + score_converter: SOFTMAX + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = model_builder.build(model_proto, is_training=True) + self.assertAlmostEqual(model._second_stage_mask_loss_weight, 3.0) + + def test_create_faster_rcnn_nas_model_from_config(self): + model_text_proto = """ + faster_rcnn { + num_classes: 3 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 600 + max_dimension: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_nas' + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + initial_crop_size: 17 + maxpool_kernel_size: 1 + maxpool_stride: 1 + second_stage_box_predictor { + mask_rcnn_box_predictor { + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.01 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + score_converter: SOFTMAX + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = model_builder.build(model_proto, is_training=True) + self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch) + self.assertIsInstance( + model._feature_extractor, + frcnn_nas.FasterRCNNNASFeatureExtractor) + + def test_create_faster_rcnn_pnas_model_from_config(self): + model_text_proto = """ + faster_rcnn { + num_classes: 3 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 600 + max_dimension: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_pnas' + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + initial_crop_size: 17 + maxpool_kernel_size: 1 + maxpool_stride: 1 + second_stage_box_predictor { + mask_rcnn_box_predictor { + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.01 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + score_converter: SOFTMAX + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = model_builder.build(model_proto, is_training=True) + self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch) + self.assertIsInstance( + model._feature_extractor, + frcnn_pnas.FasterRCNNPNASFeatureExtractor) + + def test_create_faster_rcnn_inception_resnet_v2_model_from_config(self): + model_text_proto = """ + faster_rcnn { + num_classes: 3 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 600 + max_dimension: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_inception_resnet_v2' + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + initial_crop_size: 17 + maxpool_kernel_size: 1 + maxpool_stride: 1 + second_stage_box_predictor { + mask_rcnn_box_predictor { + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.01 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + score_converter: SOFTMAX + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = model_builder.build(model_proto, is_training=True) + self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch) + self.assertIsInstance( + model._feature_extractor, + frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor) + + def test_create_faster_rcnn_inception_v2_model_from_config(self): + model_text_proto = """ + faster_rcnn { + num_classes: 3 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 600 + max_dimension: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_inception_v2' + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_box_predictor { + mask_rcnn_box_predictor { + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.01 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + score_converter: SOFTMAX + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = model_builder.build(model_proto, is_training=True) + self.assertIsInstance(model, faster_rcnn_meta_arch.FasterRCNNMetaArch) + self.assertIsInstance(model._feature_extractor, + frcnn_inc_v2.FasterRCNNInceptionV2FeatureExtractor) + + def test_create_faster_rcnn_model_from_config_with_example_miner(self): + model_text_proto = """ + faster_rcnn { + num_classes: 3 + feature_extractor { + type: 'faster_rcnn_inception_resnet_v2' + } + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 600 + max_dimension: 1024 + } + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + second_stage_box_predictor { + mask_rcnn_box_predictor { + fc_hyperparams { + op: FC + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + hard_example_miner { + num_hard_examples: 10 + iou_threshold: 0.99 + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + model = model_builder.build(model_proto, is_training=True) + self.assertIsNotNone(model._hard_example_miner) + + def test_create_rfcn_resnet_v1_model_from_config(self): + model_text_proto = """ + faster_rcnn { + num_classes: 3 + image_resizer { + keep_aspect_ratio_resizer { + min_dimension: 600 + max_dimension: 1024 + } + } + feature_extractor { + type: 'faster_rcnn_resnet101' + } + first_stage_anchor_generator { + grid_anchor_generator { + scales: [0.25, 0.5, 1.0, 2.0] + aspect_ratios: [0.5, 1.0, 2.0] + height_stride: 16 + width_stride: 16 + } + } + first_stage_box_predictor_conv_hyperparams { + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + initial_crop_size: 14 + maxpool_kernel_size: 2 + maxpool_stride: 2 + second_stage_box_predictor { + rfcn_box_predictor { + conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + } + } + } + second_stage_post_processing { + batch_non_max_suppression { + score_threshold: 0.01 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + score_converter: SOFTMAX + } + }""" + model_proto = model_pb2.DetectionModel() + text_format.Merge(model_text_proto, model_proto) + for extractor_type, extractor_class in FRCNN_RESNET_FEAT_MAPS.items(): + model_proto.faster_rcnn.feature_extractor.type = extractor_type + model = model_builder.build(model_proto, is_training=True) + self.assertIsInstance(model, rfcn_meta_arch.RFCNMetaArch) + self.assertIsInstance(model._feature_extractor, extractor_class) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/optimizer_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/optimizer_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..e3a437f0d9a9442dfb1fff3013b250e4e854a2c2 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/optimizer_builder.py @@ -0,0 +1,124 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Functions to build DetectionModel training optimizers.""" + +import tensorflow as tf +from object_detection.utils import learning_schedules + + +def build(optimizer_config): + """Create optimizer based on config. + + Args: + optimizer_config: A Optimizer proto message. + + Returns: + An optimizer and a list of variables for summary. + + Raises: + ValueError: when using an unsupported input data type. + """ + optimizer_type = optimizer_config.WhichOneof('optimizer') + optimizer = None + + summary_vars = [] + if optimizer_type == 'rms_prop_optimizer': + config = optimizer_config.rms_prop_optimizer + learning_rate = _create_learning_rate(config.learning_rate) + summary_vars.append(learning_rate) + optimizer = tf.train.RMSPropOptimizer( + learning_rate, + decay=config.decay, + momentum=config.momentum_optimizer_value, + epsilon=config.epsilon) + + if optimizer_type == 'momentum_optimizer': + config = optimizer_config.momentum_optimizer + learning_rate = _create_learning_rate(config.learning_rate) + summary_vars.append(learning_rate) + optimizer = tf.train.MomentumOptimizer( + learning_rate, + momentum=config.momentum_optimizer_value) + + if optimizer_type == 'adam_optimizer': + config = optimizer_config.adam_optimizer + learning_rate = _create_learning_rate(config.learning_rate) + summary_vars.append(learning_rate) + optimizer = tf.train.AdamOptimizer(learning_rate) + + if optimizer is None: + raise ValueError('Optimizer %s not supported.' % optimizer_type) + + if optimizer_config.use_moving_average: + optimizer = tf.contrib.opt.MovingAverageOptimizer( + optimizer, average_decay=optimizer_config.moving_average_decay) + + return optimizer, summary_vars + + +def _create_learning_rate(learning_rate_config): + """Create optimizer learning rate based on config. + + Args: + learning_rate_config: A LearningRate proto message. + + Returns: + A learning rate. + + Raises: + ValueError: when using an unsupported input data type. + """ + learning_rate = None + learning_rate_type = learning_rate_config.WhichOneof('learning_rate') + if learning_rate_type == 'constant_learning_rate': + config = learning_rate_config.constant_learning_rate + learning_rate = tf.constant(config.learning_rate, dtype=tf.float32, + name='learning_rate') + + if learning_rate_type == 'exponential_decay_learning_rate': + config = learning_rate_config.exponential_decay_learning_rate + learning_rate = tf.train.exponential_decay( + config.initial_learning_rate, + tf.train.get_or_create_global_step(), + config.decay_steps, + config.decay_factor, + staircase=config.staircase, name='learning_rate') + + if learning_rate_type == 'manual_step_learning_rate': + config = learning_rate_config.manual_step_learning_rate + if not config.schedule: + raise ValueError('Empty learning rate schedule.') + learning_rate_step_boundaries = [x.step for x in config.schedule] + learning_rate_sequence = [config.initial_learning_rate] + learning_rate_sequence += [x.learning_rate for x in config.schedule] + learning_rate = learning_schedules.manual_stepping( + tf.train.get_or_create_global_step(), learning_rate_step_boundaries, + learning_rate_sequence, config.warmup) + + if learning_rate_type == 'cosine_decay_learning_rate': + config = learning_rate_config.cosine_decay_learning_rate + learning_rate = learning_schedules.cosine_decay_with_warmup( + tf.train.get_or_create_global_step(), + config.learning_rate_base, + config.total_steps, + config.warmup_learning_rate, + config.warmup_steps, + config.hold_base_rate_steps) + + if learning_rate is None: + raise ValueError('Learning_rate %s not supported.' % learning_rate_type) + + return learning_rate diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/optimizer_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/optimizer_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..343a858fb90b223d7f82b1d11466a6478d73f3e5 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/optimizer_builder_test.py @@ -0,0 +1,208 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for optimizer_builder.""" + +import tensorflow as tf + +from google.protobuf import text_format + +from object_detection.builders import optimizer_builder +from object_detection.protos import optimizer_pb2 + + +class LearningRateBuilderTest(tf.test.TestCase): + + def testBuildConstantLearningRate(self): + learning_rate_text_proto = """ + constant_learning_rate { + learning_rate: 0.004 + } + """ + learning_rate_proto = optimizer_pb2.LearningRate() + text_format.Merge(learning_rate_text_proto, learning_rate_proto) + learning_rate = optimizer_builder._create_learning_rate( + learning_rate_proto) + self.assertTrue(learning_rate.op.name.endswith('learning_rate')) + with self.test_session(): + learning_rate_out = learning_rate.eval() + self.assertAlmostEqual(learning_rate_out, 0.004) + + def testBuildExponentialDecayLearningRate(self): + learning_rate_text_proto = """ + exponential_decay_learning_rate { + initial_learning_rate: 0.004 + decay_steps: 99999 + decay_factor: 0.85 + staircase: false + } + """ + learning_rate_proto = optimizer_pb2.LearningRate() + text_format.Merge(learning_rate_text_proto, learning_rate_proto) + learning_rate = optimizer_builder._create_learning_rate( + learning_rate_proto) + self.assertTrue(learning_rate.op.name.endswith('learning_rate')) + self.assertTrue(isinstance(learning_rate, tf.Tensor)) + + def testBuildManualStepLearningRate(self): + learning_rate_text_proto = """ + manual_step_learning_rate { + initial_learning_rate: 0.002 + schedule { + step: 100 + learning_rate: 0.006 + } + schedule { + step: 90000 + learning_rate: 0.00006 + } + warmup: true + } + """ + learning_rate_proto = optimizer_pb2.LearningRate() + text_format.Merge(learning_rate_text_proto, learning_rate_proto) + learning_rate = optimizer_builder._create_learning_rate( + learning_rate_proto) + self.assertTrue(isinstance(learning_rate, tf.Tensor)) + + def testBuildCosineDecayLearningRate(self): + learning_rate_text_proto = """ + cosine_decay_learning_rate { + learning_rate_base: 0.002 + total_steps: 20000 + warmup_learning_rate: 0.0001 + warmup_steps: 1000 + hold_base_rate_steps: 20000 + } + """ + learning_rate_proto = optimizer_pb2.LearningRate() + text_format.Merge(learning_rate_text_proto, learning_rate_proto) + learning_rate = optimizer_builder._create_learning_rate( + learning_rate_proto) + self.assertTrue(isinstance(learning_rate, tf.Tensor)) + + def testRaiseErrorOnEmptyLearningRate(self): + learning_rate_text_proto = """ + """ + learning_rate_proto = optimizer_pb2.LearningRate() + text_format.Merge(learning_rate_text_proto, learning_rate_proto) + with self.assertRaises(ValueError): + optimizer_builder._create_learning_rate(learning_rate_proto) + + +class OptimizerBuilderTest(tf.test.TestCase): + + def testBuildRMSPropOptimizer(self): + optimizer_text_proto = """ + rms_prop_optimizer: { + learning_rate: { + exponential_decay_learning_rate { + initial_learning_rate: 0.004 + decay_steps: 800720 + decay_factor: 0.95 + } + } + momentum_optimizer_value: 0.9 + decay: 0.9 + epsilon: 1.0 + } + use_moving_average: false + """ + optimizer_proto = optimizer_pb2.Optimizer() + text_format.Merge(optimizer_text_proto, optimizer_proto) + optimizer, _ = optimizer_builder.build(optimizer_proto) + self.assertTrue(isinstance(optimizer, tf.train.RMSPropOptimizer)) + + def testBuildMomentumOptimizer(self): + optimizer_text_proto = """ + momentum_optimizer: { + learning_rate: { + constant_learning_rate { + learning_rate: 0.001 + } + } + momentum_optimizer_value: 0.99 + } + use_moving_average: false + """ + optimizer_proto = optimizer_pb2.Optimizer() + text_format.Merge(optimizer_text_proto, optimizer_proto) + optimizer, _ = optimizer_builder.build(optimizer_proto) + self.assertTrue(isinstance(optimizer, tf.train.MomentumOptimizer)) + + def testBuildAdamOptimizer(self): + optimizer_text_proto = """ + adam_optimizer: { + learning_rate: { + constant_learning_rate { + learning_rate: 0.002 + } + } + } + use_moving_average: false + """ + optimizer_proto = optimizer_pb2.Optimizer() + text_format.Merge(optimizer_text_proto, optimizer_proto) + optimizer, _ = optimizer_builder.build(optimizer_proto) + self.assertTrue(isinstance(optimizer, tf.train.AdamOptimizer)) + + def testBuildMovingAverageOptimizer(self): + optimizer_text_proto = """ + adam_optimizer: { + learning_rate: { + constant_learning_rate { + learning_rate: 0.002 + } + } + } + use_moving_average: True + """ + optimizer_proto = optimizer_pb2.Optimizer() + text_format.Merge(optimizer_text_proto, optimizer_proto) + optimizer, _ = optimizer_builder.build(optimizer_proto) + self.assertTrue( + isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer)) + + def testBuildMovingAverageOptimizerWithNonDefaultDecay(self): + optimizer_text_proto = """ + adam_optimizer: { + learning_rate: { + constant_learning_rate { + learning_rate: 0.002 + } + } + } + use_moving_average: True + moving_average_decay: 0.2 + """ + optimizer_proto = optimizer_pb2.Optimizer() + text_format.Merge(optimizer_text_proto, optimizer_proto) + optimizer, _ = optimizer_builder.build(optimizer_proto) + self.assertTrue( + isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer)) + # TODO(rathodv): Find a way to not depend on the private members. + self.assertAlmostEqual(optimizer._ema._decay, 0.2) + + def testBuildEmptyOptimizer(self): + optimizer_text_proto = """ + """ + optimizer_proto = optimizer_pb2.Optimizer() + text_format.Merge(optimizer_text_proto, optimizer_proto) + with self.assertRaises(ValueError): + optimizer_builder.build(optimizer_proto) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/post_processing_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/post_processing_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..fa3a772896dd1a1b8146677dc862549970a6fecd --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/post_processing_builder.py @@ -0,0 +1,123 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Builder function for post processing operations.""" +import functools + +import tensorflow as tf +from object_detection.core import post_processing +from object_detection.protos import post_processing_pb2 + + +def build(post_processing_config): + """Builds callables for post-processing operations. + + Builds callables for non-max suppression and score conversion based on the + configuration. + + Non-max suppression callable takes `boxes`, `scores`, and optionally + `clip_window`, `parallel_iterations` `masks, and `scope` as inputs. It returns + `nms_boxes`, `nms_scores`, `nms_classes` `nms_masks` and `num_detections`. See + post_processing.batch_multiclass_non_max_suppression for the type and shape + of these tensors. + + Score converter callable should be called with `input` tensor. The callable + returns the output from one of 3 tf operations based on the configuration - + tf.identity, tf.sigmoid or tf.nn.softmax. See tensorflow documentation for + argument and return value descriptions. + + Args: + post_processing_config: post_processing.proto object containing the + parameters for the post-processing operations. + + Returns: + non_max_suppressor_fn: Callable for non-max suppression. + score_converter_fn: Callable for score conversion. + + Raises: + ValueError: if the post_processing_config is of incorrect type. + """ + if not isinstance(post_processing_config, post_processing_pb2.PostProcessing): + raise ValueError('post_processing_config not of type ' + 'post_processing_pb2.Postprocessing.') + non_max_suppressor_fn = _build_non_max_suppressor( + post_processing_config.batch_non_max_suppression) + score_converter_fn = _build_score_converter( + post_processing_config.score_converter, + post_processing_config.logit_scale) + return non_max_suppressor_fn, score_converter_fn + + +def _build_non_max_suppressor(nms_config): + """Builds non-max suppresson based on the nms config. + + Args: + nms_config: post_processing_pb2.PostProcessing.BatchNonMaxSuppression proto. + + Returns: + non_max_suppressor_fn: Callable non-max suppressor. + + Raises: + ValueError: On incorrect iou_threshold or on incompatible values of + max_total_detections and max_detections_per_class. + """ + if nms_config.iou_threshold < 0 or nms_config.iou_threshold > 1.0: + raise ValueError('iou_threshold not in [0, 1.0].') + if nms_config.max_detections_per_class > nms_config.max_total_detections: + raise ValueError('max_detections_per_class should be no greater than ' + 'max_total_detections.') + + non_max_suppressor_fn = functools.partial( + post_processing.batch_multiclass_non_max_suppression, + score_thresh=nms_config.score_threshold, + iou_thresh=nms_config.iou_threshold, + max_size_per_class=nms_config.max_detections_per_class, + max_total_size=nms_config.max_total_detections) + return non_max_suppressor_fn + + +def _score_converter_fn_with_logit_scale(tf_score_converter_fn, logit_scale): + """Create a function to scale logits then apply a Tensorflow function.""" + def score_converter_fn(logits): + scaled_logits = tf.divide(logits, logit_scale, name='scale_logits') + return tf_score_converter_fn(scaled_logits, name='convert_scores') + score_converter_fn.__name__ = '%s_with_logit_scale' % ( + tf_score_converter_fn.__name__) + return score_converter_fn + + +def _build_score_converter(score_converter_config, logit_scale): + """Builds score converter based on the config. + + Builds one of [tf.identity, tf.sigmoid, tf.softmax] score converters based on + the config. + + Args: + score_converter_config: post_processing_pb2.PostProcessing.score_converter. + logit_scale: temperature to use for SOFTMAX score_converter. + + Returns: + Callable score converter op. + + Raises: + ValueError: On unknown score converter. + """ + if score_converter_config == post_processing_pb2.PostProcessing.IDENTITY: + return _score_converter_fn_with_logit_scale(tf.identity, logit_scale) + if score_converter_config == post_processing_pb2.PostProcessing.SIGMOID: + return _score_converter_fn_with_logit_scale(tf.sigmoid, logit_scale) + if score_converter_config == post_processing_pb2.PostProcessing.SOFTMAX: + return _score_converter_fn_with_logit_scale(tf.nn.softmax, logit_scale) + raise ValueError('Unknown score converter.') diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/post_processing_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/post_processing_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c39fbfb417db148d756c3e8a2b51948ed13d07b3 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/post_processing_builder_test.py @@ -0,0 +1,107 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for post_processing_builder.""" + +import tensorflow as tf +from google.protobuf import text_format +from object_detection.builders import post_processing_builder +from object_detection.protos import post_processing_pb2 + + +class PostProcessingBuilderTest(tf.test.TestCase): + + def test_build_non_max_suppressor_with_correct_parameters(self): + post_processing_text_proto = """ + batch_non_max_suppression { + score_threshold: 0.7 + iou_threshold: 0.6 + max_detections_per_class: 100 + max_total_detections: 300 + } + """ + post_processing_config = post_processing_pb2.PostProcessing() + text_format.Merge(post_processing_text_proto, post_processing_config) + non_max_suppressor, _ = post_processing_builder.build( + post_processing_config) + self.assertEqual(non_max_suppressor.keywords['max_size_per_class'], 100) + self.assertEqual(non_max_suppressor.keywords['max_total_size'], 300) + self.assertAlmostEqual(non_max_suppressor.keywords['score_thresh'], 0.7) + self.assertAlmostEqual(non_max_suppressor.keywords['iou_thresh'], 0.6) + + def test_build_identity_score_converter(self): + post_processing_text_proto = """ + score_converter: IDENTITY + """ + post_processing_config = post_processing_pb2.PostProcessing() + text_format.Merge(post_processing_text_proto, post_processing_config) + _, score_converter = post_processing_builder.build(post_processing_config) + self.assertEqual(score_converter.__name__, 'identity_with_logit_scale') + + inputs = tf.constant([1, 1], tf.float32) + outputs = score_converter(inputs) + with self.test_session() as sess: + converted_scores = sess.run(outputs) + expected_converted_scores = sess.run(inputs) + self.assertAllClose(converted_scores, expected_converted_scores) + + def test_build_identity_score_converter_with_logit_scale(self): + post_processing_text_proto = """ + score_converter: IDENTITY + logit_scale: 2.0 + """ + post_processing_config = post_processing_pb2.PostProcessing() + text_format.Merge(post_processing_text_proto, post_processing_config) + _, score_converter = post_processing_builder.build(post_processing_config) + self.assertEqual(score_converter.__name__, 'identity_with_logit_scale') + + inputs = tf.constant([1, 1], tf.float32) + outputs = score_converter(inputs) + with self.test_session() as sess: + converted_scores = sess.run(outputs) + expected_converted_scores = sess.run(tf.constant([.5, .5], tf.float32)) + self.assertAllClose(converted_scores, expected_converted_scores) + + def test_build_sigmoid_score_converter(self): + post_processing_text_proto = """ + score_converter: SIGMOID + """ + post_processing_config = post_processing_pb2.PostProcessing() + text_format.Merge(post_processing_text_proto, post_processing_config) + _, score_converter = post_processing_builder.build(post_processing_config) + self.assertEqual(score_converter.__name__, 'sigmoid_with_logit_scale') + + def test_build_softmax_score_converter(self): + post_processing_text_proto = """ + score_converter: SOFTMAX + """ + post_processing_config = post_processing_pb2.PostProcessing() + text_format.Merge(post_processing_text_proto, post_processing_config) + _, score_converter = post_processing_builder.build(post_processing_config) + self.assertEqual(score_converter.__name__, 'softmax_with_logit_scale') + + def test_build_softmax_score_converter_with_temperature(self): + post_processing_text_proto = """ + score_converter: SOFTMAX + logit_scale: 2.0 + """ + post_processing_config = post_processing_pb2.PostProcessing() + text_format.Merge(post_processing_text_proto, post_processing_config) + _, score_converter = post_processing_builder.build(post_processing_config) + self.assertEqual(score_converter.__name__, 'softmax_with_logit_scale') + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/preprocessor_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/preprocessor_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..10b92532fc3ef5a533b7f317082436b0052eb166 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/preprocessor_builder.py @@ -0,0 +1,322 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Builder for preprocessing steps.""" + +import tensorflow as tf + +from object_detection.core import preprocessor +from object_detection.protos import preprocessor_pb2 + + +def _get_step_config_from_proto(preprocessor_step_config, step_name): + """Returns the value of a field named step_name from proto. + + Args: + preprocessor_step_config: A preprocessor_pb2.PreprocessingStep object. + step_name: Name of the field to get value from. + + Returns: + result_dict: a sub proto message from preprocessor_step_config which will be + later converted to a dictionary. + + Raises: + ValueError: If field does not exist in proto. + """ + for field, value in preprocessor_step_config.ListFields(): + if field.name == step_name: + return value + + raise ValueError('Could not get field %s from proto!', step_name) + + +def _get_dict_from_proto(config): + """Helper function to put all proto fields into a dictionary. + + For many preprocessing steps, there's an trivial 1-1 mapping from proto fields + to function arguments. This function automatically populates a dictionary with + the arguments from the proto. + + Protos that CANNOT be trivially populated include: + * nested messages. + * steps that check if an optional field is set (ie. where None != 0). + * protos that don't map 1-1 to arguments (ie. list should be reshaped). + * fields requiring additional validation (ie. repeated field has n elements). + + Args: + config: A protobuf object that does not violate the conditions above. + + Returns: + result_dict: |config| converted into a python dictionary. + """ + result_dict = {} + for field, value in config.ListFields(): + result_dict[field.name] = value + return result_dict + + +# A map from a PreprocessingStep proto config field name to the preprocessing +# function that should be used. The PreprocessingStep proto should be parsable +# with _get_dict_from_proto. +PREPROCESSING_FUNCTION_MAP = { + 'normalize_image': preprocessor.normalize_image, + 'random_pixel_value_scale': preprocessor.random_pixel_value_scale, + 'random_image_scale': preprocessor.random_image_scale, + 'random_rgb_to_gray': preprocessor.random_rgb_to_gray, + 'random_adjust_brightness': preprocessor.random_adjust_brightness, + 'random_adjust_contrast': preprocessor.random_adjust_contrast, + 'random_adjust_hue': preprocessor.random_adjust_hue, + 'random_adjust_saturation': preprocessor.random_adjust_saturation, + 'random_distort_color': preprocessor.random_distort_color, + 'random_jitter_boxes': preprocessor.random_jitter_boxes, + 'random_crop_to_aspect_ratio': preprocessor.random_crop_to_aspect_ratio, + 'random_black_patches': preprocessor.random_black_patches, + 'rgb_to_gray': preprocessor.rgb_to_gray, + 'scale_boxes_to_pixel_coordinates': ( + preprocessor.scale_boxes_to_pixel_coordinates), + 'subtract_channel_mean': preprocessor.subtract_channel_mean, +} + + +# A map to convert from preprocessor_pb2.ResizeImage.Method enum to +# tf.image.ResizeMethod. +RESIZE_METHOD_MAP = { + preprocessor_pb2.ResizeImage.AREA: tf.image.ResizeMethod.AREA, + preprocessor_pb2.ResizeImage.BICUBIC: tf.image.ResizeMethod.BICUBIC, + preprocessor_pb2.ResizeImage.BILINEAR: tf.image.ResizeMethod.BILINEAR, + preprocessor_pb2.ResizeImage.NEAREST_NEIGHBOR: ( + tf.image.ResizeMethod.NEAREST_NEIGHBOR), +} + + +def build(preprocessor_step_config): + """Builds preprocessing step based on the configuration. + + Args: + preprocessor_step_config: PreprocessingStep configuration proto. + + Returns: + function, argmap: A callable function and an argument map to call function + with. + + Raises: + ValueError: On invalid configuration. + """ + step_type = preprocessor_step_config.WhichOneof('preprocessing_step') + + if step_type in PREPROCESSING_FUNCTION_MAP: + preprocessing_function = PREPROCESSING_FUNCTION_MAP[step_type] + step_config = _get_step_config_from_proto(preprocessor_step_config, + step_type) + function_args = _get_dict_from_proto(step_config) + return (preprocessing_function, function_args) + + if step_type == 'random_horizontal_flip': + config = preprocessor_step_config.random_horizontal_flip + return (preprocessor.random_horizontal_flip, + { + 'keypoint_flip_permutation': tuple( + config.keypoint_flip_permutation), + }) + + if step_type == 'random_vertical_flip': + config = preprocessor_step_config.random_vertical_flip + return (preprocessor.random_vertical_flip, + { + 'keypoint_flip_permutation': tuple( + config.keypoint_flip_permutation), + }) + + if step_type == 'random_rotation90': + return (preprocessor.random_rotation90, {}) + + if step_type == 'random_crop_image': + config = preprocessor_step_config.random_crop_image + return (preprocessor.random_crop_image, + { + 'min_object_covered': config.min_object_covered, + 'aspect_ratio_range': (config.min_aspect_ratio, + config.max_aspect_ratio), + 'area_range': (config.min_area, config.max_area), + 'overlap_thresh': config.overlap_thresh, + 'random_coef': config.random_coef, + }) + + if step_type == 'random_pad_image': + config = preprocessor_step_config.random_pad_image + min_image_size = None + if (config.HasField('min_image_height') != + config.HasField('min_image_width')): + raise ValueError('min_image_height and min_image_width should be either ' + 'both set or both unset.') + if config.HasField('min_image_height'): + min_image_size = (config.min_image_height, config.min_image_width) + + max_image_size = None + if (config.HasField('max_image_height') != + config.HasField('max_image_width')): + raise ValueError('max_image_height and max_image_width should be either ' + 'both set or both unset.') + if config.HasField('max_image_height'): + max_image_size = (config.max_image_height, config.max_image_width) + + pad_color = config.pad_color + if pad_color and len(pad_color) != 3: + raise ValueError('pad_color should have 3 elements (RGB) if set!') + if not pad_color: + pad_color = None + return (preprocessor.random_pad_image, + { + 'min_image_size': min_image_size, + 'max_image_size': max_image_size, + 'pad_color': pad_color, + }) + + if step_type == 'random_crop_pad_image': + config = preprocessor_step_config.random_crop_pad_image + min_padded_size_ratio = config.min_padded_size_ratio + if min_padded_size_ratio and len(min_padded_size_ratio) != 2: + raise ValueError('min_padded_size_ratio should have 2 elements if set!') + max_padded_size_ratio = config.max_padded_size_ratio + if max_padded_size_ratio and len(max_padded_size_ratio) != 2: + raise ValueError('max_padded_size_ratio should have 2 elements if set!') + pad_color = config.pad_color + if pad_color and len(pad_color) != 3: + raise ValueError('pad_color should have 3 elements if set!') + kwargs = { + 'min_object_covered': config.min_object_covered, + 'aspect_ratio_range': (config.min_aspect_ratio, + config.max_aspect_ratio), + 'area_range': (config.min_area, config.max_area), + 'overlap_thresh': config.overlap_thresh, + 'random_coef': config.random_coef, + } + if min_padded_size_ratio: + kwargs['min_padded_size_ratio'] = tuple(min_padded_size_ratio) + if max_padded_size_ratio: + kwargs['max_padded_size_ratio'] = tuple(max_padded_size_ratio) + if pad_color: + kwargs['pad_color'] = tuple(pad_color) + return (preprocessor.random_crop_pad_image, kwargs) + + if step_type == 'random_resize_method': + config = preprocessor_step_config.random_resize_method + return (preprocessor.random_resize_method, + { + 'target_size': [config.target_height, config.target_width], + }) + + if step_type == 'resize_image': + config = preprocessor_step_config.resize_image + method = RESIZE_METHOD_MAP[config.method] + return (preprocessor.resize_image, + { + 'new_height': config.new_height, + 'new_width': config.new_width, + 'method': method + }) + + if step_type == 'ssd_random_crop': + config = preprocessor_step_config.ssd_random_crop + if config.operations: + min_object_covered = [op.min_object_covered for op in config.operations] + aspect_ratio_range = [(op.min_aspect_ratio, op.max_aspect_ratio) + for op in config.operations] + area_range = [(op.min_area, op.max_area) for op in config.operations] + overlap_thresh = [op.overlap_thresh for op in config.operations] + random_coef = [op.random_coef for op in config.operations] + return (preprocessor.ssd_random_crop, + { + 'min_object_covered': min_object_covered, + 'aspect_ratio_range': aspect_ratio_range, + 'area_range': area_range, + 'overlap_thresh': overlap_thresh, + 'random_coef': random_coef, + }) + return (preprocessor.ssd_random_crop, {}) + + if step_type == 'ssd_random_crop_pad': + config = preprocessor_step_config.ssd_random_crop_pad + if config.operations: + min_object_covered = [op.min_object_covered for op in config.operations] + aspect_ratio_range = [(op.min_aspect_ratio, op.max_aspect_ratio) + for op in config.operations] + area_range = [(op.min_area, op.max_area) for op in config.operations] + overlap_thresh = [op.overlap_thresh for op in config.operations] + random_coef = [op.random_coef for op in config.operations] + min_padded_size_ratio = [tuple(op.min_padded_size_ratio) + for op in config.operations] + max_padded_size_ratio = [tuple(op.max_padded_size_ratio) + for op in config.operations] + pad_color = [(op.pad_color_r, op.pad_color_g, op.pad_color_b) + for op in config.operations] + return (preprocessor.ssd_random_crop_pad, + { + 'min_object_covered': min_object_covered, + 'aspect_ratio_range': aspect_ratio_range, + 'area_range': area_range, + 'overlap_thresh': overlap_thresh, + 'random_coef': random_coef, + 'min_padded_size_ratio': min_padded_size_ratio, + 'max_padded_size_ratio': max_padded_size_ratio, + 'pad_color': pad_color, + }) + return (preprocessor.ssd_random_crop_pad, {}) + + if step_type == 'ssd_random_crop_fixed_aspect_ratio': + config = preprocessor_step_config.ssd_random_crop_fixed_aspect_ratio + if config.operations: + min_object_covered = [op.min_object_covered for op in config.operations] + area_range = [(op.min_area, op.max_area) for op in config.operations] + overlap_thresh = [op.overlap_thresh for op in config.operations] + random_coef = [op.random_coef for op in config.operations] + return (preprocessor.ssd_random_crop_fixed_aspect_ratio, + { + 'min_object_covered': min_object_covered, + 'aspect_ratio': config.aspect_ratio, + 'area_range': area_range, + 'overlap_thresh': overlap_thresh, + 'random_coef': random_coef, + }) + return (preprocessor.ssd_random_crop_fixed_aspect_ratio, {}) + + if step_type == 'ssd_random_crop_pad_fixed_aspect_ratio': + config = preprocessor_step_config.ssd_random_crop_pad_fixed_aspect_ratio + kwargs = {} + aspect_ratio = config.aspect_ratio + if aspect_ratio: + kwargs['aspect_ratio'] = aspect_ratio + min_padded_size_ratio = config.min_padded_size_ratio + if min_padded_size_ratio: + if len(min_padded_size_ratio) != 2: + raise ValueError('min_padded_size_ratio should have 2 elements if set!') + kwargs['min_padded_size_ratio'] = tuple(min_padded_size_ratio) + max_padded_size_ratio = config.max_padded_size_ratio + if max_padded_size_ratio: + if len(max_padded_size_ratio) != 2: + raise ValueError('max_padded_size_ratio should have 2 elements if set!') + kwargs['max_padded_size_ratio'] = tuple(max_padded_size_ratio) + if config.operations: + kwargs['min_object_covered'] = [op.min_object_covered + for op in config.operations] + kwargs['aspect_ratio_range'] = [(op.min_aspect_ratio, op.max_aspect_ratio) + for op in config.operations] + kwargs['area_range'] = [(op.min_area, op.max_area) + for op in config.operations] + kwargs['overlap_thresh'] = [op.overlap_thresh for op in config.operations] + kwargs['random_coef'] = [op.random_coef for op in config.operations] + return (preprocessor.ssd_random_crop_pad_fixed_aspect_ratio, kwargs) + + raise ValueError('Unknown preprocessing step.') diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/preprocessor_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/preprocessor_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9e5d8de8e9ab84836c918b40cd17345543e18d19 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/preprocessor_builder_test.py @@ -0,0 +1,566 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for preprocessor_builder.""" + +import tensorflow as tf + +from google.protobuf import text_format + +from object_detection.builders import preprocessor_builder +from object_detection.core import preprocessor +from object_detection.protos import preprocessor_pb2 + + +class PreprocessorBuilderTest(tf.test.TestCase): + + def assert_dictionary_close(self, dict1, dict2): + """Helper to check if two dicts with floatst or integers are close.""" + self.assertEqual(sorted(dict1.keys()), sorted(dict2.keys())) + for key in dict1: + value = dict1[key] + if isinstance(value, float): + self.assertAlmostEqual(value, dict2[key]) + else: + self.assertEqual(value, dict2[key]) + + def test_build_normalize_image(self): + preprocessor_text_proto = """ + normalize_image { + original_minval: 0.0 + original_maxval: 255.0 + target_minval: -1.0 + target_maxval: 1.0 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.normalize_image) + self.assertEqual(args, { + 'original_minval': 0.0, + 'original_maxval': 255.0, + 'target_minval': -1.0, + 'target_maxval': 1.0, + }) + + def test_build_random_horizontal_flip(self): + preprocessor_text_proto = """ + random_horizontal_flip { + keypoint_flip_permutation: 1 + keypoint_flip_permutation: 0 + keypoint_flip_permutation: 2 + keypoint_flip_permutation: 3 + keypoint_flip_permutation: 5 + keypoint_flip_permutation: 4 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_horizontal_flip) + self.assertEqual(args, {'keypoint_flip_permutation': (1, 0, 2, 3, 5, 4)}) + + def test_build_random_vertical_flip(self): + preprocessor_text_proto = """ + random_vertical_flip { + keypoint_flip_permutation: 1 + keypoint_flip_permutation: 0 + keypoint_flip_permutation: 2 + keypoint_flip_permutation: 3 + keypoint_flip_permutation: 5 + keypoint_flip_permutation: 4 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_vertical_flip) + self.assertEqual(args, {'keypoint_flip_permutation': (1, 0, 2, 3, 5, 4)}) + + def test_build_random_rotation90(self): + preprocessor_text_proto = """ + random_rotation90 {} + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_rotation90) + self.assertEqual(args, {}) + + def test_build_random_pixel_value_scale(self): + preprocessor_text_proto = """ + random_pixel_value_scale { + minval: 0.8 + maxval: 1.2 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_pixel_value_scale) + self.assert_dictionary_close(args, {'minval': 0.8, 'maxval': 1.2}) + + def test_build_random_image_scale(self): + preprocessor_text_proto = """ + random_image_scale { + min_scale_ratio: 0.8 + max_scale_ratio: 2.2 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_image_scale) + self.assert_dictionary_close(args, {'min_scale_ratio': 0.8, + 'max_scale_ratio': 2.2}) + + def test_build_random_rgb_to_gray(self): + preprocessor_text_proto = """ + random_rgb_to_gray { + probability: 0.8 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_rgb_to_gray) + self.assert_dictionary_close(args, {'probability': 0.8}) + + def test_build_random_adjust_brightness(self): + preprocessor_text_proto = """ + random_adjust_brightness { + max_delta: 0.2 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_adjust_brightness) + self.assert_dictionary_close(args, {'max_delta': 0.2}) + + def test_build_random_adjust_contrast(self): + preprocessor_text_proto = """ + random_adjust_contrast { + min_delta: 0.7 + max_delta: 1.1 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_adjust_contrast) + self.assert_dictionary_close(args, {'min_delta': 0.7, 'max_delta': 1.1}) + + def test_build_random_adjust_hue(self): + preprocessor_text_proto = """ + random_adjust_hue { + max_delta: 0.01 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_adjust_hue) + self.assert_dictionary_close(args, {'max_delta': 0.01}) + + def test_build_random_adjust_saturation(self): + preprocessor_text_proto = """ + random_adjust_saturation { + min_delta: 0.75 + max_delta: 1.15 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_adjust_saturation) + self.assert_dictionary_close(args, {'min_delta': 0.75, 'max_delta': 1.15}) + + def test_build_random_distort_color(self): + preprocessor_text_proto = """ + random_distort_color { + color_ordering: 1 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_distort_color) + self.assertEqual(args, {'color_ordering': 1}) + + def test_build_random_jitter_boxes(self): + preprocessor_text_proto = """ + random_jitter_boxes { + ratio: 0.1 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_jitter_boxes) + self.assert_dictionary_close(args, {'ratio': 0.1}) + + def test_build_random_crop_image(self): + preprocessor_text_proto = """ + random_crop_image { + min_object_covered: 0.75 + min_aspect_ratio: 0.75 + max_aspect_ratio: 1.5 + min_area: 0.25 + max_area: 0.875 + overlap_thresh: 0.5 + random_coef: 0.125 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_crop_image) + self.assertEqual(args, { + 'min_object_covered': 0.75, + 'aspect_ratio_range': (0.75, 1.5), + 'area_range': (0.25, 0.875), + 'overlap_thresh': 0.5, + 'random_coef': 0.125, + }) + + def test_build_random_pad_image(self): + preprocessor_text_proto = """ + random_pad_image { + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_pad_image) + self.assertEqual(args, { + 'min_image_size': None, + 'max_image_size': None, + 'pad_color': None, + }) + + def test_build_random_crop_pad_image(self): + preprocessor_text_proto = """ + random_crop_pad_image { + min_object_covered: 0.75 + min_aspect_ratio: 0.75 + max_aspect_ratio: 1.5 + min_area: 0.25 + max_area: 0.875 + overlap_thresh: 0.5 + random_coef: 0.125 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_crop_pad_image) + self.assertEqual(args, { + 'min_object_covered': 0.75, + 'aspect_ratio_range': (0.75, 1.5), + 'area_range': (0.25, 0.875), + 'overlap_thresh': 0.5, + 'random_coef': 0.125, + }) + + def test_build_random_crop_pad_image_with_optional_parameters(self): + preprocessor_text_proto = """ + random_crop_pad_image { + min_object_covered: 0.75 + min_aspect_ratio: 0.75 + max_aspect_ratio: 1.5 + min_area: 0.25 + max_area: 0.875 + overlap_thresh: 0.5 + random_coef: 0.125 + min_padded_size_ratio: 0.5 + min_padded_size_ratio: 0.75 + max_padded_size_ratio: 0.5 + max_padded_size_ratio: 0.75 + pad_color: 0.5 + pad_color: 0.5 + pad_color: 1.0 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_crop_pad_image) + self.assertEqual(args, { + 'min_object_covered': 0.75, + 'aspect_ratio_range': (0.75, 1.5), + 'area_range': (0.25, 0.875), + 'overlap_thresh': 0.5, + 'random_coef': 0.125, + 'min_padded_size_ratio': (0.5, 0.75), + 'max_padded_size_ratio': (0.5, 0.75), + 'pad_color': (0.5, 0.5, 1.0) + }) + + def test_build_random_crop_to_aspect_ratio(self): + preprocessor_text_proto = """ + random_crop_to_aspect_ratio { + aspect_ratio: 0.85 + overlap_thresh: 0.35 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_crop_to_aspect_ratio) + self.assert_dictionary_close(args, {'aspect_ratio': 0.85, + 'overlap_thresh': 0.35}) + + def test_build_random_black_patches(self): + preprocessor_text_proto = """ + random_black_patches { + max_black_patches: 20 + probability: 0.95 + size_to_image_ratio: 0.12 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_black_patches) + self.assert_dictionary_close(args, {'max_black_patches': 20, + 'probability': 0.95, + 'size_to_image_ratio': 0.12}) + + def test_build_random_resize_method(self): + preprocessor_text_proto = """ + random_resize_method { + target_height: 75 + target_width: 100 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.random_resize_method) + self.assert_dictionary_close(args, {'target_size': [75, 100]}) + + def test_build_scale_boxes_to_pixel_coordinates(self): + preprocessor_text_proto = """ + scale_boxes_to_pixel_coordinates {} + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.scale_boxes_to_pixel_coordinates) + self.assertEqual(args, {}) + + def test_build_resize_image(self): + preprocessor_text_proto = """ + resize_image { + new_height: 75 + new_width: 100 + method: BICUBIC + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.resize_image) + self.assertEqual(args, {'new_height': 75, + 'new_width': 100, + 'method': tf.image.ResizeMethod.BICUBIC}) + + def test_build_rgb_to_gray(self): + preprocessor_text_proto = """ + rgb_to_gray {} + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.rgb_to_gray) + self.assertEqual(args, {}) + + def test_build_subtract_channel_mean(self): + preprocessor_text_proto = """ + subtract_channel_mean { + means: [1.0, 2.0, 3.0] + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.subtract_channel_mean) + self.assertEqual(args, {'means': [1.0, 2.0, 3.0]}) + + def test_build_ssd_random_crop(self): + preprocessor_text_proto = """ + ssd_random_crop { + operations { + min_object_covered: 0.0 + min_aspect_ratio: 0.875 + max_aspect_ratio: 1.125 + min_area: 0.5 + max_area: 1.0 + overlap_thresh: 0.0 + random_coef: 0.375 + } + operations { + min_object_covered: 0.25 + min_aspect_ratio: 0.75 + max_aspect_ratio: 1.5 + min_area: 0.5 + max_area: 1.0 + overlap_thresh: 0.25 + random_coef: 0.375 + } + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.ssd_random_crop) + self.assertEqual(args, {'min_object_covered': [0.0, 0.25], + 'aspect_ratio_range': [(0.875, 1.125), (0.75, 1.5)], + 'area_range': [(0.5, 1.0), (0.5, 1.0)], + 'overlap_thresh': [0.0, 0.25], + 'random_coef': [0.375, 0.375]}) + + def test_build_ssd_random_crop_empty_operations(self): + preprocessor_text_proto = """ + ssd_random_crop { + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.ssd_random_crop) + self.assertEqual(args, {}) + + def test_build_ssd_random_crop_pad(self): + preprocessor_text_proto = """ + ssd_random_crop_pad { + operations { + min_object_covered: 0.0 + min_aspect_ratio: 0.875 + max_aspect_ratio: 1.125 + min_area: 0.5 + max_area: 1.0 + overlap_thresh: 0.0 + random_coef: 0.375 + min_padded_size_ratio: [1.0, 1.0] + max_padded_size_ratio: [2.0, 2.0] + pad_color_r: 0.5 + pad_color_g: 0.5 + pad_color_b: 0.5 + } + operations { + min_object_covered: 0.25 + min_aspect_ratio: 0.75 + max_aspect_ratio: 1.5 + min_area: 0.5 + max_area: 1.0 + overlap_thresh: 0.25 + random_coef: 0.375 + min_padded_size_ratio: [1.0, 1.0] + max_padded_size_ratio: [2.0, 2.0] + pad_color_r: 0.5 + pad_color_g: 0.5 + pad_color_b: 0.5 + } + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.ssd_random_crop_pad) + self.assertEqual(args, {'min_object_covered': [0.0, 0.25], + 'aspect_ratio_range': [(0.875, 1.125), (0.75, 1.5)], + 'area_range': [(0.5, 1.0), (0.5, 1.0)], + 'overlap_thresh': [0.0, 0.25], + 'random_coef': [0.375, 0.375], + 'min_padded_size_ratio': [(1.0, 1.0), (1.0, 1.0)], + 'max_padded_size_ratio': [(2.0, 2.0), (2.0, 2.0)], + 'pad_color': [(0.5, 0.5, 0.5), (0.5, 0.5, 0.5)]}) + + def test_build_ssd_random_crop_fixed_aspect_ratio(self): + preprocessor_text_proto = """ + ssd_random_crop_fixed_aspect_ratio { + operations { + min_object_covered: 0.0 + min_area: 0.5 + max_area: 1.0 + overlap_thresh: 0.0 + random_coef: 0.375 + } + operations { + min_object_covered: 0.25 + min_area: 0.5 + max_area: 1.0 + overlap_thresh: 0.25 + random_coef: 0.375 + } + aspect_ratio: 0.875 + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, preprocessor.ssd_random_crop_fixed_aspect_ratio) + self.assertEqual(args, {'min_object_covered': [0.0, 0.25], + 'aspect_ratio': 0.875, + 'area_range': [(0.5, 1.0), (0.5, 1.0)], + 'overlap_thresh': [0.0, 0.25], + 'random_coef': [0.375, 0.375]}) + + def test_build_ssd_random_crop_pad_fixed_aspect_ratio(self): + preprocessor_text_proto = """ + ssd_random_crop_pad_fixed_aspect_ratio { + operations { + min_object_covered: 0.0 + min_aspect_ratio: 0.875 + max_aspect_ratio: 1.125 + min_area: 0.5 + max_area: 1.0 + overlap_thresh: 0.0 + random_coef: 0.375 + } + operations { + min_object_covered: 0.25 + min_aspect_ratio: 0.75 + max_aspect_ratio: 1.5 + min_area: 0.5 + max_area: 1.0 + overlap_thresh: 0.25 + random_coef: 0.375 + } + aspect_ratio: 0.875 + min_padded_size_ratio: [1.0, 1.0] + max_padded_size_ratio: [2.0, 2.0] + } + """ + preprocessor_proto = preprocessor_pb2.PreprocessingStep() + text_format.Merge(preprocessor_text_proto, preprocessor_proto) + function, args = preprocessor_builder.build(preprocessor_proto) + self.assertEqual(function, + preprocessor.ssd_random_crop_pad_fixed_aspect_ratio) + self.assertEqual(args, {'min_object_covered': [0.0, 0.25], + 'aspect_ratio': 0.875, + 'aspect_ratio_range': [(0.875, 1.125), (0.75, 1.5)], + 'area_range': [(0.5, 1.0), (0.5, 1.0)], + 'overlap_thresh': [0.0, 0.25], + 'random_coef': [0.375, 0.375], + 'min_padded_size_ratio': (1.0, 1.0), + 'max_padded_size_ratio': (2.0, 2.0)}) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/region_similarity_calculator_builder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/region_similarity_calculator_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..fa1d671754df07043957ccf9e04f651c114c1cf9 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/region_similarity_calculator_builder.py @@ -0,0 +1,56 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Builder for region similarity calculators.""" + +from object_detection.core import region_similarity_calculator +from object_detection.protos import region_similarity_calculator_pb2 + + +def build(region_similarity_calculator_config): + """Builds region similarity calculator based on the configuration. + + Builds one of [IouSimilarity, IoaSimilarity, NegSqDistSimilarity] objects. See + core/region_similarity_calculator.proto for details. + + Args: + region_similarity_calculator_config: RegionSimilarityCalculator + configuration proto. + + Returns: + region_similarity_calculator: RegionSimilarityCalculator object. + + Raises: + ValueError: On unknown region similarity calculator. + """ + + if not isinstance( + region_similarity_calculator_config, + region_similarity_calculator_pb2.RegionSimilarityCalculator): + raise ValueError( + 'region_similarity_calculator_config not of type ' + 'region_similarity_calculator_pb2.RegionsSimilarityCalculator') + + similarity_calculator = region_similarity_calculator_config.WhichOneof( + 'region_similarity') + if similarity_calculator == 'iou_similarity': + return region_similarity_calculator.IouSimilarity() + if similarity_calculator == 'ioa_similarity': + return region_similarity_calculator.IoaSimilarity() + if similarity_calculator == 'neg_sq_dist_similarity': + return region_similarity_calculator.NegSqDistSimilarity() + + raise ValueError('Unknown region similarity calculator.') + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/region_similarity_calculator_builder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/region_similarity_calculator_builder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..ca3a5512e374fc03f39de1f3f77cf22bc6f6556e --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/builders/region_similarity_calculator_builder_test.py @@ -0,0 +1,67 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for region_similarity_calculator_builder.""" + +import tensorflow as tf + +from google.protobuf import text_format +from object_detection.builders import region_similarity_calculator_builder +from object_detection.core import region_similarity_calculator +from object_detection.protos import region_similarity_calculator_pb2 as sim_calc_pb2 + + +class RegionSimilarityCalculatorBuilderTest(tf.test.TestCase): + + def testBuildIoaSimilarityCalculator(self): + similarity_calc_text_proto = """ + ioa_similarity { + } + """ + similarity_calc_proto = sim_calc_pb2.RegionSimilarityCalculator() + text_format.Merge(similarity_calc_text_proto, similarity_calc_proto) + similarity_calc = region_similarity_calculator_builder.build( + similarity_calc_proto) + self.assertTrue(isinstance(similarity_calc, + region_similarity_calculator.IoaSimilarity)) + + def testBuildIouSimilarityCalculator(self): + similarity_calc_text_proto = """ + iou_similarity { + } + """ + similarity_calc_proto = sim_calc_pb2.RegionSimilarityCalculator() + text_format.Merge(similarity_calc_text_proto, similarity_calc_proto) + similarity_calc = region_similarity_calculator_builder.build( + similarity_calc_proto) + self.assertTrue(isinstance(similarity_calc, + region_similarity_calculator.IouSimilarity)) + + def testBuildNegSqDistSimilarityCalculator(self): + similarity_calc_text_proto = """ + neg_sq_dist_similarity { + } + """ + similarity_calc_proto = sim_calc_pb2.RegionSimilarityCalculator() + text_format.Merge(similarity_calc_text_proto, similarity_calc_proto) + similarity_calc = region_similarity_calculator_builder.build( + similarity_calc_proto) + self.assertTrue(isinstance(similarity_calc, + region_similarity_calculator. + NegSqDistSimilarity)) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/__init__.py @@ -0,0 +1 @@ + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/anchor_generator.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/anchor_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..f2797ef77d3e83597e18db10e5ba87f24364d8aa --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/anchor_generator.py @@ -0,0 +1,150 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Base anchor generator. + +The job of the anchor generator is to create (or load) a collection +of bounding boxes to be used as anchors. + +Generated anchors are assumed to match some convolutional grid or list of grid +shapes. For example, we might want to generate anchors matching an 8x8 +feature map and a 4x4 feature map. If we place 3 anchors per grid location +on the first feature map and 6 anchors per grid location on the second feature +map, then 3*8*8 + 6*4*4 = 288 anchors are generated in total. + +To support fully convolutional settings, feature map shapes are passed +dynamically at generation time. The number of anchors to place at each location +is static --- implementations of AnchorGenerator must always be able return +the number of anchors that it uses per location for each feature map. +""" +from abc import ABCMeta +from abc import abstractmethod + +import tensorflow as tf + + +class AnchorGenerator(object): + """Abstract base class for anchor generators.""" + __metaclass__ = ABCMeta + + @abstractmethod + def name_scope(self): + """Name scope. + + Must be defined by implementations. + + Returns: + a string representing the name scope of the anchor generation operation. + """ + pass + + @property + def check_num_anchors(self): + """Whether to dynamically check the number of anchors generated. + + Can be overridden by implementations that would like to disable this + behavior. + + Returns: + a boolean controlling whether the Generate function should dynamically + check the number of anchors generated against the mathematically + expected number of anchors. + """ + return True + + @abstractmethod + def num_anchors_per_location(self): + """Returns the number of anchors per spatial location. + + Returns: + a list of integers, one for each expected feature map to be passed to + the `generate` function. + """ + pass + + def generate(self, feature_map_shape_list, **params): + """Generates a collection of bounding boxes to be used as anchors. + + TODO(rathodv): remove **params from argument list and make stride and + offsets (for multiple_grid_anchor_generator) constructor arguments. + + Args: + feature_map_shape_list: list of (height, width) pairs in the format + [(height_0, width_0), (height_1, width_1), ...] that the generated + anchors must align with. Pairs can be provided as 1-dimensional + integer tensors of length 2 or simply as tuples of integers. + **params: parameters for anchor generation op + + Returns: + boxes_list: a list of BoxLists each holding anchor boxes corresponding to + the input feature map shapes. + + Raises: + ValueError: if the number of feature map shapes does not match the length + of NumAnchorsPerLocation. + """ + if self.check_num_anchors and ( + len(feature_map_shape_list) != len(self.num_anchors_per_location())): + raise ValueError('Number of feature maps is expected to equal the length ' + 'of `num_anchors_per_location`.') + with tf.name_scope(self.name_scope()): + anchors_list = self._generate(feature_map_shape_list, **params) + if self.check_num_anchors: + with tf.control_dependencies([ + self._assert_correct_number_of_anchors( + anchors_list, feature_map_shape_list)]): + for item in anchors_list: + item.set(tf.identity(item.get())) + return anchors_list + + @abstractmethod + def _generate(self, feature_map_shape_list, **params): + """To be overridden by implementations. + + Args: + feature_map_shape_list: list of (height, width) pairs in the format + [(height_0, width_0), (height_1, width_1), ...] that the generated + anchors must align with. + **params: parameters for anchor generation op + + Returns: + boxes_list: a list of BoxList, each holding a collection of N anchor + boxes. + """ + pass + + def _assert_correct_number_of_anchors(self, anchors_list, + feature_map_shape_list): + """Assert that correct number of anchors was generated. + + Args: + anchors_list: A list of box_list.BoxList object holding anchors generated. + feature_map_shape_list: list of (height, width) pairs in the format + [(height_0, width_0), (height_1, width_1), ...] that the generated + anchors must align with. + Returns: + Op that raises InvalidArgumentError if the number of anchors does not + match the number of expected anchors. + """ + expected_num_anchors = 0 + actual_num_anchors = 0 + for num_anchors_per_location, feature_map_shape, anchors in zip( + self.num_anchors_per_location(), feature_map_shape_list, anchors_list): + expected_num_anchors += (num_anchors_per_location + * feature_map_shape[0] + * feature_map_shape[1]) + actual_num_anchors += anchors.num_boxes() + return tf.assert_equal(expected_num_anchors, actual_num_anchors) + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/balanced_positive_negative_sampler.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/balanced_positive_negative_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..7042c40fffbef3126fc90a81114693ac4c0c8bf6 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/balanced_positive_negative_sampler.py @@ -0,0 +1,105 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Class to subsample minibatches by balancing positives and negatives. + +Subsamples minibatches based on a pre-specified positive fraction in range +[0,1]. The class presumes there are many more negatives than positive examples: +if the desired batch_size cannot be achieved with the pre-specified positive +fraction, it fills the rest with negative examples. If this is not sufficient +for obtaining the desired batch_size, it returns fewer examples. + +The main function to call is Subsample(self, indicator, labels). For convenience +one can also call SubsampleWeights(self, weights, labels) which is defined in +the minibatch_sampler base class. +""" + +import tensorflow as tf + +from object_detection.core import minibatch_sampler + + +class BalancedPositiveNegativeSampler(minibatch_sampler.MinibatchSampler): + """Subsamples minibatches to a desired balance of positives and negatives.""" + + def __init__(self, positive_fraction=0.5): + """Constructs a minibatch sampler. + + Args: + positive_fraction: desired fraction of positive examples (scalar in [0,1]) + in the batch. + + Raises: + ValueError: if positive_fraction < 0, or positive_fraction > 1 + """ + if positive_fraction < 0 or positive_fraction > 1: + raise ValueError('positive_fraction should be in range [0,1]. ' + 'Received: %s.' % positive_fraction) + self._positive_fraction = positive_fraction + + def subsample(self, indicator, batch_size, labels): + """Returns subsampled minibatch. + + Args: + indicator: boolean tensor of shape [N] whose True entries can be sampled. + batch_size: desired batch size. If None, keeps all positive samples and + randomly selects negative samples so that the positive sample fraction + matches self._positive_fraction. + labels: boolean tensor of shape [N] denoting positive(=True) and negative + (=False) examples. + + Returns: + is_sampled: boolean tensor of shape [N], True for entries which are + sampled. + + Raises: + ValueError: if labels and indicator are not 1D boolean tensors. + """ + if len(indicator.get_shape().as_list()) != 1: + raise ValueError('indicator must be 1 dimensional, got a tensor of ' + 'shape %s' % indicator.get_shape()) + if len(labels.get_shape().as_list()) != 1: + raise ValueError('labels must be 1 dimensional, got a tensor of ' + 'shape %s' % labels.get_shape()) + if labels.dtype != tf.bool: + raise ValueError('labels should be of type bool. Received: %s' % + labels.dtype) + if indicator.dtype != tf.bool: + raise ValueError('indicator should be of type bool. Received: %s' % + indicator.dtype) + + # Only sample from indicated samples + negative_idx = tf.logical_not(labels) + positive_idx = tf.logical_and(labels, indicator) + negative_idx = tf.logical_and(negative_idx, indicator) + + # Sample positive and negative samples separately + if batch_size is None: + max_num_pos = tf.reduce_sum(tf.to_int32(positive_idx)) + else: + max_num_pos = int(self._positive_fraction * batch_size) + sampled_pos_idx = self.subsample_indicator(positive_idx, max_num_pos) + num_sampled_pos = tf.reduce_sum(tf.cast(sampled_pos_idx, tf.int32)) + if batch_size is None: + negative_positive_ratio = ( + 1 - self._positive_fraction) / self._positive_fraction + max_num_neg = tf.to_int32( + negative_positive_ratio * tf.to_float(num_sampled_pos)) + else: + max_num_neg = batch_size - num_sampled_pos + sampled_neg_idx = self.subsample_indicator(negative_idx, max_num_neg) + + sampled_idx = tf.logical_or(sampled_pos_idx, sampled_neg_idx) + return sampled_idx diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/balanced_positive_negative_sampler_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/balanced_positive_negative_sampler_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e39de5342c4f01afa38725a56ee543c6eec27d13 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/balanced_positive_negative_sampler_test.py @@ -0,0 +1,106 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.balanced_positive_negative_sampler.""" + +import numpy as np +import tensorflow as tf + +from object_detection.core import balanced_positive_negative_sampler +from object_detection.utils import test_case + + +class BalancedPositiveNegativeSamplerTest(test_case.TestCase): + + def test_subsample_all_examples(self): + numpy_labels = np.random.permutation(300) + indicator = tf.constant(np.ones(300) == 1) + numpy_labels = (numpy_labels - 200) > 0 + + labels = tf.constant(numpy_labels) + + sampler = (balanced_positive_negative_sampler. + BalancedPositiveNegativeSampler()) + is_sampled = sampler.subsample(indicator, 64, labels) + with self.test_session() as sess: + is_sampled = sess.run(is_sampled) + self.assertTrue(sum(is_sampled) == 64) + self.assertTrue(sum(np.logical_and(numpy_labels, is_sampled)) == 32) + self.assertTrue(sum(np.logical_and( + np.logical_not(numpy_labels), is_sampled)) == 32) + + def test_subsample_selection(self): + # Test random sampling when only some examples can be sampled: + # 100 samples, 20 positives, 10 positives cannot be sampled + numpy_labels = np.arange(100) + numpy_indicator = numpy_labels < 90 + indicator = tf.constant(numpy_indicator) + numpy_labels = (numpy_labels - 80) >= 0 + + labels = tf.constant(numpy_labels) + + sampler = (balanced_positive_negative_sampler. + BalancedPositiveNegativeSampler()) + is_sampled = sampler.subsample(indicator, 64, labels) + with self.test_session() as sess: + is_sampled = sess.run(is_sampled) + self.assertTrue(sum(is_sampled) == 64) + self.assertTrue(sum(np.logical_and(numpy_labels, is_sampled)) == 10) + self.assertTrue(sum(np.logical_and( + np.logical_not(numpy_labels), is_sampled)) == 54) + self.assertAllEqual(is_sampled, np.logical_and(is_sampled, + numpy_indicator)) + + def test_subsample_selection_no_batch_size(self): + # Test random sampling when only some examples can be sampled: + # 1000 samples, 6 positives (5 can be sampled). + numpy_labels = np.arange(1000) + numpy_indicator = numpy_labels < 999 + indicator = tf.constant(numpy_indicator) + numpy_labels = (numpy_labels - 994) >= 0 + + labels = tf.constant(numpy_labels) + + sampler = (balanced_positive_negative_sampler. + BalancedPositiveNegativeSampler(0.01)) + is_sampled = sampler.subsample(indicator, None, labels) + with self.test_session() as sess: + is_sampled = sess.run(is_sampled) + self.assertTrue(sum(is_sampled) == 500) + self.assertTrue(sum(np.logical_and(numpy_labels, is_sampled)) == 5) + self.assertTrue(sum(np.logical_and( + np.logical_not(numpy_labels), is_sampled)) == 495) + self.assertAllEqual(is_sampled, np.logical_and(is_sampled, + numpy_indicator)) + + def test_raises_error_with_incorrect_label_shape(self): + labels = tf.constant([[True, False, False]]) + indicator = tf.constant([True, False, True]) + sampler = (balanced_positive_negative_sampler. + BalancedPositiveNegativeSampler()) + with self.assertRaises(ValueError): + sampler.subsample(indicator, 64, labels) + + def test_raises_error_with_incorrect_indicator_shape(self): + labels = tf.constant([True, False, False]) + indicator = tf.constant([[True, False, True]]) + sampler = (balanced_positive_negative_sampler. + BalancedPositiveNegativeSampler()) + with self.assertRaises(ValueError): + sampler.subsample(indicator, 64, labels) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/batcher.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/batcher.py new file mode 100644 index 0000000000000000000000000000000000000000..c5dfb712108d0f9ec797ef04c9a4a3620b189fea --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/batcher.py @@ -0,0 +1,136 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Provides functions to batch a dictionary of input tensors.""" +import collections + +import tensorflow as tf + +from object_detection.core import prefetcher + +rt_shape_str = '_runtime_shapes' + + +class BatchQueue(object): + """BatchQueue class. + + This class creates a batch queue to asynchronously enqueue tensors_dict. + It also adds a FIFO prefetcher so that the batches are readily available + for the consumers. Dequeue ops for a BatchQueue object can be created via + the Dequeue method which evaluates to a batch of tensor_dict. + + Example input pipeline with batching: + ------------------------------------ + key, string_tensor = slim.parallel_reader.parallel_read(...) + tensor_dict = decoder.decode(string_tensor) + tensor_dict = preprocessor.preprocess(tensor_dict, ...) + batch_queue = batcher.BatchQueue(tensor_dict, + batch_size=32, + batch_queue_capacity=2000, + num_batch_queue_threads=8, + prefetch_queue_capacity=20) + tensor_dict = batch_queue.dequeue() + outputs = Model(tensor_dict) + ... + ----------------------------------- + + Notes: + ----- + This class batches tensors of unequal sizes by zero padding and unpadding + them after generating a batch. This can be computationally expensive when + batching tensors (such as images) that are of vastly different sizes. So it is + recommended that the shapes of such tensors be fully defined in tensor_dict + while other lightweight tensors such as bounding box corners and class labels + can be of varying sizes. Use either crop or resize operations to fully define + the shape of an image in tensor_dict. + + It is also recommended to perform any preprocessing operations on tensors + before passing to BatchQueue and subsequently calling the Dequeue method. + + Another caveat is that this class does not read the last batch if it is not + full. The current implementation makes it hard to support that use case. So, + for evaluation, when it is critical to run all the examples through your + network use the input pipeline example mentioned in core/prefetcher.py. + """ + + def __init__(self, tensor_dict, batch_size, batch_queue_capacity, + num_batch_queue_threads, prefetch_queue_capacity): + """Constructs a batch queue holding tensor_dict. + + Args: + tensor_dict: dictionary of tensors to batch. + batch_size: batch size. + batch_queue_capacity: max capacity of the queue from which the tensors are + batched. + num_batch_queue_threads: number of threads to use for batching. + prefetch_queue_capacity: max capacity of the queue used to prefetch + assembled batches. + """ + # Remember static shapes to set shapes of batched tensors. + static_shapes = collections.OrderedDict( + {key: tensor.get_shape() for key, tensor in tensor_dict.items()}) + # Remember runtime shapes to unpad tensors after batching. + runtime_shapes = collections.OrderedDict( + {(key + rt_shape_str): tf.shape(tensor) + for key, tensor in tensor_dict.items()}) + + all_tensors = tensor_dict + all_tensors.update(runtime_shapes) + batched_tensors = tf.train.batch( + all_tensors, + capacity=batch_queue_capacity, + batch_size=batch_size, + dynamic_pad=True, + num_threads=num_batch_queue_threads) + + self._queue = prefetcher.prefetch(batched_tensors, + prefetch_queue_capacity) + self._static_shapes = static_shapes + self._batch_size = batch_size + + def dequeue(self): + """Dequeues a batch of tensor_dict from the BatchQueue. + + TODO: use allow_smaller_final_batch to allow running over the whole eval set + + Returns: + A list of tensor_dicts of the requested batch_size. + """ + batched_tensors = self._queue.dequeue() + # Separate input tensors from tensors containing their runtime shapes. + tensors = {} + shapes = {} + for key, batched_tensor in batched_tensors.items(): + unbatched_tensor_list = tf.unstack(batched_tensor) + for i, unbatched_tensor in enumerate(unbatched_tensor_list): + if rt_shape_str in key: + shapes[(key[:-len(rt_shape_str)], i)] = unbatched_tensor + else: + tensors[(key, i)] = unbatched_tensor + + # Undo that padding using shapes and create a list of size `batch_size` that + # contains tensor dictionaries. + tensor_dict_list = [] + batch_size = self._batch_size + for batch_id in range(batch_size): + tensor_dict = {} + for key in self._static_shapes: + tensor_dict[key] = tf.slice(tensors[(key, batch_id)], + tf.zeros_like(shapes[(key, batch_id)]), + shapes[(key, batch_id)]) + tensor_dict[key].set_shape(self._static_shapes[key]) + tensor_dict_list.append(tensor_dict) + + return tensor_dict_list diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/batcher_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/batcher_test.py new file mode 100644 index 0000000000000000000000000000000000000000..61b4390b4cdcff146b721872ee98f9a48c6f67f0 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/batcher_test.py @@ -0,0 +1,158 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.batcher.""" + +import numpy as np +import tensorflow as tf + +from object_detection.core import batcher + +slim = tf.contrib.slim + + +class BatcherTest(tf.test.TestCase): + + def test_batch_and_unpad_2d_tensors_of_different_sizes_in_1st_dimension(self): + with self.test_session() as sess: + batch_size = 3 + num_batches = 2 + examples = tf.Variable(tf.constant(2, dtype=tf.int32)) + counter = examples.count_up_to(num_batches * batch_size + 2) + boxes = tf.tile( + tf.reshape(tf.range(4), [1, 4]), tf.stack([counter, tf.constant(1)])) + batch_queue = batcher.BatchQueue( + tensor_dict={'boxes': boxes}, + batch_size=batch_size, + batch_queue_capacity=100, + num_batch_queue_threads=1, + prefetch_queue_capacity=100) + batch = batch_queue.dequeue() + + for tensor_dict in batch: + for tensor in tensor_dict.values(): + self.assertAllEqual([None, 4], tensor.get_shape().as_list()) + + tf.initialize_all_variables().run() + with slim.queues.QueueRunners(sess): + i = 2 + for _ in range(num_batches): + batch_np = sess.run(batch) + for tensor_dict in batch_np: + for tensor in tensor_dict.values(): + self.assertAllEqual(tensor, np.tile(np.arange(4), (i, 1))) + i += 1 + with self.assertRaises(tf.errors.OutOfRangeError): + sess.run(batch) + + def test_batch_and_unpad_2d_tensors_of_different_sizes_in_all_dimensions( + self): + with self.test_session() as sess: + batch_size = 3 + num_batches = 2 + examples = tf.Variable(tf.constant(2, dtype=tf.int32)) + counter = examples.count_up_to(num_batches * batch_size + 2) + image = tf.reshape( + tf.range(counter * counter), tf.stack([counter, counter])) + batch_queue = batcher.BatchQueue( + tensor_dict={'image': image}, + batch_size=batch_size, + batch_queue_capacity=100, + num_batch_queue_threads=1, + prefetch_queue_capacity=100) + batch = batch_queue.dequeue() + + for tensor_dict in batch: + for tensor in tensor_dict.values(): + self.assertAllEqual([None, None], tensor.get_shape().as_list()) + + tf.initialize_all_variables().run() + with slim.queues.QueueRunners(sess): + i = 2 + for _ in range(num_batches): + batch_np = sess.run(batch) + for tensor_dict in batch_np: + for tensor in tensor_dict.values(): + self.assertAllEqual(tensor, np.arange(i * i).reshape((i, i))) + i += 1 + with self.assertRaises(tf.errors.OutOfRangeError): + sess.run(batch) + + def test_batch_and_unpad_2d_tensors_of_same_size_in_all_dimensions(self): + with self.test_session() as sess: + batch_size = 3 + num_batches = 2 + examples = tf.Variable(tf.constant(1, dtype=tf.int32)) + counter = examples.count_up_to(num_batches * batch_size + 1) + image = tf.reshape(tf.range(1, 13), [4, 3]) * counter + batch_queue = batcher.BatchQueue( + tensor_dict={'image': image}, + batch_size=batch_size, + batch_queue_capacity=100, + num_batch_queue_threads=1, + prefetch_queue_capacity=100) + batch = batch_queue.dequeue() + + for tensor_dict in batch: + for tensor in tensor_dict.values(): + self.assertAllEqual([4, 3], tensor.get_shape().as_list()) + + tf.initialize_all_variables().run() + with slim.queues.QueueRunners(sess): + i = 1 + for _ in range(num_batches): + batch_np = sess.run(batch) + for tensor_dict in batch_np: + for tensor in tensor_dict.values(): + self.assertAllEqual(tensor, np.arange(1, 13).reshape((4, 3)) * i) + i += 1 + with self.assertRaises(tf.errors.OutOfRangeError): + sess.run(batch) + + def test_batcher_when_batch_size_is_one(self): + with self.test_session() as sess: + batch_size = 1 + num_batches = 2 + examples = tf.Variable(tf.constant(2, dtype=tf.int32)) + counter = examples.count_up_to(num_batches * batch_size + 2) + image = tf.reshape( + tf.range(counter * counter), tf.stack([counter, counter])) + batch_queue = batcher.BatchQueue( + tensor_dict={'image': image}, + batch_size=batch_size, + batch_queue_capacity=100, + num_batch_queue_threads=1, + prefetch_queue_capacity=100) + batch = batch_queue.dequeue() + + for tensor_dict in batch: + for tensor in tensor_dict.values(): + self.assertAllEqual([None, None], tensor.get_shape().as_list()) + + tf.initialize_all_variables().run() + with slim.queues.QueueRunners(sess): + i = 2 + for _ in range(num_batches): + batch_np = sess.run(batch) + for tensor_dict in batch_np: + for tensor in tensor_dict.values(): + self.assertAllEqual(tensor, np.arange(i * i).reshape((i, i))) + i += 1 + with self.assertRaises(tf.errors.OutOfRangeError): + sess.run(batch) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_coder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..f20ac956dfbce1fa69d1b9e6f5b023b704e1ec8a --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_coder.py @@ -0,0 +1,151 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Base box coder. + +Box coders convert between coordinate frames, namely image-centric +(with (0,0) on the top left of image) and anchor-centric (with (0,0) being +defined by a specific anchor). + +Users of a BoxCoder can call two methods: + encode: which encodes a box with respect to a given anchor + (or rather, a tensor of boxes wrt a corresponding tensor of anchors) and + decode: which inverts this encoding with a decode operation. +In both cases, the arguments are assumed to be in 1-1 correspondence already; +it is not the job of a BoxCoder to perform matching. +""" +from abc import ABCMeta +from abc import abstractmethod +from abc import abstractproperty + +import tensorflow as tf + + +# Box coder types. +FASTER_RCNN = 'faster_rcnn' +KEYPOINT = 'keypoint' +MEAN_STDDEV = 'mean_stddev' +SQUARE = 'square' + + +class BoxCoder(object): + """Abstract base class for box coder.""" + __metaclass__ = ABCMeta + + @abstractproperty + def code_size(self): + """Return the size of each code. + + This number is a constant and should agree with the output of the `encode` + op (e.g. if rel_codes is the output of self.encode(...), then it should have + shape [N, code_size()]). This abstractproperty should be overridden by + implementations. + + Returns: + an integer constant + """ + pass + + def encode(self, boxes, anchors): + """Encode a box list relative to an anchor collection. + + Args: + boxes: BoxList holding N boxes to be encoded + anchors: BoxList of N anchors + + Returns: + a tensor representing N relative-encoded boxes + """ + with tf.name_scope('Encode'): + return self._encode(boxes, anchors) + + def decode(self, rel_codes, anchors): + """Decode boxes that are encoded relative to an anchor collection. + + Args: + rel_codes: a tensor representing N relative-encoded boxes + anchors: BoxList of anchors + + Returns: + boxlist: BoxList holding N boxes encoded in the ordinary way (i.e., + with corners y_min, x_min, y_max, x_max) + """ + with tf.name_scope('Decode'): + return self._decode(rel_codes, anchors) + + @abstractmethod + def _encode(self, boxes, anchors): + """Method to be overriden by implementations. + + Args: + boxes: BoxList holding N boxes to be encoded + anchors: BoxList of N anchors + + Returns: + a tensor representing N relative-encoded boxes + """ + pass + + @abstractmethod + def _decode(self, rel_codes, anchors): + """Method to be overriden by implementations. + + Args: + rel_codes: a tensor representing N relative-encoded boxes + anchors: BoxList of anchors + + Returns: + boxlist: BoxList holding N boxes encoded in the ordinary way (i.e., + with corners y_min, x_min, y_max, x_max) + """ + pass + + +def batch_decode(encoded_boxes, box_coder, anchors): + """Decode a batch of encoded boxes. + + This op takes a batch of encoded bounding boxes and transforms + them to a batch of bounding boxes specified by their corners in + the order of [y_min, x_min, y_max, x_max]. + + Args: + encoded_boxes: a float32 tensor of shape [batch_size, num_anchors, + code_size] representing the location of the objects. + box_coder: a BoxCoder object. + anchors: a BoxList of anchors used to encode `encoded_boxes`. + + Returns: + decoded_boxes: a float32 tensor of shape [batch_size, num_anchors, + coder_size] representing the corners of the objects in the order + of [y_min, x_min, y_max, x_max]. + + Raises: + ValueError: if batch sizes of the inputs are inconsistent, or if + the number of anchors inferred from encoded_boxes and anchors are + inconsistent. + """ + encoded_boxes.get_shape().assert_has_rank(3) + if encoded_boxes.get_shape()[1].value != anchors.num_boxes_static(): + raise ValueError('The number of anchors inferred from encoded_boxes' + ' and anchors are inconsistent: shape[1] of encoded_boxes' + ' %s should be equal to the number of anchors: %s.' % + (encoded_boxes.get_shape()[1].value, + anchors.num_boxes_static())) + + decoded_boxes = tf.stack([ + box_coder.decode(boxes, anchors).get() + for boxes in tf.unstack(encoded_boxes) + ]) + return decoded_boxes diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_coder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_coder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c087a325275f84604a114d064e050147001d32d0 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_coder_test.py @@ -0,0 +1,61 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.box_coder.""" + +import tensorflow as tf + +from object_detection.core import box_coder +from object_detection.core import box_list + + +class MockBoxCoder(box_coder.BoxCoder): + """Test BoxCoder that encodes/decodes using the multiply-by-two function.""" + + def code_size(self): + return 4 + + def _encode(self, boxes, anchors): + return 2.0 * boxes.get() + + def _decode(self, rel_codes, anchors): + return box_list.BoxList(rel_codes / 2.0) + + +class BoxCoderTest(tf.test.TestCase): + + def test_batch_decode(self): + mock_anchor_corners = tf.constant( + [[0, 0.1, 0.2, 0.3], [0.2, 0.4, 0.4, 0.6]], tf.float32) + mock_anchors = box_list.BoxList(mock_anchor_corners) + mock_box_coder = MockBoxCoder() + + expected_boxes = [[[0.0, 0.1, 0.5, 0.6], [0.5, 0.6, 0.7, 0.8]], + [[0.1, 0.2, 0.3, 0.4], [0.7, 0.8, 0.9, 1.0]]] + + encoded_boxes_list = [mock_box_coder.encode( + box_list.BoxList(tf.constant(boxes)), mock_anchors) + for boxes in expected_boxes] + encoded_boxes = tf.stack(encoded_boxes_list) + decoded_boxes = box_coder.batch_decode( + encoded_boxes, mock_box_coder, mock_anchors) + + with self.test_session() as sess: + decoded_boxes_result = sess.run(decoded_boxes) + self.assertAllClose(expected_boxes, decoded_boxes_result) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list.py new file mode 100644 index 0000000000000000000000000000000000000000..c0196f053030b103a6021ac159f6203f77ba1eed --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list.py @@ -0,0 +1,207 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Bounding Box List definition. + +BoxList represents a list of bounding boxes as tensorflow +tensors, where each bounding box is represented as a row of 4 numbers, +[y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes +within a given list correspond to a single image. See also +box_list_ops.py for common box related operations (such as area, iou, etc). + +Optionally, users can add additional related fields (such as weights). +We assume the following things to be true about fields: +* they correspond to boxes in the box_list along the 0th dimension +* they have inferrable rank at graph construction time +* all dimensions except for possibly the 0th can be inferred + (i.e., not None) at graph construction time. + +Some other notes: + * Following tensorflow conventions, we use height, width ordering, + and correspondingly, y,x (or ymin, xmin, ymax, xmax) ordering + * Tensors are always provided as (flat) [N, 4] tensors. +""" + +import tensorflow as tf + + +class BoxList(object): + """Box collection.""" + + def __init__(self, boxes): + """Constructs box collection. + + Args: + boxes: a tensor of shape [N, 4] representing box corners + + Raises: + ValueError: if invalid dimensions for bbox data or if bbox data is not in + float32 format. + """ + if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4: + raise ValueError('Invalid dimensions for box data.') + if boxes.dtype != tf.float32: + raise ValueError('Invalid tensor type: should be tf.float32') + self.data = {'boxes': boxes} + + def num_boxes(self): + """Returns number of boxes held in collection. + + Returns: + a tensor representing the number of boxes held in the collection. + """ + return tf.shape(self.data['boxes'])[0] + + def num_boxes_static(self): + """Returns number of boxes held in collection. + + This number is inferred at graph construction time rather than run-time. + + Returns: + Number of boxes held in collection (integer) or None if this is not + inferrable at graph construction time. + """ + return self.data['boxes'].get_shape()[0].value + + def get_all_fields(self): + """Returns all fields.""" + return self.data.keys() + + def get_extra_fields(self): + """Returns all non-box fields (i.e., everything not named 'boxes').""" + return [k for k in self.data.keys() if k != 'boxes'] + + def add_field(self, field, field_data): + """Add field to box list. + + This method can be used to add related box data such as + weights/labels, etc. + + Args: + field: a string key to access the data via `get` + field_data: a tensor containing the data to store in the BoxList + """ + self.data[field] = field_data + + def has_field(self, field): + return field in self.data + + def get(self): + """Convenience function for accessing box coordinates. + + Returns: + a tensor with shape [N, 4] representing box coordinates. + """ + return self.get_field('boxes') + + def set(self, boxes): + """Convenience function for setting box coordinates. + + Args: + boxes: a tensor of shape [N, 4] representing box corners + + Raises: + ValueError: if invalid dimensions for bbox data + """ + if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4: + raise ValueError('Invalid dimensions for box data.') + self.data['boxes'] = boxes + + def get_field(self, field): + """Accesses a box collection and associated fields. + + This function returns specified field with object; if no field is specified, + it returns the box coordinates. + + Args: + field: this optional string parameter can be used to specify + a related field to be accessed. + + Returns: + a tensor representing the box collection or an associated field. + + Raises: + ValueError: if invalid field + """ + if not self.has_field(field): + raise ValueError('field ' + str(field) + ' does not exist') + return self.data[field] + + def set_field(self, field, value): + """Sets the value of a field. + + Updates the field of a box_list with a given value. + + Args: + field: (string) name of the field to set value. + value: the value to assign to the field. + + Raises: + ValueError: if the box_list does not have specified field. + """ + if not self.has_field(field): + raise ValueError('field %s does not exist' % field) + self.data[field] = value + + def get_center_coordinates_and_sizes(self, scope=None): + """Computes the center coordinates, height and width of the boxes. + + Args: + scope: name scope of the function. + + Returns: + a list of 4 1-D tensors [ycenter, xcenter, height, width]. + """ + with tf.name_scope(scope, 'get_center_coordinates_and_sizes'): + box_corners = self.get() + ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(box_corners)) + width = xmax - xmin + height = ymax - ymin + ycenter = ymin + height / 2. + xcenter = xmin + width / 2. + return [ycenter, xcenter, height, width] + + def transpose_coordinates(self, scope=None): + """Transpose the coordinate representation in a boxlist. + + Args: + scope: name scope of the function. + """ + with tf.name_scope(scope, 'transpose_coordinates'): + y_min, x_min, y_max, x_max = tf.split( + value=self.get(), num_or_size_splits=4, axis=1) + self.set(tf.concat([x_min, y_min, x_max, y_max], 1)) + + def as_tensor_dict(self, fields=None): + """Retrieves specified fields as a dictionary of tensors. + + Args: + fields: (optional) list of fields to return in the dictionary. + If None (default), all fields are returned. + + Returns: + tensor_dict: A dictionary of tensors specified by fields. + + Raises: + ValueError: if specified field is not contained in boxlist. + """ + tensor_dict = {} + if fields is None: + fields = self.get_all_fields() + for field in fields: + if not self.has_field(field): + raise ValueError('boxlist must contain all specified fields') + tensor_dict[field] = self.get_field(field) + return tensor_dict diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list_ops.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a755ef68ec81d6802f0ea6e8d1e3f613aff8fdef --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list_ops.py @@ -0,0 +1,1061 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Bounding Box List operations. + +Example box operations that are supported: + * areas: compute bounding box areas + * iou: pairwise intersection-over-union scores + * sq_dist: pairwise distances between bounding boxes + +Whenever box_list_ops functions output a BoxList, the fields of the incoming +BoxList are retained unless documented otherwise. +""" +import tensorflow as tf + +from object_detection.core import box_list +from object_detection.utils import shape_utils + + +class SortOrder(object): + """Enum class for sort order. + + Attributes: + ascend: ascend order. + descend: descend order. + """ + ascend = 1 + descend = 2 + + +def area(boxlist, scope=None): + """Computes area of boxes. + + Args: + boxlist: BoxList holding N boxes + scope: name scope. + + Returns: + a tensor with shape [N] representing box areas. + """ + with tf.name_scope(scope, 'Area'): + y_min, x_min, y_max, x_max = tf.split( + value=boxlist.get(), num_or_size_splits=4, axis=1) + return tf.squeeze((y_max - y_min) * (x_max - x_min), [1]) + + +def height_width(boxlist, scope=None): + """Computes height and width of boxes in boxlist. + + Args: + boxlist: BoxList holding N boxes + scope: name scope. + + Returns: + Height: A tensor with shape [N] representing box heights. + Width: A tensor with shape [N] representing box widths. + """ + with tf.name_scope(scope, 'HeightWidth'): + y_min, x_min, y_max, x_max = tf.split( + value=boxlist.get(), num_or_size_splits=4, axis=1) + return tf.squeeze(y_max - y_min, [1]), tf.squeeze(x_max - x_min, [1]) + + +def scale(boxlist, y_scale, x_scale, scope=None): + """scale box coordinates in x and y dimensions. + + Args: + boxlist: BoxList holding N boxes + y_scale: (float) scalar tensor + x_scale: (float) scalar tensor + scope: name scope. + + Returns: + boxlist: BoxList holding N boxes + """ + with tf.name_scope(scope, 'Scale'): + y_scale = tf.cast(y_scale, tf.float32) + x_scale = tf.cast(x_scale, tf.float32) + y_min, x_min, y_max, x_max = tf.split( + value=boxlist.get(), num_or_size_splits=4, axis=1) + y_min = y_scale * y_min + y_max = y_scale * y_max + x_min = x_scale * x_min + x_max = x_scale * x_max + scaled_boxlist = box_list.BoxList( + tf.concat([y_min, x_min, y_max, x_max], 1)) + return _copy_extra_fields(scaled_boxlist, boxlist) + + +def clip_to_window(boxlist, window, filter_nonoverlapping=True, scope=None): + """Clip bounding boxes to a window. + + This op clips any input bounding boxes (represented by bounding box + corners) to a window, optionally filtering out boxes that do not + overlap at all with the window. + + Args: + boxlist: BoxList holding M_in boxes + window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max] + window to which the op should clip boxes. + filter_nonoverlapping: whether to filter out boxes that do not overlap at + all with the window. + scope: name scope. + + Returns: + a BoxList holding M_out boxes where M_out <= M_in + """ + with tf.name_scope(scope, 'ClipToWindow'): + y_min, x_min, y_max, x_max = tf.split( + value=boxlist.get(), num_or_size_splits=4, axis=1) + win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window) + y_min_clipped = tf.maximum(tf.minimum(y_min, win_y_max), win_y_min) + y_max_clipped = tf.maximum(tf.minimum(y_max, win_y_max), win_y_min) + x_min_clipped = tf.maximum(tf.minimum(x_min, win_x_max), win_x_min) + x_max_clipped = tf.maximum(tf.minimum(x_max, win_x_max), win_x_min) + clipped = box_list.BoxList( + tf.concat([y_min_clipped, x_min_clipped, y_max_clipped, x_max_clipped], + 1)) + clipped = _copy_extra_fields(clipped, boxlist) + if filter_nonoverlapping: + areas = area(clipped) + nonzero_area_indices = tf.cast( + tf.reshape(tf.where(tf.greater(areas, 0.0)), [-1]), tf.int32) + clipped = gather(clipped, nonzero_area_indices) + return clipped + + +def prune_outside_window(boxlist, window, scope=None): + """Prunes bounding boxes that fall outside a given window. + + This function prunes bounding boxes that even partially fall outside the given + window. See also clip_to_window which only prunes bounding boxes that fall + completely outside the window, and clips any bounding boxes that partially + overflow. + + Args: + boxlist: a BoxList holding M_in boxes. + window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax] + of the window + scope: name scope. + + Returns: + pruned_corners: a tensor with shape [M_out, 4] where M_out <= M_in + valid_indices: a tensor with shape [M_out] indexing the valid bounding boxes + in the input tensor. + """ + with tf.name_scope(scope, 'PruneOutsideWindow'): + y_min, x_min, y_max, x_max = tf.split( + value=boxlist.get(), num_or_size_splits=4, axis=1) + win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window) + coordinate_violations = tf.concat([ + tf.less(y_min, win_y_min), tf.less(x_min, win_x_min), + tf.greater(y_max, win_y_max), tf.greater(x_max, win_x_max) + ], 1) + valid_indices = tf.reshape( + tf.where(tf.logical_not(tf.reduce_any(coordinate_violations, 1))), [-1]) + return gather(boxlist, valid_indices), valid_indices + + +def prune_completely_outside_window(boxlist, window, scope=None): + """Prunes bounding boxes that fall completely outside of the given window. + + The function clip_to_window prunes bounding boxes that fall + completely outside the window, but also clips any bounding boxes that + partially overflow. This function does not clip partially overflowing boxes. + + Args: + boxlist: a BoxList holding M_in boxes. + window: a float tensor of shape [4] representing [ymin, xmin, ymax, xmax] + of the window + scope: name scope. + + Returns: + pruned_boxlist: a new BoxList with all bounding boxes partially or fully in + the window. + valid_indices: a tensor with shape [M_out] indexing the valid bounding boxes + in the input tensor. + """ + with tf.name_scope(scope, 'PruneCompleteleyOutsideWindow'): + y_min, x_min, y_max, x_max = tf.split( + value=boxlist.get(), num_or_size_splits=4, axis=1) + win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window) + coordinate_violations = tf.concat([ + tf.greater_equal(y_min, win_y_max), tf.greater_equal(x_min, win_x_max), + tf.less_equal(y_max, win_y_min), tf.less_equal(x_max, win_x_min) + ], 1) + valid_indices = tf.reshape( + tf.where(tf.logical_not(tf.reduce_any(coordinate_violations, 1))), [-1]) + return gather(boxlist, valid_indices), valid_indices + + +def intersection(boxlist1, boxlist2, scope=None): + """Compute pairwise intersection areas between boxes. + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding M boxes + scope: name scope. + + Returns: + a tensor with shape [N, M] representing pairwise intersections + """ + with tf.name_scope(scope, 'Intersection'): + y_min1, x_min1, y_max1, x_max1 = tf.split( + value=boxlist1.get(), num_or_size_splits=4, axis=1) + y_min2, x_min2, y_max2, x_max2 = tf.split( + value=boxlist2.get(), num_or_size_splits=4, axis=1) + all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2)) + all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2)) + intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin) + all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2)) + all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2)) + intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin) + return intersect_heights * intersect_widths + + +def matched_intersection(boxlist1, boxlist2, scope=None): + """Compute intersection areas between corresponding boxes in two boxlists. + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding N boxes + scope: name scope. + + Returns: + a tensor with shape [N] representing pairwise intersections + """ + with tf.name_scope(scope, 'MatchedIntersection'): + y_min1, x_min1, y_max1, x_max1 = tf.split( + value=boxlist1.get(), num_or_size_splits=4, axis=1) + y_min2, x_min2, y_max2, x_max2 = tf.split( + value=boxlist2.get(), num_or_size_splits=4, axis=1) + min_ymax = tf.minimum(y_max1, y_max2) + max_ymin = tf.maximum(y_min1, y_min2) + intersect_heights = tf.maximum(0.0, min_ymax - max_ymin) + min_xmax = tf.minimum(x_max1, x_max2) + max_xmin = tf.maximum(x_min1, x_min2) + intersect_widths = tf.maximum(0.0, min_xmax - max_xmin) + return tf.reshape(intersect_heights * intersect_widths, [-1]) + + +def iou(boxlist1, boxlist2, scope=None): + """Computes pairwise intersection-over-union between box collections. + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding M boxes + scope: name scope. + + Returns: + a tensor with shape [N, M] representing pairwise iou scores. + """ + with tf.name_scope(scope, 'IOU'): + intersections = intersection(boxlist1, boxlist2) + areas1 = area(boxlist1) + areas2 = area(boxlist2) + unions = ( + tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections) + return tf.where( + tf.equal(intersections, 0.0), + tf.zeros_like(intersections), tf.truediv(intersections, unions)) + + +def matched_iou(boxlist1, boxlist2, scope=None): + """Compute intersection-over-union between corresponding boxes in boxlists. + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding N boxes + scope: name scope. + + Returns: + a tensor with shape [N] representing pairwise iou scores. + """ + with tf.name_scope(scope, 'MatchedIOU'): + intersections = matched_intersection(boxlist1, boxlist2) + areas1 = area(boxlist1) + areas2 = area(boxlist2) + unions = areas1 + areas2 - intersections + return tf.where( + tf.equal(intersections, 0.0), + tf.zeros_like(intersections), tf.truediv(intersections, unions)) + + +def ioa(boxlist1, boxlist2, scope=None): + """Computes pairwise intersection-over-area between box collections. + + intersection-over-area (IOA) between two boxes box1 and box2 is defined as + their intersection area over box2's area. Note that ioa is not symmetric, + that is, ioa(box1, box2) != ioa(box2, box1). + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding M boxes + scope: name scope. + + Returns: + a tensor with shape [N, M] representing pairwise ioa scores. + """ + with tf.name_scope(scope, 'IOA'): + intersections = intersection(boxlist1, boxlist2) + areas = tf.expand_dims(area(boxlist2), 0) + return tf.truediv(intersections, areas) + + +def prune_non_overlapping_boxes( + boxlist1, boxlist2, min_overlap=0.0, scope=None): + """Prunes the boxes in boxlist1 that overlap less than thresh with boxlist2. + + For each box in boxlist1, we want its IOA to be more than minoverlap with + at least one of the boxes in boxlist2. If it does not, we remove it. + + Args: + boxlist1: BoxList holding N boxes. + boxlist2: BoxList holding M boxes. + min_overlap: Minimum required overlap between boxes, to count them as + overlapping. + scope: name scope. + + Returns: + new_boxlist1: A pruned boxlist with size [N', 4]. + keep_inds: A tensor with shape [N'] indexing kept bounding boxes in the + first input BoxList `boxlist1`. + """ + with tf.name_scope(scope, 'PruneNonOverlappingBoxes'): + ioa_ = ioa(boxlist2, boxlist1) # [M, N] tensor + ioa_ = tf.reduce_max(ioa_, reduction_indices=[0]) # [N] tensor + keep_bool = tf.greater_equal(ioa_, tf.constant(min_overlap)) + keep_inds = tf.squeeze(tf.where(keep_bool), squeeze_dims=[1]) + new_boxlist1 = gather(boxlist1, keep_inds) + return new_boxlist1, keep_inds + + +def prune_small_boxes(boxlist, min_side, scope=None): + """Prunes small boxes in the boxlist which have a side smaller than min_side. + + Args: + boxlist: BoxList holding N boxes. + min_side: Minimum width AND height of box to survive pruning. + scope: name scope. + + Returns: + A pruned boxlist. + """ + with tf.name_scope(scope, 'PruneSmallBoxes'): + height, width = height_width(boxlist) + is_valid = tf.logical_and(tf.greater_equal(width, min_side), + tf.greater_equal(height, min_side)) + return gather(boxlist, tf.reshape(tf.where(is_valid), [-1])) + + +def change_coordinate_frame(boxlist, window, scope=None): + """Change coordinate frame of the boxlist to be relative to window's frame. + + Given a window of the form [ymin, xmin, ymax, xmax], + changes bounding box coordinates from boxlist to be relative to this window + (e.g., the min corner maps to (0,0) and the max corner maps to (1,1)). + + An example use case is data augmentation: where we are given groundtruth + boxes (boxlist) and would like to randomly crop the image to some + window (window). In this case we need to change the coordinate frame of + each groundtruth box to be relative to this new window. + + Args: + boxlist: A BoxList object holding N boxes. + window: A rank 1 tensor [4]. + scope: name scope. + + Returns: + Returns a BoxList object with N boxes. + """ + with tf.name_scope(scope, 'ChangeCoordinateFrame'): + win_height = window[2] - window[0] + win_width = window[3] - window[1] + boxlist_new = scale(box_list.BoxList( + boxlist.get() - [window[0], window[1], window[0], window[1]]), + 1.0 / win_height, 1.0 / win_width) + boxlist_new = _copy_extra_fields(boxlist_new, boxlist) + return boxlist_new + + +def sq_dist(boxlist1, boxlist2, scope=None): + """Computes the pairwise squared distances between box corners. + + This op treats each box as if it were a point in a 4d Euclidean space and + computes pairwise squared distances. + + Mathematically, we are given two matrices of box coordinates X and Y, + where X(i,:) is the i'th row of X, containing the 4 numbers defining the + corners of the i'th box in boxlist1. Similarly Y(j,:) corresponds to + boxlist2. We compute + Z(i,j) = ||X(i,:) - Y(j,:)||^2 + = ||X(i,:)||^2 + ||Y(j,:)||^2 - 2 X(i,:)' * Y(j,:), + + Args: + boxlist1: BoxList holding N boxes + boxlist2: BoxList holding M boxes + scope: name scope. + + Returns: + a tensor with shape [N, M] representing pairwise distances + """ + with tf.name_scope(scope, 'SqDist'): + sqnorm1 = tf.reduce_sum(tf.square(boxlist1.get()), 1, keep_dims=True) + sqnorm2 = tf.reduce_sum(tf.square(boxlist2.get()), 1, keep_dims=True) + innerprod = tf.matmul(boxlist1.get(), boxlist2.get(), + transpose_a=False, transpose_b=True) + return sqnorm1 + tf.transpose(sqnorm2) - 2.0 * innerprod + + +def boolean_mask(boxlist, indicator, fields=None, scope=None): + """Select boxes from BoxList according to indicator and return new BoxList. + + `boolean_mask` returns the subset of boxes that are marked as "True" by the + indicator tensor. By default, `boolean_mask` returns boxes corresponding to + the input index list, as well as all additional fields stored in the boxlist + (indexing into the first dimension). However one can optionally only draw + from a subset of fields. + + Args: + boxlist: BoxList holding N boxes + indicator: a rank-1 boolean tensor + fields: (optional) list of fields to also gather from. If None (default), + all fields are gathered from. Pass an empty fields list to only gather + the box coordinates. + scope: name scope. + + Returns: + subboxlist: a BoxList corresponding to the subset of the input BoxList + specified by indicator + Raises: + ValueError: if `indicator` is not a rank-1 boolean tensor. + """ + with tf.name_scope(scope, 'BooleanMask'): + if indicator.shape.ndims != 1: + raise ValueError('indicator should have rank 1') + if indicator.dtype != tf.bool: + raise ValueError('indicator should be a boolean tensor') + subboxlist = box_list.BoxList(tf.boolean_mask(boxlist.get(), indicator)) + if fields is None: + fields = boxlist.get_extra_fields() + for field in fields: + if not boxlist.has_field(field): + raise ValueError('boxlist must contain all specified fields') + subfieldlist = tf.boolean_mask(boxlist.get_field(field), indicator) + subboxlist.add_field(field, subfieldlist) + return subboxlist + + +def gather(boxlist, indices, fields=None, scope=None): + """Gather boxes from BoxList according to indices and return new BoxList. + + By default, `gather` returns boxes corresponding to the input index list, as + well as all additional fields stored in the boxlist (indexing into the + first dimension). However one can optionally only gather from a + subset of fields. + + Args: + boxlist: BoxList holding N boxes + indices: a rank-1 tensor of type int32 / int64 + fields: (optional) list of fields to also gather from. If None (default), + all fields are gathered from. Pass an empty fields list to only gather + the box coordinates. + scope: name scope. + + Returns: + subboxlist: a BoxList corresponding to the subset of the input BoxList + specified by indices + Raises: + ValueError: if specified field is not contained in boxlist or if the + indices are not of type int32 + """ + with tf.name_scope(scope, 'Gather'): + if len(indices.shape.as_list()) != 1: + raise ValueError('indices should have rank 1') + if indices.dtype != tf.int32 and indices.dtype != tf.int64: + raise ValueError('indices should be an int32 / int64 tensor') + subboxlist = box_list.BoxList(tf.gather(boxlist.get(), indices)) + if fields is None: + fields = boxlist.get_extra_fields() + for field in fields: + if not boxlist.has_field(field): + raise ValueError('boxlist must contain all specified fields') + subfieldlist = tf.gather(boxlist.get_field(field), indices) + subboxlist.add_field(field, subfieldlist) + return subboxlist + + +def concatenate(boxlists, fields=None, scope=None): + """Concatenate list of BoxLists. + + This op concatenates a list of input BoxLists into a larger BoxList. It also + handles concatenation of BoxList fields as long as the field tensor shapes + are equal except for the first dimension. + + Args: + boxlists: list of BoxList objects + fields: optional list of fields to also concatenate. By default, all + fields from the first BoxList in the list are included in the + concatenation. + scope: name scope. + + Returns: + a BoxList with number of boxes equal to + sum([boxlist.num_boxes() for boxlist in BoxList]) + Raises: + ValueError: if boxlists is invalid (i.e., is not a list, is empty, or + contains non BoxList objects), or if requested fields are not contained in + all boxlists + """ + with tf.name_scope(scope, 'Concatenate'): + if not isinstance(boxlists, list): + raise ValueError('boxlists should be a list') + if not boxlists: + raise ValueError('boxlists should have nonzero length') + for boxlist in boxlists: + if not isinstance(boxlist, box_list.BoxList): + raise ValueError('all elements of boxlists should be BoxList objects') + concatenated = box_list.BoxList( + tf.concat([boxlist.get() for boxlist in boxlists], 0)) + if fields is None: + fields = boxlists[0].get_extra_fields() + for field in fields: + first_field_shape = boxlists[0].get_field(field).get_shape().as_list() + first_field_shape[0] = -1 + if None in first_field_shape: + raise ValueError('field %s must have fully defined shape except for the' + ' 0th dimension.' % field) + for boxlist in boxlists: + if not boxlist.has_field(field): + raise ValueError('boxlist must contain all requested fields') + field_shape = boxlist.get_field(field).get_shape().as_list() + field_shape[0] = -1 + if field_shape != first_field_shape: + raise ValueError('field %s must have same shape for all boxlists ' + 'except for the 0th dimension.' % field) + concatenated_field = tf.concat( + [boxlist.get_field(field) for boxlist in boxlists], 0) + concatenated.add_field(field, concatenated_field) + return concatenated + + +def sort_by_field(boxlist, field, order=SortOrder.descend, scope=None): + """Sort boxes and associated fields according to a scalar field. + + A common use case is reordering the boxes according to descending scores. + + Args: + boxlist: BoxList holding N boxes. + field: A BoxList field for sorting and reordering the BoxList. + order: (Optional) descend or ascend. Default is descend. + scope: name scope. + + Returns: + sorted_boxlist: A sorted BoxList with the field in the specified order. + + Raises: + ValueError: if specified field does not exist + ValueError: if the order is not either descend or ascend + """ + with tf.name_scope(scope, 'SortByField'): + if order != SortOrder.descend and order != SortOrder.ascend: + raise ValueError('Invalid sort order') + + field_to_sort = boxlist.get_field(field) + if len(field_to_sort.shape.as_list()) != 1: + raise ValueError('Field should have rank 1') + + num_boxes = boxlist.num_boxes() + num_entries = tf.size(field_to_sort) + length_assert = tf.Assert( + tf.equal(num_boxes, num_entries), + ['Incorrect field size: actual vs expected.', num_entries, num_boxes]) + + with tf.control_dependencies([length_assert]): + # TODO(derekjchow): Remove with tf.device when top_k operation runs + # correctly on GPU. + with tf.device('/cpu:0'): + _, sorted_indices = tf.nn.top_k(field_to_sort, num_boxes, sorted=True) + + if order == SortOrder.ascend: + sorted_indices = tf.reverse_v2(sorted_indices, [0]) + + return gather(boxlist, sorted_indices) + + +def visualize_boxes_in_image(image, boxlist, normalized=False, scope=None): + """Overlay bounding box list on image. + + Currently this visualization plots a 1 pixel thick red bounding box on top + of the image. Note that tf.image.draw_bounding_boxes essentially is + 1 indexed. + + Args: + image: an image tensor with shape [height, width, 3] + boxlist: a BoxList + normalized: (boolean) specify whether corners are to be interpreted + as absolute coordinates in image space or normalized with respect to the + image size. + scope: name scope. + + Returns: + image_and_boxes: an image tensor with shape [height, width, 3] + """ + with tf.name_scope(scope, 'VisualizeBoxesInImage'): + if not normalized: + height, width, _ = tf.unstack(tf.shape(image)) + boxlist = scale(boxlist, + 1.0 / tf.cast(height, tf.float32), + 1.0 / tf.cast(width, tf.float32)) + corners = tf.expand_dims(boxlist.get(), 0) + image = tf.expand_dims(image, 0) + return tf.squeeze(tf.image.draw_bounding_boxes(image, corners), [0]) + + +def filter_field_value_equals(boxlist, field, value, scope=None): + """Filter to keep only boxes with field entries equal to the given value. + + Args: + boxlist: BoxList holding N boxes. + field: field name for filtering. + value: scalar value. + scope: name scope. + + Returns: + a BoxList holding M boxes where M <= N + + Raises: + ValueError: if boxlist not a BoxList object or if it does not have + the specified field. + """ + with tf.name_scope(scope, 'FilterFieldValueEquals'): + if not isinstance(boxlist, box_list.BoxList): + raise ValueError('boxlist must be a BoxList') + if not boxlist.has_field(field): + raise ValueError('boxlist must contain the specified field') + filter_field = boxlist.get_field(field) + gather_index = tf.reshape(tf.where(tf.equal(filter_field, value)), [-1]) + return gather(boxlist, gather_index) + + +def filter_greater_than(boxlist, thresh, scope=None): + """Filter to keep only boxes with score exceeding a given threshold. + + This op keeps the collection of boxes whose corresponding scores are + greater than the input threshold. + + TODO(jonathanhuang): Change function name to filter_scores_greater_than + + Args: + boxlist: BoxList holding N boxes. Must contain a 'scores' field + representing detection scores. + thresh: scalar threshold + scope: name scope. + + Returns: + a BoxList holding M boxes where M <= N + + Raises: + ValueError: if boxlist not a BoxList object or if it does not + have a scores field + """ + with tf.name_scope(scope, 'FilterGreaterThan'): + if not isinstance(boxlist, box_list.BoxList): + raise ValueError('boxlist must be a BoxList') + if not boxlist.has_field('scores'): + raise ValueError('input boxlist must have \'scores\' field') + scores = boxlist.get_field('scores') + if len(scores.shape.as_list()) > 2: + raise ValueError('Scores should have rank 1 or 2') + if len(scores.shape.as_list()) == 2 and scores.shape.as_list()[1] != 1: + raise ValueError('Scores should have rank 1 or have shape ' + 'consistent with [None, 1]') + high_score_indices = tf.cast(tf.reshape( + tf.where(tf.greater(scores, thresh)), + [-1]), tf.int32) + return gather(boxlist, high_score_indices) + + +def non_max_suppression(boxlist, thresh, max_output_size, scope=None): + """Non maximum suppression. + + This op greedily selects a subset of detection bounding boxes, pruning + away boxes that have high IOU (intersection over union) overlap (> thresh) + with already selected boxes. Note that this only works for a single class --- + to apply NMS to multi-class predictions, use MultiClassNonMaxSuppression. + + Args: + boxlist: BoxList holding N boxes. Must contain a 'scores' field + representing detection scores. + thresh: scalar threshold + max_output_size: maximum number of retained boxes + scope: name scope. + + Returns: + a BoxList holding M boxes where M <= max_output_size + Raises: + ValueError: if thresh is not in [0, 1] + """ + with tf.name_scope(scope, 'NonMaxSuppression'): + if not 0 <= thresh <= 1.0: + raise ValueError('thresh must be between 0 and 1') + if not isinstance(boxlist, box_list.BoxList): + raise ValueError('boxlist must be a BoxList') + if not boxlist.has_field('scores'): + raise ValueError('input boxlist must have \'scores\' field') + selected_indices = tf.image.non_max_suppression( + boxlist.get(), boxlist.get_field('scores'), + max_output_size, iou_threshold=thresh) + return gather(boxlist, selected_indices) + + +def _copy_extra_fields(boxlist_to_copy_to, boxlist_to_copy_from): + """Copies the extra fields of boxlist_to_copy_from to boxlist_to_copy_to. + + Args: + boxlist_to_copy_to: BoxList to which extra fields are copied. + boxlist_to_copy_from: BoxList from which fields are copied. + + Returns: + boxlist_to_copy_to with extra fields. + """ + for field in boxlist_to_copy_from.get_extra_fields(): + boxlist_to_copy_to.add_field(field, boxlist_to_copy_from.get_field(field)) + return boxlist_to_copy_to + + +def to_normalized_coordinates(boxlist, height, width, + check_range=True, scope=None): + """Converts absolute box coordinates to normalized coordinates in [0, 1]. + + Usually one uses the dynamic shape of the image or conv-layer tensor: + boxlist = box_list_ops.to_normalized_coordinates(boxlist, + tf.shape(images)[1], + tf.shape(images)[2]), + + This function raises an assertion failed error at graph execution time when + the maximum coordinate is smaller than 1.01 (which means that coordinates are + already normalized). The value 1.01 is to deal with small rounding errors. + + Args: + boxlist: BoxList with coordinates in terms of pixel-locations. + height: Maximum value for height of absolute box coordinates. + width: Maximum value for width of absolute box coordinates. + check_range: If True, checks if the coordinates are normalized or not. + scope: name scope. + + Returns: + boxlist with normalized coordinates in [0, 1]. + """ + with tf.name_scope(scope, 'ToNormalizedCoordinates'): + height = tf.cast(height, tf.float32) + width = tf.cast(width, tf.float32) + + if check_range: + max_val = tf.reduce_max(boxlist.get()) + max_assert = tf.Assert(tf.greater(max_val, 1.01), + ['max value is lower than 1.01: ', max_val]) + with tf.control_dependencies([max_assert]): + width = tf.identity(width) + + return scale(boxlist, 1 / height, 1 / width) + + +def to_absolute_coordinates(boxlist, + height, + width, + check_range=True, + maximum_normalized_coordinate=1.1, + scope=None): + """Converts normalized box coordinates to absolute pixel coordinates. + + This function raises an assertion failed error when the maximum box coordinate + value is larger than maximum_normalized_coordinate (in which case coordinates + are already absolute). + + Args: + boxlist: BoxList with coordinates in range [0, 1]. + height: Maximum value for height of absolute box coordinates. + width: Maximum value for width of absolute box coordinates. + check_range: If True, checks if the coordinates are normalized or not. + maximum_normalized_coordinate: Maximum coordinate value to be considered + as normalized, default to 1.1. + scope: name scope. + + Returns: + boxlist with absolute coordinates in terms of the image size. + + """ + with tf.name_scope(scope, 'ToAbsoluteCoordinates'): + height = tf.cast(height, tf.float32) + width = tf.cast(width, tf.float32) + + # Ensure range of input boxes is correct. + if check_range: + box_maximum = tf.reduce_max(boxlist.get()) + max_assert = tf.Assert( + tf.greater_equal(maximum_normalized_coordinate, box_maximum), + ['maximum box coordinate value is larger ' + 'than %f: ' % maximum_normalized_coordinate, box_maximum]) + with tf.control_dependencies([max_assert]): + width = tf.identity(width) + + return scale(boxlist, height, width) + + +def refine_boxes_multi_class(pool_boxes, + num_classes, + nms_iou_thresh, + nms_max_detections, + voting_iou_thresh=0.5): + """Refines a pool of boxes using non max suppression and box voting. + + Box refinement is done independently for each class. + + Args: + pool_boxes: (BoxList) A collection of boxes to be refined. pool_boxes must + have a rank 1 'scores' field and a rank 1 'classes' field. + num_classes: (int scalar) Number of classes. + nms_iou_thresh: (float scalar) iou threshold for non max suppression (NMS). + nms_max_detections: (int scalar) maximum output size for NMS. + voting_iou_thresh: (float scalar) iou threshold for box voting. + + Returns: + BoxList of refined boxes. + + Raises: + ValueError: if + a) nms_iou_thresh or voting_iou_thresh is not in [0, 1]. + b) pool_boxes is not a BoxList. + c) pool_boxes does not have a scores and classes field. + """ + if not 0.0 <= nms_iou_thresh <= 1.0: + raise ValueError('nms_iou_thresh must be between 0 and 1') + if not 0.0 <= voting_iou_thresh <= 1.0: + raise ValueError('voting_iou_thresh must be between 0 and 1') + if not isinstance(pool_boxes, box_list.BoxList): + raise ValueError('pool_boxes must be a BoxList') + if not pool_boxes.has_field('scores'): + raise ValueError('pool_boxes must have a \'scores\' field') + if not pool_boxes.has_field('classes'): + raise ValueError('pool_boxes must have a \'classes\' field') + + refined_boxes = [] + for i in range(num_classes): + boxes_class = filter_field_value_equals(pool_boxes, 'classes', i) + refined_boxes_class = refine_boxes(boxes_class, nms_iou_thresh, + nms_max_detections, voting_iou_thresh) + refined_boxes.append(refined_boxes_class) + return sort_by_field(concatenate(refined_boxes), 'scores') + + +def refine_boxes(pool_boxes, + nms_iou_thresh, + nms_max_detections, + voting_iou_thresh=0.5): + """Refines a pool of boxes using non max suppression and box voting. + + Args: + pool_boxes: (BoxList) A collection of boxes to be refined. pool_boxes must + have a rank 1 'scores' field. + nms_iou_thresh: (float scalar) iou threshold for non max suppression (NMS). + nms_max_detections: (int scalar) maximum output size for NMS. + voting_iou_thresh: (float scalar) iou threshold for box voting. + + Returns: + BoxList of refined boxes. + + Raises: + ValueError: if + a) nms_iou_thresh or voting_iou_thresh is not in [0, 1]. + b) pool_boxes is not a BoxList. + c) pool_boxes does not have a scores field. + """ + if not 0.0 <= nms_iou_thresh <= 1.0: + raise ValueError('nms_iou_thresh must be between 0 and 1') + if not 0.0 <= voting_iou_thresh <= 1.0: + raise ValueError('voting_iou_thresh must be between 0 and 1') + if not isinstance(pool_boxes, box_list.BoxList): + raise ValueError('pool_boxes must be a BoxList') + if not pool_boxes.has_field('scores'): + raise ValueError('pool_boxes must have a \'scores\' field') + + nms_boxes = non_max_suppression( + pool_boxes, nms_iou_thresh, nms_max_detections) + return box_voting(nms_boxes, pool_boxes, voting_iou_thresh) + + +def box_voting(selected_boxes, pool_boxes, iou_thresh=0.5): + """Performs box voting as described in S. Gidaris and N. Komodakis, ICCV 2015. + + Performs box voting as described in 'Object detection via a multi-region & + semantic segmentation-aware CNN model', Gidaris and Komodakis, ICCV 2015. For + each box 'B' in selected_boxes, we find the set 'S' of boxes in pool_boxes + with iou overlap >= iou_thresh. The location of B is set to the weighted + average location of boxes in S (scores are used for weighting). And the score + of B is set to the average score of boxes in S. + + Args: + selected_boxes: BoxList containing a subset of boxes in pool_boxes. These + boxes are usually selected from pool_boxes using non max suppression. + pool_boxes: BoxList containing a set of (possibly redundant) boxes. + iou_thresh: (float scalar) iou threshold for matching boxes in + selected_boxes and pool_boxes. + + Returns: + BoxList containing averaged locations and scores for each box in + selected_boxes. + + Raises: + ValueError: if + a) selected_boxes or pool_boxes is not a BoxList. + b) if iou_thresh is not in [0, 1]. + c) pool_boxes does not have a scores field. + """ + if not 0.0 <= iou_thresh <= 1.0: + raise ValueError('iou_thresh must be between 0 and 1') + if not isinstance(selected_boxes, box_list.BoxList): + raise ValueError('selected_boxes must be a BoxList') + if not isinstance(pool_boxes, box_list.BoxList): + raise ValueError('pool_boxes must be a BoxList') + if not pool_boxes.has_field('scores'): + raise ValueError('pool_boxes must have a \'scores\' field') + + iou_ = iou(selected_boxes, pool_boxes) + match_indicator = tf.to_float(tf.greater(iou_, iou_thresh)) + num_matches = tf.reduce_sum(match_indicator, 1) + # TODO(kbanoop): Handle the case where some boxes in selected_boxes do not + # match to any boxes in pool_boxes. For such boxes without any matches, we + # should return the original boxes without voting. + match_assert = tf.Assert( + tf.reduce_all(tf.greater(num_matches, 0)), + ['Each box in selected_boxes must match with at least one box ' + 'in pool_boxes.']) + + scores = tf.expand_dims(pool_boxes.get_field('scores'), 1) + scores_assert = tf.Assert( + tf.reduce_all(tf.greater_equal(scores, 0)), + ['Scores must be non negative.']) + + with tf.control_dependencies([scores_assert, match_assert]): + sum_scores = tf.matmul(match_indicator, scores) + averaged_scores = tf.reshape(sum_scores, [-1]) / num_matches + + box_locations = tf.matmul(match_indicator, + pool_boxes.get() * scores) / sum_scores + averaged_boxes = box_list.BoxList(box_locations) + _copy_extra_fields(averaged_boxes, selected_boxes) + averaged_boxes.add_field('scores', averaged_scores) + return averaged_boxes + + +def pad_or_clip_box_list(boxlist, num_boxes, scope=None): + """Pads or clips all fields of a BoxList. + + Args: + boxlist: A BoxList with arbitrary of number of boxes. + num_boxes: First num_boxes in boxlist are kept. + The fields are zero-padded if num_boxes is bigger than the + actual number of boxes. + scope: name scope. + + Returns: + BoxList with all fields padded or clipped. + """ + with tf.name_scope(scope, 'PadOrClipBoxList'): + subboxlist = box_list.BoxList(shape_utils.pad_or_clip_tensor( + boxlist.get(), num_boxes)) + for field in boxlist.get_extra_fields(): + subfield = shape_utils.pad_or_clip_tensor( + boxlist.get_field(field), num_boxes) + subboxlist.add_field(field, subfield) + return subboxlist + + +def select_random_box(boxlist, + default_box=None, + seed=None, + scope=None): + """Selects a random bounding box from a `BoxList`. + + Args: + boxlist: A BoxList. + default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`, + this default box will be returned. If None, will use a default box of + [[-1., -1., -1., -1.]]. + seed: Random seed. + scope: Name scope. + + Returns: + bbox: A [1, 4] tensor with a random bounding box. + valid: A bool tensor indicating whether a valid bounding box is returned + (True) or whether the default box is returned (False). + """ + with tf.name_scope(scope, 'SelectRandomBox'): + bboxes = boxlist.get() + combined_shape = shape_utils.combined_static_and_dynamic_shape(bboxes) + number_of_boxes = combined_shape[0] + default_box = default_box or tf.constant([[-1., -1., -1., -1.]]) + + def select_box(): + random_index = tf.random_uniform([], + maxval=number_of_boxes, + dtype=tf.int32, + seed=seed) + return tf.expand_dims(bboxes[random_index], axis=0), tf.constant(True) + + return tf.cond( + tf.greater_equal(number_of_boxes, 1), + true_fn=select_box, + false_fn=lambda: (default_box, tf.constant(False))) + + +def get_minimal_coverage_box(boxlist, + default_box=None, + scope=None): + """Creates a single bounding box which covers all boxes in the boxlist. + + Args: + boxlist: A Boxlist. + default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`, + this default box will be returned. If None, will use a default box of + [[0., 0., 1., 1.]]. + scope: Name scope. + + Returns: + A [1, 4] float32 tensor with a bounding box that tightly covers all the + boxes in the box list. If the boxlist does not contain any boxes, the + default box is returned. + """ + with tf.name_scope(scope, 'CreateCoverageBox'): + num_boxes = boxlist.num_boxes() + + def coverage_box(bboxes): + y_min, x_min, y_max, x_max = tf.split( + value=bboxes, num_or_size_splits=4, axis=1) + y_min_coverage = tf.reduce_min(y_min, axis=0) + x_min_coverage = tf.reduce_min(x_min, axis=0) + y_max_coverage = tf.reduce_max(y_max, axis=0) + x_max_coverage = tf.reduce_max(x_max, axis=0) + return tf.stack( + [y_min_coverage, x_min_coverage, y_max_coverage, x_max_coverage], + axis=1) + + default_box = default_box or tf.constant([[0., 0., 1., 1.]]) + return tf.cond( + tf.greater_equal(num_boxes, 1), + true_fn=lambda: coverage_box(boxlist.get()), + false_fn=lambda: default_box) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list_ops_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list_ops_test.py new file mode 100644 index 0000000000000000000000000000000000000000..bb76cfd35af1a077debdf6945c13b04aaac37eca --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list_ops_test.py @@ -0,0 +1,1036 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.box_list_ops.""" +import numpy as np +import tensorflow as tf +from tensorflow.python.framework import errors +from tensorflow.python.framework import ops + +from object_detection.core import box_list +from object_detection.core import box_list_ops + + +class BoxListOpsTest(tf.test.TestCase): + """Tests for common bounding box operations.""" + + def test_area(self): + corners = tf.constant([[0.0, 0.0, 10.0, 20.0], [1.0, 2.0, 3.0, 4.0]]) + exp_output = [200.0, 4.0] + boxes = box_list.BoxList(corners) + areas = box_list_ops.area(boxes) + with self.test_session() as sess: + areas_output = sess.run(areas) + self.assertAllClose(areas_output, exp_output) + + def test_height_width(self): + corners = tf.constant([[0.0, 0.0, 10.0, 20.0], [1.0, 2.0, 3.0, 4.0]]) + exp_output_heights = [10., 2.] + exp_output_widths = [20., 2.] + boxes = box_list.BoxList(corners) + heights, widths = box_list_ops.height_width(boxes) + with self.test_session() as sess: + output_heights, output_widths = sess.run([heights, widths]) + self.assertAllClose(output_heights, exp_output_heights) + self.assertAllClose(output_widths, exp_output_widths) + + def test_scale(self): + corners = tf.constant([[0, 0, 100, 200], [50, 120, 100, 140]], + dtype=tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('extra_data', tf.constant([[1], [2]])) + + y_scale = tf.constant(1.0/100) + x_scale = tf.constant(1.0/200) + scaled_boxes = box_list_ops.scale(boxes, y_scale, x_scale) + exp_output = [[0, 0, 1, 1], [0.5, 0.6, 1.0, 0.7]] + with self.test_session() as sess: + scaled_corners_out = sess.run(scaled_boxes.get()) + self.assertAllClose(scaled_corners_out, exp_output) + extra_data_out = sess.run(scaled_boxes.get_field('extra_data')) + self.assertAllEqual(extra_data_out, [[1], [2]]) + + def test_clip_to_window_filter_boxes_which_fall_outside_the_window( + self): + window = tf.constant([0, 0, 9, 14], tf.float32) + corners = tf.constant([[5.0, 5.0, 6.0, 6.0], + [-1.0, -2.0, 4.0, 5.0], + [2.0, 3.0, 5.0, 9.0], + [0.0, 0.0, 9.0, 14.0], + [-100.0, -100.0, 300.0, 600.0], + [-10.0, -10.0, -9.0, -9.0]]) + boxes = box_list.BoxList(corners) + boxes.add_field('extra_data', tf.constant([[1], [2], [3], [4], [5], [6]])) + exp_output = [[5.0, 5.0, 6.0, 6.0], [0.0, 0.0, 4.0, 5.0], + [2.0, 3.0, 5.0, 9.0], [0.0, 0.0, 9.0, 14.0], + [0.0, 0.0, 9.0, 14.0]] + pruned = box_list_ops.clip_to_window( + boxes, window, filter_nonoverlapping=True) + with self.test_session() as sess: + pruned_output = sess.run(pruned.get()) + self.assertAllClose(pruned_output, exp_output) + extra_data_out = sess.run(pruned.get_field('extra_data')) + self.assertAllEqual(extra_data_out, [[1], [2], [3], [4], [5]]) + + def test_clip_to_window_without_filtering_boxes_which_fall_outside_the_window( + self): + window = tf.constant([0, 0, 9, 14], tf.float32) + corners = tf.constant([[5.0, 5.0, 6.0, 6.0], + [-1.0, -2.0, 4.0, 5.0], + [2.0, 3.0, 5.0, 9.0], + [0.0, 0.0, 9.0, 14.0], + [-100.0, -100.0, 300.0, 600.0], + [-10.0, -10.0, -9.0, -9.0]]) + boxes = box_list.BoxList(corners) + boxes.add_field('extra_data', tf.constant([[1], [2], [3], [4], [5], [6]])) + exp_output = [[5.0, 5.0, 6.0, 6.0], [0.0, 0.0, 4.0, 5.0], + [2.0, 3.0, 5.0, 9.0], [0.0, 0.0, 9.0, 14.0], + [0.0, 0.0, 9.0, 14.0], [0.0, 0.0, 0.0, 0.0]] + pruned = box_list_ops.clip_to_window( + boxes, window, filter_nonoverlapping=False) + with self.test_session() as sess: + pruned_output = sess.run(pruned.get()) + self.assertAllClose(pruned_output, exp_output) + extra_data_out = sess.run(pruned.get_field('extra_data')) + self.assertAllEqual(extra_data_out, [[1], [2], [3], [4], [5], [6]]) + + def test_prune_outside_window_filters_boxes_which_fall_outside_the_window( + self): + window = tf.constant([0, 0, 9, 14], tf.float32) + corners = tf.constant([[5.0, 5.0, 6.0, 6.0], + [-1.0, -2.0, 4.0, 5.0], + [2.0, 3.0, 5.0, 9.0], + [0.0, 0.0, 9.0, 14.0], + [-10.0, -10.0, -9.0, -9.0], + [-100.0, -100.0, 300.0, 600.0]]) + boxes = box_list.BoxList(corners) + boxes.add_field('extra_data', tf.constant([[1], [2], [3], [4], [5], [6]])) + exp_output = [[5.0, 5.0, 6.0, 6.0], + [2.0, 3.0, 5.0, 9.0], + [0.0, 0.0, 9.0, 14.0]] + pruned, keep_indices = box_list_ops.prune_outside_window(boxes, window) + with self.test_session() as sess: + pruned_output = sess.run(pruned.get()) + self.assertAllClose(pruned_output, exp_output) + keep_indices_out = sess.run(keep_indices) + self.assertAllEqual(keep_indices_out, [0, 2, 3]) + extra_data_out = sess.run(pruned.get_field('extra_data')) + self.assertAllEqual(extra_data_out, [[1], [3], [4]]) + + def test_prune_completely_outside_window(self): + window = tf.constant([0, 0, 9, 14], tf.float32) + corners = tf.constant([[5.0, 5.0, 6.0, 6.0], + [-1.0, -2.0, 4.0, 5.0], + [2.0, 3.0, 5.0, 9.0], + [0.0, 0.0, 9.0, 14.0], + [-10.0, -10.0, -9.0, -9.0], + [-100.0, -100.0, 300.0, 600.0]]) + boxes = box_list.BoxList(corners) + boxes.add_field('extra_data', tf.constant([[1], [2], [3], [4], [5], [6]])) + exp_output = [[5.0, 5.0, 6.0, 6.0], + [-1.0, -2.0, 4.0, 5.0], + [2.0, 3.0, 5.0, 9.0], + [0.0, 0.0, 9.0, 14.0], + [-100.0, -100.0, 300.0, 600.0]] + pruned, keep_indices = box_list_ops.prune_completely_outside_window(boxes, + window) + with self.test_session() as sess: + pruned_output = sess.run(pruned.get()) + self.assertAllClose(pruned_output, exp_output) + keep_indices_out = sess.run(keep_indices) + self.assertAllEqual(keep_indices_out, [0, 1, 2, 3, 5]) + extra_data_out = sess.run(pruned.get_field('extra_data')) + self.assertAllEqual(extra_data_out, [[1], [2], [3], [4], [6]]) + + def test_prune_completely_outside_window_with_empty_boxlist(self): + window = tf.constant([0, 0, 9, 14], tf.float32) + corners = tf.zeros(shape=[0, 4], dtype=tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('extra_data', tf.zeros(shape=[0], dtype=tf.int32)) + pruned, keep_indices = box_list_ops.prune_completely_outside_window(boxes, + window) + pruned_boxes = pruned.get() + extra = pruned.get_field('extra_data') + + exp_pruned_boxes = np.zeros(shape=[0, 4], dtype=np.float32) + exp_extra = np.zeros(shape=[0], dtype=np.int32) + with self.test_session() as sess: + pruned_boxes_out, keep_indices_out, extra_out = sess.run( + [pruned_boxes, keep_indices, extra]) + self.assertAllClose(exp_pruned_boxes, pruned_boxes_out) + self.assertAllEqual([], keep_indices_out) + self.assertAllEqual(exp_extra, extra_out) + + def test_intersection(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]) + exp_output = [[2.0, 0.0, 6.0], [1.0, 0.0, 5.0]] + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + intersect = box_list_ops.intersection(boxes1, boxes2) + with self.test_session() as sess: + intersect_output = sess.run(intersect) + self.assertAllClose(intersect_output, exp_output) + + def test_matched_intersection(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0]]) + exp_output = [2.0, 0.0] + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + intersect = box_list_ops.matched_intersection(boxes1, boxes2) + with self.test_session() as sess: + intersect_output = sess.run(intersect) + self.assertAllClose(intersect_output, exp_output) + + def test_iou(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]) + exp_output = [[2.0 / 16.0, 0, 6.0 / 400.0], [1.0 / 16.0, 0.0, 5.0 / 400.0]] + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + iou = box_list_ops.iou(boxes1, boxes2) + with self.test_session() as sess: + iou_output = sess.run(iou) + self.assertAllClose(iou_output, exp_output) + + def test_matched_iou(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0]]) + exp_output = [2.0 / 16.0, 0] + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + iou = box_list_ops.matched_iou(boxes1, boxes2) + with self.test_session() as sess: + iou_output = sess.run(iou) + self.assertAllClose(iou_output, exp_output) + + def test_iouworks_on_empty_inputs(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]) + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + boxes_empty = box_list.BoxList(tf.zeros((0, 4))) + iou_empty_1 = box_list_ops.iou(boxes1, boxes_empty) + iou_empty_2 = box_list_ops.iou(boxes_empty, boxes2) + iou_empty_3 = box_list_ops.iou(boxes_empty, boxes_empty) + with self.test_session() as sess: + iou_output_1, iou_output_2, iou_output_3 = sess.run( + [iou_empty_1, iou_empty_2, iou_empty_3]) + self.assertAllEqual(iou_output_1.shape, (2, 0)) + self.assertAllEqual(iou_output_2.shape, (0, 3)) + self.assertAllEqual(iou_output_3.shape, (0, 0)) + + def test_ioa(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]) + exp_output_1 = [[2.0 / 12.0, 0, 6.0 / 400.0], + [1.0 / 12.0, 0.0, 5.0 / 400.0]] + exp_output_2 = [[2.0 / 6.0, 1.0 / 5.0], + [0, 0], + [6.0 / 6.0, 5.0 / 5.0]] + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + ioa_1 = box_list_ops.ioa(boxes1, boxes2) + ioa_2 = box_list_ops.ioa(boxes2, boxes1) + with self.test_session() as sess: + ioa_output_1, ioa_output_2 = sess.run([ioa_1, ioa_2]) + self.assertAllClose(ioa_output_1, exp_output_1) + self.assertAllClose(ioa_output_2, exp_output_2) + + def test_prune_non_overlapping_boxes(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]) + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + minoverlap = 0.5 + + exp_output_1 = boxes1 + exp_output_2 = box_list.BoxList(tf.constant(0.0, shape=[0, 4])) + output_1, keep_indices_1 = box_list_ops.prune_non_overlapping_boxes( + boxes1, boxes2, min_overlap=minoverlap) + output_2, keep_indices_2 = box_list_ops.prune_non_overlapping_boxes( + boxes2, boxes1, min_overlap=minoverlap) + with self.test_session() as sess: + (output_1_, keep_indices_1_, output_2_, keep_indices_2_, exp_output_1_, + exp_output_2_) = sess.run( + [output_1.get(), keep_indices_1, + output_2.get(), keep_indices_2, + exp_output_1.get(), exp_output_2.get()]) + self.assertAllClose(output_1_, exp_output_1_) + self.assertAllClose(output_2_, exp_output_2_) + self.assertAllEqual(keep_indices_1_, [0, 1]) + self.assertAllEqual(keep_indices_2_, []) + + def test_prune_small_boxes(self): + boxes = tf.constant([[4.0, 3.0, 7.0, 5.0], + [5.0, 6.0, 10.0, 7.0], + [3.0, 4.0, 6.0, 8.0], + [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]) + exp_boxes = [[3.0, 4.0, 6.0, 8.0], + [0.0, 0.0, 20.0, 20.0]] + boxes = box_list.BoxList(boxes) + pruned_boxes = box_list_ops.prune_small_boxes(boxes, 3) + with self.test_session() as sess: + pruned_boxes = sess.run(pruned_boxes.get()) + self.assertAllEqual(pruned_boxes, exp_boxes) + + def test_prune_small_boxes_prunes_boxes_with_negative_side(self): + boxes = tf.constant([[4.0, 3.0, 7.0, 5.0], + [5.0, 6.0, 10.0, 7.0], + [3.0, 4.0, 6.0, 8.0], + [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0], + [2.0, 3.0, 1.5, 7.0], # negative height + [2.0, 3.0, 5.0, 1.7]]) # negative width + exp_boxes = [[3.0, 4.0, 6.0, 8.0], + [0.0, 0.0, 20.0, 20.0]] + boxes = box_list.BoxList(boxes) + pruned_boxes = box_list_ops.prune_small_boxes(boxes, 3) + with self.test_session() as sess: + pruned_boxes = sess.run(pruned_boxes.get()) + self.assertAllEqual(pruned_boxes, exp_boxes) + + def test_change_coordinate_frame(self): + corners = tf.constant([[0.25, 0.5, 0.75, 0.75], [0.5, 0.0, 1.0, 1.0]]) + window = tf.constant([0.25, 0.25, 0.75, 0.75]) + boxes = box_list.BoxList(corners) + + expected_corners = tf.constant([[0, 0.5, 1.0, 1.0], [0.5, -0.5, 1.5, 1.5]]) + expected_boxes = box_list.BoxList(expected_corners) + output = box_list_ops.change_coordinate_frame(boxes, window) + + with self.test_session() as sess: + output_, expected_boxes_ = sess.run([output.get(), expected_boxes.get()]) + self.assertAllClose(output_, expected_boxes_) + + def test_ioaworks_on_empty_inputs(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]) + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + boxes_empty = box_list.BoxList(tf.zeros((0, 4))) + ioa_empty_1 = box_list_ops.ioa(boxes1, boxes_empty) + ioa_empty_2 = box_list_ops.ioa(boxes_empty, boxes2) + ioa_empty_3 = box_list_ops.ioa(boxes_empty, boxes_empty) + with self.test_session() as sess: + ioa_output_1, ioa_output_2, ioa_output_3 = sess.run( + [ioa_empty_1, ioa_empty_2, ioa_empty_3]) + self.assertAllEqual(ioa_output_1.shape, (2, 0)) + self.assertAllEqual(ioa_output_2.shape, (0, 3)) + self.assertAllEqual(ioa_output_3.shape, (0, 0)) + + def test_pairwise_distances(self): + corners1 = tf.constant([[0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 0.0, 2.0]]) + corners2 = tf.constant([[3.0, 4.0, 1.0, 0.0], + [-4.0, 0.0, 0.0, 3.0], + [0.0, 0.0, 0.0, 0.0]]) + exp_output = [[26, 25, 0], [18, 27, 6]] + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + dist_matrix = box_list_ops.sq_dist(boxes1, boxes2) + with self.test_session() as sess: + dist_output = sess.run(dist_matrix) + self.assertAllClose(dist_output, exp_output) + + def test_boolean_mask(self): + corners = tf.constant( + [4 * [0.0], 4 * [1.0], 4 * [2.0], 4 * [3.0], 4 * [4.0]]) + indicator = tf.constant([True, False, True, False, True], tf.bool) + expected_subset = [4 * [0.0], 4 * [2.0], 4 * [4.0]] + boxes = box_list.BoxList(corners) + subset = box_list_ops.boolean_mask(boxes, indicator) + with self.test_session() as sess: + subset_output = sess.run(subset.get()) + self.assertAllClose(subset_output, expected_subset) + + def test_boolean_mask_with_field(self): + corners = tf.constant( + [4 * [0.0], 4 * [1.0], 4 * [2.0], 4 * [3.0], 4 * [4.0]]) + indicator = tf.constant([True, False, True, False, True], tf.bool) + weights = tf.constant([[.1], [.3], [.5], [.7], [.9]], tf.float32) + expected_subset = [4 * [0.0], 4 * [2.0], 4 * [4.0]] + expected_weights = [[.1], [.5], [.9]] + + boxes = box_list.BoxList(corners) + boxes.add_field('weights', weights) + subset = box_list_ops.boolean_mask(boxes, indicator, ['weights']) + with self.test_session() as sess: + subset_output, weights_output = sess.run( + [subset.get(), subset.get_field('weights')]) + self.assertAllClose(subset_output, expected_subset) + self.assertAllClose(weights_output, expected_weights) + + def test_gather(self): + corners = tf.constant( + [4 * [0.0], 4 * [1.0], 4 * [2.0], 4 * [3.0], 4 * [4.0]]) + indices = tf.constant([0, 2, 4], tf.int32) + expected_subset = [4 * [0.0], 4 * [2.0], 4 * [4.0]] + boxes = box_list.BoxList(corners) + subset = box_list_ops.gather(boxes, indices) + with self.test_session() as sess: + subset_output = sess.run(subset.get()) + self.assertAllClose(subset_output, expected_subset) + + def test_gather_with_field(self): + corners = tf.constant([4*[0.0], 4*[1.0], 4*[2.0], 4*[3.0], 4*[4.0]]) + indices = tf.constant([0, 2, 4], tf.int32) + weights = tf.constant([[.1], [.3], [.5], [.7], [.9]], tf.float32) + expected_subset = [4 * [0.0], 4 * [2.0], 4 * [4.0]] + expected_weights = [[.1], [.5], [.9]] + + boxes = box_list.BoxList(corners) + boxes.add_field('weights', weights) + subset = box_list_ops.gather(boxes, indices, ['weights']) + with self.test_session() as sess: + subset_output, weights_output = sess.run( + [subset.get(), subset.get_field('weights')]) + self.assertAllClose(subset_output, expected_subset) + self.assertAllClose(weights_output, expected_weights) + + def test_gather_with_invalid_field(self): + corners = tf.constant([4 * [0.0], 4 * [1.0]]) + indices = tf.constant([0, 1], tf.int32) + weights = tf.constant([[.1], [.3]], tf.float32) + + boxes = box_list.BoxList(corners) + boxes.add_field('weights', weights) + with self.assertRaises(ValueError): + box_list_ops.gather(boxes, indices, ['foo', 'bar']) + + def test_gather_with_invalid_inputs(self): + corners = tf.constant( + [4 * [0.0], 4 * [1.0], 4 * [2.0], 4 * [3.0], 4 * [4.0]]) + indices_float32 = tf.constant([0, 2, 4], tf.float32) + boxes = box_list.BoxList(corners) + with self.assertRaises(ValueError): + _ = box_list_ops.gather(boxes, indices_float32) + indices_2d = tf.constant([[0, 2, 4]], tf.int32) + boxes = box_list.BoxList(corners) + with self.assertRaises(ValueError): + _ = box_list_ops.gather(boxes, indices_2d) + + def test_gather_with_dynamic_indexing(self): + corners = tf.constant([4 * [0.0], 4 * [1.0], 4 * [2.0], 4 * [3.0], 4 * [4.0] + ]) + weights = tf.constant([.5, .3, .7, .1, .9], tf.float32) + indices = tf.reshape(tf.where(tf.greater(weights, 0.4)), [-1]) + expected_subset = [4 * [0.0], 4 * [2.0], 4 * [4.0]] + expected_weights = [.5, .7, .9] + + boxes = box_list.BoxList(corners) + boxes.add_field('weights', weights) + subset = box_list_ops.gather(boxes, indices, ['weights']) + with self.test_session() as sess: + subset_output, weights_output = sess.run([subset.get(), subset.get_field( + 'weights')]) + self.assertAllClose(subset_output, expected_subset) + self.assertAllClose(weights_output, expected_weights) + + def test_sort_by_field_ascending_order(self): + exp_corners = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9], + [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]] + exp_scores = [.95, .9, .75, .6, .5, .3] + exp_weights = [.2, .45, .6, .75, .8, .92] + shuffle = [2, 4, 0, 5, 1, 3] + corners = tf.constant([exp_corners[i] for i in shuffle], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('scores', tf.constant( + [exp_scores[i] for i in shuffle], tf.float32)) + boxes.add_field('weights', tf.constant( + [exp_weights[i] for i in shuffle], tf.float32)) + sort_by_weight = box_list_ops.sort_by_field( + boxes, + 'weights', + order=box_list_ops.SortOrder.ascend) + with self.test_session() as sess: + corners_out, scores_out, weights_out = sess.run([ + sort_by_weight.get(), + sort_by_weight.get_field('scores'), + sort_by_weight.get_field('weights')]) + self.assertAllClose(corners_out, exp_corners) + self.assertAllClose(scores_out, exp_scores) + self.assertAllClose(weights_out, exp_weights) + + def test_sort_by_field_descending_order(self): + exp_corners = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9], + [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]] + exp_scores = [.95, .9, .75, .6, .5, .3] + exp_weights = [.2, .45, .6, .75, .8, .92] + shuffle = [2, 4, 0, 5, 1, 3] + + corners = tf.constant([exp_corners[i] for i in shuffle], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('scores', tf.constant( + [exp_scores[i] for i in shuffle], tf.float32)) + boxes.add_field('weights', tf.constant( + [exp_weights[i] for i in shuffle], tf.float32)) + + sort_by_score = box_list_ops.sort_by_field(boxes, 'scores') + with self.test_session() as sess: + corners_out, scores_out, weights_out = sess.run([sort_by_score.get( + ), sort_by_score.get_field('scores'), sort_by_score.get_field('weights')]) + self.assertAllClose(corners_out, exp_corners) + self.assertAllClose(scores_out, exp_scores) + self.assertAllClose(weights_out, exp_weights) + + def test_sort_by_field_invalid_inputs(self): + corners = tf.constant([4 * [0.0], 4 * [0.5], 4 * [1.0], 4 * [2.0], 4 * + [3.0], 4 * [4.0]]) + misc = tf.constant([[.95, .9], [.5, .3]], tf.float32) + weights = tf.constant([.1, .2], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('misc', misc) + boxes.add_field('weights', weights) + + with self.test_session() as sess: + with self.assertRaises(ValueError): + box_list_ops.sort_by_field(boxes, 'area') + + with self.assertRaises(ValueError): + box_list_ops.sort_by_field(boxes, 'misc') + + if ops._USE_C_API: + with self.assertRaises(ValueError): + box_list_ops.sort_by_field(boxes, 'weights') + else: + with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, + 'Incorrect field size'): + sess.run(box_list_ops.sort_by_field(boxes, 'weights').get()) + + def test_visualize_boxes_in_image(self): + image = tf.zeros((6, 4, 3)) + corners = tf.constant([[0, 0, 5, 3], + [0, 0, 3, 2]], tf.float32) + boxes = box_list.BoxList(corners) + image_and_boxes = box_list_ops.visualize_boxes_in_image(image, boxes) + image_and_boxes_bw = tf.to_float( + tf.greater(tf.reduce_sum(image_and_boxes, 2), 0.0)) + exp_result = [[1, 1, 1, 0], + [1, 1, 1, 0], + [1, 1, 1, 0], + [1, 0, 1, 0], + [1, 1, 1, 0], + [0, 0, 0, 0]] + with self.test_session() as sess: + output = sess.run(image_and_boxes_bw) + self.assertAllEqual(output.astype(int), exp_result) + + def test_filter_field_value_equals(self): + corners = tf.constant([[0, 0, 1, 1], + [0, 0.1, 1, 1.1], + [0, -0.1, 1, 0.9], + [0, 10, 1, 11], + [0, 10.1, 1, 11.1], + [0, 100, 1, 101]], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('classes', tf.constant([1, 2, 1, 2, 2, 1])) + exp_output1 = [[0, 0, 1, 1], [0, -0.1, 1, 0.9], [0, 100, 1, 101]] + exp_output2 = [[0, 0.1, 1, 1.1], [0, 10, 1, 11], [0, 10.1, 1, 11.1]] + + filtered_boxes1 = box_list_ops.filter_field_value_equals( + boxes, 'classes', 1) + filtered_boxes2 = box_list_ops.filter_field_value_equals( + boxes, 'classes', 2) + with self.test_session() as sess: + filtered_output1, filtered_output2 = sess.run([filtered_boxes1.get(), + filtered_boxes2.get()]) + self.assertAllClose(filtered_output1, exp_output1) + self.assertAllClose(filtered_output2, exp_output2) + + def test_filter_greater_than(self): + corners = tf.constant([[0, 0, 1, 1], + [0, 0.1, 1, 1.1], + [0, -0.1, 1, 0.9], + [0, 10, 1, 11], + [0, 10.1, 1, 11.1], + [0, 100, 1, 101]], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('scores', tf.constant([.1, .75, .9, .5, .5, .8])) + thresh = .6 + exp_output = [[0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9], [0, 100, 1, 101]] + + filtered_boxes = box_list_ops.filter_greater_than(boxes, thresh) + with self.test_session() as sess: + filtered_output = sess.run(filtered_boxes.get()) + self.assertAllClose(filtered_output, exp_output) + + def test_clip_box_list(self): + boxlist = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5], + [0.6, 0.6, 0.8, 0.8], [0.2, 0.2, 0.3, 0.3]], tf.float32)) + boxlist.add_field('classes', tf.constant([0, 0, 1, 1])) + boxlist.add_field('scores', tf.constant([0.75, 0.65, 0.3, 0.2])) + num_boxes = 2 + clipped_boxlist = box_list_ops.pad_or_clip_box_list(boxlist, num_boxes) + + expected_boxes = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5]] + expected_classes = [0, 0] + expected_scores = [0.75, 0.65] + with self.test_session() as sess: + boxes_out, classes_out, scores_out = sess.run( + [clipped_boxlist.get(), clipped_boxlist.get_field('classes'), + clipped_boxlist.get_field('scores')]) + + self.assertAllClose(expected_boxes, boxes_out) + self.assertAllEqual(expected_classes, classes_out) + self.assertAllClose(expected_scores, scores_out) + + def test_pad_box_list(self): + boxlist = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5]], tf.float32)) + boxlist.add_field('classes', tf.constant([0, 1])) + boxlist.add_field('scores', tf.constant([0.75, 0.2])) + num_boxes = 4 + padded_boxlist = box_list_ops.pad_or_clip_box_list(boxlist, num_boxes) + + expected_boxes = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5], + [0, 0, 0, 0], [0, 0, 0, 0]] + expected_classes = [0, 1, 0, 0] + expected_scores = [0.75, 0.2, 0, 0] + with self.test_session() as sess: + boxes_out, classes_out, scores_out = sess.run( + [padded_boxlist.get(), padded_boxlist.get_field('classes'), + padded_boxlist.get_field('scores')]) + + self.assertAllClose(expected_boxes, boxes_out) + self.assertAllEqual(expected_classes, classes_out) + self.assertAllClose(expected_scores, scores_out) + + def test_select_random_box(self): + boxes = [[0., 0., 1., 1.], + [0., 1., 2., 3.], + [0., 2., 3., 4.]] + + corners = tf.constant(boxes, dtype=tf.float32) + boxlist = box_list.BoxList(corners) + random_bbox, valid = box_list_ops.select_random_box(boxlist) + with self.test_session() as sess: + random_bbox_out, valid_out = sess.run([random_bbox, valid]) + + norm_small = any( + [np.linalg.norm(random_bbox_out - box) < 1e-6 for box in boxes]) + + self.assertTrue(norm_small) + self.assertTrue(valid_out) + + def test_select_random_box_with_empty_boxlist(self): + corners = tf.constant([], shape=[0, 4], dtype=tf.float32) + boxlist = box_list.BoxList(corners) + random_bbox, valid = box_list_ops.select_random_box(boxlist) + with self.test_session() as sess: + random_bbox_out, valid_out = sess.run([random_bbox, valid]) + + expected_bbox_out = np.array([[-1., -1., -1., -1.]], dtype=np.float32) + self.assertAllEqual(expected_bbox_out, random_bbox_out) + self.assertFalse(valid_out) + + def test_get_minimal_coverage_box(self): + boxes = [[0., 0., 1., 1.], + [-1., 1., 2., 3.], + [0., 2., 3., 4.]] + + expected_coverage_box = [[-1., 0., 3., 4.]] + + corners = tf.constant(boxes, dtype=tf.float32) + boxlist = box_list.BoxList(corners) + coverage_box = box_list_ops.get_minimal_coverage_box(boxlist) + with self.test_session() as sess: + coverage_box_out = sess.run(coverage_box) + + self.assertAllClose(expected_coverage_box, coverage_box_out) + + def test_get_minimal_coverage_box_with_empty_boxlist(self): + corners = tf.constant([], shape=[0, 4], dtype=tf.float32) + boxlist = box_list.BoxList(corners) + coverage_box = box_list_ops.get_minimal_coverage_box(boxlist) + with self.test_session() as sess: + coverage_box_out = sess.run(coverage_box) + + self.assertAllClose([[0.0, 0.0, 1.0, 1.0]], coverage_box_out) + + +class ConcatenateTest(tf.test.TestCase): + + def test_invalid_input_box_list_list(self): + with self.assertRaises(ValueError): + box_list_ops.concatenate(None) + with self.assertRaises(ValueError): + box_list_ops.concatenate([]) + with self.assertRaises(ValueError): + corners = tf.constant([[0, 0, 0, 0]], tf.float32) + boxlist = box_list.BoxList(corners) + box_list_ops.concatenate([boxlist, 2]) + + def test_concatenate_with_missing_fields(self): + corners1 = tf.constant([[0, 0, 0, 0], [1, 2, 3, 4]], tf.float32) + scores1 = tf.constant([1.0, 2.1]) + corners2 = tf.constant([[0, 3, 1, 6], [2, 4, 3, 8]], tf.float32) + boxlist1 = box_list.BoxList(corners1) + boxlist1.add_field('scores', scores1) + boxlist2 = box_list.BoxList(corners2) + with self.assertRaises(ValueError): + box_list_ops.concatenate([boxlist1, boxlist2]) + + def test_concatenate_with_incompatible_field_shapes(self): + corners1 = tf.constant([[0, 0, 0, 0], [1, 2, 3, 4]], tf.float32) + scores1 = tf.constant([1.0, 2.1]) + corners2 = tf.constant([[0, 3, 1, 6], [2, 4, 3, 8]], tf.float32) + scores2 = tf.constant([[1.0, 1.0], [2.1, 3.2]]) + boxlist1 = box_list.BoxList(corners1) + boxlist1.add_field('scores', scores1) + boxlist2 = box_list.BoxList(corners2) + boxlist2.add_field('scores', scores2) + with self.assertRaises(ValueError): + box_list_ops.concatenate([boxlist1, boxlist2]) + + def test_concatenate_is_correct(self): + corners1 = tf.constant([[0, 0, 0, 0], [1, 2, 3, 4]], tf.float32) + scores1 = tf.constant([1.0, 2.1]) + corners2 = tf.constant([[0, 3, 1, 6], [2, 4, 3, 8], [1, 0, 5, 10]], + tf.float32) + scores2 = tf.constant([1.0, 2.1, 5.6]) + + exp_corners = [[0, 0, 0, 0], + [1, 2, 3, 4], + [0, 3, 1, 6], + [2, 4, 3, 8], + [1, 0, 5, 10]] + exp_scores = [1.0, 2.1, 1.0, 2.1, 5.6] + + boxlist1 = box_list.BoxList(corners1) + boxlist1.add_field('scores', scores1) + boxlist2 = box_list.BoxList(corners2) + boxlist2.add_field('scores', scores2) + result = box_list_ops.concatenate([boxlist1, boxlist2]) + with self.test_session() as sess: + corners_output, scores_output = sess.run( + [result.get(), result.get_field('scores')]) + self.assertAllClose(corners_output, exp_corners) + self.assertAllClose(scores_output, exp_scores) + + +class NonMaxSuppressionTest(tf.test.TestCase): + + def test_select_from_three_clusters(self): + corners = tf.constant([[0, 0, 1, 1], + [0, 0.1, 1, 1.1], + [0, -0.1, 1, 0.9], + [0, 10, 1, 11], + [0, 10.1, 1, 11.1], + [0, 100, 1, 101]], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('scores', tf.constant([.9, .75, .6, .95, .5, .3])) + iou_thresh = .5 + max_output_size = 3 + + exp_nms = [[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 100, 1, 101]] + nms = box_list_ops.non_max_suppression( + boxes, iou_thresh, max_output_size) + with self.test_session() as sess: + nms_output = sess.run(nms.get()) + self.assertAllClose(nms_output, exp_nms) + + def test_select_at_most_two_boxes_from_three_clusters(self): + corners = tf.constant([[0, 0, 1, 1], + [0, 0.1, 1, 1.1], + [0, -0.1, 1, 0.9], + [0, 10, 1, 11], + [0, 10.1, 1, 11.1], + [0, 100, 1, 101]], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('scores', tf.constant([.9, .75, .6, .95, .5, .3])) + iou_thresh = .5 + max_output_size = 2 + + exp_nms = [[0, 10, 1, 11], + [0, 0, 1, 1]] + nms = box_list_ops.non_max_suppression( + boxes, iou_thresh, max_output_size) + with self.test_session() as sess: + nms_output = sess.run(nms.get()) + self.assertAllClose(nms_output, exp_nms) + + def test_select_at_most_thirty_boxes_from_three_clusters(self): + corners = tf.constant([[0, 0, 1, 1], + [0, 0.1, 1, 1.1], + [0, -0.1, 1, 0.9], + [0, 10, 1, 11], + [0, 10.1, 1, 11.1], + [0, 100, 1, 101]], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('scores', tf.constant([.9, .75, .6, .95, .5, .3])) + iou_thresh = .5 + max_output_size = 30 + + exp_nms = [[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 100, 1, 101]] + nms = box_list_ops.non_max_suppression( + boxes, iou_thresh, max_output_size) + with self.test_session() as sess: + nms_output = sess.run(nms.get()) + self.assertAllClose(nms_output, exp_nms) + + def test_select_single_box(self): + corners = tf.constant([[0, 0, 1, 1]], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('scores', tf.constant([.9])) + iou_thresh = .5 + max_output_size = 3 + + exp_nms = [[0, 0, 1, 1]] + nms = box_list_ops.non_max_suppression( + boxes, iou_thresh, max_output_size) + with self.test_session() as sess: + nms_output = sess.run(nms.get()) + self.assertAllClose(nms_output, exp_nms) + + def test_select_from_ten_identical_boxes(self): + corners = tf.constant(10 * [[0, 0, 1, 1]], tf.float32) + boxes = box_list.BoxList(corners) + boxes.add_field('scores', tf.constant(10 * [.9])) + iou_thresh = .5 + max_output_size = 3 + + exp_nms = [[0, 0, 1, 1]] + nms = box_list_ops.non_max_suppression( + boxes, iou_thresh, max_output_size) + with self.test_session() as sess: + nms_output = sess.run(nms.get()) + self.assertAllClose(nms_output, exp_nms) + + def test_copy_extra_fields(self): + corners = tf.constant([[0, 0, 1, 1], + [0, 0.1, 1, 1.1]], tf.float32) + boxes = box_list.BoxList(corners) + tensor1 = np.array([[1], [4]]) + tensor2 = np.array([[1, 1], [2, 2]]) + boxes.add_field('tensor1', tf.constant(tensor1)) + boxes.add_field('tensor2', tf.constant(tensor2)) + new_boxes = box_list.BoxList(tf.constant([[0, 0, 10, 10], + [1, 3, 5, 5]], tf.float32)) + new_boxes = box_list_ops._copy_extra_fields(new_boxes, boxes) + with self.test_session() as sess: + self.assertAllClose(tensor1, sess.run(new_boxes.get_field('tensor1'))) + self.assertAllClose(tensor2, sess.run(new_boxes.get_field('tensor2'))) + + +class CoordinatesConversionTest(tf.test.TestCase): + + def test_to_normalized_coordinates(self): + coordinates = tf.constant([[0, 0, 100, 100], + [25, 25, 75, 75]], tf.float32) + img = tf.ones((128, 100, 100, 3)) + boxlist = box_list.BoxList(coordinates) + normalized_boxlist = box_list_ops.to_normalized_coordinates( + boxlist, tf.shape(img)[1], tf.shape(img)[2]) + expected_boxes = [[0, 0, 1, 1], + [0.25, 0.25, 0.75, 0.75]] + + with self.test_session() as sess: + normalized_boxes = sess.run(normalized_boxlist.get()) + self.assertAllClose(normalized_boxes, expected_boxes) + + def test_to_normalized_coordinates_already_normalized(self): + coordinates = tf.constant([[0, 0, 1, 1], + [0.25, 0.25, 0.75, 0.75]], tf.float32) + img = tf.ones((128, 100, 100, 3)) + boxlist = box_list.BoxList(coordinates) + normalized_boxlist = box_list_ops.to_normalized_coordinates( + boxlist, tf.shape(img)[1], tf.shape(img)[2]) + + with self.test_session() as sess: + with self.assertRaisesOpError('assertion failed'): + sess.run(normalized_boxlist.get()) + + def test_to_absolute_coordinates(self): + coordinates = tf.constant([[0, 0, 1, 1], + [0.25, 0.25, 0.75, 0.75]], tf.float32) + img = tf.ones((128, 100, 100, 3)) + boxlist = box_list.BoxList(coordinates) + absolute_boxlist = box_list_ops.to_absolute_coordinates(boxlist, + tf.shape(img)[1], + tf.shape(img)[2]) + expected_boxes = [[0, 0, 100, 100], + [25, 25, 75, 75]] + + with self.test_session() as sess: + absolute_boxes = sess.run(absolute_boxlist.get()) + self.assertAllClose(absolute_boxes, expected_boxes) + + def test_to_absolute_coordinates_already_abolute(self): + coordinates = tf.constant([[0, 0, 100, 100], + [25, 25, 75, 75]], tf.float32) + img = tf.ones((128, 100, 100, 3)) + boxlist = box_list.BoxList(coordinates) + absolute_boxlist = box_list_ops.to_absolute_coordinates(boxlist, + tf.shape(img)[1], + tf.shape(img)[2]) + + with self.test_session() as sess: + with self.assertRaisesOpError('assertion failed'): + sess.run(absolute_boxlist.get()) + + def test_convert_to_normalized_and_back(self): + coordinates = np.random.uniform(size=(100, 4)) + coordinates = np.round(np.sort(coordinates) * 200) + coordinates[:, 2:4] += 1 + coordinates[99, :] = [0, 0, 201, 201] + img = tf.ones((128, 202, 202, 3)) + + boxlist = box_list.BoxList(tf.constant(coordinates, tf.float32)) + boxlist = box_list_ops.to_normalized_coordinates(boxlist, + tf.shape(img)[1], + tf.shape(img)[2]) + boxlist = box_list_ops.to_absolute_coordinates(boxlist, + tf.shape(img)[1], + tf.shape(img)[2]) + + with self.test_session() as sess: + out = sess.run(boxlist.get()) + self.assertAllClose(out, coordinates) + + def test_convert_to_absolute_and_back(self): + coordinates = np.random.uniform(size=(100, 4)) + coordinates = np.sort(coordinates) + coordinates[99, :] = [0, 0, 1, 1] + img = tf.ones((128, 202, 202, 3)) + + boxlist = box_list.BoxList(tf.constant(coordinates, tf.float32)) + boxlist = box_list_ops.to_absolute_coordinates(boxlist, + tf.shape(img)[1], + tf.shape(img)[2]) + boxlist = box_list_ops.to_normalized_coordinates(boxlist, + tf.shape(img)[1], + tf.shape(img)[2]) + + with self.test_session() as sess: + out = sess.run(boxlist.get()) + self.assertAllClose(out, coordinates) + + def test_to_absolute_coordinates_maximum_coordinate_check(self): + coordinates = tf.constant([[0, 0, 1.2, 1.2], + [0.25, 0.25, 0.75, 0.75]], tf.float32) + img = tf.ones((128, 100, 100, 3)) + boxlist = box_list.BoxList(coordinates) + absolute_boxlist = box_list_ops.to_absolute_coordinates( + boxlist, + tf.shape(img)[1], + tf.shape(img)[2], + maximum_normalized_coordinate=1.1) + + with self.test_session() as sess: + with self.assertRaisesOpError('assertion failed'): + sess.run(absolute_boxlist.get()) + + +class BoxRefinementTest(tf.test.TestCase): + + def test_box_voting(self): + candidates = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4], [0.6, 0.6, 0.8, 0.8]], tf.float32)) + candidates.add_field('ExtraField', tf.constant([1, 2])) + pool = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5], + [0.6, 0.6, 0.8, 0.8]], tf.float32)) + pool.add_field('scores', tf.constant([0.75, 0.25, 0.3])) + averaged_boxes = box_list_ops.box_voting(candidates, pool) + expected_boxes = [[0.1, 0.1, 0.425, 0.425], [0.6, 0.6, 0.8, 0.8]] + expected_scores = [0.5, 0.3] + with self.test_session() as sess: + boxes_out, scores_out, extra_field_out = sess.run( + [averaged_boxes.get(), averaged_boxes.get_field('scores'), + averaged_boxes.get_field('ExtraField')]) + + self.assertAllClose(expected_boxes, boxes_out) + self.assertAllClose(expected_scores, scores_out) + self.assertAllEqual(extra_field_out, [1, 2]) + + def test_box_voting_fails_with_negative_scores(self): + candidates = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4]], tf.float32)) + pool = box_list.BoxList(tf.constant([[0.1, 0.1, 0.4, 0.4]], tf.float32)) + pool.add_field('scores', tf.constant([-0.2])) + averaged_boxes = box_list_ops.box_voting(candidates, pool) + + with self.test_session() as sess: + with self.assertRaisesOpError('Scores must be non negative'): + sess.run([averaged_boxes.get()]) + + def test_box_voting_fails_when_unmatched(self): + candidates = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4]], tf.float32)) + pool = box_list.BoxList(tf.constant([[0.6, 0.6, 0.8, 0.8]], tf.float32)) + pool.add_field('scores', tf.constant([0.2])) + averaged_boxes = box_list_ops.box_voting(candidates, pool) + + with self.test_session() as sess: + with self.assertRaisesOpError('Each box in selected_boxes must match ' + 'with at least one box in pool_boxes.'): + sess.run([averaged_boxes.get()]) + + def test_refine_boxes(self): + pool = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5], + [0.6, 0.6, 0.8, 0.8]], tf.float32)) + pool.add_field('ExtraField', tf.constant([1, 2, 3])) + pool.add_field('scores', tf.constant([0.75, 0.25, 0.3])) + refined_boxes = box_list_ops.refine_boxes(pool, 0.5, 10) + + expected_boxes = [[0.1, 0.1, 0.425, 0.425], [0.6, 0.6, 0.8, 0.8]] + expected_scores = [0.5, 0.3] + with self.test_session() as sess: + boxes_out, scores_out, extra_field_out = sess.run( + [refined_boxes.get(), refined_boxes.get_field('scores'), + refined_boxes.get_field('ExtraField')]) + + self.assertAllClose(expected_boxes, boxes_out) + self.assertAllClose(expected_scores, scores_out) + self.assertAllEqual(extra_field_out, [1, 3]) + + def test_refine_boxes_multi_class(self): + pool = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5], + [0.6, 0.6, 0.8, 0.8], [0.2, 0.2, 0.3, 0.3]], tf.float32)) + pool.add_field('classes', tf.constant([0, 0, 1, 1])) + pool.add_field('scores', tf.constant([0.75, 0.25, 0.3, 0.2])) + refined_boxes = box_list_ops.refine_boxes_multi_class(pool, 3, 0.5, 10) + + expected_boxes = [[0.1, 0.1, 0.425, 0.425], [0.6, 0.6, 0.8, 0.8], + [0.2, 0.2, 0.3, 0.3]] + expected_scores = [0.5, 0.3, 0.2] + with self.test_session() as sess: + boxes_out, scores_out, extra_field_out = sess.run( + [refined_boxes.get(), refined_boxes.get_field('scores'), + refined_boxes.get_field('classes')]) + + self.assertAllClose(expected_boxes, boxes_out) + self.assertAllClose(expected_scores, scores_out) + self.assertAllEqual(extra_field_out, [0, 1, 1]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list_test.py new file mode 100644 index 0000000000000000000000000000000000000000..edc00ebbc40227713739e2583fe9fc067e9449e2 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_list_test.py @@ -0,0 +1,134 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.box_list.""" + +import tensorflow as tf + +from object_detection.core import box_list + + +class BoxListTest(tf.test.TestCase): + """Tests for BoxList class.""" + + def test_num_boxes(self): + data = tf.constant([[0, 0, 1, 1], [1, 1, 2, 3], [3, 4, 5, 5]], tf.float32) + expected_num_boxes = 3 + + boxes = box_list.BoxList(data) + with self.test_session() as sess: + num_boxes_output = sess.run(boxes.num_boxes()) + self.assertEquals(num_boxes_output, expected_num_boxes) + + def test_get_correct_center_coordinates_and_sizes(self): + boxes = [[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]] + boxes = box_list.BoxList(tf.constant(boxes)) + centers_sizes = boxes.get_center_coordinates_and_sizes() + expected_centers_sizes = [[15, 0.35], [12.5, 0.25], [10, 0.3], [5, 0.3]] + with self.test_session() as sess: + centers_sizes_out = sess.run(centers_sizes) + self.assertAllClose(centers_sizes_out, expected_centers_sizes) + + def test_create_box_list_with_dynamic_shape(self): + data = tf.constant([[0, 0, 1, 1], [1, 1, 2, 3], [3, 4, 5, 5]], tf.float32) + indices = tf.reshape(tf.where(tf.greater([1, 0, 1], 0)), [-1]) + data = tf.gather(data, indices) + assert data.get_shape().as_list() == [None, 4] + expected_num_boxes = 2 + + boxes = box_list.BoxList(data) + with self.test_session() as sess: + num_boxes_output = sess.run(boxes.num_boxes()) + self.assertEquals(num_boxes_output, expected_num_boxes) + + def test_transpose_coordinates(self): + boxes = [[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]] + boxes = box_list.BoxList(tf.constant(boxes)) + boxes.transpose_coordinates() + expected_corners = [[10.0, 10.0, 15.0, 20.0], [0.1, 0.2, 0.4, 0.5]] + with self.test_session() as sess: + corners_out = sess.run(boxes.get()) + self.assertAllClose(corners_out, expected_corners) + + def test_box_list_invalid_inputs(self): + data0 = tf.constant([[[0, 0, 1, 1], [3, 4, 5, 5]]], tf.float32) + data1 = tf.constant([[0, 0, 1], [1, 1, 2], [3, 4, 5]], tf.float32) + data2 = tf.constant([[0, 0, 1], [1, 1, 2], [3, 4, 5]], tf.int32) + + with self.assertRaises(ValueError): + _ = box_list.BoxList(data0) + with self.assertRaises(ValueError): + _ = box_list.BoxList(data1) + with self.assertRaises(ValueError): + _ = box_list.BoxList(data2) + + def test_num_boxes_static(self): + box_corners = [[10.0, 10.0, 20.0, 15.0], [0.2, 0.1, 0.5, 0.4]] + boxes = box_list.BoxList(tf.constant(box_corners)) + self.assertEquals(boxes.num_boxes_static(), 2) + self.assertEquals(type(boxes.num_boxes_static()), int) + + def test_num_boxes_static_for_uninferrable_shape(self): + placeholder = tf.placeholder(tf.float32, shape=[None, 4]) + boxes = box_list.BoxList(placeholder) + self.assertEquals(boxes.num_boxes_static(), None) + + def test_as_tensor_dict(self): + boxlist = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5]], tf.float32)) + boxlist.add_field('classes', tf.constant([0, 1])) + boxlist.add_field('scores', tf.constant([0.75, 0.2])) + tensor_dict = boxlist.as_tensor_dict() + + expected_boxes = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5]] + expected_classes = [0, 1] + expected_scores = [0.75, 0.2] + + with self.test_session() as sess: + tensor_dict_out = sess.run(tensor_dict) + self.assertAllEqual(3, len(tensor_dict_out)) + self.assertAllClose(expected_boxes, tensor_dict_out['boxes']) + self.assertAllEqual(expected_classes, tensor_dict_out['classes']) + self.assertAllClose(expected_scores, tensor_dict_out['scores']) + + def test_as_tensor_dict_with_features(self): + boxlist = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5]], tf.float32)) + boxlist.add_field('classes', tf.constant([0, 1])) + boxlist.add_field('scores', tf.constant([0.75, 0.2])) + tensor_dict = boxlist.as_tensor_dict(['boxes', 'classes', 'scores']) + + expected_boxes = [[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5]] + expected_classes = [0, 1] + expected_scores = [0.75, 0.2] + + with self.test_session() as sess: + tensor_dict_out = sess.run(tensor_dict) + self.assertAllEqual(3, len(tensor_dict_out)) + self.assertAllClose(expected_boxes, tensor_dict_out['boxes']) + self.assertAllEqual(expected_classes, tensor_dict_out['classes']) + self.assertAllClose(expected_scores, tensor_dict_out['scores']) + + def test_as_tensor_dict_missing_field(self): + boxlist = box_list.BoxList( + tf.constant([[0.1, 0.1, 0.4, 0.4], [0.1, 0.1, 0.5, 0.5]], tf.float32)) + boxlist.add_field('classes', tf.constant([0, 1])) + boxlist.add_field('scores', tf.constant([0.75, 0.2])) + with self.assertRaises(ValueError): + boxlist.as_tensor_dict(['foo', 'bar']) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_predictor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..78d8242372549fecbdd5442fda2a520850308972 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_predictor.py @@ -0,0 +1,963 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Box predictor for object detectors. + +Box predictors are classes that take a high level +image feature map as input and produce two predictions, +(1) a tensor encoding box locations, and +(2) a tensor encoding classes for each box. + +These components are passed directly to loss functions +in our detection models. + +These modules are separated from the main model since the same +few box predictor architectures are shared across many models. +""" +from abc import abstractmethod +import math +import tensorflow as tf +from object_detection.utils import ops +from object_detection.utils import shape_utils +from object_detection.utils import static_shape + +slim = tf.contrib.slim + +BOX_ENCODINGS = 'box_encodings' +CLASS_PREDICTIONS_WITH_BACKGROUND = 'class_predictions_with_background' +MASK_PREDICTIONS = 'mask_predictions' + + +class BoxPredictor(object): + """BoxPredictor.""" + + def __init__(self, is_training, num_classes): + """Constructor. + + Args: + is_training: Indicates whether the BoxPredictor is in training mode. + num_classes: number of classes. Note that num_classes *does not* + include the background category, so if groundtruth labels take values + in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the + assigned classification targets can range from {0,... K}). + """ + self._is_training = is_training + self._num_classes = num_classes + + @property + def num_classes(self): + return self._num_classes + + def predict(self, image_features, num_predictions_per_location, + scope=None, **params): + """Computes encoded object locations and corresponding confidences. + + Takes a list of high level image feature maps as input and produces a list + of box encodings and a list of class scores where each element in the output + lists correspond to the feature maps in the input list. + + Args: + image_features: A list of float tensors of shape [batch_size, height_i, + width_i, channels_i] containing features for a batch of images. + num_predictions_per_location: A list of integers representing the number + of box predictions to be made per spatial location for each feature map. + scope: Variable and Op scope name. + **params: Additional keyword arguments for specific implementations of + BoxPredictor. + + Returns: + A dictionary containing at least the following tensors. + box_encodings: A list of float tensors. Each entry in the list + corresponds to a feature map in the input `image_features` list. All + tensors in the list have one of the two following shapes: + a. [batch_size, num_anchors_i, q, code_size] representing the location + of the objects, where q is 1 or the number of classes. + b. [batch_size, num_anchors_i, code_size]. + class_predictions_with_background: A list of float tensors of shape + [batch_size, num_anchors_i, num_classes + 1] representing the class + predictions for the proposals. Each entry in the list corresponds to a + feature map in the input `image_features` list. + + Raises: + ValueError: If length of `image_features` is not equal to length of + `num_predictions_per_location`. + """ + if len(image_features) != len(num_predictions_per_location): + raise ValueError('image_feature and num_predictions_per_location must ' + 'be of same length, found: {} vs {}'. + format(len(image_features), + len(num_predictions_per_location))) + if scope is not None: + with tf.variable_scope(scope): + return self._predict(image_features, num_predictions_per_location, + **params) + return self._predict(image_features, num_predictions_per_location, + **params) + + # TODO(rathodv): num_predictions_per_location could be moved to constructor. + # This is currently only used by ConvolutionalBoxPredictor. + @abstractmethod + def _predict(self, image_features, num_predictions_per_location, **params): + """Implementations must override this method. + + Args: + image_features: A list of float tensors of shape [batch_size, height_i, + width_i, channels_i] containing features for a batch of images. + num_predictions_per_location: A list of integers representing the number + of box predictions to be made per spatial location for each feature map. + **params: Additional keyword arguments for specific implementations of + BoxPredictor. + + Returns: + A dictionary containing at least the following tensors. + box_encodings: A list of float tensors. Each entry in the list + corresponds to a feature map in the input `image_features` list. All + tensors in the list have one of the two following shapes: + a. [batch_size, num_anchors_i, q, code_size] representing the location + of the objects, where q is 1 or the number of classes. + b. [batch_size, num_anchors_i, code_size]. + class_predictions_with_background: A list of float tensors of shape + [batch_size, num_anchors_i, num_classes + 1] representing the class + predictions for the proposals. Each entry in the list corresponds to a + feature map in the input `image_features` list. + """ + pass + + +class RfcnBoxPredictor(BoxPredictor): + """RFCN Box Predictor. + + Applies a position sensitive ROI pooling on position sensitive feature maps to + predict classes and refined locations. See https://arxiv.org/abs/1605.06409 + for details. + + This is used for the second stage of the RFCN meta architecture. Notice that + locations are *not* shared across classes, thus for each anchor, a separate + prediction is made for each class. + """ + + def __init__(self, + is_training, + num_classes, + conv_hyperparams_fn, + num_spatial_bins, + depth, + crop_size, + box_code_size): + """Constructor. + + Args: + is_training: Indicates whether the BoxPredictor is in training mode. + num_classes: number of classes. Note that num_classes *does not* + include the background category, so if groundtruth labels take values + in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the + assigned classification targets can range from {0,... K}). + conv_hyperparams_fn: A function to construct tf-slim arg_scope with + hyperparameters for convolutional layers. + num_spatial_bins: A list of two integers `[spatial_bins_y, + spatial_bins_x]`. + depth: Target depth to reduce the input feature maps to. + crop_size: A list of two integers `[crop_height, crop_width]`. + box_code_size: Size of encoding for each box. + """ + super(RfcnBoxPredictor, self).__init__(is_training, num_classes) + self._conv_hyperparams_fn = conv_hyperparams_fn + self._num_spatial_bins = num_spatial_bins + self._depth = depth + self._crop_size = crop_size + self._box_code_size = box_code_size + + @property + def num_classes(self): + return self._num_classes + + def _predict(self, image_features, num_predictions_per_location, + proposal_boxes): + """Computes encoded object locations and corresponding confidences. + + Args: + image_features: A list of float tensors of shape [batch_size, height_i, + width_i, channels_i] containing features for a batch of images. + num_predictions_per_location: A list of integers representing the number + of box predictions to be made per spatial location for each feature map. + Currently, this must be set to [1], or an error will be raised. + proposal_boxes: A float tensor of shape [batch_size, num_proposals, + box_code_size]. + + Returns: + box_encodings: A list of float tensors of shape + [batch_size, num_anchors_i, q, code_size] representing the location of + the objects, where q is 1 or the number of classes. Each entry in the + list corresponds to a feature map in the input `image_features` list. + class_predictions_with_background: A list of float tensors of shape + [batch_size, num_anchors_i, num_classes + 1] representing the class + predictions for the proposals. Each entry in the list corresponds to a + feature map in the input `image_features` list. + + Raises: + ValueError: if num_predictions_per_location is not 1 or if + len(image_features) is not 1. + """ + if (len(num_predictions_per_location) != 1 or + num_predictions_per_location[0] != 1): + raise ValueError('Currently RfcnBoxPredictor only supports ' + 'predicting a single box per class per location.') + if len(image_features) != 1: + raise ValueError('length of `image_features` must be 1. Found {}'. + format(len(image_features))) + image_feature = image_features[0] + num_predictions_per_location = num_predictions_per_location[0] + batch_size = tf.shape(proposal_boxes)[0] + num_boxes = tf.shape(proposal_boxes)[1] + def get_box_indices(proposals): + proposals_shape = proposals.get_shape().as_list() + if any(dim is None for dim in proposals_shape): + proposals_shape = tf.shape(proposals) + ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32) + multiplier = tf.expand_dims( + tf.range(start=0, limit=proposals_shape[0]), 1) + return tf.reshape(ones_mat * multiplier, [-1]) + + net = image_feature + with slim.arg_scope(self._conv_hyperparams_fn()): + net = slim.conv2d(net, self._depth, [1, 1], scope='reduce_depth') + # Location predictions. + location_feature_map_depth = (self._num_spatial_bins[0] * + self._num_spatial_bins[1] * + self.num_classes * + self._box_code_size) + location_feature_map = slim.conv2d(net, location_feature_map_depth, + [1, 1], activation_fn=None, + scope='refined_locations') + box_encodings = ops.position_sensitive_crop_regions( + location_feature_map, + boxes=tf.reshape(proposal_boxes, [-1, self._box_code_size]), + box_ind=get_box_indices(proposal_boxes), + crop_size=self._crop_size, + num_spatial_bins=self._num_spatial_bins, + global_pool=True) + box_encodings = tf.squeeze(box_encodings, squeeze_dims=[1, 2]) + box_encodings = tf.reshape(box_encodings, + [batch_size * num_boxes, 1, self.num_classes, + self._box_code_size]) + + # Class predictions. + total_classes = self.num_classes + 1 # Account for background class. + class_feature_map_depth = (self._num_spatial_bins[0] * + self._num_spatial_bins[1] * + total_classes) + class_feature_map = slim.conv2d(net, class_feature_map_depth, [1, 1], + activation_fn=None, + scope='class_predictions') + class_predictions_with_background = ops.position_sensitive_crop_regions( + class_feature_map, + boxes=tf.reshape(proposal_boxes, [-1, self._box_code_size]), + box_ind=get_box_indices(proposal_boxes), + crop_size=self._crop_size, + num_spatial_bins=self._num_spatial_bins, + global_pool=True) + class_predictions_with_background = tf.squeeze( + class_predictions_with_background, squeeze_dims=[1, 2]) + class_predictions_with_background = tf.reshape( + class_predictions_with_background, + [batch_size * num_boxes, 1, total_classes]) + + return {BOX_ENCODINGS: [box_encodings], + CLASS_PREDICTIONS_WITH_BACKGROUND: + [class_predictions_with_background]} + + +# TODO(rathodv): Change the implementation to return lists of predictions. +class MaskRCNNBoxPredictor(BoxPredictor): + """Mask R-CNN Box Predictor. + + See Mask R-CNN: He, K., Gkioxari, G., Dollar, P., & Girshick, R. (2017). + Mask R-CNN. arXiv preprint arXiv:1703.06870. + + This is used for the second stage of the Mask R-CNN detector where proposals + cropped from an image are arranged along the batch dimension of the input + image_features tensor. Notice that locations are *not* shared across classes, + thus for each anchor, a separate prediction is made for each class. + + In addition to predicting boxes and classes, optionally this class allows + predicting masks and/or keypoints inside detection boxes. + + Currently this box predictor makes per-class predictions; that is, each + anchor makes a separate box prediction for each class. + """ + + def __init__(self, + is_training, + num_classes, + fc_hyperparams_fn, + use_dropout, + dropout_keep_prob, + box_code_size, + conv_hyperparams_fn=None, + predict_instance_masks=False, + mask_height=14, + mask_width=14, + mask_prediction_num_conv_layers=2, + mask_prediction_conv_depth=256, + masks_are_class_agnostic=False, + predict_keypoints=False, + share_box_across_classes=False): + """Constructor. + + Args: + is_training: Indicates whether the BoxPredictor is in training mode. + num_classes: number of classes. Note that num_classes *does not* + include the background category, so if groundtruth labels take values + in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the + assigned classification targets can range from {0,... K}). + fc_hyperparams_fn: A function to generate tf-slim arg_scope with + hyperparameters for fully connected ops. + use_dropout: Option to use dropout or not. Note that a single dropout + op is applied here prior to both box and class predictions, which stands + in contrast to the ConvolutionalBoxPredictor below. + dropout_keep_prob: Keep probability for dropout. + This is only used if use_dropout is True. + box_code_size: Size of encoding for each box. + conv_hyperparams_fn: A function to generate tf-slim arg_scope with + hyperparameters for convolution ops. + predict_instance_masks: Whether to predict object masks inside detection + boxes. + mask_height: Desired output mask height. The default value is 14. + mask_width: Desired output mask width. The default value is 14. + mask_prediction_num_conv_layers: Number of convolution layers applied to + the image_features in mask prediction branch. + mask_prediction_conv_depth: The depth for the first conv2d_transpose op + applied to the image_features in the mask prediction branch. If set + to 0, the depth of the convolution layers will be automatically chosen + based on the number of object classes and the number of channels in the + image features. + masks_are_class_agnostic: Boolean determining if the mask-head is + class-agnostic or not. + predict_keypoints: Whether to predict keypoints insde detection boxes. + share_box_across_classes: Whether to share boxes across classes rather + than use a different box for each class. + + Raises: + ValueError: If predict_instance_masks is true but conv_hyperparams is not + set. + ValueError: If predict_keypoints is true since it is not implemented yet. + ValueError: If mask_prediction_num_conv_layers is smaller than two. + """ + super(MaskRCNNBoxPredictor, self).__init__(is_training, num_classes) + self._fc_hyperparams_fn = fc_hyperparams_fn + self._use_dropout = use_dropout + self._box_code_size = box_code_size + self._dropout_keep_prob = dropout_keep_prob + self._conv_hyperparams_fn = conv_hyperparams_fn + self._predict_instance_masks = predict_instance_masks + self._mask_height = mask_height + self._mask_width = mask_width + self._mask_prediction_num_conv_layers = mask_prediction_num_conv_layers + self._mask_prediction_conv_depth = mask_prediction_conv_depth + self._masks_are_class_agnostic = masks_are_class_agnostic + self._predict_keypoints = predict_keypoints + self._share_box_across_classes = share_box_across_classes + if self._predict_keypoints: + raise ValueError('Keypoint prediction is unimplemented.') + if ((self._predict_instance_masks or self._predict_keypoints) and + self._conv_hyperparams_fn is None): + raise ValueError('`conv_hyperparams` must be provided when predicting ' + 'masks.') + if self._mask_prediction_num_conv_layers < 2: + raise ValueError( + 'Mask prediction should consist of at least 2 conv layers') + + @property + def num_classes(self): + return self._num_classes + + @property + def predicts_instance_masks(self): + return self._predict_instance_masks + + def _predict_boxes_and_classes(self, image_features): + """Predicts boxes and class scores. + + Args: + image_features: A float tensor of shape [batch_size, height, width, + channels] containing features for a batch of images. + + Returns: + box_encodings: A float tensor of shape + [batch_size, 1, num_classes, code_size] representing the location of the + objects. + class_predictions_with_background: A float tensor of shape + [batch_size, 1, num_classes + 1] representing the class predictions for + the proposals. + """ + spatial_averaged_image_features = tf.reduce_mean(image_features, [1, 2], + keep_dims=True, + name='AvgPool') + flattened_image_features = slim.flatten(spatial_averaged_image_features) + if self._use_dropout: + flattened_image_features = slim.dropout(flattened_image_features, + keep_prob=self._dropout_keep_prob, + is_training=self._is_training) + number_of_boxes = 1 + if not self._share_box_across_classes: + number_of_boxes = self._num_classes + + with slim.arg_scope(self._fc_hyperparams_fn()): + box_encodings = slim.fully_connected( + flattened_image_features, + number_of_boxes * self._box_code_size, + activation_fn=None, + scope='BoxEncodingPredictor') + class_predictions_with_background = slim.fully_connected( + flattened_image_features, + self._num_classes + 1, + activation_fn=None, + scope='ClassPredictor') + box_encodings = tf.reshape( + box_encodings, [-1, 1, number_of_boxes, self._box_code_size]) + class_predictions_with_background = tf.reshape( + class_predictions_with_background, [-1, 1, self._num_classes + 1]) + return box_encodings, class_predictions_with_background + + def _get_mask_predictor_conv_depth(self, num_feature_channels, num_classes, + class_weight=3.0, feature_weight=2.0): + """Computes the depth of the mask predictor convolutions. + + Computes the depth of the mask predictor convolutions given feature channels + and number of classes by performing a weighted average of the two in + log space to compute the number of convolution channels. The weights that + are used for computing the weighted average do not need to sum to 1. + + Args: + num_feature_channels: An integer containing the number of feature + channels. + num_classes: An integer containing the number of classes. + class_weight: Class weight used in computing the weighted average. + feature_weight: Feature weight used in computing the weighted average. + + Returns: + An integer containing the number of convolution channels used by mask + predictor. + """ + num_feature_channels_log = math.log(float(num_feature_channels), 2.0) + num_classes_log = math.log(float(num_classes), 2.0) + weighted_num_feature_channels_log = ( + num_feature_channels_log * feature_weight) + weighted_num_classes_log = num_classes_log * class_weight + total_weight = feature_weight + class_weight + num_conv_channels_log = round( + (weighted_num_feature_channels_log + weighted_num_classes_log) / + total_weight) + return int(math.pow(2.0, num_conv_channels_log)) + + def _predict_masks(self, image_features): + """Performs mask prediction. + + Args: + image_features: A float tensor of shape [batch_size, height, width, + channels] containing features for a batch of images. + + Returns: + instance_masks: A float tensor of shape + [batch_size, 1, num_classes, image_height, image_width]. + """ + num_conv_channels = self._mask_prediction_conv_depth + if num_conv_channels == 0: + num_feature_channels = image_features.get_shape().as_list()[3] + num_conv_channels = self._get_mask_predictor_conv_depth( + num_feature_channels, self.num_classes) + with slim.arg_scope(self._conv_hyperparams_fn()): + upsampled_features = tf.image.resize_bilinear( + image_features, + [self._mask_height, self._mask_width], + align_corners=True) + for _ in range(self._mask_prediction_num_conv_layers - 1): + upsampled_features = slim.conv2d( + upsampled_features, + num_outputs=num_conv_channels, + kernel_size=[3, 3]) + num_masks = 1 if self._masks_are_class_agnostic else self.num_classes + mask_predictions = slim.conv2d(upsampled_features, + num_outputs=num_masks, + activation_fn=None, + kernel_size=[3, 3]) + return tf.expand_dims( + tf.transpose(mask_predictions, perm=[0, 3, 1, 2]), + axis=1, + name='MaskPredictor') + + def _predict(self, image_features, num_predictions_per_location, + predict_boxes_and_classes=True, predict_auxiliary_outputs=False): + """Optionally computes encoded object locations, confidences, and masks. + + Flattens image_features and applies fully connected ops (with no + non-linearity) to predict box encodings and class predictions. In this + setting, anchors are not spatially arranged in any way and are assumed to + have been folded into the batch dimension. Thus we output 1 for the + anchors dimension. + + Also optionally predicts instance masks. + The mask prediction head is based on the Mask RCNN paper with the following + modifications: We replace the deconvolution layer with a bilinear resize + and a convolution. + + Args: + image_features: A list of float tensors of shape [batch_size, height_i, + width_i, channels_i] containing features for a batch of images. + num_predictions_per_location: A list of integers representing the number + of box predictions to be made per spatial location for each feature map. + Currently, this must be set to [1], or an error will be raised. + predict_boxes_and_classes: If true, the function will perform box + refinement and classification. + predict_auxiliary_outputs: If true, the function will perform other + predictions such as mask, keypoint, boundaries, etc. if any. + + Returns: + A dictionary containing the following tensors. + box_encodings: A float tensor of shape + [batch_size, 1, num_classes, code_size] representing the + location of the objects. + class_predictions_with_background: A float tensor of shape + [batch_size, 1, num_classes + 1] representing the class + predictions for the proposals. + If predict_masks is True the dictionary also contains: + instance_masks: A float tensor of shape + [batch_size, 1, num_classes, image_height, image_width] + If predict_keypoints is True the dictionary also contains: + keypoints: [batch_size, 1, num_keypoints, 2] + + Raises: + ValueError: If num_predictions_per_location is not 1 or if both + predict_boxes_and_classes and predict_auxiliary_outputs are false or if + len(image_features) is not 1. + """ + if (len(num_predictions_per_location) != 1 or + num_predictions_per_location[0] != 1): + raise ValueError('Currently FullyConnectedBoxPredictor only supports ' + 'predicting a single box per class per location.') + if not predict_boxes_and_classes and not predict_auxiliary_outputs: + raise ValueError('Should perform at least one prediction.') + if len(image_features) != 1: + raise ValueError('length of `image_features` must be 1. Found {}'. + format(len(image_features))) + image_feature = image_features[0] + num_predictions_per_location = num_predictions_per_location[0] + predictions_dict = {} + + if predict_boxes_and_classes: + (box_encodings, class_predictions_with_background + ) = self._predict_boxes_and_classes(image_feature) + predictions_dict[BOX_ENCODINGS] = box_encodings + predictions_dict[ + CLASS_PREDICTIONS_WITH_BACKGROUND] = class_predictions_with_background + + if self._predict_instance_masks and predict_auxiliary_outputs: + predictions_dict[MASK_PREDICTIONS] = self._predict_masks(image_feature) + + return predictions_dict + + +class _NoopVariableScope(object): + """A dummy class that does not push any scope.""" + + def __enter__(self): + return None + + def __exit__(self, exc_type, exc_value, traceback): + return False + + +class ConvolutionalBoxPredictor(BoxPredictor): + """Convolutional Box Predictor. + + Optionally add an intermediate 1x1 convolutional layer after features and + predict in parallel branches box_encodings and + class_predictions_with_background. + + Currently this box predictor assumes that predictions are "shared" across + classes --- that is each anchor makes box predictions which do not depend + on class. + """ + + def __init__(self, + is_training, + num_classes, + conv_hyperparams_fn, + min_depth, + max_depth, + num_layers_before_predictor, + use_dropout, + dropout_keep_prob, + kernel_size, + box_code_size, + apply_sigmoid_to_scores=False, + class_prediction_bias_init=0.0, + use_depthwise=False): + """Constructor. + + Args: + is_training: Indicates whether the BoxPredictor is in training mode. + num_classes: number of classes. Note that num_classes *does not* + include the background category, so if groundtruth labels take values + in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the + assigned classification targets can range from {0,... K}). + conv_hyperparams_fn: A function to generate tf-slim arg_scope with + hyperparameters for convolution ops. + min_depth: Minimum feature depth prior to predicting box encodings + and class predictions. + max_depth: Maximum feature depth prior to predicting box encodings + and class predictions. If max_depth is set to 0, no additional + feature map will be inserted before location and class predictions. + num_layers_before_predictor: Number of the additional conv layers before + the predictor. + use_dropout: Option to use dropout for class prediction or not. + dropout_keep_prob: Keep probability for dropout. + This is only used if use_dropout is True. + kernel_size: Size of final convolution kernel. If the + spatial resolution of the feature map is smaller than the kernel size, + then the kernel size is automatically set to be + min(feature_width, feature_height). + box_code_size: Size of encoding for each box. + apply_sigmoid_to_scores: if True, apply the sigmoid on the output + class_predictions. + class_prediction_bias_init: constant value to initialize bias of the last + conv2d layer before class prediction. + use_depthwise: Whether to use depthwise convolutions for prediction + steps. Default is False. + + Raises: + ValueError: if min_depth > max_depth. + """ + super(ConvolutionalBoxPredictor, self).__init__(is_training, num_classes) + if min_depth > max_depth: + raise ValueError('min_depth should be less than or equal to max_depth') + self._conv_hyperparams_fn = conv_hyperparams_fn + self._min_depth = min_depth + self._max_depth = max_depth + self._num_layers_before_predictor = num_layers_before_predictor + self._use_dropout = use_dropout + self._kernel_size = kernel_size + self._box_code_size = box_code_size + self._dropout_keep_prob = dropout_keep_prob + self._apply_sigmoid_to_scores = apply_sigmoid_to_scores + self._class_prediction_bias_init = class_prediction_bias_init + self._use_depthwise = use_depthwise + + def _predict(self, image_features, num_predictions_per_location_list): + """Computes encoded object locations and corresponding confidences. + + Args: + image_features: A list of float tensors of shape [batch_size, height_i, + width_i, channels_i] containing features for a batch of images. + num_predictions_per_location_list: A list of integers representing the + number of box predictions to be made per spatial location for each + feature map. + + Returns: + box_encodings: A list of float tensors of shape + [batch_size, num_anchors_i, q, code_size] representing the location of + the objects, where q is 1 or the number of classes. Each entry in the + list corresponds to a feature map in the input `image_features` list. + class_predictions_with_background: A list of float tensors of shape + [batch_size, num_anchors_i, num_classes + 1] representing the class + predictions for the proposals. Each entry in the list corresponds to a + feature map in the input `image_features` list. + """ + box_encodings_list = [] + class_predictions_list = [] + # TODO(rathodv): Come up with a better way to generate scope names + # in box predictor once we have time to retrain all models in the zoo. + # The following lines create scope names to be backwards compatible with the + # existing checkpoints. + box_predictor_scopes = [_NoopVariableScope()] + if len(image_features) > 1: + box_predictor_scopes = [ + tf.variable_scope('BoxPredictor_{}'.format(i)) + for i in range(len(image_features)) + ] + + for (image_feature, + num_predictions_per_location, box_predictor_scope) in zip( + image_features, num_predictions_per_location_list, + box_predictor_scopes): + with box_predictor_scope: + # Add a slot for the background class. + num_class_slots = self.num_classes + 1 + net = image_feature + with slim.arg_scope(self._conv_hyperparams_fn()), \ + slim.arg_scope([slim.dropout], is_training=self._is_training): + # Add additional conv layers before the class predictor. + features_depth = static_shape.get_depth(image_feature.get_shape()) + depth = max(min(features_depth, self._max_depth), self._min_depth) + tf.logging.info('depth of additional conv before box predictor: {}'. + format(depth)) + if depth > 0 and self._num_layers_before_predictor > 0: + for i in range(self._num_layers_before_predictor): + net = slim.conv2d( + net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) + with slim.arg_scope([slim.conv2d], activation_fn=None, + normalizer_fn=None, normalizer_params=None): + if self._use_depthwise: + box_encodings = slim.separable_conv2d( + net, None, [self._kernel_size, self._kernel_size], + padding='SAME', depth_multiplier=1, stride=1, + rate=1, scope='BoxEncodingPredictor_depthwise') + box_encodings = slim.conv2d( + box_encodings, + num_predictions_per_location * self._box_code_size, [1, 1], + scope='BoxEncodingPredictor') + else: + box_encodings = slim.conv2d( + net, num_predictions_per_location * self._box_code_size, + [self._kernel_size, self._kernel_size], + scope='BoxEncodingPredictor') + if self._use_dropout: + net = slim.dropout(net, keep_prob=self._dropout_keep_prob) + if self._use_depthwise: + class_predictions_with_background = slim.separable_conv2d( + net, None, [self._kernel_size, self._kernel_size], + padding='SAME', depth_multiplier=1, stride=1, + rate=1, scope='ClassPredictor_depthwise') + class_predictions_with_background = slim.conv2d( + class_predictions_with_background, + num_predictions_per_location * num_class_slots, + [1, 1], scope='ClassPredictor') + else: + class_predictions_with_background = slim.conv2d( + net, num_predictions_per_location * num_class_slots, + [self._kernel_size, self._kernel_size], + scope='ClassPredictor', + biases_initializer=tf.constant_initializer( + self._class_prediction_bias_init)) + if self._apply_sigmoid_to_scores: + class_predictions_with_background = tf.sigmoid( + class_predictions_with_background) + + combined_feature_map_shape = (shape_utils. + combined_static_and_dynamic_shape( + image_feature)) + box_encodings = tf.reshape( + box_encodings, tf.stack([combined_feature_map_shape[0], + combined_feature_map_shape[1] * + combined_feature_map_shape[2] * + num_predictions_per_location, + 1, self._box_code_size])) + box_encodings_list.append(box_encodings) + class_predictions_with_background = tf.reshape( + class_predictions_with_background, + tf.stack([combined_feature_map_shape[0], + combined_feature_map_shape[1] * + combined_feature_map_shape[2] * + num_predictions_per_location, + num_class_slots])) + class_predictions_list.append(class_predictions_with_background) + return { + BOX_ENCODINGS: box_encodings_list, + CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_list + } + + +# TODO(rathodv): Replace with slim.arg_scope_func_key once its available +# externally. +def _arg_scope_func_key(op): + """Returns a key that can be used to index arg_scope dictionary.""" + return getattr(op, '_key_op', str(op)) + + +# TODO(rathodv): Merge the implementation with ConvolutionalBoxPredictor above +# since they are very similar. +class WeightSharedConvolutionalBoxPredictor(BoxPredictor): + """Convolutional Box Predictor with weight sharing. + + Defines the box predictor as defined in + https://arxiv.org/abs/1708.02002. This class differs from + ConvolutionalBoxPredictor in that it shares weights and biases while + predicting from different feature maps. However, batch_norm parameters are not + shared because the statistics of the activations vary among the different + feature maps. + + Also note that separate multi-layer towers are constructed for the box + encoding and class predictors respectively. + """ + + def __init__(self, + is_training, + num_classes, + conv_hyperparams_fn, + depth, + num_layers_before_predictor, + box_code_size, + kernel_size=3, + class_prediction_bias_init=0.0, + use_dropout=False, + dropout_keep_prob=0.8): + """Constructor. + + Args: + is_training: Indicates whether the BoxPredictor is in training mode. + num_classes: number of classes. Note that num_classes *does not* + include the background category, so if groundtruth labels take values + in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the + assigned classification targets can range from {0,... K}). + conv_hyperparams_fn: A function to generate tf-slim arg_scope with + hyperparameters for convolution ops. + depth: depth of conv layers. + num_layers_before_predictor: Number of the additional conv layers before + the predictor. + box_code_size: Size of encoding for each box. + kernel_size: Size of final convolution kernel. + class_prediction_bias_init: constant value to initialize bias of the last + conv2d layer before class prediction. + use_dropout: Whether to apply dropout to class prediction head. + dropout_keep_prob: Probability of keeping activiations. + """ + super(WeightSharedConvolutionalBoxPredictor, self).__init__(is_training, + num_classes) + self._conv_hyperparams_fn = conv_hyperparams_fn + self._depth = depth + self._num_layers_before_predictor = num_layers_before_predictor + self._box_code_size = box_code_size + self._kernel_size = kernel_size + self._class_prediction_bias_init = class_prediction_bias_init + self._use_dropout = use_dropout + self._dropout_keep_prob = dropout_keep_prob + + def _predict(self, image_features, num_predictions_per_location_list): + """Computes encoded object locations and corresponding confidences. + + Args: + image_features: A list of float tensors of shape [batch_size, height_i, + width_i, channels] containing features for a batch of images. Note that + all tensors in the list must have the same number of channels. + num_predictions_per_location_list: A list of integers representing the + number of box predictions to be made per spatial location for each + feature map. Note that all values must be the same since the weights are + shared. + + Returns: + box_encodings: A list of float tensors of shape + [batch_size, num_anchors_i, code_size] representing the location of + the objects. Each entry in the list corresponds to a feature map in the + input `image_features` list. + class_predictions_with_background: A list of float tensors of shape + [batch_size, num_anchors_i, num_classes + 1] representing the class + predictions for the proposals. Each entry in the list corresponds to a + feature map in the input `image_features` list. + + + Raises: + ValueError: If the image feature maps do not have the same number of + channels or if the num predictions per locations is differs between the + feature maps. + """ + if len(set(num_predictions_per_location_list)) > 1: + raise ValueError('num predictions per location must be same for all' + 'feature maps, found: {}'.format( + num_predictions_per_location_list)) + feature_channels = [ + image_feature.shape[3].value for image_feature in image_features + ] + if len(set(feature_channels)) > 1: + raise ValueError('all feature maps must have the same number of ' + 'channels, found: {}'.format(feature_channels)) + box_encodings_list = [] + class_predictions_list = [] + for feature_index, (image_feature, + num_predictions_per_location) in enumerate( + zip(image_features, + num_predictions_per_location_list)): + # Add a slot for the background class. + with tf.variable_scope('WeightSharedConvolutionalBoxPredictor', + reuse=tf.AUTO_REUSE): + num_class_slots = self.num_classes + 1 + box_encodings_net = image_feature + class_predictions_net = image_feature + with slim.arg_scope(self._conv_hyperparams_fn()) as sc: + apply_batch_norm = _arg_scope_func_key(slim.batch_norm) in sc + for i in range(self._num_layers_before_predictor): + box_encodings_net = slim.conv2d( + box_encodings_net, + self._depth, + [self._kernel_size, self._kernel_size], + stride=1, + padding='SAME', + activation_fn=None, + normalizer_fn=(tf.identity if apply_batch_norm else None), + scope='BoxPredictionTower/conv2d_{}'.format(i)) + if apply_batch_norm: + box_encodings_net = slim.batch_norm( + box_encodings_net, + scope='BoxPredictionTower/conv2d_{}/BatchNorm/feature_{}'. + format(i, feature_index)) + box_encodings_net = tf.nn.relu6(box_encodings_net) + box_encodings = slim.conv2d( + box_encodings_net, + num_predictions_per_location * self._box_code_size, + [self._kernel_size, self._kernel_size], + activation_fn=None, stride=1, padding='SAME', + normalizer_fn=None, + scope='BoxPredictor') + + for i in range(self._num_layers_before_predictor): + class_predictions_net = slim.conv2d( + class_predictions_net, + self._depth, + [self._kernel_size, self._kernel_size], + stride=1, + padding='SAME', + activation_fn=None, + normalizer_fn=(tf.identity if apply_batch_norm else None), + scope='ClassPredictionTower/conv2d_{}'.format(i)) + if apply_batch_norm: + class_predictions_net = slim.batch_norm( + class_predictions_net, + scope='ClassPredictionTower/conv2d_{}/BatchNorm/feature_{}' + .format(i, feature_index)) + class_predictions_net = tf.nn.relu6(class_predictions_net) + if self._use_dropout: + class_predictions_net = slim.dropout( + class_predictions_net, keep_prob=self._dropout_keep_prob) + class_predictions_with_background = slim.conv2d( + class_predictions_net, + num_predictions_per_location * num_class_slots, + [self._kernel_size, self._kernel_size], + activation_fn=None, stride=1, padding='SAME', + normalizer_fn=None, + biases_initializer=tf.constant_initializer( + self._class_prediction_bias_init), + scope='ClassPredictor') + + combined_feature_map_shape = (shape_utils. + combined_static_and_dynamic_shape( + image_feature)) + box_encodings = tf.reshape( + box_encodings, tf.stack([combined_feature_map_shape[0], + combined_feature_map_shape[1] * + combined_feature_map_shape[2] * + num_predictions_per_location, + self._box_code_size])) + box_encodings_list.append(box_encodings) + class_predictions_with_background = tf.reshape( + class_predictions_with_background, + tf.stack([combined_feature_map_shape[0], + combined_feature_map_shape[1] * + combined_feature_map_shape[2] * + num_predictions_per_location, + num_class_slots])) + class_predictions_list.append(class_predictions_with_background) + return { + BOX_ENCODINGS: box_encodings_list, + CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_list + } diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_predictor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_predictor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..49680596f8d607ba52c0791a49df67a5d5ebe293 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/box_predictor_test.py @@ -0,0 +1,724 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.box_predictor.""" +import numpy as np +import tensorflow as tf + +from google.protobuf import text_format +from object_detection.builders import hyperparams_builder +from object_detection.core import box_predictor +from object_detection.protos import hyperparams_pb2 +from object_detection.utils import test_case + + +class MaskRCNNBoxPredictorTest(tf.test.TestCase): + + def _build_arg_scope_with_hyperparams(self, + op_type=hyperparams_pb2.Hyperparams.FC): + hyperparams = hyperparams_pb2.Hyperparams() + hyperparams_text_proto = """ + activation: NONE + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + text_format.Merge(hyperparams_text_proto, hyperparams) + hyperparams.op = op_type + return hyperparams_builder.build(hyperparams, is_training=True) + + def test_get_boxes_with_five_classes(self): + image_features = tf.random_uniform([2, 7, 7, 3], dtype=tf.float32) + mask_box_predictor = box_predictor.MaskRCNNBoxPredictor( + is_training=False, + num_classes=5, + fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(), + use_dropout=False, + dropout_keep_prob=0.5, + box_code_size=4, + ) + box_predictions = mask_box_predictor.predict( + [image_features], num_predictions_per_location=[1], + scope='BoxPredictor') + box_encodings = box_predictions[box_predictor.BOX_ENCODINGS] + class_predictions_with_background = box_predictions[ + box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND] + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + (box_encodings_shape, + class_predictions_with_background_shape) = sess.run( + [tf.shape(box_encodings), + tf.shape(class_predictions_with_background)]) + self.assertAllEqual(box_encodings_shape, [2, 1, 5, 4]) + self.assertAllEqual(class_predictions_with_background_shape, [2, 1, 6]) + + def test_get_boxes_with_five_classes_share_box_across_classes(self): + image_features = tf.random_uniform([2, 7, 7, 3], dtype=tf.float32) + mask_box_predictor = box_predictor.MaskRCNNBoxPredictor( + is_training=False, + num_classes=5, + fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(), + use_dropout=False, + dropout_keep_prob=0.5, + box_code_size=4, + share_box_across_classes=True + ) + box_predictions = mask_box_predictor.predict( + [image_features], num_predictions_per_location=[1], + scope='BoxPredictor') + box_encodings = box_predictions[box_predictor.BOX_ENCODINGS] + class_predictions_with_background = box_predictions[ + box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND] + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + (box_encodings_shape, + class_predictions_with_background_shape) = sess.run( + [tf.shape(box_encodings), + tf.shape(class_predictions_with_background)]) + self.assertAllEqual(box_encodings_shape, [2, 1, 1, 4]) + self.assertAllEqual(class_predictions_with_background_shape, [2, 1, 6]) + + def test_value_error_on_predict_instance_masks_with_no_conv_hyperparms(self): + with self.assertRaises(ValueError): + box_predictor.MaskRCNNBoxPredictor( + is_training=False, + num_classes=5, + fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(), + use_dropout=False, + dropout_keep_prob=0.5, + box_code_size=4, + predict_instance_masks=True) + + def test_get_instance_masks(self): + image_features = tf.random_uniform([2, 7, 7, 3], dtype=tf.float32) + mask_box_predictor = box_predictor.MaskRCNNBoxPredictor( + is_training=False, + num_classes=5, + fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(), + use_dropout=False, + dropout_keep_prob=0.5, + box_code_size=4, + conv_hyperparams_fn=self._build_arg_scope_with_hyperparams( + op_type=hyperparams_pb2.Hyperparams.CONV), + predict_instance_masks=True) + box_predictions = mask_box_predictor.predict( + [image_features], + num_predictions_per_location=[1], + scope='BoxPredictor', + predict_boxes_and_classes=True, + predict_auxiliary_outputs=True) + mask_predictions = box_predictions[box_predictor.MASK_PREDICTIONS] + self.assertListEqual([2, 1, 5, 14, 14], + mask_predictions.get_shape().as_list()) + + def test_do_not_return_instance_masks_without_request(self): + image_features = tf.random_uniform([2, 7, 7, 3], dtype=tf.float32) + mask_box_predictor = box_predictor.MaskRCNNBoxPredictor( + is_training=False, + num_classes=5, + fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(), + use_dropout=False, + dropout_keep_prob=0.5, + box_code_size=4) + box_predictions = mask_box_predictor.predict( + [image_features], num_predictions_per_location=[1], + scope='BoxPredictor') + self.assertEqual(len(box_predictions), 2) + self.assertTrue(box_predictor.BOX_ENCODINGS in box_predictions) + self.assertTrue(box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND + in box_predictions) + + def test_value_error_on_predict_keypoints(self): + with self.assertRaises(ValueError): + box_predictor.MaskRCNNBoxPredictor( + is_training=False, + num_classes=5, + fc_hyperparams_fn=self._build_arg_scope_with_hyperparams(), + use_dropout=False, + dropout_keep_prob=0.5, + box_code_size=4, + predict_keypoints=True) + + +class RfcnBoxPredictorTest(tf.test.TestCase): + + def _build_arg_scope_with_conv_hyperparams(self): + conv_hyperparams = hyperparams_pb2.Hyperparams() + conv_hyperparams_text_proto = """ + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams) + return hyperparams_builder.build(conv_hyperparams, is_training=True) + + def test_get_correct_box_encoding_and_class_prediction_shapes(self): + image_features = tf.random_uniform([4, 8, 8, 64], dtype=tf.float32) + proposal_boxes = tf.random_normal([4, 2, 4], dtype=tf.float32) + rfcn_box_predictor = box_predictor.RfcnBoxPredictor( + is_training=False, + num_classes=2, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + num_spatial_bins=[3, 3], + depth=4, + crop_size=[12, 12], + box_code_size=4 + ) + box_predictions = rfcn_box_predictor.predict( + [image_features], num_predictions_per_location=[1], + scope='BoxPredictor', + proposal_boxes=proposal_boxes) + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + class_predictions_with_background = tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + (box_encodings_shape, + class_predictions_shape) = sess.run( + [tf.shape(box_encodings), + tf.shape(class_predictions_with_background)]) + self.assertAllEqual(box_encodings_shape, [8, 1, 2, 4]) + self.assertAllEqual(class_predictions_shape, [8, 1, 3]) + + +class ConvolutionalBoxPredictorTest(test_case.TestCase): + + def _build_arg_scope_with_conv_hyperparams(self): + conv_hyperparams = hyperparams_pb2.Hyperparams() + conv_hyperparams_text_proto = """ + activation: RELU_6 + regularizer { + l2_regularizer { + } + } + initializer { + truncated_normal_initializer { + } + } + """ + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams) + return hyperparams_builder.build(conv_hyperparams, is_training=True) + + def test_get_boxes_for_five_aspect_ratios_per_location(self): + def graph_fn(image_features): + conv_box_predictor = box_predictor.ConvolutionalBoxPredictor( + is_training=False, + num_classes=0, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + min_depth=0, + max_depth=32, + num_layers_before_predictor=1, + use_dropout=True, + dropout_keep_prob=0.8, + kernel_size=1, + box_code_size=4 + ) + box_predictions = conv_box_predictor.predict( + [image_features], num_predictions_per_location=[5], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + objectness_predictions = tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1) + return (box_encodings, objectness_predictions) + image_features = np.random.rand(4, 8, 8, 64).astype(np.float32) + (box_encodings, objectness_predictions) = self.execute(graph_fn, + [image_features]) + self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4]) + self.assertAllEqual(objectness_predictions.shape, [4, 320, 1]) + + def test_get_boxes_for_one_aspect_ratio_per_location(self): + def graph_fn(image_features): + conv_box_predictor = box_predictor.ConvolutionalBoxPredictor( + is_training=False, + num_classes=0, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + min_depth=0, + max_depth=32, + num_layers_before_predictor=1, + use_dropout=True, + dropout_keep_prob=0.8, + kernel_size=1, + box_code_size=4 + ) + box_predictions = conv_box_predictor.predict( + [image_features], num_predictions_per_location=[1], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + objectness_predictions = tf.concat(box_predictions[ + box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1) + return (box_encodings, objectness_predictions) + image_features = np.random.rand(4, 8, 8, 64).astype(np.float32) + (box_encodings, objectness_predictions) = self.execute(graph_fn, + [image_features]) + self.assertAllEqual(box_encodings.shape, [4, 64, 1, 4]) + self.assertAllEqual(objectness_predictions.shape, [4, 64, 1]) + + def test_get_multi_class_predictions_for_five_aspect_ratios_per_location( + self): + num_classes_without_background = 6 + image_features = np.random.rand(4, 8, 8, 64).astype(np.float32) + def graph_fn(image_features): + conv_box_predictor = box_predictor.ConvolutionalBoxPredictor( + is_training=False, + num_classes=num_classes_without_background, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + min_depth=0, + max_depth=32, + num_layers_before_predictor=1, + use_dropout=True, + dropout_keep_prob=0.8, + kernel_size=1, + box_code_size=4 + ) + box_predictions = conv_box_predictor.predict( + [image_features], + num_predictions_per_location=[5], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + class_predictions_with_background = tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1) + return (box_encodings, class_predictions_with_background) + (box_encodings, + class_predictions_with_background) = self.execute(graph_fn, + [image_features]) + self.assertAllEqual(box_encodings.shape, [4, 320, 1, 4]) + self.assertAllEqual(class_predictions_with_background.shape, + [4, 320, num_classes_without_background+1]) + + def test_get_predictions_with_feature_maps_of_dynamic_shape( + self): + image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64]) + conv_box_predictor = box_predictor.ConvolutionalBoxPredictor( + is_training=False, + num_classes=0, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + min_depth=0, + max_depth=32, + num_layers_before_predictor=1, + use_dropout=True, + dropout_keep_prob=0.8, + kernel_size=1, + box_code_size=4 + ) + box_predictions = conv_box_predictor.predict( + [image_features], num_predictions_per_location=[5], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + objectness_predictions = tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1) + init_op = tf.global_variables_initializer() + + resolution = 32 + expected_num_anchors = resolution*resolution*5 + with self.test_session() as sess: + sess.run(init_op) + (box_encodings_shape, + objectness_predictions_shape) = sess.run( + [tf.shape(box_encodings), tf.shape(objectness_predictions)], + feed_dict={image_features: + np.random.rand(4, resolution, resolution, 64)}) + actual_variable_set = set( + [var.op.name for var in tf.trainable_variables()]) + self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4]) + self.assertAllEqual(objectness_predictions_shape, + [4, expected_num_anchors, 1]) + expected_variable_set = set([ + 'BoxPredictor/Conv2d_0_1x1_32/biases', + 'BoxPredictor/Conv2d_0_1x1_32/weights', + 'BoxPredictor/BoxEncodingPredictor/biases', + 'BoxPredictor/BoxEncodingPredictor/weights', + 'BoxPredictor/ClassPredictor/biases', + 'BoxPredictor/ClassPredictor/weights']) + self.assertEqual(expected_variable_set, actual_variable_set) + + def test_use_depthwise_convolution(self): + image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64]) + conv_box_predictor = box_predictor.ConvolutionalBoxPredictor( + is_training=False, + num_classes=0, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + min_depth=0, + max_depth=32, + num_layers_before_predictor=1, + dropout_keep_prob=0.8, + kernel_size=1, + box_code_size=4, + use_dropout=True, + use_depthwise=True + ) + box_predictions = conv_box_predictor.predict( + [image_features], num_predictions_per_location=[5], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + objectness_predictions = tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1) + init_op = tf.global_variables_initializer() + + resolution = 32 + expected_num_anchors = resolution*resolution*5 + with self.test_session() as sess: + sess.run(init_op) + (box_encodings_shape, + objectness_predictions_shape) = sess.run( + [tf.shape(box_encodings), tf.shape(objectness_predictions)], + feed_dict={image_features: + np.random.rand(4, resolution, resolution, 64)}) + actual_variable_set = set( + [var.op.name for var in tf.trainable_variables()]) + self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 1, 4]) + self.assertAllEqual(objectness_predictions_shape, + [4, expected_num_anchors, 1]) + expected_variable_set = set([ + 'BoxPredictor/Conv2d_0_1x1_32/biases', + 'BoxPredictor/Conv2d_0_1x1_32/weights', + 'BoxPredictor/BoxEncodingPredictor_depthwise/biases', + 'BoxPredictor/BoxEncodingPredictor_depthwise/depthwise_weights', + 'BoxPredictor/BoxEncodingPredictor/biases', + 'BoxPredictor/BoxEncodingPredictor/weights', + 'BoxPredictor/ClassPredictor_depthwise/biases', + 'BoxPredictor/ClassPredictor_depthwise/depthwise_weights', + 'BoxPredictor/ClassPredictor/biases', + 'BoxPredictor/ClassPredictor/weights']) + self.assertEqual(expected_variable_set, actual_variable_set) + + +class WeightSharedConvolutionalBoxPredictorTest(test_case.TestCase): + + def _build_arg_scope_with_conv_hyperparams(self): + conv_hyperparams = hyperparams_pb2.Hyperparams() + conv_hyperparams_text_proto = """ + activation: RELU_6 + regularizer { + l2_regularizer { + } + } + initializer { + random_normal_initializer { + stddev: 0.01 + mean: 0.0 + } + } + batch_norm { + train: true, + } + """ + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams) + return hyperparams_builder.build(conv_hyperparams, is_training=True) + + def _build_conv_arg_scope_no_batch_norm(self): + conv_hyperparams = hyperparams_pb2.Hyperparams() + conv_hyperparams_text_proto = """ + activation: RELU_6 + regularizer { + l2_regularizer { + } + } + initializer { + random_normal_initializer { + stddev: 0.01 + mean: 0.0 + } + } + """ + text_format.Merge(conv_hyperparams_text_proto, conv_hyperparams) + return hyperparams_builder.build(conv_hyperparams, is_training=True) + + def test_get_boxes_for_five_aspect_ratios_per_location(self): + + def graph_fn(image_features): + conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor( + is_training=False, + num_classes=0, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + depth=32, + num_layers_before_predictor=1, + box_code_size=4) + box_predictions = conv_box_predictor.predict( + [image_features], num_predictions_per_location=[5], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + objectness_predictions = tf.concat(box_predictions[ + box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1) + return (box_encodings, objectness_predictions) + image_features = np.random.rand(4, 8, 8, 64).astype(np.float32) + (box_encodings, objectness_predictions) = self.execute( + graph_fn, [image_features]) + self.assertAllEqual(box_encodings.shape, [4, 320, 4]) + self.assertAllEqual(objectness_predictions.shape, [4, 320, 1]) + + def test_bias_predictions_to_background_with_sigmoid_score_conversion(self): + + def graph_fn(image_features): + conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor( + is_training=True, + num_classes=2, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + depth=32, + num_layers_before_predictor=1, + class_prediction_bias_init=-4.6, + box_code_size=4) + box_predictions = conv_box_predictor.predict( + [image_features], num_predictions_per_location=[5], + scope='BoxPredictor') + class_predictions = tf.concat(box_predictions[ + box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1) + return (tf.nn.sigmoid(class_predictions),) + image_features = np.random.rand(4, 8, 8, 64).astype(np.float32) + class_predictions = self.execute(graph_fn, [image_features]) + self.assertAlmostEqual(np.mean(class_predictions), 0.01, places=3) + + def test_get_multi_class_predictions_for_five_aspect_ratios_per_location( + self): + + num_classes_without_background = 6 + def graph_fn(image_features): + conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor( + is_training=False, + num_classes=num_classes_without_background, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + depth=32, + num_layers_before_predictor=1, + box_code_size=4) + box_predictions = conv_box_predictor.predict( + [image_features], + num_predictions_per_location=[5], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + class_predictions_with_background = tf.concat(box_predictions[ + box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1) + return (box_encodings, class_predictions_with_background) + + image_features = np.random.rand(4, 8, 8, 64).astype(np.float32) + (box_encodings, class_predictions_with_background) = self.execute( + graph_fn, [image_features]) + self.assertAllEqual(box_encodings.shape, [4, 320, 4]) + self.assertAllEqual(class_predictions_with_background.shape, + [4, 320, num_classes_without_background+1]) + + def test_get_multi_class_predictions_from_two_feature_maps( + self): + + num_classes_without_background = 6 + def graph_fn(image_features1, image_features2): + conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor( + is_training=False, + num_classes=num_classes_without_background, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + depth=32, + num_layers_before_predictor=1, + box_code_size=4) + box_predictions = conv_box_predictor.predict( + [image_features1, image_features2], + num_predictions_per_location=[5, 5], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + class_predictions_with_background = tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1) + return (box_encodings, class_predictions_with_background) + + image_features1 = np.random.rand(4, 8, 8, 64).astype(np.float32) + image_features2 = np.random.rand(4, 8, 8, 64).astype(np.float32) + (box_encodings, class_predictions_with_background) = self.execute( + graph_fn, [image_features1, image_features2]) + self.assertAllEqual(box_encodings.shape, [4, 640, 4]) + self.assertAllEqual(class_predictions_with_background.shape, + [4, 640, num_classes_without_background+1]) + + def test_predictions_from_multiple_feature_maps_share_weights_not_batchnorm( + self): + num_classes_without_background = 6 + def graph_fn(image_features1, image_features2): + conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor( + is_training=False, + num_classes=num_classes_without_background, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + depth=32, + num_layers_before_predictor=2, + box_code_size=4) + box_predictions = conv_box_predictor.predict( + [image_features1, image_features2], + num_predictions_per_location=[5, 5], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + class_predictions_with_background = tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1) + return (box_encodings, class_predictions_with_background) + + with self.test_session(graph=tf.Graph()): + graph_fn(tf.random_uniform([4, 32, 32, 3], dtype=tf.float32), + tf.random_uniform([4, 16, 16, 3], dtype=tf.float32)) + actual_variable_set = set( + [var.op.name for var in tf.trainable_variables()]) + expected_variable_set = set([ + # Box prediction tower + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_0/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_0/BatchNorm/feature_0/beta'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_0/BatchNorm/feature_1/beta'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_1/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_1/BatchNorm/feature_0/beta'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_1/BatchNorm/feature_1/beta'), + # Box prediction head + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictor/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictor/biases'), + # Class prediction tower + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_0/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_0/BatchNorm/feature_0/beta'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_0/BatchNorm/feature_1/beta'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_1/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_1/BatchNorm/feature_0/beta'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_1/BatchNorm/feature_1/beta'), + # Class prediction head + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictor/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictor/biases')]) + self.assertEqual(expected_variable_set, actual_variable_set) + + def test_no_batchnorm_params_when_batchnorm_is_not_configured(self): + num_classes_without_background = 6 + def graph_fn(image_features1, image_features2): + conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor( + is_training=False, + num_classes=num_classes_without_background, + conv_hyperparams_fn=self._build_conv_arg_scope_no_batch_norm(), + depth=32, + num_layers_before_predictor=2, + box_code_size=4) + box_predictions = conv_box_predictor.predict( + [image_features1, image_features2], + num_predictions_per_location=[5, 5], + scope='BoxPredictor') + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + class_predictions_with_background = tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1) + return (box_encodings, class_predictions_with_background) + + with self.test_session(graph=tf.Graph()): + graph_fn(tf.random_uniform([4, 32, 32, 3], dtype=tf.float32), + tf.random_uniform([4, 16, 16, 3], dtype=tf.float32)) + actual_variable_set = set( + [var.op.name for var in tf.trainable_variables()]) + expected_variable_set = set([ + # Box prediction tower + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_0/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_0/biases'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_1/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictionTower/conv2d_1/biases'), + # Box prediction head + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictor/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'BoxPredictor/biases'), + # Class prediction tower + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_0/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_0/biases'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_1/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictionTower/conv2d_1/biases'), + # Class prediction head + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictor/weights'), + ('BoxPredictor/WeightSharedConvolutionalBoxPredictor/' + 'ClassPredictor/biases')]) + self.assertEqual(expected_variable_set, actual_variable_set) + + def test_get_predictions_with_feature_maps_of_dynamic_shape( + self): + image_features = tf.placeholder(dtype=tf.float32, shape=[4, None, None, 64]) + conv_box_predictor = box_predictor.WeightSharedConvolutionalBoxPredictor( + is_training=False, + num_classes=0, + conv_hyperparams_fn=self._build_arg_scope_with_conv_hyperparams(), + depth=32, + num_layers_before_predictor=1, + box_code_size=4) + box_predictions = conv_box_predictor.predict( + [image_features], num_predictions_per_location=[5], + scope='BoxPredictor') + box_encodings = tf.concat(box_predictions[box_predictor.BOX_ENCODINGS], + axis=1) + objectness_predictions = tf.concat(box_predictions[ + box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], axis=1) + init_op = tf.global_variables_initializer() + + resolution = 32 + expected_num_anchors = resolution*resolution*5 + with self.test_session() as sess: + sess.run(init_op) + (box_encodings_shape, + objectness_predictions_shape) = sess.run( + [tf.shape(box_encodings), tf.shape(objectness_predictions)], + feed_dict={image_features: + np.random.rand(4, resolution, resolution, 64)}) + self.assertAllEqual(box_encodings_shape, [4, expected_num_anchors, 4]) + self.assertAllEqual(objectness_predictions_shape, + [4, expected_num_anchors, 1]) + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/data_decoder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/data_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..9ae18c1f957ea69432b08740451abb2af2548910 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/data_decoder.py @@ -0,0 +1,41 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Interface for data decoders. + +Data decoders decode the input data and return a dictionary of tensors keyed by +the entries in core.reader.Fields. +""" +from abc import ABCMeta +from abc import abstractmethod + + +class DataDecoder(object): + """Interface for data decoders.""" + __metaclass__ = ABCMeta + + @abstractmethod + def decode(self, data): + """Return a single image and associated labels. + + Args: + data: a string tensor holding a serialized protocol buffer corresponding + to data for a single image. + + Returns: + tensor_dict: a dictionary containing tensors. Possible keys are defined in + reader.Fields. + """ + pass diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/data_parser.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/data_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..3dac4de28ec52da5697e0b2fee81a56ebb72e35c --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/data_parser.py @@ -0,0 +1,41 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Interface for data parsers. + +Data parser parses input data and returns a dictionary of numpy arrays +keyed by the entries in standard_fields.py. Since the parser parses records +to numpy arrays (materialized tensors) directly, it is used to read data for +evaluation/visualization; to parse the data during training, DataDecoder should +be used. +""" +from abc import ABCMeta +from abc import abstractmethod + + +class DataToNumpyParser(object): + __metaclass__ = ABCMeta + + @abstractmethod + def parse(self, input_data): + """Parses input and returns a numpy array or a dictionary of numpy arrays. + + Args: + input_data: an input data + + Returns: + A numpy array or a dictionary of numpy arrays or None, if input + cannot be parsed. + """ + pass diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/keypoint_ops.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/keypoint_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..e520845f92f10faf39c419c321c696e871f4558c --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/keypoint_ops.py @@ -0,0 +1,282 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Keypoint operations. + +Keypoints are represented as tensors of shape [num_instances, num_keypoints, 2], +where the last dimension holds rank 2 tensors of the form [y, x] representing +the coordinates of the keypoint. +""" +import numpy as np +import tensorflow as tf + + +def scale(keypoints, y_scale, x_scale, scope=None): + """Scales keypoint coordinates in x and y dimensions. + + Args: + keypoints: a tensor of shape [num_instances, num_keypoints, 2] + y_scale: (float) scalar tensor + x_scale: (float) scalar tensor + scope: name scope. + + Returns: + new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] + """ + with tf.name_scope(scope, 'Scale'): + y_scale = tf.cast(y_scale, tf.float32) + x_scale = tf.cast(x_scale, tf.float32) + new_keypoints = keypoints * [[[y_scale, x_scale]]] + return new_keypoints + + +def clip_to_window(keypoints, window, scope=None): + """Clips keypoints to a window. + + This op clips any input keypoints to a window. + + Args: + keypoints: a tensor of shape [num_instances, num_keypoints, 2] + window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max] + window to which the op should clip the keypoints. + scope: name scope. + + Returns: + new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] + """ + with tf.name_scope(scope, 'ClipToWindow'): + y, x = tf.split(value=keypoints, num_or_size_splits=2, axis=2) + win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window) + y = tf.maximum(tf.minimum(y, win_y_max), win_y_min) + x = tf.maximum(tf.minimum(x, win_x_max), win_x_min) + new_keypoints = tf.concat([y, x], 2) + return new_keypoints + + +def prune_outside_window(keypoints, window, scope=None): + """Prunes keypoints that fall outside a given window. + + This function replaces keypoints that fall outside the given window with nan. + See also clip_to_window which clips any keypoints that fall outside the given + window. + + Args: + keypoints: a tensor of shape [num_instances, num_keypoints, 2] + window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max] + window outside of which the op should prune the keypoints. + scope: name scope. + + Returns: + new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] + """ + with tf.name_scope(scope, 'PruneOutsideWindow'): + y, x = tf.split(value=keypoints, num_or_size_splits=2, axis=2) + win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window) + + valid_indices = tf.logical_and( + tf.logical_and(y >= win_y_min, y <= win_y_max), + tf.logical_and(x >= win_x_min, x <= win_x_max)) + + new_y = tf.where(valid_indices, y, np.nan * tf.ones_like(y)) + new_x = tf.where(valid_indices, x, np.nan * tf.ones_like(x)) + new_keypoints = tf.concat([new_y, new_x], 2) + + return new_keypoints + + +def change_coordinate_frame(keypoints, window, scope=None): + """Changes coordinate frame of the keypoints to be relative to window's frame. + + Given a window of the form [y_min, x_min, y_max, x_max], changes keypoint + coordinates from keypoints of shape [num_instances, num_keypoints, 2] + to be relative to this window. + + An example use case is data augmentation: where we are given groundtruth + keypoints and would like to randomly crop the image to some window. In this + case we need to change the coordinate frame of each groundtruth keypoint to be + relative to this new window. + + Args: + keypoints: a tensor of shape [num_instances, num_keypoints, 2] + window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max] + window we should change the coordinate frame to. + scope: name scope. + + Returns: + new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] + """ + with tf.name_scope(scope, 'ChangeCoordinateFrame'): + win_height = window[2] - window[0] + win_width = window[3] - window[1] + new_keypoints = scale(keypoints - [window[0], window[1]], 1.0 / win_height, + 1.0 / win_width) + return new_keypoints + + +def to_normalized_coordinates(keypoints, height, width, + check_range=True, scope=None): + """Converts absolute keypoint coordinates to normalized coordinates in [0, 1]. + + Usually one uses the dynamic shape of the image or conv-layer tensor: + keypoints = keypoint_ops.to_normalized_coordinates(keypoints, + tf.shape(images)[1], + tf.shape(images)[2]), + + This function raises an assertion failed error at graph execution time when + the maximum coordinate is smaller than 1.01 (which means that coordinates are + already normalized). The value 1.01 is to deal with small rounding errors. + + Args: + keypoints: A tensor of shape [num_instances, num_keypoints, 2]. + height: Maximum value for y coordinate of absolute keypoint coordinates. + width: Maximum value for x coordinate of absolute keypoint coordinates. + check_range: If True, checks if the coordinates are normalized. + scope: name scope. + + Returns: + tensor of shape [num_instances, num_keypoints, 2] with normalized + coordinates in [0, 1]. + """ + with tf.name_scope(scope, 'ToNormalizedCoordinates'): + height = tf.cast(height, tf.float32) + width = tf.cast(width, tf.float32) + + if check_range: + max_val = tf.reduce_max(keypoints) + max_assert = tf.Assert(tf.greater(max_val, 1.01), + ['max value is lower than 1.01: ', max_val]) + with tf.control_dependencies([max_assert]): + width = tf.identity(width) + + return scale(keypoints, 1.0 / height, 1.0 / width) + + +def to_absolute_coordinates(keypoints, height, width, + check_range=True, scope=None): + """Converts normalized keypoint coordinates to absolute pixel coordinates. + + This function raises an assertion failed error when the maximum keypoint + coordinate value is larger than 1.01 (in which case coordinates are already + absolute). + + Args: + keypoints: A tensor of shape [num_instances, num_keypoints, 2] + height: Maximum value for y coordinate of absolute keypoint coordinates. + width: Maximum value for x coordinate of absolute keypoint coordinates. + check_range: If True, checks if the coordinates are normalized or not. + scope: name scope. + + Returns: + tensor of shape [num_instances, num_keypoints, 2] with absolute coordinates + in terms of the image size. + + """ + with tf.name_scope(scope, 'ToAbsoluteCoordinates'): + height = tf.cast(height, tf.float32) + width = tf.cast(width, tf.float32) + + # Ensure range of input keypoints is correct. + if check_range: + max_val = tf.reduce_max(keypoints) + max_assert = tf.Assert(tf.greater_equal(1.01, max_val), + ['maximum keypoint coordinate value is larger ' + 'than 1.01: ', max_val]) + with tf.control_dependencies([max_assert]): + width = tf.identity(width) + + return scale(keypoints, height, width) + + +def flip_horizontal(keypoints, flip_point, flip_permutation, scope=None): + """Flips the keypoints horizontally around the flip_point. + + This operation flips the x coordinate for each keypoint around the flip_point + and also permutes the keypoints in a manner specified by flip_permutation. + + Args: + keypoints: a tensor of shape [num_instances, num_keypoints, 2] + flip_point: (float) scalar tensor representing the x coordinate to flip the + keypoints around. + flip_permutation: rank 1 int32 tensor containing the keypoint flip + permutation. This specifies the mapping from original keypoint indices + to the flipped keypoint indices. This is used primarily for keypoints + that are not reflection invariant. E.g. Suppose there are 3 keypoints + representing ['head', 'right_eye', 'left_eye'], then a logical choice for + flip_permutation might be [0, 2, 1] since we want to swap the 'left_eye' + and 'right_eye' after a horizontal flip. + scope: name scope. + + Returns: + new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] + """ + with tf.name_scope(scope, 'FlipHorizontal'): + keypoints = tf.transpose(keypoints, [1, 0, 2]) + keypoints = tf.gather(keypoints, flip_permutation) + v, u = tf.split(value=keypoints, num_or_size_splits=2, axis=2) + u = flip_point * 2.0 - u + new_keypoints = tf.concat([v, u], 2) + new_keypoints = tf.transpose(new_keypoints, [1, 0, 2]) + return new_keypoints + + +def flip_vertical(keypoints, flip_point, flip_permutation, scope=None): + """Flips the keypoints vertically around the flip_point. + + This operation flips the y coordinate for each keypoint around the flip_point + and also permutes the keypoints in a manner specified by flip_permutation. + + Args: + keypoints: a tensor of shape [num_instances, num_keypoints, 2] + flip_point: (float) scalar tensor representing the y coordinate to flip the + keypoints around. + flip_permutation: rank 1 int32 tensor containing the keypoint flip + permutation. This specifies the mapping from original keypoint indices + to the flipped keypoint indices. This is used primarily for keypoints + that are not reflection invariant. E.g. Suppose there are 3 keypoints + representing ['head', 'right_eye', 'left_eye'], then a logical choice for + flip_permutation might be [0, 2, 1] since we want to swap the 'left_eye' + and 'right_eye' after a horizontal flip. + scope: name scope. + + Returns: + new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] + """ + with tf.name_scope(scope, 'FlipVertical'): + keypoints = tf.transpose(keypoints, [1, 0, 2]) + keypoints = tf.gather(keypoints, flip_permutation) + v, u = tf.split(value=keypoints, num_or_size_splits=2, axis=2) + v = flip_point * 2.0 - v + new_keypoints = tf.concat([v, u], 2) + new_keypoints = tf.transpose(new_keypoints, [1, 0, 2]) + return new_keypoints + + +def rot90(keypoints, scope=None): + """Rotates the keypoints counter-clockwise by 90 degrees. + + Args: + keypoints: a tensor of shape [num_instances, num_keypoints, 2] + scope: name scope. + + Returns: + new_keypoints: a tensor of shape [num_instances, num_keypoints, 2] + """ + with tf.name_scope(scope, 'Rot90'): + keypoints = tf.transpose(keypoints, [1, 0, 2]) + v, u = tf.split(value=keypoints[:, :, ::-1], num_or_size_splits=2, axis=2) + v = 1.0 - v + new_keypoints = tf.concat([v, u], 2) + new_keypoints = tf.transpose(new_keypoints, [1, 0, 2]) + return new_keypoints diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/keypoint_ops_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/keypoint_ops_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1c09c55aa2c834e566dd8d6cd57b9a254bf26efe --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/keypoint_ops_test.py @@ -0,0 +1,200 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.keypoint_ops.""" +import numpy as np +import tensorflow as tf + +from object_detection.core import keypoint_ops + + +class KeypointOpsTest(tf.test.TestCase): + """Tests for common keypoint operations.""" + + def test_scale(self): + keypoints = tf.constant([ + [[0.0, 0.0], [100.0, 200.0]], + [[50.0, 120.0], [100.0, 140.0]] + ]) + y_scale = tf.constant(1.0 / 100) + x_scale = tf.constant(1.0 / 200) + + expected_keypoints = tf.constant([ + [[0., 0.], [1.0, 1.0]], + [[0.5, 0.6], [1.0, 0.7]] + ]) + output = keypoint_ops.scale(keypoints, y_scale, x_scale) + + with self.test_session() as sess: + output_, expected_keypoints_ = sess.run([output, expected_keypoints]) + self.assertAllClose(output_, expected_keypoints_) + + def test_clip_to_window(self): + keypoints = tf.constant([ + [[0.25, 0.5], [0.75, 0.75]], + [[0.5, 0.0], [1.0, 1.0]] + ]) + window = tf.constant([0.25, 0.25, 0.75, 0.75]) + + expected_keypoints = tf.constant([ + [[0.25, 0.5], [0.75, 0.75]], + [[0.5, 0.25], [0.75, 0.75]] + ]) + output = keypoint_ops.clip_to_window(keypoints, window) + + with self.test_session() as sess: + output_, expected_keypoints_ = sess.run([output, expected_keypoints]) + self.assertAllClose(output_, expected_keypoints_) + + def test_prune_outside_window(self): + keypoints = tf.constant([ + [[0.25, 0.5], [0.75, 0.75]], + [[0.5, 0.0], [1.0, 1.0]] + ]) + window = tf.constant([0.25, 0.25, 0.75, 0.75]) + + expected_keypoints = tf.constant([[[0.25, 0.5], [0.75, 0.75]], + [[np.nan, np.nan], [np.nan, np.nan]]]) + output = keypoint_ops.prune_outside_window(keypoints, window) + + with self.test_session() as sess: + output_, expected_keypoints_ = sess.run([output, expected_keypoints]) + self.assertAllClose(output_, expected_keypoints_) + + def test_change_coordinate_frame(self): + keypoints = tf.constant([ + [[0.25, 0.5], [0.75, 0.75]], + [[0.5, 0.0], [1.0, 1.0]] + ]) + window = tf.constant([0.25, 0.25, 0.75, 0.75]) + + expected_keypoints = tf.constant([ + [[0, 0.5], [1.0, 1.0]], + [[0.5, -0.5], [1.5, 1.5]] + ]) + output = keypoint_ops.change_coordinate_frame(keypoints, window) + + with self.test_session() as sess: + output_, expected_keypoints_ = sess.run([output, expected_keypoints]) + self.assertAllClose(output_, expected_keypoints_) + + def test_to_normalized_coordinates(self): + keypoints = tf.constant([ + [[10., 30.], [30., 45.]], + [[20., 0.], [40., 60.]] + ]) + output = keypoint_ops.to_normalized_coordinates( + keypoints, 40, 60) + expected_keypoints = tf.constant([ + [[0.25, 0.5], [0.75, 0.75]], + [[0.5, 0.0], [1.0, 1.0]] + ]) + + with self.test_session() as sess: + output_, expected_keypoints_ = sess.run([output, expected_keypoints]) + self.assertAllClose(output_, expected_keypoints_) + + def test_to_normalized_coordinates_already_normalized(self): + keypoints = tf.constant([ + [[0.25, 0.5], [0.75, 0.75]], + [[0.5, 0.0], [1.0, 1.0]] + ]) + output = keypoint_ops.to_normalized_coordinates( + keypoints, 40, 60) + + with self.test_session() as sess: + with self.assertRaisesOpError('assertion failed'): + sess.run(output) + + def test_to_absolute_coordinates(self): + keypoints = tf.constant([ + [[0.25, 0.5], [0.75, 0.75]], + [[0.5, 0.0], [1.0, 1.0]] + ]) + output = keypoint_ops.to_absolute_coordinates( + keypoints, 40, 60) + expected_keypoints = tf.constant([ + [[10., 30.], [30., 45.]], + [[20., 0.], [40., 60.]] + ]) + + with self.test_session() as sess: + output_, expected_keypoints_ = sess.run([output, expected_keypoints]) + self.assertAllClose(output_, expected_keypoints_) + + def test_to_absolute_coordinates_already_absolute(self): + keypoints = tf.constant([ + [[10., 30.], [30., 45.]], + [[20., 0.], [40., 60.]] + ]) + output = keypoint_ops.to_absolute_coordinates( + keypoints, 40, 60) + + with self.test_session() as sess: + with self.assertRaisesOpError('assertion failed'): + sess.run(output) + + def test_flip_horizontal(self): + keypoints = tf.constant([ + [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]], + [[0.4, 0.4], [0.5, 0.5], [0.6, 0.6]] + ]) + flip_permutation = [0, 2, 1] + + expected_keypoints = tf.constant([ + [[0.1, 0.9], [0.3, 0.7], [0.2, 0.8]], + [[0.4, 0.6], [0.6, 0.4], [0.5, 0.5]], + ]) + output = keypoint_ops.flip_horizontal(keypoints, 0.5, flip_permutation) + + with self.test_session() as sess: + output_, expected_keypoints_ = sess.run([output, expected_keypoints]) + self.assertAllClose(output_, expected_keypoints_) + + def test_flip_vertical(self): + keypoints = tf.constant([ + [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]], + [[0.4, 0.4], [0.5, 0.5], [0.6, 0.6]] + ]) + flip_permutation = [0, 2, 1] + + expected_keypoints = tf.constant([ + [[0.9, 0.1], [0.7, 0.3], [0.8, 0.2]], + [[0.6, 0.4], [0.4, 0.6], [0.5, 0.5]], + ]) + output = keypoint_ops.flip_vertical(keypoints, 0.5, flip_permutation) + + with self.test_session() as sess: + output_, expected_keypoints_ = sess.run([output, expected_keypoints]) + self.assertAllClose(output_, expected_keypoints_) + + def test_rot90(self): + keypoints = tf.constant([ + [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]], + [[0.4, 0.6], [0.5, 0.6], [0.6, 0.7]] + ]) + expected_keypoints = tf.constant([ + [[0.9, 0.1], [0.8, 0.2], [0.7, 0.3]], + [[0.4, 0.4], [0.4, 0.5], [0.3, 0.6]], + ]) + output = keypoint_ops.rot90(keypoints) + + with self.test_session() as sess: + output_, expected_keypoints_ = sess.run([output, expected_keypoints]) + self.assertAllClose(output_, expected_keypoints_) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/losses.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..5471c955fdcef7530c04557dba8b8cbb54936cef --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/losses.py @@ -0,0 +1,641 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Classification and regression loss functions for object detection. + +Localization losses: + * WeightedL2LocalizationLoss + * WeightedSmoothL1LocalizationLoss + * WeightedIOULocalizationLoss + +Classification losses: + * WeightedSigmoidClassificationLoss + * WeightedSoftmaxClassificationLoss + * WeightedSoftmaxClassificationAgainstLogitsLoss + * BootstrappedSigmoidClassificationLoss +""" +from abc import ABCMeta +from abc import abstractmethod + +import tensorflow as tf + +from object_detection.core import box_list +from object_detection.core import box_list_ops +from object_detection.utils import ops + +slim = tf.contrib.slim + + +class Loss(object): + """Abstract base class for loss functions.""" + __metaclass__ = ABCMeta + + def __call__(self, + prediction_tensor, + target_tensor, + ignore_nan_targets=False, + scope=None, + **params): + """Call the loss function. + + Args: + prediction_tensor: an N-d tensor of shape [batch, anchors, ...] + representing predicted quantities. + target_tensor: an N-d tensor of shape [batch, anchors, ...] representing + regression or classification targets. + ignore_nan_targets: whether to ignore nan targets in the loss computation. + E.g. can be used if the target tensor is missing groundtruth data that + shouldn't be factored into the loss. + scope: Op scope name. Defaults to 'Loss' if None. + **params: Additional keyword arguments for specific implementations of + the Loss. + + Returns: + loss: a tensor representing the value of the loss function. + """ + with tf.name_scope(scope, 'Loss', + [prediction_tensor, target_tensor, params]) as scope: + if ignore_nan_targets: + target_tensor = tf.where(tf.is_nan(target_tensor), + prediction_tensor, + target_tensor) + return self._compute_loss(prediction_tensor, target_tensor, **params) + + @abstractmethod + def _compute_loss(self, prediction_tensor, target_tensor, **params): + """Method to be overridden by implementations. + + Args: + prediction_tensor: a tensor representing predicted quantities + target_tensor: a tensor representing regression or classification targets + **params: Additional keyword arguments for specific implementations of + the Loss. + + Returns: + loss: an N-d tensor of shape [batch, anchors, ...] containing the loss per + anchor + """ + pass + + +class WeightedL2LocalizationLoss(Loss): + """L2 localization loss function with anchorwise output support. + + Loss[b,a] = .5 * ||weights[b,a] * (prediction[b,a,:] - target[b,a,:])||^2 + """ + + def _compute_loss(self, prediction_tensor, target_tensor, weights): + """Compute loss function. + + Args: + prediction_tensor: A float tensor of shape [batch_size, num_anchors, + code_size] representing the (encoded) predicted locations of objects. + target_tensor: A float tensor of shape [batch_size, num_anchors, + code_size] representing the regression targets + weights: a float tensor of shape [batch_size, num_anchors] + + Returns: + loss: a float tensor of shape [batch_size, num_anchors] tensor + representing the value of the loss function. + """ + weighted_diff = (prediction_tensor - target_tensor) * tf.expand_dims( + weights, 2) + square_diff = 0.5 * tf.square(weighted_diff) + return tf.reduce_sum(square_diff, 2) + + +class WeightedSmoothL1LocalizationLoss(Loss): + """Smooth L1 localization loss function aka Huber Loss.. + + The smooth L1_loss is defined elementwise as .5 x^2 if |x| <= delta and + 0.5 x^2 + delta * (|x|-delta) otherwise, where x is the difference between + predictions and target. + + See also Equation (3) in the Fast R-CNN paper by Ross Girshick (ICCV 2015) + """ + + def __init__(self, delta=1.0): + """Constructor. + + Args: + delta: delta for smooth L1 loss. + """ + self._delta = delta + + def _compute_loss(self, prediction_tensor, target_tensor, weights): + """Compute loss function. + + Args: + prediction_tensor: A float tensor of shape [batch_size, num_anchors, + code_size] representing the (encoded) predicted locations of objects. + target_tensor: A float tensor of shape [batch_size, num_anchors, + code_size] representing the regression targets + weights: a float tensor of shape [batch_size, num_anchors] + + Returns: + loss: a float tensor of shape [batch_size, num_anchors] tensor + representing the value of the loss function. + """ + return tf.reduce_sum(tf.losses.huber_loss( + target_tensor, + prediction_tensor, + delta=self._delta, + weights=tf.expand_dims(weights, axis=2), + loss_collection=None, + reduction=tf.losses.Reduction.NONE + ), axis=2) + + +class WeightedIOULocalizationLoss(Loss): + """IOU localization loss function. + + Sums the IOU for corresponding pairs of predicted/groundtruth boxes + and for each pair assign a loss of 1 - IOU. We then compute a weighted + sum over all pairs which is returned as the total loss. + """ + + def _compute_loss(self, prediction_tensor, target_tensor, weights): + """Compute loss function. + + Args: + prediction_tensor: A float tensor of shape [batch_size, num_anchors, 4] + representing the decoded predicted boxes + target_tensor: A float tensor of shape [batch_size, num_anchors, 4] + representing the decoded target boxes + weights: a float tensor of shape [batch_size, num_anchors] + + Returns: + loss: a float tensor of shape [batch_size, num_anchors] tensor + representing the value of the loss function. + """ + predicted_boxes = box_list.BoxList(tf.reshape(prediction_tensor, [-1, 4])) + target_boxes = box_list.BoxList(tf.reshape(target_tensor, [-1, 4])) + per_anchor_iou_loss = 1.0 - box_list_ops.matched_iou(predicted_boxes, + target_boxes) + return tf.reshape(weights, [-1]) * per_anchor_iou_loss + + +class WeightedSigmoidClassificationLoss(Loss): + """Sigmoid cross entropy classification loss function.""" + + def _compute_loss(self, + prediction_tensor, + target_tensor, + weights, + class_indices=None): + """Compute loss function. + + Args: + prediction_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing the predicted logits for each class + target_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing one-hot encoded classification targets + weights: a float tensor of shape [batch_size, num_anchors] + class_indices: (Optional) A 1-D integer tensor of class indices. + If provided, computes loss only for the specified class indices. + + Returns: + loss: a float tensor of shape [batch_size, num_anchors, num_classes] + representing the value of the loss function. + """ + weights = tf.expand_dims(weights, 2) + if class_indices is not None: + weights *= tf.reshape( + ops.indices_to_dense_vector(class_indices, + tf.shape(prediction_tensor)[2]), + [1, 1, -1]) + per_entry_cross_ent = (tf.nn.sigmoid_cross_entropy_with_logits( + labels=target_tensor, logits=prediction_tensor)) + return per_entry_cross_ent * weights + + +class SigmoidFocalClassificationLoss(Loss): + """Sigmoid focal cross entropy loss. + + Focal loss down-weights well classified examples and focusses on the hard + examples. See https://arxiv.org/pdf/1708.02002.pdf for the loss definition. + """ + + def __init__(self, gamma=2.0, alpha=0.25): + """Constructor. + + Args: + gamma: exponent of the modulating factor (1 - p_t) ^ gamma. + alpha: optional alpha weighting factor to balance positives vs negatives. + """ + self._alpha = alpha + self._gamma = gamma + + def _compute_loss(self, + prediction_tensor, + target_tensor, + weights, + class_indices=None): + """Compute loss function. + + Args: + prediction_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing the predicted logits for each class + target_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing one-hot encoded classification targets + weights: a float tensor of shape [batch_size, num_anchors] + class_indices: (Optional) A 1-D integer tensor of class indices. + If provided, computes loss only for the specified class indices. + + Returns: + loss: a float tensor of shape [batch_size, num_anchors, num_classes] + representing the value of the loss function. + """ + weights = tf.expand_dims(weights, 2) + if class_indices is not None: + weights *= tf.reshape( + ops.indices_to_dense_vector(class_indices, + tf.shape(prediction_tensor)[2]), + [1, 1, -1]) + per_entry_cross_ent = (tf.nn.sigmoid_cross_entropy_with_logits( + labels=target_tensor, logits=prediction_tensor)) + prediction_probabilities = tf.sigmoid(prediction_tensor) + p_t = ((target_tensor * prediction_probabilities) + + ((1 - target_tensor) * (1 - prediction_probabilities))) + modulating_factor = 1.0 + if self._gamma: + modulating_factor = tf.pow(1.0 - p_t, self._gamma) + alpha_weight_factor = 1.0 + if self._alpha is not None: + alpha_weight_factor = (target_tensor * self._alpha + + (1 - target_tensor) * (1 - self._alpha)) + focal_cross_entropy_loss = (modulating_factor * alpha_weight_factor * + per_entry_cross_ent) + return focal_cross_entropy_loss * weights + + +class WeightedSoftmaxClassificationLoss(Loss): + """Softmax loss function.""" + + def __init__(self, logit_scale=1.0): + """Constructor. + + Args: + logit_scale: When this value is high, the prediction is "diffused" and + when this value is low, the prediction is made peakier. + (default 1.0) + + """ + self._logit_scale = logit_scale + + def _compute_loss(self, prediction_tensor, target_tensor, weights): + """Compute loss function. + + Args: + prediction_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing the predicted logits for each class + target_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing one-hot encoded classification targets + weights: a float tensor of shape [batch_size, num_anchors] + + Returns: + loss: a float tensor of shape [batch_size, num_anchors] + representing the value of the loss function. + """ + num_classes = prediction_tensor.get_shape().as_list()[-1] + prediction_tensor = tf.divide( + prediction_tensor, self._logit_scale, name='scale_logit') + per_row_cross_ent = (tf.nn.softmax_cross_entropy_with_logits( + labels=tf.reshape(target_tensor, [-1, num_classes]), + logits=tf.reshape(prediction_tensor, [-1, num_classes]))) + return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights + + +class WeightedSoftmaxClassificationAgainstLogitsLoss(Loss): + """Softmax loss function against logits. + + Targets are expected to be provided in logits space instead of "one hot" or + "probability distribution" space. + """ + + def __init__(self, logit_scale=1.0): + """Constructor. + + Args: + logit_scale: When this value is high, the target is "diffused" and + when this value is low, the target is made peakier. + (default 1.0) + + """ + self._logit_scale = logit_scale + + def _scale_and_softmax_logits(self, logits): + """Scale logits then apply softmax.""" + scaled_logits = tf.divide(logits, self._logit_scale, name='scale_logits') + return tf.nn.softmax(scaled_logits, name='convert_scores') + + def _compute_loss(self, prediction_tensor, target_tensor, weights): + """Compute loss function. + + Args: + prediction_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing the predicted logits for each class + target_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing logit classification targets + weights: a float tensor of shape [batch_size, num_anchors] + + Returns: + loss: a float tensor of shape [batch_size, num_anchors] + representing the value of the loss function. + """ + num_classes = prediction_tensor.get_shape().as_list()[-1] + target_tensor = self._scale_and_softmax_logits(target_tensor) + prediction_tensor = tf.divide(prediction_tensor, self._logit_scale, + name='scale_logits') + + per_row_cross_ent = (tf.nn.softmax_cross_entropy_with_logits( + labels=tf.reshape(target_tensor, [-1, num_classes]), + logits=tf.reshape(prediction_tensor, [-1, num_classes]))) + return tf.reshape(per_row_cross_ent, tf.shape(weights)) * weights + + +class BootstrappedSigmoidClassificationLoss(Loss): + """Bootstrapped sigmoid cross entropy classification loss function. + + This loss uses a convex combination of training labels and the current model's + predictions as training targets in the classification loss. The idea is that + as the model improves over time, its predictions can be trusted more and we + can use these predictions to mitigate the damage of noisy/incorrect labels, + because incorrect labels are likely to be eventually highly inconsistent with + other stimuli predicted to have the same label by the model. + + In "soft" bootstrapping, we use all predicted class probabilities, whereas in + "hard" bootstrapping, we use the single class favored by the model. + + See also Training Deep Neural Networks On Noisy Labels with Bootstrapping by + Reed et al. (ICLR 2015). + """ + + def __init__(self, alpha, bootstrap_type='soft'): + """Constructor. + + Args: + alpha: a float32 scalar tensor between 0 and 1 representing interpolation + weight + bootstrap_type: set to either 'hard' or 'soft' (default) + + Raises: + ValueError: if bootstrap_type is not either 'hard' or 'soft' + """ + if bootstrap_type != 'hard' and bootstrap_type != 'soft': + raise ValueError('Unrecognized bootstrap_type: must be one of ' + '\'hard\' or \'soft.\'') + self._alpha = alpha + self._bootstrap_type = bootstrap_type + + def _compute_loss(self, prediction_tensor, target_tensor, weights): + """Compute loss function. + + Args: + prediction_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing the predicted logits for each class + target_tensor: A float tensor of shape [batch_size, num_anchors, + num_classes] representing one-hot encoded classification targets + weights: a float tensor of shape [batch_size, num_anchors] + + Returns: + loss: a float tensor of shape [batch_size, num_anchors, num_classes] + representing the value of the loss function. + """ + if self._bootstrap_type == 'soft': + bootstrap_target_tensor = self._alpha * target_tensor + ( + 1.0 - self._alpha) * tf.sigmoid(prediction_tensor) + else: + bootstrap_target_tensor = self._alpha * target_tensor + ( + 1.0 - self._alpha) * tf.cast( + tf.sigmoid(prediction_tensor) > 0.5, tf.float32) + per_entry_cross_ent = (tf.nn.sigmoid_cross_entropy_with_logits( + labels=bootstrap_target_tensor, logits=prediction_tensor)) + return per_entry_cross_ent * tf.expand_dims(weights, 2) + + +class HardExampleMiner(object): + """Hard example mining for regions in a list of images. + + Implements hard example mining to select a subset of regions to be + back-propagated. For each image, selects the regions with highest losses, + subject to the condition that a newly selected region cannot have + an IOU > iou_threshold with any of the previously selected regions. + This can be achieved by re-using a greedy non-maximum suppression algorithm. + A constraint on the number of negatives mined per positive region can also be + enforced. + + Reference papers: "Training Region-based Object Detectors with Online + Hard Example Mining" (CVPR 2016) by Srivastava et al., and + "SSD: Single Shot MultiBox Detector" (ECCV 2016) by Liu et al. + """ + + def __init__(self, + num_hard_examples=64, + iou_threshold=0.7, + loss_type='both', + cls_loss_weight=0.05, + loc_loss_weight=0.06, + max_negatives_per_positive=None, + min_negatives_per_image=0): + """Constructor. + + The hard example mining implemented by this class can replicate the behavior + in the two aforementioned papers (Srivastava et al., and Liu et al). + To replicate the A2 paper (Srivastava et al), num_hard_examples is set + to a fixed parameter (64 by default) and iou_threshold is set to .7 for + running non-max-suppression the predicted boxes prior to hard mining. + In order to replicate the SSD paper (Liu et al), num_hard_examples should + be set to None, max_negatives_per_positive should be 3 and iou_threshold + should be 1.0 (in order to effectively turn off NMS). + + Args: + num_hard_examples: maximum number of hard examples to be + selected per image (prior to enforcing max negative to positive ratio + constraint). If set to None, all examples obtained after NMS are + considered. + iou_threshold: minimum intersection over union for an example + to be discarded during NMS. + loss_type: use only classification losses ('cls', default), + localization losses ('loc') or both losses ('both'). + In the last case, cls_loss_weight and loc_loss_weight are used to + compute weighted sum of the two losses. + cls_loss_weight: weight for classification loss. + loc_loss_weight: weight for location loss. + max_negatives_per_positive: maximum number of negatives to retain for + each positive anchor. By default, num_negatives_per_positive is None, + which means that we do not enforce a prespecified negative:positive + ratio. Note also that num_negatives_per_positives can be a float + (and will be converted to be a float even if it is passed in otherwise). + min_negatives_per_image: minimum number of negative anchors to sample for + a given image. Setting this to a positive number allows sampling + negatives in an image without any positive anchors and thus not biased + towards at least one detection per image. + """ + self._num_hard_examples = num_hard_examples + self._iou_threshold = iou_threshold + self._loss_type = loss_type + self._cls_loss_weight = cls_loss_weight + self._loc_loss_weight = loc_loss_weight + self._max_negatives_per_positive = max_negatives_per_positive + self._min_negatives_per_image = min_negatives_per_image + if self._max_negatives_per_positive is not None: + self._max_negatives_per_positive = float(self._max_negatives_per_positive) + self._num_positives_list = None + self._num_negatives_list = None + + def __call__(self, + location_losses, + cls_losses, + decoded_boxlist_list, + match_list=None): + """Computes localization and classification losses after hard mining. + + Args: + location_losses: a float tensor of shape [num_images, num_anchors] + representing anchorwise localization losses. + cls_losses: a float tensor of shape [num_images, num_anchors] + representing anchorwise classification losses. + decoded_boxlist_list: a list of decoded BoxList representing location + predictions for each image. + match_list: an optional list of matcher.Match objects encoding the match + between anchors and groundtruth boxes for each image of the batch, + with rows of the Match objects corresponding to groundtruth boxes + and columns corresponding to anchors. Match objects in match_list are + used to reference which anchors are positive, negative or ignored. If + self._max_negatives_per_positive exists, these are then used to enforce + a prespecified negative to positive ratio. + + Returns: + mined_location_loss: a float scalar with sum of localization losses from + selected hard examples. + mined_cls_loss: a float scalar with sum of classification losses from + selected hard examples. + Raises: + ValueError: if location_losses, cls_losses and decoded_boxlist_list do + not have compatible shapes (i.e., they must correspond to the same + number of images). + ValueError: if match_list is specified but its length does not match + len(decoded_boxlist_list). + """ + mined_location_losses = [] + mined_cls_losses = [] + location_losses = tf.unstack(location_losses) + cls_losses = tf.unstack(cls_losses) + num_images = len(decoded_boxlist_list) + if not match_list: + match_list = num_images * [None] + if not len(location_losses) == len(decoded_boxlist_list) == len(cls_losses): + raise ValueError('location_losses, cls_losses and decoded_boxlist_list ' + 'do not have compatible shapes.') + if not isinstance(match_list, list): + raise ValueError('match_list must be a list.') + if len(match_list) != len(decoded_boxlist_list): + raise ValueError('match_list must either be None or have ' + 'length=len(decoded_boxlist_list).') + num_positives_list = [] + num_negatives_list = [] + for ind, detection_boxlist in enumerate(decoded_boxlist_list): + box_locations = detection_boxlist.get() + match = match_list[ind] + image_losses = cls_losses[ind] + if self._loss_type == 'loc': + image_losses = location_losses[ind] + elif self._loss_type == 'both': + image_losses *= self._cls_loss_weight + image_losses += location_losses[ind] * self._loc_loss_weight + if self._num_hard_examples is not None: + num_hard_examples = self._num_hard_examples + else: + num_hard_examples = detection_boxlist.num_boxes() + selected_indices = tf.image.non_max_suppression( + box_locations, image_losses, num_hard_examples, self._iou_threshold) + if self._max_negatives_per_positive is not None and match: + (selected_indices, num_positives, + num_negatives) = self._subsample_selection_to_desired_neg_pos_ratio( + selected_indices, match, self._max_negatives_per_positive, + self._min_negatives_per_image) + num_positives_list.append(num_positives) + num_negatives_list.append(num_negatives) + mined_location_losses.append( + tf.reduce_sum(tf.gather(location_losses[ind], selected_indices))) + mined_cls_losses.append( + tf.reduce_sum(tf.gather(cls_losses[ind], selected_indices))) + location_loss = tf.reduce_sum(tf.stack(mined_location_losses)) + cls_loss = tf.reduce_sum(tf.stack(mined_cls_losses)) + if match and self._max_negatives_per_positive: + self._num_positives_list = num_positives_list + self._num_negatives_list = num_negatives_list + return (location_loss, cls_loss) + + def summarize(self): + """Summarize the number of positives and negatives after mining.""" + if self._num_positives_list and self._num_negatives_list: + avg_num_positives = tf.reduce_mean(tf.to_float(self._num_positives_list)) + avg_num_negatives = tf.reduce_mean(tf.to_float(self._num_negatives_list)) + tf.summary.scalar('HardExampleMiner/NumPositives', avg_num_positives) + tf.summary.scalar('HardExampleMiner/NumNegatives', avg_num_negatives) + + def _subsample_selection_to_desired_neg_pos_ratio(self, + indices, + match, + max_negatives_per_positive, + min_negatives_per_image=0): + """Subsample a collection of selected indices to a desired neg:pos ratio. + + This function takes a subset of M indices (indexing into a large anchor + collection of N anchors where M=0, + meaning that column i is matched with row match_results[i]. + (2) match_results[i]=-1, meaning that column i is not matched. + (3) match_results[i]=-2, meaning that column i is ignored. + use_matmul_gather: Use matrix multiplication based gather instead of + standard tf.gather. (Default: False). + + Raises: + ValueError: if match_results does not have rank 1 or is not an + integer int32 scalar tensor + """ + if match_results.shape.ndims != 1: + raise ValueError('match_results should have rank 1') + if match_results.dtype != tf.int32: + raise ValueError('match_results should be an int32 or int64 scalar ' + 'tensor') + self._match_results = match_results + self._gather_op = tf.gather + if use_matmul_gather: + self._gather_op = ops.matmul_gather_on_zeroth_axis + + @property + def match_results(self): + """The accessor for match results. + + Returns: + the tensor which encodes the match results. + """ + return self._match_results + + def matched_column_indices(self): + """Returns column indices that match to some row. + + The indices returned by this op are always sorted in increasing order. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return self._reshape_and_cast(tf.where(tf.greater(self._match_results, -1))) + + def matched_column_indicator(self): + """Returns column indices that are matched. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return tf.greater_equal(self._match_results, 0) + + def num_matched_columns(self): + """Returns number (int32 scalar tensor) of matched columns.""" + return tf.size(self.matched_column_indices()) + + def unmatched_column_indices(self): + """Returns column indices that do not match any row. + + The indices returned by this op are always sorted in increasing order. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return self._reshape_and_cast(tf.where(tf.equal(self._match_results, -1))) + + def unmatched_column_indicator(self): + """Returns column indices that are unmatched. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return tf.equal(self._match_results, -1) + + def num_unmatched_columns(self): + """Returns number (int32 scalar tensor) of unmatched columns.""" + return tf.size(self.unmatched_column_indices()) + + def ignored_column_indices(self): + """Returns column indices that are ignored (neither Matched nor Unmatched). + + The indices returned by this op are always sorted in increasing order. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return self._reshape_and_cast(tf.where(self.ignored_column_indicator())) + + def ignored_column_indicator(self): + """Returns boolean column indicator where True means the colum is ignored. + + Returns: + column_indicator: boolean vector which is True for all ignored column + indices. + """ + return tf.equal(self._match_results, -2) + + def num_ignored_columns(self): + """Returns number (int32 scalar tensor) of matched columns.""" + return tf.size(self.ignored_column_indices()) + + def unmatched_or_ignored_column_indices(self): + """Returns column indices that are unmatched or ignored. + + The indices returned by this op are always sorted in increasing order. + + Returns: + column_indices: int32 tensor of shape [K] with column indices. + """ + return self._reshape_and_cast(tf.where(tf.greater(0, self._match_results))) + + def matched_row_indices(self): + """Returns row indices that match some column. + + The indices returned by this op are ordered so as to be in correspondence + with the output of matched_column_indicator(). For example if + self.matched_column_indicator() is [0,2], and self.matched_row_indices() is + [7, 3], then we know that column 0 was matched to row 7 and column 2 was + matched to row 3. + + Returns: + row_indices: int32 tensor of shape [K] with row indices. + """ + return self._reshape_and_cast( + self._gather_op(self._match_results, self.matched_column_indices())) + + def _reshape_and_cast(self, t): + return tf.cast(tf.reshape(t, [-1]), tf.int32) + + def gather_based_on_match(self, input_tensor, unmatched_value, + ignored_value): + """Gathers elements from `input_tensor` based on match results. + + For columns that are matched to a row, gathered_tensor[col] is set to + input_tensor[match_results[col]]. For columns that are unmatched, + gathered_tensor[col] is set to unmatched_value. Finally, for columns that + are ignored gathered_tensor[col] is set to ignored_value. + + Note that the input_tensor.shape[1:] must match with unmatched_value.shape + and ignored_value.shape + + Args: + input_tensor: Tensor to gather values from. + unmatched_value: Constant tensor value for unmatched columns. + ignored_value: Constant tensor value for ignored columns. + + Returns: + gathered_tensor: A tensor containing values gathered from input_tensor. + The shape of the gathered tensor is [match_results.shape[0]] + + input_tensor.shape[1:]. + """ + input_tensor = tf.concat([tf.stack([ignored_value, unmatched_value]), + input_tensor], axis=0) + gather_indices = tf.maximum(self.match_results + 2, 0) + gathered_tensor = self._gather_op(input_tensor, gather_indices) + return gathered_tensor + + +class Matcher(object): + """Abstract base class for matcher. + """ + __metaclass__ = ABCMeta + + def __init__(self, use_matmul_gather=False): + """Constructs a Matcher. + + Args: + use_matmul_gather: Force constructed match objects to use matrix + multiplication based gather instead of standard tf.gather. + (Default: False). + """ + self._use_matmul_gather = use_matmul_gather + + def match(self, similarity_matrix, scope=None, **params): + """Computes matches among row and column indices and returns the result. + + Computes matches among the row and column indices based on the similarity + matrix and optional arguments. + + Args: + similarity_matrix: Float tensor of shape [N, M] with pairwise similarity + where higher value means more similar. + scope: Op scope name. Defaults to 'Match' if None. + **params: Additional keyword arguments for specific implementations of + the Matcher. + + Returns: + A Match object with the results of matching. + """ + with tf.name_scope(scope, 'Match', [similarity_matrix, params]) as scope: + return Match(self._match(similarity_matrix, **params), + self._use_matmul_gather) + + @abstractmethod + def _match(self, similarity_matrix, **params): + """Method to be overridden by implementations. + + Args: + similarity_matrix: Float tensor of shape [N, M] with pairwise similarity + where higher value means more similar. + **params: Additional keyword arguments for specific implementations of + the Matcher. + + Returns: + match_results: Integer tensor of shape [M]: match_results[i]>=0 means + that column i is matched to row match_results[i], match_results[i]=-1 + means that the column is not matched. match_results[i]=-2 means that + the column is ignored (usually this happens when there is a very weak + match which one neither wants as positive nor negative example). + """ + pass diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/matcher_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/matcher_test.py new file mode 100644 index 0000000000000000000000000000000000000000..05607834a1dd116e2e0beeb79a508d6196fad235 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/matcher_test.py @@ -0,0 +1,192 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.matcher.""" +import numpy as np +import tensorflow as tf + +from object_detection.core import matcher + + +class MatchTest(tf.test.TestCase): + + def test_get_correct_matched_columnIndices(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + match = matcher.Match(match_results) + expected_column_indices = [0, 1, 3, 5] + matched_column_indices = match.matched_column_indices() + self.assertEquals(matched_column_indices.dtype, tf.int32) + with self.test_session() as sess: + matched_column_indices = sess.run(matched_column_indices) + self.assertAllEqual(matched_column_indices, expected_column_indices) + + def test_get_correct_counts(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + match = matcher.Match(match_results) + exp_num_matched_columns = 4 + exp_num_unmatched_columns = 2 + exp_num_ignored_columns = 1 + num_matched_columns = match.num_matched_columns() + num_unmatched_columns = match.num_unmatched_columns() + num_ignored_columns = match.num_ignored_columns() + self.assertEquals(num_matched_columns.dtype, tf.int32) + self.assertEquals(num_unmatched_columns.dtype, tf.int32) + self.assertEquals(num_ignored_columns.dtype, tf.int32) + with self.test_session() as sess: + (num_matched_columns_out, num_unmatched_columns_out, + num_ignored_columns_out) = sess.run( + [num_matched_columns, num_unmatched_columns, num_ignored_columns]) + self.assertAllEqual(num_matched_columns_out, exp_num_matched_columns) + self.assertAllEqual(num_unmatched_columns_out, exp_num_unmatched_columns) + self.assertAllEqual(num_ignored_columns_out, exp_num_ignored_columns) + + def testGetCorrectUnmatchedColumnIndices(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + match = matcher.Match(match_results) + expected_column_indices = [2, 4] + unmatched_column_indices = match.unmatched_column_indices() + self.assertEquals(unmatched_column_indices.dtype, tf.int32) + with self.test_session() as sess: + unmatched_column_indices = sess.run(unmatched_column_indices) + self.assertAllEqual(unmatched_column_indices, expected_column_indices) + + def testGetCorrectMatchedRowIndices(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + match = matcher.Match(match_results) + expected_row_indices = [3, 1, 0, 5] + matched_row_indices = match.matched_row_indices() + self.assertEquals(matched_row_indices.dtype, tf.int32) + with self.test_session() as sess: + matched_row_inds = sess.run(matched_row_indices) + self.assertAllEqual(matched_row_inds, expected_row_indices) + + def test_get_correct_ignored_column_indices(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + match = matcher.Match(match_results) + expected_column_indices = [6] + ignored_column_indices = match.ignored_column_indices() + self.assertEquals(ignored_column_indices.dtype, tf.int32) + with self.test_session() as sess: + ignored_column_indices = sess.run(ignored_column_indices) + self.assertAllEqual(ignored_column_indices, expected_column_indices) + + def test_get_correct_matched_column_indicator(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + match = matcher.Match(match_results) + expected_column_indicator = [True, True, False, True, False, True, False] + matched_column_indicator = match.matched_column_indicator() + self.assertEquals(matched_column_indicator.dtype, tf.bool) + with self.test_session() as sess: + matched_column_indicator = sess.run(matched_column_indicator) + self.assertAllEqual(matched_column_indicator, expected_column_indicator) + + def test_get_correct_unmatched_column_indicator(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + match = matcher.Match(match_results) + expected_column_indicator = [False, False, True, False, True, False, False] + unmatched_column_indicator = match.unmatched_column_indicator() + self.assertEquals(unmatched_column_indicator.dtype, tf.bool) + with self.test_session() as sess: + unmatched_column_indicator = sess.run(unmatched_column_indicator) + self.assertAllEqual(unmatched_column_indicator, expected_column_indicator) + + def test_get_correct_ignored_column_indicator(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + match = matcher.Match(match_results) + expected_column_indicator = [False, False, False, False, False, False, True] + ignored_column_indicator = match.ignored_column_indicator() + self.assertEquals(ignored_column_indicator.dtype, tf.bool) + with self.test_session() as sess: + ignored_column_indicator = sess.run(ignored_column_indicator) + self.assertAllEqual(ignored_column_indicator, expected_column_indicator) + + def test_get_correct_unmatched_ignored_column_indices(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + match = matcher.Match(match_results) + expected_column_indices = [2, 4, 6] + unmatched_ignored_column_indices = (match. + unmatched_or_ignored_column_indices()) + self.assertEquals(unmatched_ignored_column_indices.dtype, tf.int32) + with self.test_session() as sess: + unmatched_ignored_column_indices = sess.run( + unmatched_ignored_column_indices) + self.assertAllEqual(unmatched_ignored_column_indices, + expected_column_indices) + + def test_all_columns_accounted_for(self): + # Note: deliberately setting to small number so not always + # all possibilities appear (matched, unmatched, ignored) + num_matches = 10 + match_results = tf.random_uniform( + [num_matches], minval=-2, maxval=5, dtype=tf.int32) + match = matcher.Match(match_results) + matched_column_indices = match.matched_column_indices() + unmatched_column_indices = match.unmatched_column_indices() + ignored_column_indices = match.ignored_column_indices() + with self.test_session() as sess: + matched, unmatched, ignored = sess.run([ + matched_column_indices, unmatched_column_indices, + ignored_column_indices + ]) + all_indices = np.hstack((matched, unmatched, ignored)) + all_indices_sorted = np.sort(all_indices) + self.assertAllEqual(all_indices_sorted, + np.arange(num_matches, dtype=np.int32)) + + def test_scalar_gather_based_on_match(self): + match_results = tf.constant([3, 1, -1, 0, -1, 5, -2]) + input_tensor = tf.constant([0, 1, 2, 3, 4, 5, 6, 7], dtype=tf.float32) + expected_gathered_tensor = [3, 1, 100, 0, 100, 5, 200] + match = matcher.Match(match_results) + gathered_tensor = match.gather_based_on_match(input_tensor, + unmatched_value=100., + ignored_value=200.) + self.assertEquals(gathered_tensor.dtype, tf.float32) + with self.test_session(): + gathered_tensor_out = gathered_tensor.eval() + self.assertAllEqual(expected_gathered_tensor, gathered_tensor_out) + + def test_multidimensional_gather_based_on_match(self): + match_results = tf.constant([1, -1, -2]) + input_tensor = tf.constant([[0, 0.5, 0, 0.5], [0, 0, 0.5, 0.5]], + dtype=tf.float32) + expected_gathered_tensor = [[0, 0, 0.5, 0.5], [0, 0, 0, 0], [0, 0, 0, 0]] + match = matcher.Match(match_results) + gathered_tensor = match.gather_based_on_match(input_tensor, + unmatched_value=tf.zeros(4), + ignored_value=tf.zeros(4)) + self.assertEquals(gathered_tensor.dtype, tf.float32) + with self.test_session(): + gathered_tensor_out = gathered_tensor.eval() + self.assertAllEqual(expected_gathered_tensor, gathered_tensor_out) + + def test_multidimensional_gather_based_on_match_with_matmul_gather_op(self): + match_results = tf.constant([1, -1, -2]) + input_tensor = tf.constant([[0, 0.5, 0, 0.5], [0, 0, 0.5, 0.5]], + dtype=tf.float32) + expected_gathered_tensor = [[0, 0, 0.5, 0.5], [0, 0, 0, 0], [0, 0, 0, 0]] + match = matcher.Match(match_results, use_matmul_gather=True) + gathered_tensor = match.gather_based_on_match(input_tensor, + unmatched_value=tf.zeros(4), + ignored_value=tf.zeros(4)) + self.assertEquals(gathered_tensor.dtype, tf.float32) + with self.test_session() as sess: + self.assertTrue( + all([op.name is not 'Gather' for op in sess.graph.get_operations()])) + gathered_tensor_out = gathered_tensor.eval() + self.assertAllEqual(expected_gathered_tensor, gathered_tensor_out) + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/minibatch_sampler.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/minibatch_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..dc622221ae526360d0a5f85f914bc2c53365911c --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/minibatch_sampler.py @@ -0,0 +1,90 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Base minibatch sampler module. + +The job of the minibatch_sampler is to subsample a minibatch based on some +criterion. + +The main function call is: + subsample(indicator, batch_size, **params). +Indicator is a 1d boolean tensor where True denotes which examples can be +sampled. It returns a boolean indicator where True denotes an example has been +sampled.. + +Subclasses should implement the Subsample function and can make use of the +@staticmethod SubsampleIndicator. +""" + +from abc import ABCMeta +from abc import abstractmethod + +import tensorflow as tf + +from object_detection.utils import ops + + +class MinibatchSampler(object): + """Abstract base class for subsampling minibatches.""" + __metaclass__ = ABCMeta + + def __init__(self): + """Constructs a minibatch sampler.""" + pass + + @abstractmethod + def subsample(self, indicator, batch_size, **params): + """Returns subsample of entries in indicator. + + Args: + indicator: boolean tensor of shape [N] whose True entries can be sampled. + batch_size: desired batch size. + **params: additional keyword arguments for specific implementations of + the MinibatchSampler. + + Returns: + sample_indicator: boolean tensor of shape [N] whose True entries have been + sampled. If sum(indicator) >= batch_size, sum(is_sampled) = batch_size + """ + pass + + @staticmethod + def subsample_indicator(indicator, num_samples): + """Subsample indicator vector. + + Given a boolean indicator vector with M elements set to `True`, the function + assigns all but `num_samples` of these previously `True` elements to + `False`. If `num_samples` is greater than M, the original indicator vector + is returned. + + Args: + indicator: a 1-dimensional boolean tensor indicating which elements + are allowed to be sampled and which are not. + num_samples: int32 scalar tensor + + Returns: + a boolean tensor with the same shape as input (indicator) tensor + """ + indices = tf.where(indicator) + indices = tf.random_shuffle(indices) + indices = tf.reshape(indices, [-1]) + + num_samples = tf.minimum(tf.size(indices), num_samples) + selected_indices = tf.slice(indices, [0], tf.reshape(num_samples, [1])) + + selected_indicator = ops.indices_to_dense_vector(selected_indices, + tf.shape(indicator)[0]) + + return tf.equal(selected_indicator, 1) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/minibatch_sampler_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/minibatch_sampler_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7420ae5d03ca5318d2fd5df4dd4a5cee400189b1 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/minibatch_sampler_test.py @@ -0,0 +1,82 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for google3.research.vale.object_detection.minibatch_sampler.""" + +import numpy as np +import tensorflow as tf + +from object_detection.core import minibatch_sampler + + +class MinibatchSamplerTest(tf.test.TestCase): + + def test_subsample_indicator_when_more_true_elements_than_num_samples(self): + np_indicator = [True, False, True, False, True, True, False] + indicator = tf.constant(np_indicator) + samples = minibatch_sampler.MinibatchSampler.subsample_indicator( + indicator, 3) + with self.test_session() as sess: + samples_out = sess.run(samples) + self.assertTrue(np.sum(samples_out), 3) + self.assertAllEqual(samples_out, + np.logical_and(samples_out, np_indicator)) + + def test_subsample_when_more_true_elements_than_num_samples_no_shape(self): + np_indicator = [True, False, True, False, True, True, False] + indicator = tf.placeholder(tf.bool) + feed_dict = {indicator: np_indicator} + + samples = minibatch_sampler.MinibatchSampler.subsample_indicator( + indicator, 3) + with self.test_session() as sess: + samples_out = sess.run(samples, feed_dict=feed_dict) + self.assertTrue(np.sum(samples_out), 3) + self.assertAllEqual(samples_out, + np.logical_and(samples_out, np_indicator)) + + def test_subsample_indicator_when_less_true_elements_than_num_samples(self): + np_indicator = [True, False, True, False, True, True, False] + indicator = tf.constant(np_indicator) + samples = minibatch_sampler.MinibatchSampler.subsample_indicator( + indicator, 5) + with self.test_session() as sess: + samples_out = sess.run(samples) + self.assertTrue(np.sum(samples_out), 4) + self.assertAllEqual(samples_out, + np.logical_and(samples_out, np_indicator)) + + def test_subsample_indicator_when_num_samples_is_zero(self): + np_indicator = [True, False, True, False, True, True, False] + indicator = tf.constant(np_indicator) + samples_none = minibatch_sampler.MinibatchSampler.subsample_indicator( + indicator, 0) + with self.test_session() as sess: + samples_none_out = sess.run(samples_none) + self.assertAllEqual( + np.zeros_like(samples_none_out, dtype=bool), + samples_none_out) + + def test_subsample_indicator_when_indicator_all_false(self): + indicator_empty = tf.zeros([0], dtype=tf.bool) + samples_empty = minibatch_sampler.MinibatchSampler.subsample_indicator( + indicator_empty, 4) + with self.test_session() as sess: + samples_empty_out = sess.run(samples_empty) + self.assertEqual(0, samples_empty_out.size) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/model.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/model.py new file mode 100644 index 0000000000000000000000000000000000000000..081136f9c6a64ca8b56b2a98b9113a81bdc791f8 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/model.py @@ -0,0 +1,305 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Abstract detection model. + +This file defines a generic base class for detection models. Programs that are +designed to work with arbitrary detection models should only depend on this +class. We intend for the functions in this class to follow tensor-in/tensor-out +design, thus all functions have tensors or lists/dictionaries holding tensors as +inputs and outputs. + +Abstractly, detection models predict output tensors given input images +which can be passed to a loss function at training time or passed to a +postprocessing function at eval time. The computation graphs at a high level +consequently look as follows: + +Training time: +inputs (images tensor) -> preprocess -> predict -> loss -> outputs (loss tensor) + +Evaluation time: +inputs (images tensor) -> preprocess -> predict -> postprocess + -> outputs (boxes tensor, scores tensor, classes tensor, num_detections tensor) + +DetectionModels must thus implement four functions (1) preprocess, (2) predict, +(3) postprocess and (4) loss. DetectionModels should make no assumptions about +the input size or aspect ratio --- they are responsible for doing any +resize/reshaping necessary (see docstring for the preprocess function). +Output classes are always integers in the range [0, num_classes). Any mapping +of these integers to semantic labels is to be handled outside of this class. + +Images are resized in the `preprocess` method. All of `preprocess`, `predict`, +and `postprocess` should be reentrant. + +The `preprocess` method runs `image_resizer_fn` that returns resized_images and +`true_image_shapes`. Since `image_resizer_fn` can pad the images with zeros, +true_image_shapes indicate the slices that contain the image without padding. +This is useful for padding images to be a fixed size for batching. + +The `postprocess` method uses the true image shapes to clip predictions that lie +outside of images. + +By default, DetectionModels produce bounding box detections; However, we support +a handful of auxiliary annotations associated with each bounding box, namely, +instance masks and keypoints. +""" +from abc import ABCMeta +from abc import abstractmethod + +from object_detection.core import standard_fields as fields + + +class DetectionModel(object): + """Abstract base class for detection models.""" + __metaclass__ = ABCMeta + + def __init__(self, num_classes): + """Constructor. + + Args: + num_classes: number of classes. Note that num_classes *does not* include + background categories that might be implicitly predicted in various + implementations. + """ + self._num_classes = num_classes + self._groundtruth_lists = {} + + @property + def num_classes(self): + return self._num_classes + + def groundtruth_lists(self, field): + """Access list of groundtruth tensors. + + Args: + field: a string key, options are + fields.BoxListFields.{boxes,classes,masks,keypoints} + + Returns: + a list of tensors holding groundtruth information (see also + provide_groundtruth function below), with one entry for each image in the + batch. + Raises: + RuntimeError: if the field has not been provided via provide_groundtruth. + """ + if field not in self._groundtruth_lists: + raise RuntimeError('Groundtruth tensor %s has not been provided', field) + return self._groundtruth_lists[field] + + def groundtruth_has_field(self, field): + """Determines whether the groundtruth includes the given field. + + Args: + field: a string key, options are + fields.BoxListFields.{boxes,classes,masks,keypoints} + + Returns: + True if the groundtruth includes the given field, False otherwise. + """ + return field in self._groundtruth_lists + + @abstractmethod + def preprocess(self, inputs): + """Input preprocessing. + + To be overridden by implementations. + + This function is responsible for any scaling/shifting of input values that + is necessary prior to running the detector on an input image. + It is also responsible for any resizing, padding that might be necessary + as images are assumed to arrive in arbitrary sizes. While this function + could conceivably be part of the predict method (below), it is often + convenient to keep these separate --- for example, we may want to preprocess + on one device, place onto a queue, and let another device (e.g., the GPU) + handle prediction. + + A few important notes about the preprocess function: + + We assume that this operation does not have any trainable variables nor + does it affect the groundtruth annotations in any way (thus data + augmentation operations such as random cropping should be performed + externally). + + There is no assumption that the batchsize in this function is the same as + the batch size in the predict function. In fact, we recommend calling the + preprocess function prior to calling any batching operations (which should + happen outside of the model) and thus assuming that batch sizes are equal + to 1 in the preprocess function. + + There is also no explicit assumption that the output resolutions + must be fixed across inputs --- this is to support "fully convolutional" + settings in which input images can have different shapes/resolutions. + + Args: + inputs: a [batch, height_in, width_in, channels] float32 tensor + representing a batch of images with values between 0 and 255.0. + + Returns: + preprocessed_inputs: a [batch, height_out, width_out, channels] float32 + tensor representing a batch of images. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + """ + pass + + @abstractmethod + def predict(self, preprocessed_inputs, true_image_shapes): + """Predict prediction tensors from inputs tensor. + + Outputs of this function can be passed to loss or postprocess functions. + + Args: + preprocessed_inputs: a [batch, height, width, channels] float32 tensor + representing a batch of images. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Returns: + prediction_dict: a dictionary holding prediction tensors to be + passed to the Loss or Postprocess functions. + """ + pass + + @abstractmethod + def postprocess(self, prediction_dict, true_image_shapes, **params): + """Convert predicted output tensors to final detections. + + Outputs adhere to the following conventions: + * Classes are integers in [0, num_classes); background classes are removed + and the first non-background class is mapped to 0. If the model produces + class-agnostic detections, then no output is produced for classes. + * Boxes are to be interpreted as being in [y_min, x_min, y_max, x_max] + format and normalized relative to the image window. + * `num_detections` is provided for settings where detections are padded to a + fixed number of boxes. + * We do not specifically assume any kind of probabilistic interpretation + of the scores --- the only important thing is their relative ordering. + Thus implementations of the postprocess function are free to output + logits, probabilities, calibrated probabilities, or anything else. + + Args: + prediction_dict: a dictionary holding prediction tensors. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + **params: Additional keyword arguments for specific implementations of + DetectionModel. + + Returns: + detections: a dictionary containing the following fields + detection_boxes: [batch, max_detections, 4] + detection_scores: [batch, max_detections] + detection_classes: [batch, max_detections] + (If a model is producing class-agnostic detections, this field may be + missing) + instance_masks: [batch, max_detections, image_height, image_width] + (optional) + keypoints: [batch, max_detections, num_keypoints, 2] (optional) + num_detections: [batch] + """ + pass + + @abstractmethod + def loss(self, prediction_dict, true_image_shapes): + """Compute scalar loss tensors with respect to provided groundtruth. + + Calling this function requires that groundtruth tensors have been + provided via the provide_groundtruth function. + + Args: + prediction_dict: a dictionary holding predicted tensors + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Returns: + a dictionary mapping strings (loss names) to scalar tensors representing + loss values. + """ + pass + + def provide_groundtruth(self, + groundtruth_boxes_list, + groundtruth_classes_list, + groundtruth_masks_list=None, + groundtruth_keypoints_list=None, + groundtruth_weights_list=None, + groundtruth_is_crowd_list=None): + """Provide groundtruth tensors. + + Args: + groundtruth_boxes_list: a list of 2-D tf.float32 tensors of shape + [num_boxes, 4] containing coordinates of the groundtruth boxes. + Groundtruth boxes are provided in [y_min, x_min, y_max, x_max] + format and assumed to be normalized and clipped + relative to the image window with y_min <= y_max and x_min <= x_max. + groundtruth_classes_list: a list of 2-D tf.float32 one-hot (or k-hot) + tensors of shape [num_boxes, num_classes] containing the class targets + with the 0th index assumed to map to the first non-background class. + groundtruth_masks_list: a list of 3-D tf.float32 tensors of + shape [num_boxes, height_in, width_in] containing instance + masks with values in {0, 1}. If None, no masks are provided. + Mask resolution `height_in`x`width_in` must agree with the resolution + of the input image tensor provided to the `preprocess` function. + groundtruth_keypoints_list: a list of 3-D tf.float32 tensors of + shape [num_boxes, num_keypoints, 2] containing keypoints. + Keypoints are assumed to be provided in normalized coordinates and + missing keypoints should be encoded as NaN. + groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape + [num_boxes] containing weights for groundtruth boxes. + groundtruth_is_crowd_list: A list of 1-D tf.bool tensors of shape + [num_boxes] containing is_crowd annotations + """ + self._groundtruth_lists[fields.BoxListFields.boxes] = groundtruth_boxes_list + self._groundtruth_lists[ + fields.BoxListFields.classes] = groundtruth_classes_list + if groundtruth_weights_list: + self._groundtruth_lists[fields.BoxListFields. + weights] = groundtruth_weights_list + if groundtruth_masks_list: + self._groundtruth_lists[ + fields.BoxListFields.masks] = groundtruth_masks_list + if groundtruth_keypoints_list: + self._groundtruth_lists[ + fields.BoxListFields.keypoints] = groundtruth_keypoints_list + if groundtruth_is_crowd_list: + self._groundtruth_lists[ + fields.BoxListFields.is_crowd] = groundtruth_is_crowd_list + + @abstractmethod + def restore_map(self, fine_tune_checkpoint_type='detection'): + """Returns a map of variables to load from a foreign checkpoint. + + Returns a map of variable names to load from a checkpoint to variables in + the model graph. This enables the model to initialize based on weights from + another task. For example, the feature extractor variables from a + classification model can be used to bootstrap training of an object + detector. When loading from an object detection model, the checkpoint model + should have the same parameters as this detection model with exception of + the num_classes parameter. + + Args: + fine_tune_checkpoint_type: whether to restore from a full detection + checkpoint (with compatible variable names) or to restore from a + classification checkpoint for initialization prior to training. + Valid values: `detection`, `classification`. Default 'detection'. + + Returns: + A dict mapping variable names (to load from a checkpoint) to variables in + the model graph. + """ + pass diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/post_processing.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/post_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..bbc61f66fe7e61b2a3d243fa3285a204374af0c1 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/post_processing.py @@ -0,0 +1,425 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Post-processing operations on detected boxes.""" + +import tensorflow as tf + +from object_detection.core import box_list +from object_detection.core import box_list_ops +from object_detection.core import standard_fields as fields +from object_detection.utils import shape_utils + + +def multiclass_non_max_suppression(boxes, + scores, + score_thresh, + iou_thresh, + max_size_per_class, + max_total_size=0, + clip_window=None, + change_coordinate_frame=False, + masks=None, + boundaries=None, + additional_fields=None, + scope=None): + """Multi-class version of non maximum suppression. + + This op greedily selects a subset of detection bounding boxes, pruning + away boxes that have high IOU (intersection over union) overlap (> thresh) + with already selected boxes. It operates independently for each class for + which scores are provided (via the scores field of the input box_list), + pruning boxes with score less than a provided threshold prior to + applying NMS. + + Please note that this operation is performed on *all* classes, therefore any + background classes should be removed prior to calling this function. + + Args: + boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either + number of classes or 1 depending on whether a separate box is predicted + per class. + scores: A [k, num_classes] float32 tensor containing the scores for each of + the k detections. + score_thresh: scalar threshold for score (low scoring boxes are removed). + iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap + with previously selected boxes are removed). + max_size_per_class: maximum number of retained boxes per class. + max_total_size: maximum number of boxes retained over all classes. By + default returns all boxes retained after capping boxes per class. + clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max] + representing the window to clip and normalize boxes to before performing + non-max suppression. + change_coordinate_frame: Whether to normalize coordinates after clipping + relative to clip_window (this can only be set to True if a clip_window + is provided) + masks: (optional) a [k, q, mask_height, mask_width] float32 tensor + containing box masks. `q` can be either number of classes or 1 depending + on whether a separate mask is predicted per class. + boundaries: (optional) a [k, q, boundary_height, boundary_width] float32 + tensor containing box boundaries. `q` can be either number of classes or 1 + depending on whether a separate boundary is predicted per class. + additional_fields: (optional) If not None, a dictionary that maps keys to + tensors whose first dimensions are all of size `k`. After non-maximum + suppression, all tensors corresponding to the selected boxes will be + added to resulting BoxList. + scope: name scope. + + Returns: + a BoxList holding M boxes with a rank-1 scores field representing + corresponding scores for each box with scores sorted in decreasing order + and a rank-1 classes field representing a class label for each box. + + Raises: + ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have + a valid scores field. + """ + if not 0 <= iou_thresh <= 1.0: + raise ValueError('iou_thresh must be between 0 and 1') + if scores.shape.ndims != 2: + raise ValueError('scores field must be of rank 2') + if scores.shape[1].value is None: + raise ValueError('scores must have statically defined second ' + 'dimension') + if boxes.shape.ndims != 3: + raise ValueError('boxes must be of rank 3.') + if not (boxes.shape[1].value == scores.shape[1].value or + boxes.shape[1].value == 1): + raise ValueError('second dimension of boxes must be either 1 or equal ' + 'to the second dimension of scores') + if boxes.shape[2].value != 4: + raise ValueError('last dimension of boxes must be of size 4.') + if change_coordinate_frame and clip_window is None: + raise ValueError('if change_coordinate_frame is True, then a clip_window' + 'must be specified.') + + with tf.name_scope(scope, 'MultiClassNonMaxSuppression'): + num_boxes = tf.shape(boxes)[0] + num_scores = tf.shape(scores)[0] + num_classes = scores.get_shape()[1] + + length_assert = tf.Assert( + tf.equal(num_boxes, num_scores), + ['Incorrect scores field length: actual vs expected.', + num_scores, num_boxes]) + + selected_boxes_list = [] + per_class_boxes_list = tf.unstack(boxes, axis=1) + if masks is not None: + per_class_masks_list = tf.unstack(masks, axis=1) + if boundaries is not None: + per_class_boundaries_list = tf.unstack(boundaries, axis=1) + boxes_ids = (range(num_classes) if len(per_class_boxes_list) > 1 + else [0] * num_classes.value) + for class_idx, boxes_idx in zip(range(num_classes), boxes_ids): + per_class_boxes = per_class_boxes_list[boxes_idx] + boxlist_and_class_scores = box_list.BoxList(per_class_boxes) + with tf.control_dependencies([length_assert]): + class_scores = tf.reshape( + tf.slice(scores, [0, class_idx], tf.stack([num_scores, 1])), [-1]) + boxlist_and_class_scores.add_field(fields.BoxListFields.scores, + class_scores) + if masks is not None: + per_class_masks = per_class_masks_list[boxes_idx] + boxlist_and_class_scores.add_field(fields.BoxListFields.masks, + per_class_masks) + if boundaries is not None: + per_class_boundaries = per_class_boundaries_list[boxes_idx] + boxlist_and_class_scores.add_field(fields.BoxListFields.boundaries, + per_class_boundaries) + if additional_fields is not None: + for key, tensor in additional_fields.items(): + boxlist_and_class_scores.add_field(key, tensor) + boxlist_filtered = box_list_ops.filter_greater_than( + boxlist_and_class_scores, score_thresh) + if clip_window is not None: + boxlist_filtered = box_list_ops.clip_to_window( + boxlist_filtered, clip_window) + if change_coordinate_frame: + boxlist_filtered = box_list_ops.change_coordinate_frame( + boxlist_filtered, clip_window) + max_selection_size = tf.minimum(max_size_per_class, + boxlist_filtered.num_boxes()) + selected_indices = tf.image.non_max_suppression( + boxlist_filtered.get(), + boxlist_filtered.get_field(fields.BoxListFields.scores), + max_selection_size, + iou_threshold=iou_thresh) + nms_result = box_list_ops.gather(boxlist_filtered, selected_indices) + nms_result.add_field( + fields.BoxListFields.classes, (tf.zeros_like( + nms_result.get_field(fields.BoxListFields.scores)) + class_idx)) + selected_boxes_list.append(nms_result) + selected_boxes = box_list_ops.concatenate(selected_boxes_list) + sorted_boxes = box_list_ops.sort_by_field(selected_boxes, + fields.BoxListFields.scores) + if max_total_size: + max_total_size = tf.minimum(max_total_size, + sorted_boxes.num_boxes()) + sorted_boxes = box_list_ops.gather(sorted_boxes, + tf.range(max_total_size)) + return sorted_boxes + + +def batch_multiclass_non_max_suppression(boxes, + scores, + score_thresh, + iou_thresh, + max_size_per_class, + max_total_size=0, + clip_window=None, + change_coordinate_frame=False, + num_valid_boxes=None, + masks=None, + additional_fields=None, + scope=None, + parallel_iterations=32): + """Multi-class version of non maximum suppression that operates on a batch. + + This op is similar to `multiclass_non_max_suppression` but operates on a batch + of boxes and scores. See documentation for `multiclass_non_max_suppression` + for details. + + Args: + boxes: A [batch_size, num_anchors, q, 4] float32 tensor containing + detections. If `q` is 1 then same boxes are used for all classes + otherwise, if `q` is equal to number of classes, class-specific boxes + are used. + scores: A [batch_size, num_anchors, num_classes] float32 tensor containing + the scores for each of the `num_anchors` detections. + score_thresh: scalar threshold for score (low scoring boxes are removed). + iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap + with previously selected boxes are removed). + max_size_per_class: maximum number of retained boxes per class. + max_total_size: maximum number of boxes retained over all classes. By + default returns all boxes retained after capping boxes per class. + clip_window: A float32 tensor of shape [batch_size, 4] where each entry is + of the form [y_min, x_min, y_max, x_max] representing the window to clip + boxes to before performing non-max suppression. This argument can also be + a tensor of shape [4] in which case, the same clip window is applied to + all images in the batch. If clip_widow is None, all boxes are used to + perform non-max suppression. + change_coordinate_frame: Whether to normalize coordinates after clipping + relative to clip_window (this can only be set to True if a clip_window + is provided) + num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape + [batch_size] representing the number of valid boxes to be considered + for each image in the batch. This parameter allows for ignoring zero + paddings. + masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width] + float32 tensor containing box masks. `q` can be either number of classes + or 1 depending on whether a separate mask is predicted per class. + additional_fields: (optional) If not None, a dictionary that maps keys to + tensors whose dimensions are [batch_size, num_anchors, ...]. + scope: tf scope name. + parallel_iterations: (optional) number of batch items to process in + parallel. + + Returns: + 'nmsed_boxes': A [batch_size, max_detections, 4] float32 tensor + containing the non-max suppressed boxes. + 'nmsed_scores': A [batch_size, max_detections] float32 tensor containing + the scores for the boxes. + 'nmsed_classes': A [batch_size, max_detections] float32 tensor + containing the class for boxes. + 'nmsed_masks': (optional) a + [batch_size, max_detections, mask_height, mask_width] float32 tensor + containing masks for each selected box. This is set to None if input + `masks` is None. + 'nmsed_additional_fields': (optional) a dictionary of + [batch_size, max_detections, ...] float32 tensors corresponding to the + tensors specified in the input `additional_fields`. This is not returned + if input `additional_fields` is None. + 'num_detections': A [batch_size] int32 tensor indicating the number of + valid detections per batch item. Only the top num_detections[i] entries in + nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the + entries are zero paddings. + + Raises: + ValueError: if `q` in boxes.shape is not 1 or not equal to number of + classes as inferred from scores.shape. + """ + q = boxes.shape[2].value + num_classes = scores.shape[2].value + if q != 1 and q != num_classes: + raise ValueError('third dimension of boxes must be either 1 or equal ' + 'to the third dimension of scores') + if change_coordinate_frame and clip_window is None: + raise ValueError('if change_coordinate_frame is True, then a clip_window' + 'must be specified.') + original_masks = masks + original_additional_fields = additional_fields + with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'): + boxes_shape = boxes.shape + batch_size = boxes_shape[0].value + num_anchors = boxes_shape[1].value + + if batch_size is None: + batch_size = tf.shape(boxes)[0] + if num_anchors is None: + num_anchors = tf.shape(boxes)[1] + + # If num valid boxes aren't provided, create one and mark all boxes as + # valid. + if num_valid_boxes is None: + num_valid_boxes = tf.ones([batch_size], dtype=tf.int32) * num_anchors + + # If masks aren't provided, create dummy masks so we can only have one copy + # of _single_image_nms_fn and discard the dummy masks after map_fn. + if masks is None: + masks_shape = tf.stack([batch_size, num_anchors, 1, 0, 0]) + masks = tf.zeros(masks_shape) + + if clip_window is None: + clip_window = tf.stack([ + tf.reduce_min(boxes[:, :, :, 0]), + tf.reduce_min(boxes[:, :, :, 1]), + tf.reduce_max(boxes[:, :, :, 2]), + tf.reduce_max(boxes[:, :, :, 3]) + ]) + if clip_window.shape.ndims == 1: + clip_window = tf.tile(tf.expand_dims(clip_window, 0), [batch_size, 1]) + + if additional_fields is None: + additional_fields = {} + + def _single_image_nms_fn(args): + """Runs NMS on a single image and returns padded output. + + Args: + args: A list of tensors consisting of the following: + per_image_boxes - A [num_anchors, q, 4] float32 tensor containing + detections. If `q` is 1 then same boxes are used for all classes + otherwise, if `q` is equal to number of classes, class-specific + boxes are used. + per_image_scores - A [num_anchors, num_classes] float32 tensor + containing the scores for each of the `num_anchors` detections. + per_image_masks - A [num_anchors, q, mask_height, mask_width] float32 + tensor containing box masks. `q` can be either number of classes + or 1 depending on whether a separate mask is predicted per class. + per_image_clip_window - A 1D float32 tensor of the form + [ymin, xmin, ymax, xmax] representing the window to clip the boxes + to. + per_image_additional_fields - (optional) A variable number of float32 + tensors each with size [num_anchors, ...]. + per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of + shape [batch_size] representing the number of valid boxes to be + considered for each image in the batch. This parameter allows for + ignoring zero paddings. + + Returns: + 'nmsed_boxes': A [max_detections, 4] float32 tensor containing the + non-max suppressed boxes. + 'nmsed_scores': A [max_detections] float32 tensor containing the scores + for the boxes. + 'nmsed_classes': A [max_detections] float32 tensor containing the class + for boxes. + 'nmsed_masks': (optional) a [max_detections, mask_height, mask_width] + float32 tensor containing masks for each selected box. This is set to + None if input `masks` is None. + 'nmsed_additional_fields': (optional) A variable number of float32 + tensors each with size [max_detections, ...] corresponding to the + input `per_image_additional_fields`. + 'num_detections': A [batch_size] int32 tensor indicating the number of + valid detections per batch item. Only the top num_detections[i] + entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The + rest of the entries are zero paddings. + """ + per_image_boxes = args[0] + per_image_scores = args[1] + per_image_masks = args[2] + per_image_clip_window = args[3] + per_image_additional_fields = { + key: value + for key, value in zip(additional_fields, args[4:-1]) + } + per_image_num_valid_boxes = args[-1] + per_image_boxes = tf.reshape( + tf.slice(per_image_boxes, 3 * [0], + tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4]) + per_image_scores = tf.reshape( + tf.slice(per_image_scores, [0, 0], + tf.stack([per_image_num_valid_boxes, -1])), + [-1, num_classes]) + per_image_masks = tf.reshape( + tf.slice(per_image_masks, 4 * [0], + tf.stack([per_image_num_valid_boxes, -1, -1, -1])), + [-1, q, per_image_masks.shape[2].value, + per_image_masks.shape[3].value]) + if per_image_additional_fields is not None: + for key, tensor in per_image_additional_fields.items(): + additional_field_shape = tensor.get_shape() + additional_field_dim = len(additional_field_shape) + per_image_additional_fields[key] = tf.reshape( + tf.slice(per_image_additional_fields[key], + additional_field_dim * [0], + tf.stack([per_image_num_valid_boxes] + + (additional_field_dim - 1) * [-1])), + [-1] + [dim.value for dim in additional_field_shape[1:]]) + nmsed_boxlist = multiclass_non_max_suppression( + per_image_boxes, + per_image_scores, + score_thresh, + iou_thresh, + max_size_per_class, + max_total_size, + clip_window=per_image_clip_window, + change_coordinate_frame=change_coordinate_frame, + masks=per_image_masks, + additional_fields=per_image_additional_fields) + padded_boxlist = box_list_ops.pad_or_clip_box_list(nmsed_boxlist, + max_total_size) + num_detections = nmsed_boxlist.num_boxes() + nmsed_boxes = padded_boxlist.get() + nmsed_scores = padded_boxlist.get_field(fields.BoxListFields.scores) + nmsed_classes = padded_boxlist.get_field(fields.BoxListFields.classes) + nmsed_masks = padded_boxlist.get_field(fields.BoxListFields.masks) + nmsed_additional_fields = [ + padded_boxlist.get_field(key) for key in per_image_additional_fields + ] + return ([nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks] + + nmsed_additional_fields + [num_detections]) + + num_additional_fields = 0 + if additional_fields is not None: + num_additional_fields = len(additional_fields) + num_nmsed_outputs = 4 + num_additional_fields + + batch_outputs = shape_utils.static_or_dynamic_map_fn( + _single_image_nms_fn, + elems=([boxes, scores, masks, clip_window] + + list(additional_fields.values()) + [num_valid_boxes]), + dtype=(num_nmsed_outputs * [tf.float32] + [tf.int32]), + parallel_iterations=parallel_iterations) + + batch_nmsed_boxes = batch_outputs[0] + batch_nmsed_scores = batch_outputs[1] + batch_nmsed_classes = batch_outputs[2] + batch_nmsed_masks = batch_outputs[3] + batch_nmsed_additional_fields = { + key: value + for key, value in zip(additional_fields, batch_outputs[4:-1]) + } + batch_num_detections = batch_outputs[-1] + + if original_masks is None: + batch_nmsed_masks = None + + if original_additional_fields is None: + batch_nmsed_additional_fields = None + + return (batch_nmsed_boxes, batch_nmsed_scores, batch_nmsed_classes, + batch_nmsed_masks, batch_nmsed_additional_fields, + batch_num_detections) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/post_processing_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/post_processing_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9674139967f933192026c2245a82bf0026a732fe --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/post_processing_test.py @@ -0,0 +1,1078 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for tensorflow_models.object_detection.core.post_processing.""" +import numpy as np +import tensorflow as tf +from object_detection.core import post_processing +from object_detection.core import standard_fields as fields + + +class MulticlassNonMaxSuppressionTest(tf.test.TestCase): + + def test_with_invalid_scores_size(self): + boxes = tf.constant([[[0, 0, 1, 1]], + [[0, 0.1, 1, 1.1]], + [[0, -0.1, 1, 0.9]], + [[0, 10, 1, 11]], + [[0, 10.1, 1, 11.1]], + [[0, 100, 1, 101]]], tf.float32) + scores = tf.constant([[.9], [.75], [.6], [.95], [.5]]) + iou_thresh = .5 + score_thresh = 0.6 + max_output_size = 3 + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_output_size) + with self.test_session() as sess: + with self.assertRaisesWithPredicateMatch( + tf.errors.InvalidArgumentError, 'Incorrect scores field length'): + sess.run(nms.get()) + + def test_multiclass_nms_select_with_shared_boxes(self): + boxes = tf.constant([[[0, 0, 1, 1]], + [[0, 0.1, 1, 1.1]], + [[0, -0.1, 1, 0.9]], + [[0, 10, 1, 11]], + [[0, 10.1, 1, 11.1]], + [[0, 100, 1, 101]], + [[0, 1000, 1, 1002]], + [[0, 1000, 1, 1002.1]]], tf.float32) + scores = tf.constant([[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0], + [.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = [[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 1000, 1, 1002], + [0, 100, 1, 101]] + exp_nms_scores = [.95, .9, .85, .3] + exp_nms_classes = [0, 0, 1, 0] + + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_output_size) + with self.test_session() as sess: + nms_corners_output, nms_scores_output, nms_classes_output = sess.run( + [nms.get(), nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes)]) + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + + def test_multiclass_nms_select_with_shared_boxes_given_keypoints(self): + boxes = tf.constant([[[0, 0, 1, 1]], + [[0, 0.1, 1, 1.1]], + [[0, -0.1, 1, 0.9]], + [[0, 10, 1, 11]], + [[0, 10.1, 1, 11.1]], + [[0, 100, 1, 101]], + [[0, 1000, 1, 1002]], + [[0, 1000, 1, 1002.1]]], tf.float32) + scores = tf.constant([[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0], + [.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]) + num_keypoints = 6 + keypoints = tf.tile( + tf.reshape(tf.range(8), [8, 1, 1]), + [1, num_keypoints, 2]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = [[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 1000, 1, 1002], + [0, 100, 1, 101]] + exp_nms_scores = [.95, .9, .85, .3] + exp_nms_classes = [0, 0, 1, 0] + exp_nms_keypoints_tensor = tf.tile( + tf.reshape(tf.constant([3, 0, 6, 5], dtype=tf.float32), [4, 1, 1]), + [1, num_keypoints, 2]) + + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_output_size, + additional_fields={ + fields.BoxListFields.keypoints: keypoints}) + + with self.test_session() as sess: + (nms_corners_output, + nms_scores_output, + nms_classes_output, + nms_keypoints, + exp_nms_keypoints) = sess.run([ + nms.get(), + nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes), + nms.get_field(fields.BoxListFields.keypoints), + exp_nms_keypoints_tensor + ]) + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + self.assertAllEqual(nms_keypoints, exp_nms_keypoints) + + def test_multiclass_nms_with_shared_boxes_given_keypoint_heatmaps(self): + boxes = tf.constant([[[0, 0, 1, 1]], + [[0, 0.1, 1, 1.1]], + [[0, -0.1, 1, 0.9]], + [[0, 10, 1, 11]], + [[0, 10.1, 1, 11.1]], + [[0, 100, 1, 101]], + [[0, 1000, 1, 1002]], + [[0, 1000, 1, 1002.1]]], tf.float32) + + scores = tf.constant([[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0], + [.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]) + + num_boxes = tf.shape(boxes)[0] + heatmap_height = 5 + heatmap_width = 5 + num_keypoints = 17 + keypoint_heatmaps = tf.ones( + [num_boxes, heatmap_height, heatmap_width, num_keypoints], + dtype=tf.float32) + + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + exp_nms_corners = [[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 1000, 1, 1002], + [0, 100, 1, 101]] + + exp_nms_scores = [.95, .9, .85, .3] + exp_nms_classes = [0, 0, 1, 0] + exp_nms_keypoint_heatmaps = np.ones( + (4, heatmap_height, heatmap_width, num_keypoints), dtype=np.float32) + + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_output_size, + additional_fields={ + fields.BoxListFields.keypoint_heatmaps: keypoint_heatmaps}) + + with self.test_session() as sess: + (nms_corners_output, + nms_scores_output, + nms_classes_output, + nms_keypoint_heatmaps) = sess.run( + [nms.get(), + nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes), + nms.get_field(fields.BoxListFields.keypoint_heatmaps)]) + + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + self.assertAllEqual(nms_keypoint_heatmaps, exp_nms_keypoint_heatmaps) + + def test_multiclass_nms_with_additional_fields(self): + boxes = tf.constant([[[0, 0, 1, 1]], + [[0, 0.1, 1, 1.1]], + [[0, -0.1, 1, 0.9]], + [[0, 10, 1, 11]], + [[0, 10.1, 1, 11.1]], + [[0, 100, 1, 101]], + [[0, 1000, 1, 1002]], + [[0, 1000, 1, 1002.1]]], tf.float32) + + scores = tf.constant([[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0], + [.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]) + + coarse_boxes_key = 'coarse_boxes' + coarse_boxes = tf.constant([[0.1, 0.1, 1.1, 1.1], + [0.1, 0.2, 1.1, 1.2], + [0.1, -0.2, 1.1, 1.0], + [0.1, 10.1, 1.1, 11.1], + [0.1, 10.2, 1.1, 11.2], + [0.1, 100.1, 1.1, 101.1], + [0.1, 1000.1, 1.1, 1002.1], + [0.1, 1000.1, 1.1, 1002.2]], tf.float32) + + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = np.array([[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 1000, 1, 1002], + [0, 100, 1, 101]], dtype=np.float32) + + exp_nms_coarse_corners = np.array([[0.1, 10.1, 1.1, 11.1], + [0.1, 0.1, 1.1, 1.1], + [0.1, 1000.1, 1.1, 1002.1], + [0.1, 100.1, 1.1, 101.1]], + dtype=np.float32) + + exp_nms_scores = [.95, .9, .85, .3] + exp_nms_classes = [0, 0, 1, 0] + + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_output_size, + additional_fields={coarse_boxes_key: coarse_boxes}) + + with self.test_session() as sess: + (nms_corners_output, + nms_scores_output, + nms_classes_output, + nms_coarse_corners) = sess.run( + [nms.get(), + nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes), + nms.get_field(coarse_boxes_key)]) + + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + self.assertAllEqual(nms_coarse_corners, exp_nms_coarse_corners) + + def test_multiclass_nms_select_with_shared_boxes_given_masks(self): + boxes = tf.constant([[[0, 0, 1, 1]], + [[0, 0.1, 1, 1.1]], + [[0, -0.1, 1, 0.9]], + [[0, 10, 1, 11]], + [[0, 10.1, 1, 11.1]], + [[0, 100, 1, 101]], + [[0, 1000, 1, 1002]], + [[0, 1000, 1, 1002.1]]], tf.float32) + scores = tf.constant([[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0], + [.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]) + num_classes = 2 + mask_height = 3 + mask_width = 3 + masks = tf.tile( + tf.reshape(tf.range(8), [8, 1, 1, 1]), + [1, num_classes, mask_height, mask_width]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = [[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 1000, 1, 1002], + [0, 100, 1, 101]] + exp_nms_scores = [.95, .9, .85, .3] + exp_nms_classes = [0, 0, 1, 0] + exp_nms_masks_tensor = tf.tile( + tf.reshape(tf.constant([3, 0, 6, 5], dtype=tf.float32), [4, 1, 1]), + [1, mask_height, mask_width]) + + nms = post_processing.multiclass_non_max_suppression(boxes, scores, + score_thresh, + iou_thresh, + max_output_size, + masks=masks) + with self.test_session() as sess: + (nms_corners_output, + nms_scores_output, + nms_classes_output, + nms_masks, + exp_nms_masks) = sess.run([nms.get(), + nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes), + nms.get_field(fields.BoxListFields.masks), + exp_nms_masks_tensor]) + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + self.assertAllEqual(nms_masks, exp_nms_masks) + + def test_multiclass_nms_select_with_clip_window(self): + boxes = tf.constant([[[0, 0, 10, 10]], + [[1, 1, 11, 11]]], tf.float32) + scores = tf.constant([[.9], [.75]]) + clip_window = tf.constant([5, 4, 8, 7], tf.float32) + score_thresh = 0.0 + iou_thresh = 0.5 + max_output_size = 100 + + exp_nms_corners = [[5, 4, 8, 7]] + exp_nms_scores = [.9] + exp_nms_classes = [0] + + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_output_size, + clip_window=clip_window) + with self.test_session() as sess: + nms_corners_output, nms_scores_output, nms_classes_output = sess.run( + [nms.get(), nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes)]) + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + + def test_multiclass_nms_select_with_clip_window_change_coordinate_frame(self): + boxes = tf.constant([[[0, 0, 10, 10]], + [[1, 1, 11, 11]]], tf.float32) + scores = tf.constant([[.9], [.75]]) + clip_window = tf.constant([5, 4, 8, 7], tf.float32) + score_thresh = 0.0 + iou_thresh = 0.5 + max_output_size = 100 + + exp_nms_corners = [[0, 0, 1, 1]] + exp_nms_scores = [.9] + exp_nms_classes = [0] + + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_output_size, + clip_window=clip_window, change_coordinate_frame=True) + with self.test_session() as sess: + nms_corners_output, nms_scores_output, nms_classes_output = sess.run( + [nms.get(), nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes)]) + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + + def test_multiclass_nms_select_with_per_class_cap(self): + boxes = tf.constant([[[0, 0, 1, 1]], + [[0, 0.1, 1, 1.1]], + [[0, -0.1, 1, 0.9]], + [[0, 10, 1, 11]], + [[0, 10.1, 1, 11.1]], + [[0, 100, 1, 101]], + [[0, 1000, 1, 1002]], + [[0, 1000, 1, 1002.1]]], tf.float32) + scores = tf.constant([[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0], + [.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]) + score_thresh = 0.1 + iou_thresh = .5 + max_size_per_class = 2 + + exp_nms_corners = [[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 1000, 1, 1002]] + exp_nms_scores = [.95, .9, .85] + exp_nms_classes = [0, 0, 1] + + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_size_per_class) + with self.test_session() as sess: + nms_corners_output, nms_scores_output, nms_classes_output = sess.run( + [nms.get(), nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes)]) + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + + def test_multiclass_nms_select_with_total_cap(self): + boxes = tf.constant([[[0, 0, 1, 1]], + [[0, 0.1, 1, 1.1]], + [[0, -0.1, 1, 0.9]], + [[0, 10, 1, 11]], + [[0, 10.1, 1, 11.1]], + [[0, 100, 1, 101]], + [[0, 1000, 1, 1002]], + [[0, 1000, 1, 1002.1]]], tf.float32) + scores = tf.constant([[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0], + [.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]) + score_thresh = 0.1 + iou_thresh = .5 + max_size_per_class = 4 + max_total_size = 2 + + exp_nms_corners = [[0, 10, 1, 11], + [0, 0, 1, 1]] + exp_nms_scores = [.95, .9] + exp_nms_classes = [0, 0] + + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_size_per_class, + max_total_size) + with self.test_session() as sess: + nms_corners_output, nms_scores_output, nms_classes_output = sess.run( + [nms.get(), nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes)]) + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + + def test_multiclass_nms_threshold_then_select_with_shared_boxes(self): + boxes = tf.constant([[[0, 0, 1, 1]], + [[0, 0.1, 1, 1.1]], + [[0, -0.1, 1, 0.9]], + [[0, 10, 1, 11]], + [[0, 10.1, 1, 11.1]], + [[0, 100, 1, 101]], + [[0, 1000, 1, 1002]], + [[0, 1000, 1, 1002.1]]], tf.float32) + scores = tf.constant([[.9], [.75], [.6], [.95], [.5], [.3], [.01], [.01]]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 3 + + exp_nms = [[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 100, 1, 101]] + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_output_size) + with self.test_session() as sess: + nms_output = sess.run(nms.get()) + self.assertAllClose(nms_output, exp_nms) + + def test_multiclass_nms_select_with_separate_boxes(self): + boxes = tf.constant([[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]], + [[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]], + tf.float32) + scores = tf.constant([[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0], + [.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = [[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 999, 2, 1004], + [0, 100, 1, 101]] + exp_nms_scores = [.95, .9, .85, .3] + exp_nms_classes = [0, 0, 1, 0] + + nms = post_processing.multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, max_output_size) + with self.test_session() as sess: + nms_corners_output, nms_scores_output, nms_classes_output = sess.run( + [nms.get(), nms.get_field(fields.BoxListFields.scores), + nms.get_field(fields.BoxListFields.classes)]) + self.assertAllClose(nms_corners_output, exp_nms_corners) + self.assertAllClose(nms_scores_output, exp_nms_scores) + self.assertAllClose(nms_classes_output, exp_nms_classes) + + def test_batch_multiclass_nms_with_batch_size_1(self): + boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]], + [[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]], + tf.float32) + scores = tf.constant([[[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0], + [.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = [[[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 999, 2, 1004], + [0, 100, 1, 101]]] + exp_nms_scores = [[.95, .9, .85, .3]] + exp_nms_classes = [[0, 0, 1, 0]] + + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + nmsed_additional_fields, num_detections + ) = post_processing.batch_multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, + max_size_per_class=max_output_size, max_total_size=max_output_size) + + self.assertIsNone(nmsed_masks) + self.assertIsNone(nmsed_additional_fields) + + with self.test_session() as sess: + (nmsed_boxes, nmsed_scores, nmsed_classes, + num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes, + num_detections]) + self.assertAllClose(nmsed_boxes, exp_nms_corners) + self.assertAllClose(nmsed_scores, exp_nms_scores) + self.assertAllClose(nmsed_classes, exp_nms_classes) + self.assertEqual(num_detections, [4]) + + def test_batch_multiclass_nms_with_batch_size_2(self): + boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]]], + [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]], + tf.float32) + scores = tf.constant([[[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0]], + [[.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = np.array([[[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 999, 2, 1004], + [0, 10.1, 1, 11.1], + [0, 100, 1, 101], + [0, 0, 0, 0]]]) + exp_nms_scores = np.array([[.95, .9, 0, 0], + [.85, .5, .3, 0]]) + exp_nms_classes = np.array([[0, 0, 0, 0], + [1, 0, 0, 0]]) + + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + nmsed_additional_fields, num_detections + ) = post_processing.batch_multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, + max_size_per_class=max_output_size, max_total_size=max_output_size) + + self.assertIsNone(nmsed_masks) + self.assertIsNone(nmsed_additional_fields) + # Check static shapes + self.assertAllEqual(nmsed_boxes.shape.as_list(), + exp_nms_corners.shape) + self.assertAllEqual(nmsed_scores.shape.as_list(), + exp_nms_scores.shape) + self.assertAllEqual(nmsed_classes.shape.as_list(), + exp_nms_classes.shape) + self.assertEqual(num_detections.shape.as_list(), [2]) + + with self.test_session() as sess: + (nmsed_boxes, nmsed_scores, nmsed_classes, + num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes, + num_detections]) + self.assertAllClose(nmsed_boxes, exp_nms_corners) + self.assertAllClose(nmsed_scores, exp_nms_scores) + self.assertAllClose(nmsed_classes, exp_nms_classes) + self.assertAllClose(num_detections, [2, 3]) + + def test_batch_multiclass_nms_with_per_batch_clip_window(self): + boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]]], + [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]], + tf.float32) + scores = tf.constant([[[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0]], + [[.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]]) + clip_window = tf.constant([0., 0., 200., 200.]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = np.array([[[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 10.1, 1, 11.1], + [0, 100, 1, 101], + [0, 0, 0, 0], + [0, 0, 0, 0]]]) + exp_nms_scores = np.array([[.95, .9, 0, 0], + [.5, .3, 0, 0]]) + exp_nms_classes = np.array([[0, 0, 0, 0], + [0, 0, 0, 0]]) + + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + nmsed_additional_fields, num_detections + ) = post_processing.batch_multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, + max_size_per_class=max_output_size, max_total_size=max_output_size, + clip_window=clip_window) + + self.assertIsNone(nmsed_masks) + self.assertIsNone(nmsed_additional_fields) + # Check static shapes + self.assertAllEqual(nmsed_boxes.shape.as_list(), + exp_nms_corners.shape) + self.assertAllEqual(nmsed_scores.shape.as_list(), + exp_nms_scores.shape) + self.assertAllEqual(nmsed_classes.shape.as_list(), + exp_nms_classes.shape) + self.assertEqual(num_detections.shape.as_list(), [2]) + + with self.test_session() as sess: + (nmsed_boxes, nmsed_scores, nmsed_classes, + num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes, + num_detections]) + self.assertAllClose(nmsed_boxes, exp_nms_corners) + self.assertAllClose(nmsed_scores, exp_nms_scores) + self.assertAllClose(nmsed_classes, exp_nms_classes) + self.assertAllClose(num_detections, [2, 2]) + + def test_batch_multiclass_nms_with_per_image_clip_window(self): + boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]]], + [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]], + tf.float32) + scores = tf.constant([[[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0]], + [[.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]]) + clip_window = tf.constant([[0., 0., 5., 5.], + [0., 0., 200., 200.]]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = np.array([[[0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 10.1, 1, 11.1], + [0, 100, 1, 101], + [0, 0, 0, 0], + [0, 0, 0, 0]]]) + exp_nms_scores = np.array([[.9, 0., 0., 0.], + [.5, .3, 0, 0]]) + exp_nms_classes = np.array([[0, 0, 0, 0], + [0, 0, 0, 0]]) + + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + nmsed_additional_fields, num_detections + ) = post_processing.batch_multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, + max_size_per_class=max_output_size, max_total_size=max_output_size, + clip_window=clip_window) + + self.assertIsNone(nmsed_masks) + self.assertIsNone(nmsed_additional_fields) + # Check static shapes + self.assertAllEqual(nmsed_boxes.shape.as_list(), + exp_nms_corners.shape) + self.assertAllEqual(nmsed_scores.shape.as_list(), + exp_nms_scores.shape) + self.assertAllEqual(nmsed_classes.shape.as_list(), + exp_nms_classes.shape) + self.assertEqual(num_detections.shape.as_list(), [2]) + + with self.test_session() as sess: + (nmsed_boxes, nmsed_scores, nmsed_classes, + num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes, + num_detections]) + self.assertAllClose(nmsed_boxes, exp_nms_corners) + self.assertAllClose(nmsed_scores, exp_nms_scores) + self.assertAllClose(nmsed_classes, exp_nms_classes) + self.assertAllClose(num_detections, [1, 2]) + + def test_batch_multiclass_nms_with_masks(self): + boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]]], + [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]], + tf.float32) + scores = tf.constant([[[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0]], + [[.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]]) + masks = tf.constant([[[[[0, 1], [2, 3]], [[1, 2], [3, 4]]], + [[[2, 3], [4, 5]], [[3, 4], [5, 6]]], + [[[4, 5], [6, 7]], [[5, 6], [7, 8]]], + [[[6, 7], [8, 9]], [[7, 8], [9, 10]]]], + [[[[8, 9], [10, 11]], [[9, 10], [11, 12]]], + [[[10, 11], [12, 13]], [[11, 12], [13, 14]]], + [[[12, 13], [14, 15]], [[13, 14], [15, 16]]], + [[[14, 15], [16, 17]], [[15, 16], [17, 18]]]]], + tf.float32) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = np.array([[[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 999, 2, 1004], + [0, 10.1, 1, 11.1], + [0, 100, 1, 101], + [0, 0, 0, 0]]]) + exp_nms_scores = np.array([[.95, .9, 0, 0], + [.85, .5, .3, 0]]) + exp_nms_classes = np.array([[0, 0, 0, 0], + [1, 0, 0, 0]]) + exp_nms_masks = np.array([[[[6, 7], [8, 9]], + [[0, 1], [2, 3]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]]], + [[[13, 14], [15, 16]], + [[8, 9], [10, 11]], + [[10, 11], [12, 13]], + [[0, 0], [0, 0]]]]) + + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + nmsed_additional_fields, num_detections + ) = post_processing.batch_multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, + max_size_per_class=max_output_size, max_total_size=max_output_size, + masks=masks) + + self.assertIsNone(nmsed_additional_fields) + # Check static shapes + self.assertAllEqual(nmsed_boxes.shape.as_list(), exp_nms_corners.shape) + self.assertAllEqual(nmsed_scores.shape.as_list(), exp_nms_scores.shape) + self.assertAllEqual(nmsed_classes.shape.as_list(), exp_nms_classes.shape) + self.assertAllEqual(nmsed_masks.shape.as_list(), exp_nms_masks.shape) + self.assertEqual(num_detections.shape.as_list(), [2]) + + with self.test_session() as sess: + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes, + nmsed_masks, num_detections]) + + self.assertAllClose(nmsed_boxes, exp_nms_corners) + self.assertAllClose(nmsed_scores, exp_nms_scores) + self.assertAllClose(nmsed_classes, exp_nms_classes) + self.assertAllClose(num_detections, [2, 3]) + self.assertAllClose(nmsed_masks, exp_nms_masks) + + def test_batch_multiclass_nms_with_additional_fields(self): + boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]]], + [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]], + tf.float32) + scores = tf.constant([[[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0]], + [[.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]]) + additional_fields = { + 'keypoints': tf.constant( + [[[[6, 7], [8, 9]], + [[0, 1], [2, 3]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]]], + [[[13, 14], [15, 16]], + [[8, 9], [10, 11]], + [[10, 11], [12, 13]], + [[0, 0], [0, 0]]]], + tf.float32) + } + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = np.array([[[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 999, 2, 1004], + [0, 10.1, 1, 11.1], + [0, 100, 1, 101], + [0, 0, 0, 0]]]) + exp_nms_scores = np.array([[.95, .9, 0, 0], + [.85, .5, .3, 0]]) + exp_nms_classes = np.array([[0, 0, 0, 0], + [1, 0, 0, 0]]) + exp_nms_additional_fields = { + 'keypoints': np.array([[[[0, 0], [0, 0]], + [[6, 7], [8, 9]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]]], + [[[10, 11], [12, 13]], + [[13, 14], [15, 16]], + [[8, 9], [10, 11]], + [[0, 0], [0, 0]]]]) + } + + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + nmsed_additional_fields, num_detections + ) = post_processing.batch_multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, + max_size_per_class=max_output_size, max_total_size=max_output_size, + additional_fields=additional_fields) + + self.assertIsNone(nmsed_masks) + # Check static shapes + self.assertAllEqual(nmsed_boxes.shape.as_list(), exp_nms_corners.shape) + self.assertAllEqual(nmsed_scores.shape.as_list(), exp_nms_scores.shape) + self.assertAllEqual(nmsed_classes.shape.as_list(), exp_nms_classes.shape) + self.assertEqual(len(nmsed_additional_fields), + len(exp_nms_additional_fields)) + for key in exp_nms_additional_fields: + self.assertAllEqual(nmsed_additional_fields[key].shape.as_list(), + exp_nms_additional_fields[key].shape) + self.assertEqual(num_detections.shape.as_list(), [2]) + + with self.test_session() as sess: + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_additional_fields, + num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes, + nmsed_additional_fields, num_detections]) + + self.assertAllClose(nmsed_boxes, exp_nms_corners) + self.assertAllClose(nmsed_scores, exp_nms_scores) + self.assertAllClose(nmsed_classes, exp_nms_classes) + for key in exp_nms_additional_fields: + self.assertAllClose(nmsed_additional_fields[key], + exp_nms_additional_fields[key]) + self.assertAllClose(num_detections, [2, 3]) + + def test_batch_multiclass_nms_with_dynamic_batch_size(self): + boxes_placeholder = tf.placeholder(tf.float32, shape=(None, None, 2, 4)) + scores_placeholder = tf.placeholder(tf.float32, shape=(None, None, 2)) + masks_placeholder = tf.placeholder(tf.float32, shape=(None, None, 2, 2, 2)) + + boxes = np.array([[[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]]], + [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]]) + scores = np.array([[[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0]], + [[.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]]) + masks = np.array([[[[[0, 1], [2, 3]], [[1, 2], [3, 4]]], + [[[2, 3], [4, 5]], [[3, 4], [5, 6]]], + [[[4, 5], [6, 7]], [[5, 6], [7, 8]]], + [[[6, 7], [8, 9]], [[7, 8], [9, 10]]]], + [[[[8, 9], [10, 11]], [[9, 10], [11, 12]]], + [[[10, 11], [12, 13]], [[11, 12], [13, 14]]], + [[[12, 13], [14, 15]], [[13, 14], [15, 16]]], + [[[14, 15], [16, 17]], [[15, 16], [17, 18]]]]]) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = np.array([[[0, 10, 1, 11], + [0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 999, 2, 1004], + [0, 10.1, 1, 11.1], + [0, 100, 1, 101], + [0, 0, 0, 0]]]) + exp_nms_scores = np.array([[.95, .9, 0, 0], + [.85, .5, .3, 0]]) + exp_nms_classes = np.array([[0, 0, 0, 0], + [1, 0, 0, 0]]) + exp_nms_masks = np.array([[[[6, 7], [8, 9]], + [[0, 1], [2, 3]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]]], + [[[13, 14], [15, 16]], + [[8, 9], [10, 11]], + [[10, 11], [12, 13]], + [[0, 0], [0, 0]]]]) + + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + nmsed_additional_fields, num_detections + ) = post_processing.batch_multiclass_non_max_suppression( + boxes_placeholder, scores_placeholder, score_thresh, iou_thresh, + max_size_per_class=max_output_size, max_total_size=max_output_size, + masks=masks_placeholder) + + self.assertIsNone(nmsed_additional_fields) + # Check static shapes + self.assertAllEqual(nmsed_boxes.shape.as_list(), [None, 4, 4]) + self.assertAllEqual(nmsed_scores.shape.as_list(), [None, 4]) + self.assertAllEqual(nmsed_classes.shape.as_list(), [None, 4]) + self.assertAllEqual(nmsed_masks.shape.as_list(), [None, 4, 2, 2]) + self.assertEqual(num_detections.shape.as_list(), [None]) + + with self.test_session() as sess: + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes, + nmsed_masks, num_detections], + feed_dict={boxes_placeholder: boxes, + scores_placeholder: scores, + masks_placeholder: masks}) + self.assertAllClose(nmsed_boxes, exp_nms_corners) + self.assertAllClose(nmsed_scores, exp_nms_scores) + self.assertAllClose(nmsed_classes, exp_nms_classes) + self.assertAllClose(num_detections, [2, 3]) + self.assertAllClose(nmsed_masks, exp_nms_masks) + + def test_batch_multiclass_nms_with_masks_and_num_valid_boxes(self): + boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]]], + [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]], + tf.float32) + scores = tf.constant([[[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0]], + [[.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]]) + masks = tf.constant([[[[[0, 1], [2, 3]], [[1, 2], [3, 4]]], + [[[2, 3], [4, 5]], [[3, 4], [5, 6]]], + [[[4, 5], [6, 7]], [[5, 6], [7, 8]]], + [[[6, 7], [8, 9]], [[7, 8], [9, 10]]]], + [[[[8, 9], [10, 11]], [[9, 10], [11, 12]]], + [[[10, 11], [12, 13]], [[11, 12], [13, 14]]], + [[[12, 13], [14, 15]], [[13, 14], [15, 16]]], + [[[14, 15], [16, 17]], [[15, 16], [17, 18]]]]], + tf.float32) + num_valid_boxes = tf.constant([1, 1], tf.int32) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = [[[0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 10.1, 1, 11.1], + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0]]] + exp_nms_scores = [[.9, 0, 0, 0], + [.5, 0, 0, 0]] + exp_nms_classes = [[0, 0, 0, 0], + [0, 0, 0, 0]] + exp_nms_masks = [[[[0, 1], [2, 3]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]]], + [[[8, 9], [10, 11]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]]]] + + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + nmsed_additional_fields, num_detections + ) = post_processing.batch_multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, + max_size_per_class=max_output_size, max_total_size=max_output_size, + num_valid_boxes=num_valid_boxes, masks=masks) + + self.assertIsNone(nmsed_additional_fields) + + with self.test_session() as sess: + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes, + nmsed_masks, num_detections]) + self.assertAllClose(nmsed_boxes, exp_nms_corners) + self.assertAllClose(nmsed_scores, exp_nms_scores) + self.assertAllClose(nmsed_classes, exp_nms_classes) + self.assertAllClose(num_detections, [1, 1]) + self.assertAllClose(nmsed_masks, exp_nms_masks) + + def test_batch_multiclass_nms_with_additional_fields_and_num_valid_boxes( + self): + boxes = tf.constant([[[[0, 0, 1, 1], [0, 0, 4, 5]], + [[0, 0.1, 1, 1.1], [0, 0.1, 2, 1.1]], + [[0, -0.1, 1, 0.9], [0, -0.1, 1, 0.9]], + [[0, 10, 1, 11], [0, 10, 1, 11]]], + [[[0, 10.1, 1, 11.1], [0, 10.1, 1, 11.1]], + [[0, 100, 1, 101], [0, 100, 1, 101]], + [[0, 1000, 1, 1002], [0, 999, 2, 1004]], + [[0, 1000, 1, 1002.1], [0, 999, 2, 1002.7]]]], + tf.float32) + scores = tf.constant([[[.9, 0.01], [.75, 0.05], + [.6, 0.01], [.95, 0]], + [[.5, 0.01], [.3, 0.01], + [.01, .85], [.01, .5]]]) + additional_fields = { + 'keypoints': tf.constant( + [[[[6, 7], [8, 9]], + [[0, 1], [2, 3]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]]], + [[[13, 14], [15, 16]], + [[8, 9], [10, 11]], + [[10, 11], [12, 13]], + [[0, 0], [0, 0]]]], + tf.float32) + } + num_valid_boxes = tf.constant([1, 1], tf.int32) + score_thresh = 0.1 + iou_thresh = .5 + max_output_size = 4 + + exp_nms_corners = [[[0, 0, 1, 1], + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 10.1, 1, 11.1], + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0]]] + exp_nms_scores = [[.9, 0, 0, 0], + [.5, 0, 0, 0]] + exp_nms_classes = [[0, 0, 0, 0], + [0, 0, 0, 0]] + exp_nms_additional_fields = { + 'keypoints': np.array([[[[6, 7], [8, 9]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]]], + [[[13, 14], [15, 16]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]], + [[0, 0], [0, 0]]]]) + } + + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, + nmsed_additional_fields, num_detections + ) = post_processing.batch_multiclass_non_max_suppression( + boxes, scores, score_thresh, iou_thresh, + max_size_per_class=max_output_size, max_total_size=max_output_size, + num_valid_boxes=num_valid_boxes, + additional_fields=additional_fields) + + self.assertIsNone(nmsed_masks) + + with self.test_session() as sess: + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_additional_fields, + num_detections) = sess.run([nmsed_boxes, nmsed_scores, nmsed_classes, + nmsed_additional_fields, num_detections]) + + self.assertAllClose(nmsed_boxes, exp_nms_corners) + self.assertAllClose(nmsed_scores, exp_nms_scores) + self.assertAllClose(nmsed_classes, exp_nms_classes) + for key in exp_nms_additional_fields: + self.assertAllClose(nmsed_additional_fields[key], + exp_nms_additional_fields[key]) + self.assertAllClose(num_detections, [1, 1]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/prefetcher.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/prefetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..e690c599fa74e024d9b7ec857628cdbfb0e3ee81 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/prefetcher.py @@ -0,0 +1,61 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Provides functions to prefetch tensors to feed into models.""" +import tensorflow as tf + + +def prefetch(tensor_dict, capacity): + """Creates a prefetch queue for tensors. + + Creates a FIFO queue to asynchronously enqueue tensor_dicts and returns a + dequeue op that evaluates to a tensor_dict. This function is useful in + prefetching preprocessed tensors so that the data is readily available for + consumers. + + Example input pipeline when you don't need batching: + ---------------------------------------------------- + key, string_tensor = slim.parallel_reader.parallel_read(...) + tensor_dict = decoder.decode(string_tensor) + tensor_dict = preprocessor.preprocess(tensor_dict, ...) + prefetch_queue = prefetcher.prefetch(tensor_dict, capacity=20) + tensor_dict = prefetch_queue.dequeue() + outputs = Model(tensor_dict) + ... + ---------------------------------------------------- + + For input pipelines with batching, refer to core/batcher.py + + Args: + tensor_dict: a dictionary of tensors to prefetch. + capacity: the size of the prefetch queue. + + Returns: + a FIFO prefetcher queue + """ + names = list(tensor_dict.keys()) + dtypes = [t.dtype for t in tensor_dict.values()] + shapes = [t.get_shape() for t in tensor_dict.values()] + prefetch_queue = tf.PaddingFIFOQueue(capacity, dtypes=dtypes, + shapes=shapes, + names=names, + name='prefetch_queue') + enqueue_op = prefetch_queue.enqueue(tensor_dict) + tf.train.queue_runner.add_queue_runner(tf.train.queue_runner.QueueRunner( + prefetch_queue, [enqueue_op])) + tf.summary.scalar('queue/%s/fraction_of_%d_full' % (prefetch_queue.name, + capacity), + tf.to_float(prefetch_queue.size()) * (1. / capacity)) + return prefetch_queue diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/prefetcher_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/prefetcher_test.py new file mode 100644 index 0000000000000000000000000000000000000000..63f557e3318c25d02434bc1dd0763f1df35b18ac --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/prefetcher_test.py @@ -0,0 +1,101 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.prefetcher.""" +import tensorflow as tf + +from object_detection.core import prefetcher + +slim = tf.contrib.slim + + +class PrefetcherTest(tf.test.TestCase): + + def test_prefetch_tensors_with_fully_defined_shapes(self): + with self.test_session() as sess: + batch_size = 10 + image_size = 32 + num_batches = 5 + examples = tf.Variable(tf.constant(0, dtype=tf.int64)) + counter = examples.count_up_to(num_batches) + image = tf.random_normal([batch_size, image_size, + image_size, 3], + dtype=tf.float32, + name='images') + label = tf.random_uniform([batch_size, 1], 0, 10, + dtype=tf.int32, name='labels') + + prefetch_queue = prefetcher.prefetch(tensor_dict={'counter': counter, + 'image': image, + 'label': label}, + capacity=100) + tensor_dict = prefetch_queue.dequeue() + + self.assertAllEqual(tensor_dict['image'].get_shape().as_list(), + [batch_size, image_size, image_size, 3]) + self.assertAllEqual(tensor_dict['label'].get_shape().as_list(), + [batch_size, 1]) + + tf.initialize_all_variables().run() + with slim.queues.QueueRunners(sess): + for _ in range(num_batches): + results = sess.run(tensor_dict) + self.assertEquals(results['image'].shape, + (batch_size, image_size, image_size, 3)) + self.assertEquals(results['label'].shape, (batch_size, 1)) + with self.assertRaises(tf.errors.OutOfRangeError): + sess.run(tensor_dict) + + def test_prefetch_tensors_with_partially_defined_shapes(self): + with self.test_session() as sess: + batch_size = 10 + image_size = 32 + num_batches = 5 + examples = tf.Variable(tf.constant(0, dtype=tf.int64)) + counter = examples.count_up_to(num_batches) + image = tf.random_normal([batch_size, + tf.Variable(image_size), + tf.Variable(image_size), 3], + dtype=tf.float32, + name='image') + image.set_shape([batch_size, None, None, 3]) + label = tf.random_uniform([batch_size, tf.Variable(1)], 0, + 10, dtype=tf.int32, name='label') + label.set_shape([batch_size, None]) + + prefetch_queue = prefetcher.prefetch(tensor_dict={'counter': counter, + 'image': image, + 'label': label}, + capacity=100) + tensor_dict = prefetch_queue.dequeue() + + self.assertAllEqual(tensor_dict['image'].get_shape().as_list(), + [batch_size, None, None, 3]) + self.assertAllEqual(tensor_dict['label'].get_shape().as_list(), + [batch_size, None]) + + tf.initialize_all_variables().run() + with slim.queues.QueueRunners(sess): + for _ in range(num_batches): + results = sess.run(tensor_dict) + self.assertEquals(results['image'].shape, + (batch_size, image_size, image_size, 3)) + self.assertEquals(results['label'].shape, (batch_size, 1)) + with self.assertRaises(tf.errors.OutOfRangeError): + sess.run(tensor_dict) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/preprocessor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..0fcdfcc69c273c634a9c7183e159f912e099c6c1 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/preprocessor.py @@ -0,0 +1,3176 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Preprocess images and bounding boxes for detection. + +We perform two sets of operations in preprocessing stage: +(a) operations that are applied to both training and testing data, +(b) operations that are applied only to training data for the purpose of + data augmentation. + +A preprocessing function receives a set of inputs, +e.g. an image and bounding boxes, +performs an operation on them, and returns them. +Some examples are: randomly cropping the image, randomly mirroring the image, + randomly changing the brightness, contrast, hue and + randomly jittering the bounding boxes. + +The preprocess function receives a tensor_dict which is a dictionary that maps +different field names to their tensors. For example, +tensor_dict[fields.InputDataFields.image] holds the image tensor. +The image is a rank 4 tensor: [1, height, width, channels] with +dtype=tf.float32. The groundtruth_boxes is a rank 2 tensor: [N, 4] where +in each row there is a box with [ymin xmin ymax xmax]. +Boxes are in normalized coordinates meaning +their coordinate values range in [0, 1] + +To preprocess multiple images with the same operations in cases where +nondeterministic operations are used, a preprocessor_cache.PreprocessorCache +object can be passed into the preprocess function or individual operations. +All nondeterministic operations except random_jitter_boxes support caching. +E.g. +Let tensor_dict{1,2,3,4,5} be copies of the same inputs. +Let preprocess_options contain nondeterministic operation(s) excluding +random_jitter_boxes. + +cache1 = preprocessor_cache.PreprocessorCache() +cache2 = preprocessor_cache.PreprocessorCache() +a = preprocess(tensor_dict1, preprocess_options, preprocess_vars_cache=cache1) +b = preprocess(tensor_dict2, preprocess_options, preprocess_vars_cache=cache1) +c = preprocess(tensor_dict3, preprocess_options, preprocess_vars_cache=cache2) +d = preprocess(tensor_dict4, preprocess_options, preprocess_vars_cache=cache2) +e = preprocess(tensor_dict5, preprocess_options) + +Then correspondings tensors of object pairs (a,b) and (c,d) +are guaranteed to be equal element-wise, but the equality of any other object +pair cannot be determined. + +Important Note: In tensor_dict, images is a rank 4 tensor, but preprocessing +functions receive a rank 3 tensor for processing the image. Thus, inside the +preprocess function we squeeze the image to become a rank 3 tensor and then +we pass it to the functions. At the end of the preprocess we expand the image +back to rank 4. +""" + +import functools +import inspect +import sys +import tensorflow as tf + +from tensorflow.python.ops import control_flow_ops + +from object_detection.core import box_list +from object_detection.core import box_list_ops +from object_detection.core import keypoint_ops +from object_detection.core import preprocessor_cache +from object_detection.core import standard_fields as fields +from object_detection.utils import shape_utils + + +def _apply_with_random_selector(x, + func, + num_cases, + preprocess_vars_cache=None, + key=''): + """Computes func(x, sel), with sel sampled from [0...num_cases-1]. + + If both preprocess_vars_cache AND key are the same between two calls, sel will + be the same value in both calls. + + Args: + x: input Tensor. + func: Python function to apply. + num_cases: Python int32, number of cases to sample sel from. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + key: variable identifier for preprocess_vars_cache. + + Returns: + The result of func(x, sel), where func receives the value of the + selector as a python integer, but sel is sampled dynamically. + """ + generator_func = functools.partial( + tf.random_uniform, [], maxval=num_cases, dtype=tf.int32) + rand_sel = _get_or_create_preprocess_rand_vars( + generator_func, preprocessor_cache.PreprocessorCache.SELECTOR, + preprocess_vars_cache, key) + + # Pass the real x only to one of the func calls. + return control_flow_ops.merge([func( + control_flow_ops.switch(x, tf.equal(rand_sel, case))[1], case) + for case in range(num_cases)])[0] + + +def _apply_with_random_selector_tuples(x, + func, + num_cases, + preprocess_vars_cache=None, + key=''): + """Computes func(x, sel), with sel sampled from [0...num_cases-1]. + + If both preprocess_vars_cache AND key are the same between two calls, sel will + be the same value in both calls. + + Args: + x: A tuple of input tensors. + func: Python function to apply. + num_cases: Python int32, number of cases to sample sel from. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + key: variable identifier for preprocess_vars_cache. + + Returns: + The result of func(x, sel), where func receives the value of the + selector as a python integer, but sel is sampled dynamically. + """ + num_inputs = len(x) + generator_func = functools.partial( + tf.random_uniform, [], maxval=num_cases, dtype=tf.int32) + rand_sel = _get_or_create_preprocess_rand_vars( + generator_func, preprocessor_cache.PreprocessorCache.SELECTOR_TUPLES, + preprocess_vars_cache, key) + + # Pass the real x only to one of the func calls. + tuples = [list() for t in x] + for case in range(num_cases): + new_x = [control_flow_ops.switch(t, tf.equal(rand_sel, case))[1] for t in x] + output = func(tuple(new_x), case) + for j in range(num_inputs): + tuples[j].append(output[j]) + + for i in range(num_inputs): + tuples[i] = control_flow_ops.merge(tuples[i])[0] + return tuple(tuples) + + +def _get_or_create_preprocess_rand_vars(generator_func, + function_id, + preprocess_vars_cache, + key=''): + """Returns a tensor stored in preprocess_vars_cache or using generator_func. + + If the tensor was previously generated and appears in the PreprocessorCache, + the previously generated tensor will be returned. Otherwise, a new tensor + is generated using generator_func and stored in the cache. + + Args: + generator_func: A 0-argument function that generates a tensor. + function_id: identifier for the preprocessing function used. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + key: identifier for the variable stored. + Returns: + The generated tensor. + """ + if preprocess_vars_cache is not None: + var = preprocess_vars_cache.get(function_id, key) + if var is None: + var = generator_func() + preprocess_vars_cache.update(function_id, key, var) + else: + var = generator_func() + return var + + +def _random_integer(minval, maxval, seed): + """Returns a random 0-D tensor between minval and maxval. + + Args: + minval: minimum value of the random tensor. + maxval: maximum value of the random tensor. + seed: random seed. + + Returns: + A random 0-D tensor between minval and maxval. + """ + return tf.random_uniform( + [], minval=minval, maxval=maxval, dtype=tf.int32, seed=seed) + + +# TODO(mttang): This method is needed because the current +# tf.image.rgb_to_grayscale method does not support quantization. Replace with +# tf.image.rgb_to_grayscale after quantization support is added. +def _rgb_to_grayscale(images, name=None): + """Converts one or more images from RGB to Grayscale. + + Outputs a tensor of the same `DType` and rank as `images`. The size of the + last dimension of the output is 1, containing the Grayscale value of the + pixels. + + Args: + images: The RGB tensor to convert. Last dimension must have size 3 and + should contain RGB values. + name: A name for the operation (optional). + + Returns: + The converted grayscale image(s). + """ + with tf.name_scope(name, 'rgb_to_grayscale', [images]) as name: + images = tf.convert_to_tensor(images, name='images') + # Remember original dtype to so we can convert back if needed + orig_dtype = images.dtype + flt_image = tf.image.convert_image_dtype(images, tf.float32) + + # Reference for converting between RGB and grayscale. + # https://en.wikipedia.org/wiki/Luma_%28video%29 + rgb_weights = [0.2989, 0.5870, 0.1140] + rank_1 = tf.expand_dims(tf.rank(images) - 1, 0) + gray_float = tf.reduce_sum( + flt_image * rgb_weights, rank_1, keep_dims=True) + gray_float.set_shape(images.get_shape()[:-1].concatenate([1])) + return tf.image.convert_image_dtype(gray_float, orig_dtype, name=name) + + +def normalize_image(image, original_minval, original_maxval, target_minval, + target_maxval): + """Normalizes pixel values in the image. + + Moves the pixel values from the current [original_minval, original_maxval] + range to a the [target_minval, target_maxval] range. + + Args: + image: rank 3 float32 tensor containing 1 + image -> [height, width, channels]. + original_minval: current image minimum value. + original_maxval: current image maximum value. + target_minval: target image minimum value. + target_maxval: target image maximum value. + + Returns: + image: image which is the same shape as input image. + """ + with tf.name_scope('NormalizeImage', values=[image]): + original_minval = float(original_minval) + original_maxval = float(original_maxval) + target_minval = float(target_minval) + target_maxval = float(target_maxval) + image = tf.to_float(image) + image = tf.subtract(image, original_minval) + image = tf.multiply(image, (target_maxval - target_minval) / + (original_maxval - original_minval)) + image = tf.add(image, target_minval) + return image + + +def retain_boxes_above_threshold(boxes, + labels, + label_scores, + multiclass_scores=None, + masks=None, + keypoints=None, + threshold=0.0): + """Retains boxes whose label score is above a given threshold. + + If the label score for a box is missing (represented by NaN), the box is + retained. The boxes that don't pass the threshold will not appear in the + returned tensor. + + Args: + boxes: float32 tensor of shape [num_instance, 4] representing boxes + location in normalized coordinates. + labels: rank 1 int32 tensor of shape [num_instance] containing the object + classes. + label_scores: float32 tensor of shape [num_instance] representing the + score for each box. + multiclass_scores: (optional) float32 tensor of shape + [num_instances, num_classes] representing the score for each box for each + class. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks are of + the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x normalized + coordinates. + threshold: scalar python float. + + Returns: + retained_boxes: [num_retained_instance, 4] + retianed_labels: [num_retained_instance] + retained_label_scores: [num_retained_instance] + + If multiclass_scores, masks, or keypoints are not None, the function also + returns: + + retained_multiclass_scores: [num_retained_instance, num_classes] + retained_masks: [num_retained_instance, height, width] + retained_keypoints: [num_retained_instance, num_keypoints, 2] + """ + with tf.name_scope('RetainBoxesAboveThreshold', + values=[boxes, labels, label_scores]): + indices = tf.where( + tf.logical_or(label_scores > threshold, tf.is_nan(label_scores))) + indices = tf.squeeze(indices, axis=1) + retained_boxes = tf.gather(boxes, indices) + retained_labels = tf.gather(labels, indices) + retained_label_scores = tf.gather(label_scores, indices) + result = [retained_boxes, retained_labels, retained_label_scores] + + if multiclass_scores is not None: + retained_multiclass_scores = tf.gather(multiclass_scores, indices) + result.append(retained_multiclass_scores) + + if masks is not None: + retained_masks = tf.gather(masks, indices) + result.append(retained_masks) + + if keypoints is not None: + retained_keypoints = tf.gather(keypoints, indices) + result.append(retained_keypoints) + + return result + + +def _flip_boxes_left_right(boxes): + """Left-right flip the boxes. + + Args: + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + + Returns: + Flipped boxes. + """ + ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1) + flipped_xmin = tf.subtract(1.0, xmax) + flipped_xmax = tf.subtract(1.0, xmin) + flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1) + return flipped_boxes + + +def _flip_boxes_up_down(boxes): + """Up-down flip the boxes. + + Args: + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + + Returns: + Flipped boxes. + """ + ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1) + flipped_ymin = tf.subtract(1.0, ymax) + flipped_ymax = tf.subtract(1.0, ymin) + flipped_boxes = tf.concat([flipped_ymin, xmin, flipped_ymax, xmax], 1) + return flipped_boxes + + +def _rot90_boxes(boxes): + """Rotate boxes counter-clockwise by 90 degrees. + + Args: + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + + Returns: + Rotated boxes. + """ + ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1) + rotated_ymin = tf.subtract(1.0, xmax) + rotated_ymax = tf.subtract(1.0, xmin) + rotated_xmin = ymin + rotated_xmax = ymax + rotated_boxes = tf.concat( + [rotated_ymin, rotated_xmin, rotated_ymax, rotated_xmax], 1) + return rotated_boxes + + +def _flip_masks_left_right(masks): + """Left-right flip masks. + + Args: + masks: rank 3 float32 tensor with shape + [num_instances, height, width] representing instance masks. + + Returns: + flipped masks: rank 3 float32 tensor with shape + [num_instances, height, width] representing instance masks. + """ + return masks[:, :, ::-1] + + +def _flip_masks_up_down(masks): + """Up-down flip masks. + + Args: + masks: rank 3 float32 tensor with shape + [num_instances, height, width] representing instance masks. + + Returns: + flipped masks: rank 3 float32 tensor with shape + [num_instances, height, width] representing instance masks. + """ + return masks[:, ::-1, :] + + +def _rot90_masks(masks): + """Rotate masks counter-clockwise by 90 degrees. + + Args: + masks: rank 3 float32 tensor with shape + [num_instances, height, width] representing instance masks. + + Returns: + rotated masks: rank 3 float32 tensor with shape + [num_instances, height, width] representing instance masks. + """ + masks = tf.transpose(masks, [0, 2, 1]) + return masks[:, ::-1, :] + + +def random_horizontal_flip(image, + boxes=None, + masks=None, + keypoints=None, + keypoint_flip_permutation=None, + seed=None, + preprocess_vars_cache=None): + """Randomly flips the image and detections horizontally. + + The probability of flipping the image is 50%. + + Args: + image: rank 3 float32 tensor with shape [height, width, channels]. + boxes: (optional) rank 2 float32 tensor with shape [N, 4] + containing the bounding boxes. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip + permutation. + seed: random seed + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + + If boxes, masks, keypoints, and keypoint_flip_permutation are not None, + the function also returns the following tensors. + + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + + Raises: + ValueError: if keypoints are provided but keypoint_flip_permutation is not. + """ + + def _flip_image(image): + # flip image + image_flipped = tf.image.flip_left_right(image) + return image_flipped + + if keypoints is not None and keypoint_flip_permutation is None: + raise ValueError( + 'keypoints are provided but keypoints_flip_permutation is not provided') + + with tf.name_scope('RandomHorizontalFlip', values=[image, boxes]): + result = [] + # random variable defining whether to do flip or not + generator_func = functools.partial(tf.random_uniform, [], seed=seed) + do_a_flip_random = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.HORIZONTAL_FLIP, + preprocess_vars_cache) + do_a_flip_random = tf.greater(do_a_flip_random, 0.5) + + # flip image + image = tf.cond(do_a_flip_random, lambda: _flip_image(image), lambda: image) + result.append(image) + + # flip boxes + if boxes is not None: + boxes = tf.cond(do_a_flip_random, lambda: _flip_boxes_left_right(boxes), + lambda: boxes) + result.append(boxes) + + # flip masks + if masks is not None: + masks = tf.cond(do_a_flip_random, lambda: _flip_masks_left_right(masks), + lambda: masks) + result.append(masks) + + # flip keypoints + if keypoints is not None and keypoint_flip_permutation is not None: + permutation = keypoint_flip_permutation + keypoints = tf.cond( + do_a_flip_random, + lambda: keypoint_ops.flip_horizontal(keypoints, 0.5, permutation), + lambda: keypoints) + result.append(keypoints) + + return tuple(result) + + +def random_vertical_flip(image, + boxes=None, + masks=None, + keypoints=None, + keypoint_flip_permutation=None, + seed=None, + preprocess_vars_cache=None): + """Randomly flips the image and detections vertically. + + The probability of flipping the image is 50%. + + Args: + image: rank 3 float32 tensor with shape [height, width, channels]. + boxes: (optional) rank 2 float32 tensor with shape [N, 4] + containing the bounding boxes. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip + permutation. + seed: random seed + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + + If boxes, masks, keypoints, and keypoint_flip_permutation are not None, + the function also returns the following tensors. + + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + + Raises: + ValueError: if keypoints are provided but keypoint_flip_permutation is not. + """ + + def _flip_image(image): + # flip image + image_flipped = tf.image.flip_up_down(image) + return image_flipped + + if keypoints is not None and keypoint_flip_permutation is None: + raise ValueError( + 'keypoints are provided but keypoints_flip_permutation is not provided') + + with tf.name_scope('RandomVerticalFlip', values=[image, boxes]): + result = [] + # random variable defining whether to do flip or not + generator_func = functools.partial(tf.random_uniform, [], seed=seed) + do_a_flip_random = _get_or_create_preprocess_rand_vars( + generator_func, preprocessor_cache.PreprocessorCache.VERTICAL_FLIP, + preprocess_vars_cache) + do_a_flip_random = tf.greater(do_a_flip_random, 0.5) + + # flip image + image = tf.cond(do_a_flip_random, lambda: _flip_image(image), lambda: image) + result.append(image) + + # flip boxes + if boxes is not None: + boxes = tf.cond(do_a_flip_random, lambda: _flip_boxes_up_down(boxes), + lambda: boxes) + result.append(boxes) + + # flip masks + if masks is not None: + masks = tf.cond(do_a_flip_random, lambda: _flip_masks_up_down(masks), + lambda: masks) + result.append(masks) + + # flip keypoints + if keypoints is not None and keypoint_flip_permutation is not None: + permutation = keypoint_flip_permutation + keypoints = tf.cond( + do_a_flip_random, + lambda: keypoint_ops.flip_vertical(keypoints, 0.5, permutation), + lambda: keypoints) + result.append(keypoints) + + return tuple(result) + + +def random_rotation90(image, + boxes=None, + masks=None, + keypoints=None, + seed=None, + preprocess_vars_cache=None): + """Randomly rotates the image and detections 90 degrees counter-clockwise. + + The probability of rotating the image is 50%. This can be combined with + random_horizontal_flip and random_vertical_flip to produce an output with a + uniform distribution of the eight possible 90 degree rotation / reflection + combinations. + + Args: + image: rank 3 float32 tensor with shape [height, width, channels]. + boxes: (optional) rank 2 float32 tensor with shape [N, 4] + containing the bounding boxes. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + seed: random seed + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + + If boxes, masks, and keypoints, are not None, + the function also returns the following tensors. + + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + """ + + def _rot90_image(image): + # flip image + image_rotated = tf.image.rot90(image) + return image_rotated + + with tf.name_scope('RandomRotation90', values=[image, boxes]): + result = [] + + # random variable defining whether to rotate by 90 degrees or not + generator_func = functools.partial(tf.random_uniform, [], seed=seed) + do_a_rot90_random = _get_or_create_preprocess_rand_vars( + generator_func, preprocessor_cache.PreprocessorCache.ROTATION90, + preprocess_vars_cache) + do_a_rot90_random = tf.greater(do_a_rot90_random, 0.5) + + # flip image + image = tf.cond(do_a_rot90_random, lambda: _rot90_image(image), + lambda: image) + result.append(image) + + # flip boxes + if boxes is not None: + boxes = tf.cond(do_a_rot90_random, lambda: _rot90_boxes(boxes), + lambda: boxes) + result.append(boxes) + + # flip masks + if masks is not None: + masks = tf.cond(do_a_rot90_random, lambda: _rot90_masks(masks), + lambda: masks) + result.append(masks) + + # flip keypoints + if keypoints is not None: + keypoints = tf.cond( + do_a_rot90_random, + lambda: keypoint_ops.rot90(keypoints), + lambda: keypoints) + result.append(keypoints) + + return tuple(result) + + +def random_pixel_value_scale(image, + minval=0.9, + maxval=1.1, + seed=None, + preprocess_vars_cache=None): + """Scales each value in the pixels of the image. + + This function scales each pixel independent of the other ones. + For each value in image tensor, draws a random number between + minval and maxval and multiples the values with them. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 255]. + minval: lower ratio of scaling pixel values. + maxval: upper ratio of scaling pixel values. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + """ + with tf.name_scope('RandomPixelValueScale', values=[image]): + generator_func = functools.partial( + tf.random_uniform, tf.shape(image), + minval=minval, maxval=maxval, + dtype=tf.float32, seed=seed) + color_coef = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.PIXEL_VALUE_SCALE, + preprocess_vars_cache) + + image = tf.multiply(image, color_coef) + image = tf.clip_by_value(image, 0.0, 255.0) + + return image + + +def random_image_scale(image, + masks=None, + min_scale_ratio=0.5, + max_scale_ratio=2.0, + seed=None, + preprocess_vars_cache=None): + """Scales the image size. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels]. + masks: (optional) rank 3 float32 tensor containing masks with + size [height, width, num_masks]. The value is set to None if there are no + masks. + min_scale_ratio: minimum scaling ratio. + max_scale_ratio: maximum scaling ratio. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same rank as input image. + masks: If masks is not none, resized masks which are the same rank as input + masks will be returned. + """ + with tf.name_scope('RandomImageScale', values=[image]): + result = [] + image_shape = tf.shape(image) + image_height = image_shape[0] + image_width = image_shape[1] + generator_func = functools.partial( + tf.random_uniform, [], + minval=min_scale_ratio, maxval=max_scale_ratio, + dtype=tf.float32, seed=seed) + size_coef = _get_or_create_preprocess_rand_vars( + generator_func, preprocessor_cache.PreprocessorCache.IMAGE_SCALE, + preprocess_vars_cache) + + image_newysize = tf.to_int32( + tf.multiply(tf.to_float(image_height), size_coef)) + image_newxsize = tf.to_int32( + tf.multiply(tf.to_float(image_width), size_coef)) + image = tf.image.resize_images( + image, [image_newysize, image_newxsize], align_corners=True) + result.append(image) + if masks: + masks = tf.image.resize_nearest_neighbor( + masks, [image_newysize, image_newxsize], align_corners=True) + result.append(masks) + return tuple(result) + + +def random_rgb_to_gray(image, + probability=0.1, + seed=None, + preprocess_vars_cache=None): + """Changes the image from RGB to Grayscale with the given probability. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 255]. + probability: the probability of returning a grayscale image. + The probability should be a number between [0, 1]. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + """ + def _image_to_gray(image): + image_gray1 = _rgb_to_grayscale(image) + image_gray3 = tf.image.grayscale_to_rgb(image_gray1) + return image_gray3 + + with tf.name_scope('RandomRGBtoGray', values=[image]): + # random variable defining whether to change to grayscale or not + generator_func = functools.partial(tf.random_uniform, [], seed=seed) + do_gray_random = _get_or_create_preprocess_rand_vars( + generator_func, preprocessor_cache.PreprocessorCache.RGB_TO_GRAY, + preprocess_vars_cache) + + image = tf.cond( + tf.greater(do_gray_random, probability), lambda: image, + lambda: _image_to_gray(image)) + + return image + + +def random_adjust_brightness(image, + max_delta=0.2, + seed=None, + preprocess_vars_cache=None): + """Randomly adjusts brightness. + + Makes sure the output image is still between 0 and 255. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 255]. + max_delta: how much to change the brightness. A value between [0, 1). + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + boxes: boxes which is the same shape as input boxes. + """ + with tf.name_scope('RandomAdjustBrightness', values=[image]): + generator_func = functools.partial(tf.random_uniform, [], + -max_delta, max_delta, seed=seed) + delta = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.ADJUST_BRIGHTNESS, + preprocess_vars_cache) + + image = tf.image.adjust_brightness(image / 255, delta) * 255 + image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) + return image + + +def random_adjust_contrast(image, + min_delta=0.8, + max_delta=1.25, + seed=None, + preprocess_vars_cache=None): + """Randomly adjusts contrast. + + Makes sure the output image is still between 0 and 255. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 255]. + min_delta: see max_delta. + max_delta: how much to change the contrast. Contrast will change with a + value between min_delta and max_delta. This value will be + multiplied to the current contrast of the image. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + """ + with tf.name_scope('RandomAdjustContrast', values=[image]): + generator_func = functools.partial(tf.random_uniform, [], + min_delta, max_delta, seed=seed) + contrast_factor = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.ADJUST_CONTRAST, + preprocess_vars_cache) + image = tf.image.adjust_contrast(image / 255, contrast_factor) * 255 + image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) + return image + + +def random_adjust_hue(image, + max_delta=0.02, + seed=None, + preprocess_vars_cache=None): + """Randomly adjusts hue. + + Makes sure the output image is still between 0 and 255. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 255]. + max_delta: change hue randomly with a value between 0 and max_delta. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + """ + with tf.name_scope('RandomAdjustHue', values=[image]): + generator_func = functools.partial(tf.random_uniform, [], + -max_delta, max_delta, seed=seed) + delta = _get_or_create_preprocess_rand_vars( + generator_func, preprocessor_cache.PreprocessorCache.ADJUST_HUE, + preprocess_vars_cache) + image = tf.image.adjust_hue(image / 255, delta) * 255 + image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) + return image + + +def random_adjust_saturation(image, + min_delta=0.8, + max_delta=1.25, + seed=None, + preprocess_vars_cache=None): + """Randomly adjusts saturation. + + Makes sure the output image is still between 0 and 255. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 255]. + min_delta: see max_delta. + max_delta: how much to change the saturation. Saturation will change with a + value between min_delta and max_delta. This value will be + multiplied to the current saturation of the image. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + """ + with tf.name_scope('RandomAdjustSaturation', values=[image]): + generator_func = functools.partial(tf.random_uniform, [], + min_delta, max_delta, seed=seed) + saturation_factor = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.ADJUST_SATURATION, + preprocess_vars_cache) + image = tf.image.adjust_saturation(image / 255, saturation_factor) * 255 + image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) + return image + + +def random_distort_color(image, color_ordering=0, preprocess_vars_cache=None): + """Randomly distorts color. + + Randomly distorts color using a combination of brightness, hue, contrast and + saturation changes. Makes sure the output image is still between 0 and 255. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 255]. + color_ordering: Python int, a type of distortion (valid values: 0, 1). + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same shape as input image. + + Raises: + ValueError: if color_ordering is not in {0, 1}. + """ + with tf.name_scope('RandomDistortColor', values=[image]): + if color_ordering == 0: + image = random_adjust_brightness( + image, max_delta=32. / 255., + preprocess_vars_cache=preprocess_vars_cache) + image = random_adjust_saturation( + image, min_delta=0.5, max_delta=1.5, + preprocess_vars_cache=preprocess_vars_cache) + image = random_adjust_hue( + image, max_delta=0.2, + preprocess_vars_cache=preprocess_vars_cache) + image = random_adjust_contrast( + image, min_delta=0.5, max_delta=1.5, + preprocess_vars_cache=preprocess_vars_cache) + + elif color_ordering == 1: + image = random_adjust_brightness( + image, max_delta=32. / 255., + preprocess_vars_cache=preprocess_vars_cache) + image = random_adjust_contrast( + image, min_delta=0.5, max_delta=1.5, + preprocess_vars_cache=preprocess_vars_cache) + image = random_adjust_saturation( + image, min_delta=0.5, max_delta=1.5, + preprocess_vars_cache=preprocess_vars_cache) + image = random_adjust_hue( + image, max_delta=0.2, + preprocess_vars_cache=preprocess_vars_cache) + else: + raise ValueError('color_ordering must be in {0, 1}') + return image + + +def random_jitter_boxes(boxes, ratio=0.05, seed=None): + """Randomly jitter boxes in image. + + Args: + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + ratio: The ratio of the box width and height that the corners can jitter. + For example if the width is 100 pixels and ratio is 0.05, + the corners can jitter up to 5 pixels in the x direction. + seed: random seed. + + Returns: + boxes: boxes which is the same shape as input boxes. + """ + def random_jitter_box(box, ratio, seed): + """Randomly jitter box. + + Args: + box: bounding box [1, 1, 4]. + ratio: max ratio between jittered box and original box, + a number between [0, 0.5]. + seed: random seed. + + Returns: + jittered_box: jittered box. + """ + rand_numbers = tf.random_uniform( + [1, 1, 4], minval=-ratio, maxval=ratio, dtype=tf.float32, seed=seed) + box_width = tf.subtract(box[0, 0, 3], box[0, 0, 1]) + box_height = tf.subtract(box[0, 0, 2], box[0, 0, 0]) + hw_coefs = tf.stack([box_height, box_width, box_height, box_width]) + hw_rand_coefs = tf.multiply(hw_coefs, rand_numbers) + jittered_box = tf.add(box, hw_rand_coefs) + jittered_box = tf.clip_by_value(jittered_box, 0.0, 1.0) + return jittered_box + + with tf.name_scope('RandomJitterBoxes', values=[boxes]): + # boxes are [N, 4]. Lets first make them [N, 1, 1, 4] + boxes_shape = tf.shape(boxes) + boxes = tf.expand_dims(boxes, 1) + boxes = tf.expand_dims(boxes, 2) + + distorted_boxes = tf.map_fn( + lambda x: random_jitter_box(x, ratio, seed), boxes, dtype=tf.float32) + + distorted_boxes = tf.reshape(distorted_boxes, boxes_shape) + + return distorted_boxes + + +def _strict_random_crop_image(image, + boxes, + labels, + label_scores=None, + multiclass_scores=None, + masks=None, + keypoints=None, + min_object_covered=1.0, + aspect_ratio_range=(0.75, 1.33), + area_range=(0.1, 1.0), + overlap_thresh=0.3, + preprocess_vars_cache=None): + """Performs random crop. + + Note: boxes will be clipped to the crop. Keypoint coordinates that are + outside the crop will be set to NaN, which is consistent with the original + keypoint encoding for non-existing keypoints. This function always crops + the image and is supposed to be used by `random_crop_image` function which + sometimes returns image unchanged. + + Args: + image: rank 3 float32 tensor containing 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes with shape + [num_instances, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + labels: rank 1 int32 tensor containing the object classes. + label_scores: (optional) float32 tensor of shape [num_instances] + representing the score for each box. + multiclass_scores: (optional) float32 tensor of shape + [num_instances, num_classes] representing the score for each box for each + class. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + min_object_covered: the cropped image must cover at least this fraction of + at least one of the input bounding boxes. + aspect_ratio_range: allowed range for aspect ratio of cropped image. + area_range: allowed range for area ratio between cropped image and the + original image. + overlap_thresh: minimum overlap thresh with new cropped + image to keep the box. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same rank as input image. + boxes: boxes which is the same rank as input boxes. + Boxes are in normalized form. + labels: new labels. + + If label_scores, multiclass_scores, masks, or keypoints is not None, the + function also returns: + label_scores: rank 1 float32 tensor with shape [num_instances]. + multiclass_scores: rank 2 float32 tensor with shape + [num_instances, num_classes] + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + """ + with tf.name_scope('RandomCropImage', values=[image, boxes]): + image_shape = tf.shape(image) + + # boxes are [N, 4]. Lets first make them [N, 1, 4]. + boxes_expanded = tf.expand_dims( + tf.clip_by_value( + boxes, clip_value_min=0.0, clip_value_max=1.0), 1) + + generator_func = functools.partial( + tf.image.sample_distorted_bounding_box, + image_shape, + bounding_boxes=boxes_expanded, + min_object_covered=min_object_covered, + aspect_ratio_range=aspect_ratio_range, + area_range=area_range, + max_attempts=100, + use_image_if_no_bounding_boxes=True) + + # for ssd cropping, each value of min_object_covered has its own + # cached random variable + sample_distorted_bounding_box = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.STRICT_CROP_IMAGE, + preprocess_vars_cache, key=min_object_covered) + + im_box_begin, im_box_size, im_box = sample_distorted_bounding_box + + new_image = tf.slice(image, im_box_begin, im_box_size) + new_image.set_shape([None, None, image.get_shape()[2]]) + + # [1, 4] + im_box_rank2 = tf.squeeze(im_box, squeeze_dims=[0]) + # [4] + im_box_rank1 = tf.squeeze(im_box) + + boxlist = box_list.BoxList(boxes) + boxlist.add_field('labels', labels) + + if label_scores is not None: + boxlist.add_field('label_scores', label_scores) + + if multiclass_scores is not None: + boxlist.add_field('multiclass_scores', multiclass_scores) + + im_boxlist = box_list.BoxList(im_box_rank2) + + # remove boxes that are outside cropped image + boxlist, inside_window_ids = box_list_ops.prune_completely_outside_window( + boxlist, im_box_rank1) + + # remove boxes that are outside image + overlapping_boxlist, keep_ids = box_list_ops.prune_non_overlapping_boxes( + boxlist, im_boxlist, overlap_thresh) + + # change the coordinate of the remaining boxes + new_labels = overlapping_boxlist.get_field('labels') + new_boxlist = box_list_ops.change_coordinate_frame(overlapping_boxlist, + im_box_rank1) + new_boxes = new_boxlist.get() + new_boxes = tf.clip_by_value( + new_boxes, clip_value_min=0.0, clip_value_max=1.0) + + result = [new_image, new_boxes, new_labels] + + if label_scores is not None: + new_label_scores = overlapping_boxlist.get_field('label_scores') + result.append(new_label_scores) + + if multiclass_scores is not None: + new_multiclass_scores = overlapping_boxlist.get_field('multiclass_scores') + result.append(new_multiclass_scores) + + if masks is not None: + masks_of_boxes_inside_window = tf.gather(masks, inside_window_ids) + masks_of_boxes_completely_inside_window = tf.gather( + masks_of_boxes_inside_window, keep_ids) + masks_box_begin = [0, im_box_begin[0], im_box_begin[1]] + masks_box_size = [-1, im_box_size[0], im_box_size[1]] + new_masks = tf.slice( + masks_of_boxes_completely_inside_window, + masks_box_begin, masks_box_size) + result.append(new_masks) + + if keypoints is not None: + keypoints_of_boxes_inside_window = tf.gather(keypoints, inside_window_ids) + keypoints_of_boxes_completely_inside_window = tf.gather( + keypoints_of_boxes_inside_window, keep_ids) + new_keypoints = keypoint_ops.change_coordinate_frame( + keypoints_of_boxes_completely_inside_window, im_box_rank1) + new_keypoints = keypoint_ops.prune_outside_window(new_keypoints, + [0.0, 0.0, 1.0, 1.0]) + result.append(new_keypoints) + + return tuple(result) + + +def random_crop_image(image, + boxes, + labels, + label_scores=None, + multiclass_scores=None, + masks=None, + keypoints=None, + min_object_covered=1.0, + aspect_ratio_range=(0.75, 1.33), + area_range=(0.1, 1.0), + overlap_thresh=0.3, + random_coef=0.0, + seed=None, + preprocess_vars_cache=None): + """Randomly crops the image. + + Given the input image and its bounding boxes, this op randomly + crops a subimage. Given a user-provided set of input constraints, + the crop window is resampled until it satisfies these constraints. + If within 100 trials it is unable to find a valid crop, the original + image is returned. See the Args section for a description of the input + constraints. Both input boxes and returned Boxes are in normalized + form (e.g., lie in the unit square [0, 1]). + This function will return the original image with probability random_coef. + + Note: boxes will be clipped to the crop. Keypoint coordinates that are + outside the crop will be set to NaN, which is consistent with the original + keypoint encoding for non-existing keypoints. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes with shape + [num_instances, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + labels: rank 1 int32 tensor containing the object classes. + label_scores: (optional) float32 tensor of shape [num_instances]. + representing the score for each box. + multiclass_scores: (optional) float32 tensor of shape + [num_instances, num_classes] representing the score for each box for each + class. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + min_object_covered: the cropped image must cover at least this fraction of + at least one of the input bounding boxes. + aspect_ratio_range: allowed range for aspect ratio of cropped image. + area_range: allowed range for area ratio between cropped image and the + original image. + overlap_thresh: minimum overlap thresh with new cropped + image to keep the box. + random_coef: a random coefficient that defines the chance of getting the + original image. If random_coef is 0, we will always get the + cropped image, and if it is 1.0, we will always get the + original image. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: Image shape will be [new_height, new_width, channels]. + boxes: boxes which is the same rank as input boxes. Boxes are in normalized + form. + labels: new labels. + + If label_scores, multiclass_scores, masks, or keypoints is not None, the + function also returns: + label_scores: rank 1 float32 tensor with shape [num_instances]. + multiclass_scores: rank 2 float32 tensor with shape + [num_instances, num_classes] + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + """ + + def strict_random_crop_image_fn(): + return _strict_random_crop_image( + image, + boxes, + labels, + label_scores=label_scores, + multiclass_scores=multiclass_scores, + masks=masks, + keypoints=keypoints, + min_object_covered=min_object_covered, + aspect_ratio_range=aspect_ratio_range, + area_range=area_range, + overlap_thresh=overlap_thresh, + preprocess_vars_cache=preprocess_vars_cache) + + # avoids tf.cond to make faster RCNN training on borg. See b/140057645. + if random_coef < sys.float_info.min: + result = strict_random_crop_image_fn() + else: + generator_func = functools.partial(tf.random_uniform, [], seed=seed) + do_a_crop_random = _get_or_create_preprocess_rand_vars( + generator_func, preprocessor_cache.PreprocessorCache.CROP_IMAGE, + preprocess_vars_cache) + do_a_crop_random = tf.greater(do_a_crop_random, random_coef) + + outputs = [image, boxes, labels] + + if label_scores is not None: + outputs.append(label_scores) + if multiclass_scores is not None: + outputs.append(multiclass_scores) + if masks is not None: + outputs.append(masks) + if keypoints is not None: + outputs.append(keypoints) + + result = tf.cond(do_a_crop_random, strict_random_crop_image_fn, + lambda: tuple(outputs)) + return result + + +def random_pad_image(image, + boxes, + min_image_size=None, + max_image_size=None, + pad_color=None, + seed=None, + preprocess_vars_cache=None): + """Randomly pads the image. + + This function randomly pads the image with zeros. The final size of the + padded image will be between min_image_size and max_image_size. + if min_image_size is smaller than the input image size, min_image_size will + be set to the input image size. The same for max_image_size. The input image + will be located at a uniformly random location inside the padded image. + The relative location of the boxes to the original image will remain the same. + + Args: + image: rank 3 float32 tensor containing 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + min_image_size: a tensor of size [min_height, min_width], type tf.int32. + If passed as None, will be set to image size + [height, width]. + max_image_size: a tensor of size [max_height, max_width], type tf.int32. + If passed as None, will be set to twice the + image [height * 2, width * 2]. + pad_color: padding color. A rank 1 tensor of [3] with dtype=tf.float32. + if set as None, it will be set to average color of the input + image. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: Image shape will be [new_height, new_width, channels]. + boxes: boxes which is the same rank as input boxes. Boxes are in normalized + form. + """ + if pad_color is None: + pad_color = tf.reduce_mean(image, axis=[0, 1]) + + image_shape = tf.shape(image) + image_height = image_shape[0] + image_width = image_shape[1] + + if max_image_size is None: + max_image_size = tf.stack([image_height * 2, image_width * 2]) + max_image_size = tf.maximum(max_image_size, + tf.stack([image_height, image_width])) + + if min_image_size is None: + min_image_size = tf.stack([image_height, image_width]) + min_image_size = tf.maximum(min_image_size, + tf.stack([image_height, image_width])) + + target_height = tf.cond( + max_image_size[0] > min_image_size[0], + lambda: _random_integer(min_image_size[0], max_image_size[0], seed), + lambda: max_image_size[0]) + + target_width = tf.cond( + max_image_size[1] > min_image_size[1], + lambda: _random_integer(min_image_size[1], max_image_size[1], seed), + lambda: max_image_size[1]) + + offset_height = tf.cond( + target_height > image_height, + lambda: _random_integer(0, target_height - image_height, seed), + lambda: tf.constant(0, dtype=tf.int32)) + + offset_width = tf.cond( + target_width > image_width, + lambda: _random_integer(0, target_width - image_width, seed), + lambda: tf.constant(0, dtype=tf.int32)) + + gen_func = lambda: (target_height, target_width, offset_height, offset_width) + params = _get_or_create_preprocess_rand_vars( + gen_func, preprocessor_cache.PreprocessorCache.PAD_IMAGE, + preprocess_vars_cache) + target_height, target_width, offset_height, offset_width = params + + new_image = tf.image.pad_to_bounding_box( + image, + offset_height=offset_height, + offset_width=offset_width, + target_height=target_height, + target_width=target_width) + + # Setting color of the padded pixels + image_ones = tf.ones_like(image) + image_ones_padded = tf.image.pad_to_bounding_box( + image_ones, + offset_height=offset_height, + offset_width=offset_width, + target_height=target_height, + target_width=target_width) + image_color_padded = (1.0 - image_ones_padded) * pad_color + new_image += image_color_padded + + # setting boxes + new_window = tf.to_float( + tf.stack([ + -offset_height, -offset_width, target_height - offset_height, + target_width - offset_width + ])) + new_window /= tf.to_float( + tf.stack([image_height, image_width, image_height, image_width])) + boxlist = box_list.BoxList(boxes) + new_boxlist = box_list_ops.change_coordinate_frame(boxlist, new_window) + new_boxes = new_boxlist.get() + + return new_image, new_boxes + + +def random_crop_pad_image(image, + boxes, + labels, + label_scores=None, + multiclass_scores=None, + min_object_covered=1.0, + aspect_ratio_range=(0.75, 1.33), + area_range=(0.1, 1.0), + overlap_thresh=0.3, + random_coef=0.0, + min_padded_size_ratio=(1.0, 1.0), + max_padded_size_ratio=(2.0, 2.0), + pad_color=None, + seed=None, + preprocess_vars_cache=None): + """Randomly crops and pads the image. + + Given an input image and its bounding boxes, this op first randomly crops + the image and then randomly pads the image with background values. Parameters + min_padded_size_ratio and max_padded_size_ratio, determine the range of the + final output image size. Specifically, the final image size will have a size + in the range of min_padded_size_ratio * tf.shape(image) and + max_padded_size_ratio * tf.shape(image). Note that these ratios are with + respect to the size of the original image, so we can't capture the same + effect easily by independently applying RandomCropImage + followed by RandomPadImage. + + Args: + image: rank 3 float32 tensor containing 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + labels: rank 1 int32 tensor containing the object classes. + label_scores: rank 1 float32 containing the label scores. + multiclass_scores: (optional) float32 tensor of shape + [num_instances, num_classes] representing the score for each box for each + class. + min_object_covered: the cropped image must cover at least this fraction of + at least one of the input bounding boxes. + aspect_ratio_range: allowed range for aspect ratio of cropped image. + area_range: allowed range for area ratio between cropped image and the + original image. + overlap_thresh: minimum overlap thresh with new cropped + image to keep the box. + random_coef: a random coefficient that defines the chance of getting the + original image. If random_coef is 0, we will always get the + cropped image, and if it is 1.0, we will always get the + original image. + min_padded_size_ratio: min ratio of padded image height and width to the + input image's height and width. + max_padded_size_ratio: max ratio of padded image height and width to the + input image's height and width. + pad_color: padding color. A rank 1 tensor of [3] with dtype=tf.float32. + if set as None, it will be set to average color of the randomly + cropped image. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + padded_image: padded image. + padded_boxes: boxes which is the same rank as input boxes. Boxes are in + normalized form. + cropped_labels: cropped labels. + if label_scores is not None also returns: + cropped_label_scores: cropped label scores. + if multiclass_scores is not None also returns: + cropped_multiclass_scores: cropped_multiclass_scores. + + """ + image_size = tf.shape(image) + image_height = image_size[0] + image_width = image_size[1] + result = random_crop_image( + image=image, + boxes=boxes, + labels=labels, + label_scores=label_scores, + multiclass_scores=multiclass_scores, + min_object_covered=min_object_covered, + aspect_ratio_range=aspect_ratio_range, + area_range=area_range, + overlap_thresh=overlap_thresh, + random_coef=random_coef, + seed=seed, + preprocess_vars_cache=preprocess_vars_cache) + + cropped_image, cropped_boxes, cropped_labels = result[:3] + + min_image_size = tf.to_int32( + tf.to_float(tf.stack([image_height, image_width])) * + min_padded_size_ratio) + max_image_size = tf.to_int32( + tf.to_float(tf.stack([image_height, image_width])) * + max_padded_size_ratio) + + padded_image, padded_boxes = random_pad_image( + cropped_image, + cropped_boxes, + min_image_size=min_image_size, + max_image_size=max_image_size, + pad_color=pad_color, + seed=seed, + preprocess_vars_cache=preprocess_vars_cache) + + cropped_padded_output = (padded_image, padded_boxes, cropped_labels) + + index = 3 + if label_scores is not None: + cropped_label_scores = result[index] + cropped_padded_output += (cropped_label_scores,) + index += 1 + + if multiclass_scores is not None: + cropped_multiclass_scores = result[index] + cropped_padded_output += (cropped_multiclass_scores,) + + return cropped_padded_output + + +def random_crop_to_aspect_ratio(image, + boxes, + labels, + label_scores=None, + multiclass_scores=None, + masks=None, + keypoints=None, + aspect_ratio=1.0, + overlap_thresh=0.3, + seed=None, + preprocess_vars_cache=None): + """Randomly crops an image to the specified aspect ratio. + + Randomly crops the a portion of the image such that the crop is of the + specified aspect ratio, and the crop is as large as possible. If the specified + aspect ratio is larger than the aspect ratio of the image, this op will + randomly remove rows from the top and bottom of the image. If the specified + aspect ratio is less than the aspect ratio of the image, this op will randomly + remove cols from the left and right of the image. If the specified aspect + ratio is the same as the aspect ratio of the image, this op will return the + image. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + labels: rank 1 int32 tensor containing the object classes. + label_scores: (optional) float32 tensor of shape [num_instances] + representing the score for each box. + multiclass_scores: (optional) float32 tensor of shape + [num_instances, num_classes] representing the score for each box for each + class. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + aspect_ratio: the aspect ratio of cropped image. + overlap_thresh: minimum overlap thresh with new cropped + image to keep the box. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same rank as input image. + boxes: boxes which is the same rank as input boxes. + Boxes are in normalized form. + labels: new labels. + + If label_scores, masks, keypoints, or multiclass_scores is not None, the + function also returns: + label_scores: rank 1 float32 tensor with shape [num_instances]. + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + multiclass_scores: rank 2 float32 tensor with shape + [num_instances, num_classes] + + Raises: + ValueError: If image is not a 3D tensor. + """ + if len(image.get_shape()) != 3: + raise ValueError('Image should be 3D tensor') + + with tf.name_scope('RandomCropToAspectRatio', values=[image]): + image_shape = tf.shape(image) + orig_height = image_shape[0] + orig_width = image_shape[1] + orig_aspect_ratio = tf.to_float(orig_width) / tf.to_float(orig_height) + new_aspect_ratio = tf.constant(aspect_ratio, dtype=tf.float32) + def target_height_fn(): + return tf.to_int32(tf.round(tf.to_float(orig_width) / new_aspect_ratio)) + + target_height = tf.cond(orig_aspect_ratio >= new_aspect_ratio, + lambda: orig_height, target_height_fn) + + def target_width_fn(): + return tf.to_int32(tf.round(tf.to_float(orig_height) * new_aspect_ratio)) + + target_width = tf.cond(orig_aspect_ratio <= new_aspect_ratio, + lambda: orig_width, target_width_fn) + + # either offset_height = 0 and offset_width is randomly chosen from + # [0, offset_width - target_width), or else offset_width = 0 and + # offset_height is randomly chosen from [0, offset_height - target_height) + offset_height = _random_integer(0, orig_height - target_height + 1, seed) + offset_width = _random_integer(0, orig_width - target_width + 1, seed) + + generator_func = lambda: (offset_height, offset_width) + offset_height, offset_width = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.CROP_TO_ASPECT_RATIO, + preprocess_vars_cache) + + new_image = tf.image.crop_to_bounding_box( + image, offset_height, offset_width, target_height, target_width) + + im_box = tf.stack([ + tf.to_float(offset_height) / tf.to_float(orig_height), + tf.to_float(offset_width) / tf.to_float(orig_width), + tf.to_float(offset_height + target_height) / tf.to_float(orig_height), + tf.to_float(offset_width + target_width) / tf.to_float(orig_width) + ]) + + boxlist = box_list.BoxList(boxes) + boxlist.add_field('labels', labels) + + if label_scores is not None: + boxlist.add_field('label_scores', label_scores) + + if multiclass_scores is not None: + boxlist.add_field('multiclass_scores', multiclass_scores) + + im_boxlist = box_list.BoxList(tf.expand_dims(im_box, 0)) + + # remove boxes whose overlap with the image is less than overlap_thresh + overlapping_boxlist, keep_ids = box_list_ops.prune_non_overlapping_boxes( + boxlist, im_boxlist, overlap_thresh) + + # change the coordinate of the remaining boxes + new_labels = overlapping_boxlist.get_field('labels') + new_boxlist = box_list_ops.change_coordinate_frame(overlapping_boxlist, + im_box) + new_boxlist = box_list_ops.clip_to_window(new_boxlist, + tf.constant([0.0, 0.0, 1.0, 1.0], + tf.float32)) + new_boxes = new_boxlist.get() + + result = [new_image, new_boxes, new_labels] + + if label_scores is not None: + new_label_scores = overlapping_boxlist.get_field('label_scores') + result.append(new_label_scores) + + if multiclass_scores is not None: + new_multiclass_scores = overlapping_boxlist.get_field('multiclass_scores') + result.append(new_multiclass_scores) + + if masks is not None: + masks_inside_window = tf.gather(masks, keep_ids) + masks_box_begin = tf.stack([0, offset_height, offset_width]) + masks_box_size = tf.stack([-1, target_height, target_width]) + new_masks = tf.slice(masks_inside_window, masks_box_begin, masks_box_size) + result.append(new_masks) + + if keypoints is not None: + keypoints_inside_window = tf.gather(keypoints, keep_ids) + new_keypoints = keypoint_ops.change_coordinate_frame( + keypoints_inside_window, im_box) + new_keypoints = keypoint_ops.prune_outside_window(new_keypoints, + [0.0, 0.0, 1.0, 1.0]) + result.append(new_keypoints) + + return tuple(result) + + +def random_pad_to_aspect_ratio(image, + boxes, + masks=None, + keypoints=None, + aspect_ratio=1.0, + min_padded_size_ratio=(1.0, 1.0), + max_padded_size_ratio=(2.0, 2.0), + seed=None, + preprocess_vars_cache=None): + """Randomly zero pads an image to the specified aspect ratio. + + Pads the image so that the resulting image will have the specified aspect + ratio without scaling less than the min_padded_size_ratio or more than the + max_padded_size_ratio. If the min_padded_size_ratio or max_padded_size_ratio + is lower than what is possible to maintain the aspect ratio, then this method + will use the least padding to achieve the specified aspect ratio. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + aspect_ratio: aspect ratio of the final image. + min_padded_size_ratio: min ratio of padded image height and width to the + input image's height and width. + max_padded_size_ratio: max ratio of padded image height and width to the + input image's height and width. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same rank as input image. + boxes: boxes which is the same rank as input boxes. + Boxes are in normalized form. + labels: new labels. + + If masks, or keypoints is not None, the function also returns: + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + + Raises: + ValueError: If image is not a 3D tensor. + """ + if len(image.get_shape()) != 3: + raise ValueError('Image should be 3D tensor') + + with tf.name_scope('RandomPadToAspectRatio', values=[image]): + image_shape = tf.shape(image) + image_height = tf.to_float(image_shape[0]) + image_width = tf.to_float(image_shape[1]) + image_aspect_ratio = image_width / image_height + new_aspect_ratio = tf.constant(aspect_ratio, dtype=tf.float32) + target_height = tf.cond( + image_aspect_ratio <= new_aspect_ratio, + lambda: image_height, + lambda: image_width / new_aspect_ratio) + target_width = tf.cond( + image_aspect_ratio >= new_aspect_ratio, + lambda: image_width, + lambda: image_height * new_aspect_ratio) + + min_height = tf.maximum( + min_padded_size_ratio[0] * image_height, target_height) + min_width = tf.maximum( + min_padded_size_ratio[1] * image_width, target_width) + max_height = tf.maximum( + max_padded_size_ratio[0] * image_height, target_height) + max_width = tf.maximum( + max_padded_size_ratio[1] * image_width, target_width) + + max_scale = tf.minimum(max_height / target_height, max_width / target_width) + min_scale = tf.minimum( + max_scale, + tf.maximum(min_height / target_height, min_width / target_width)) + + generator_func = functools.partial(tf.random_uniform, [], + min_scale, max_scale, seed=seed) + scale = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.PAD_TO_ASPECT_RATIO, + preprocess_vars_cache) + + target_height = tf.round(scale * target_height) + target_width = tf.round(scale * target_width) + + new_image = tf.image.pad_to_bounding_box( + image, 0, 0, tf.to_int32(target_height), tf.to_int32(target_width)) + + im_box = tf.stack([ + 0.0, + 0.0, + target_height / image_height, + target_width / image_width + ]) + boxlist = box_list.BoxList(boxes) + new_boxlist = box_list_ops.change_coordinate_frame(boxlist, im_box) + new_boxes = new_boxlist.get() + + result = [new_image, new_boxes] + + if masks is not None: + new_masks = tf.expand_dims(masks, -1) + new_masks = tf.image.pad_to_bounding_box(new_masks, 0, 0, + tf.to_int32(target_height), + tf.to_int32(target_width)) + new_masks = tf.squeeze(new_masks, [-1]) + result.append(new_masks) + + if keypoints is not None: + new_keypoints = keypoint_ops.change_coordinate_frame(keypoints, im_box) + result.append(new_keypoints) + + return tuple(result) + + +def random_black_patches(image, + max_black_patches=10, + probability=0.5, + size_to_image_ratio=0.1, + random_seed=None, + preprocess_vars_cache=None): + """Randomly adds some black patches to the image. + + This op adds up to max_black_patches square black patches of a fixed size + to the image where size is specified via the size_to_image_ratio parameter. + + Args: + image: rank 3 float32 tensor containing 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + max_black_patches: number of times that the function tries to add a + black box to the image. + probability: at each try, what is the chance of adding a box. + size_to_image_ratio: Determines the ratio of the size of the black patches + to the size of the image. + box_size = size_to_image_ratio * + min(image_width, image_height) + random_seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image + """ + def add_black_patch_to_image(image, idx): + """Function for adding one patch to the image. + + Args: + image: image + idx: counter for number of patches that could have been added + + Returns: + image with a randomly added black box + """ + image_shape = tf.shape(image) + image_height = image_shape[0] + image_width = image_shape[1] + box_size = tf.to_int32( + tf.multiply( + tf.minimum(tf.to_float(image_height), tf.to_float(image_width)), + size_to_image_ratio)) + + generator_func = functools.partial(tf.random_uniform, [], minval=0.0, + maxval=(1.0 - size_to_image_ratio), + seed=random_seed) + normalized_y_min = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.ADD_BLACK_PATCH, + preprocess_vars_cache, key=str(idx) + 'y') + normalized_x_min = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.ADD_BLACK_PATCH, + preprocess_vars_cache, key=str(idx) + 'x') + + y_min = tf.to_int32(normalized_y_min * tf.to_float(image_height)) + x_min = tf.to_int32(normalized_x_min * tf.to_float(image_width)) + black_box = tf.ones([box_size, box_size, 3], dtype=tf.float32) + mask = 1.0 - tf.image.pad_to_bounding_box(black_box, y_min, x_min, + image_height, image_width) + image = tf.multiply(image, mask) + return image + + with tf.name_scope('RandomBlackPatchInImage', values=[image]): + for idx in range(max_black_patches): + generator_func = functools.partial(tf.random_uniform, [], + minval=0.0, maxval=1.0, + dtype=tf.float32, seed=random_seed) + random_prob = _get_or_create_preprocess_rand_vars( + generator_func, + preprocessor_cache.PreprocessorCache.BLACK_PATCHES, + preprocess_vars_cache, key=idx) + image = tf.cond( + tf.greater(random_prob, probability), lambda: image, + functools.partial(add_black_patch_to_image, image=image, idx=idx)) + return image + + +def image_to_float(image): + """Used in Faster R-CNN. Casts image pixel values to float. + + Args: + image: input image which might be in tf.uint8 or sth else format + + Returns: + image: image in tf.float32 format. + """ + with tf.name_scope('ImageToFloat', values=[image]): + image = tf.to_float(image) + return image + + +def random_resize_method(image, target_size, preprocess_vars_cache=None): + """Uses a random resize method to resize the image to target size. + + Args: + image: a rank 3 tensor. + target_size: a list of [target_height, target_width] + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + resized image. + """ + + resized_image = _apply_with_random_selector( + image, + lambda x, method: tf.image.resize_images(x, target_size, method), + num_cases=4, + preprocess_vars_cache=preprocess_vars_cache, + key=preprocessor_cache.PreprocessorCache.RESIZE_METHOD) + + return resized_image + + +def _compute_new_static_size(image, min_dimension, max_dimension): + """Compute new static shape for resize_to_range method.""" + image_shape = image.get_shape().as_list() + orig_height = image_shape[0] + orig_width = image_shape[1] + num_channels = image_shape[2] + orig_min_dim = min(orig_height, orig_width) + # Calculates the larger of the possible sizes + large_scale_factor = min_dimension / float(orig_min_dim) + # Scaling orig_(height|width) by large_scale_factor will make the smaller + # dimension equal to min_dimension, save for floating point rounding errors. + # For reasonably-sized images, taking the nearest integer will reliably + # eliminate this error. + large_height = int(round(orig_height * large_scale_factor)) + large_width = int(round(orig_width * large_scale_factor)) + large_size = [large_height, large_width] + if max_dimension: + # Calculates the smaller of the possible sizes, use that if the larger + # is too big. + orig_max_dim = max(orig_height, orig_width) + small_scale_factor = max_dimension / float(orig_max_dim) + # Scaling orig_(height|width) by small_scale_factor will make the larger + # dimension equal to max_dimension, save for floating point rounding + # errors. For reasonably-sized images, taking the nearest integer will + # reliably eliminate this error. + small_height = int(round(orig_height * small_scale_factor)) + small_width = int(round(orig_width * small_scale_factor)) + small_size = [small_height, small_width] + new_size = large_size + if max(large_size) > max_dimension: + new_size = small_size + else: + new_size = large_size + return tf.constant(new_size + [num_channels]) + + +def _compute_new_dynamic_size(image, min_dimension, max_dimension): + """Compute new dynamic shape for resize_to_range method.""" + image_shape = tf.shape(image) + orig_height = tf.to_float(image_shape[0]) + orig_width = tf.to_float(image_shape[1]) + num_channels = image_shape[2] + orig_min_dim = tf.minimum(orig_height, orig_width) + # Calculates the larger of the possible sizes + min_dimension = tf.constant(min_dimension, dtype=tf.float32) + large_scale_factor = min_dimension / orig_min_dim + # Scaling orig_(height|width) by large_scale_factor will make the smaller + # dimension equal to min_dimension, save for floating point rounding errors. + # For reasonably-sized images, taking the nearest integer will reliably + # eliminate this error. + large_height = tf.to_int32(tf.round(orig_height * large_scale_factor)) + large_width = tf.to_int32(tf.round(orig_width * large_scale_factor)) + large_size = tf.stack([large_height, large_width]) + if max_dimension: + # Calculates the smaller of the possible sizes, use that if the larger + # is too big. + orig_max_dim = tf.maximum(orig_height, orig_width) + max_dimension = tf.constant(max_dimension, dtype=tf.float32) + small_scale_factor = max_dimension / orig_max_dim + # Scaling orig_(height|width) by small_scale_factor will make the larger + # dimension equal to max_dimension, save for floating point rounding + # errors. For reasonably-sized images, taking the nearest integer will + # reliably eliminate this error. + small_height = tf.to_int32(tf.round(orig_height * small_scale_factor)) + small_width = tf.to_int32(tf.round(orig_width * small_scale_factor)) + small_size = tf.stack([small_height, small_width]) + new_size = tf.cond( + tf.to_float(tf.reduce_max(large_size)) > max_dimension, + lambda: small_size, lambda: large_size) + else: + new_size = large_size + return tf.stack(tf.unstack(new_size) + [num_channels]) + + +def resize_to_range(image, + masks=None, + min_dimension=None, + max_dimension=None, + method=tf.image.ResizeMethod.BILINEAR, + align_corners=False, + pad_to_max_dimension=False, + per_channel_pad_value=(0, 0, 0)): + """Resizes an image so its dimensions are within the provided value. + + The output size can be described by two cases: + 1. If the image can be rescaled so its minimum dimension is equal to the + provided value without the other dimension exceeding max_dimension, + then do so. + 2. Otherwise, resize so the largest dimension is equal to max_dimension. + + Args: + image: A 3D tensor of shape [height, width, channels] + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. + min_dimension: (optional) (scalar) desired size of the smaller image + dimension. + max_dimension: (optional) (scalar) maximum allowed size + of the larger image dimension. + method: (optional) interpolation method used in resizing. Defaults to + BILINEAR. + align_corners: bool. If true, exactly align all 4 corners of the input + and output. Defaults to False. + pad_to_max_dimension: Whether to resize the image and pad it with zeros + so the resulting image is of the spatial size + [max_dimension, max_dimension]. If masks are included they are padded + similarly. + per_channel_pad_value: A tuple of per-channel scalar value to use for + padding. By default pads zeros. + + Returns: + Note that the position of the resized_image_shape changes based on whether + masks are present. + resized_image: A 3D tensor of shape [new_height, new_width, channels], + where the image has been resized (with bilinear interpolation) so that + min(new_height, new_width) == min_dimension or + max(new_height, new_width) == max_dimension. + resized_masks: If masks is not None, also outputs masks. A 3D tensor of + shape [num_instances, new_height, new_width]. + resized_image_shape: A 1D tensor of shape [3] containing shape of the + resized image. + + Raises: + ValueError: if the image is not a 3D tensor. + """ + if len(image.get_shape()) != 3: + raise ValueError('Image should be 3D tensor') + + with tf.name_scope('ResizeToRange', values=[image, min_dimension]): + if image.get_shape().is_fully_defined(): + new_size = _compute_new_static_size(image, min_dimension, max_dimension) + else: + new_size = _compute_new_dynamic_size(image, min_dimension, max_dimension) + new_image = tf.image.resize_images( + image, new_size[:-1], method=method, align_corners=align_corners) + + if pad_to_max_dimension: + channels = tf.unstack(new_image, axis=2) + if len(channels) != len(per_channel_pad_value): + raise ValueError('Number of channels must be equal to the length of ' + 'per-channel pad value.') + new_image = tf.stack( + [ + tf.pad( + channels[i], [[0, max_dimension - new_size[0]], + [0, max_dimension - new_size[1]]], + constant_values=per_channel_pad_value[i]) + for i in range(len(channels)) + ], + axis=2) + new_image.set_shape([max_dimension, max_dimension, 3]) + + result = [new_image] + if masks is not None: + new_masks = tf.expand_dims(masks, 3) + new_masks = tf.image.resize_images( + new_masks, + new_size[:-1], + method=tf.image.ResizeMethod.NEAREST_NEIGHBOR, + align_corners=align_corners) + new_masks = tf.squeeze(new_masks, 3) + if pad_to_max_dimension: + new_masks = tf.image.pad_to_bounding_box( + new_masks, 0, 0, max_dimension, max_dimension) + result.append(new_masks) + + result.append(new_size) + return result + + +# TODO(alirezafathi): Make sure the static shapes are preserved. +def resize_to_min_dimension(image, masks=None, min_dimension=600): + """Resizes image and masks given the min size maintaining the aspect ratio. + + If one of the image dimensions is smaller that min_dimension, it will scale + the image such that its smallest dimension is equal to min_dimension. + Otherwise, will keep the image size as is. + + Args: + image: a tensor of size [height, width, channels]. + masks: (optional) a tensors of size [num_instances, height, width]. + min_dimension: minimum image dimension. + + Returns: + Note that the position of the resized_image_shape changes based on whether + masks are present. + resized_image: A tensor of size [new_height, new_width, channels]. + resized_masks: If masks is not None, also outputs masks. A 3D tensor of + shape [num_instances, new_height, new_width] + resized_image_shape: A 1D tensor of shape [3] containing the shape of the + resized image. + + Raises: + ValueError: if the image is not a 3D tensor. + """ + if len(image.get_shape()) != 3: + raise ValueError('Image should be 3D tensor') + + with tf.name_scope('ResizeGivenMinDimension', values=[image, min_dimension]): + image_height = tf.shape(image)[0] + image_width = tf.shape(image)[1] + num_channels = tf.shape(image)[2] + min_image_dimension = tf.minimum(image_height, image_width) + min_target_dimension = tf.maximum(min_image_dimension, min_dimension) + target_ratio = tf.to_float(min_target_dimension) / tf.to_float( + min_image_dimension) + target_height = tf.to_int32(tf.to_float(image_height) * target_ratio) + target_width = tf.to_int32(tf.to_float(image_width) * target_ratio) + image = tf.image.resize_bilinear( + tf.expand_dims(image, axis=0), + size=[target_height, target_width], + align_corners=True) + result = [tf.squeeze(image, axis=0)] + + if masks is not None: + masks = tf.image.resize_nearest_neighbor( + tf.expand_dims(masks, axis=3), + size=[target_height, target_width], + align_corners=True) + result.append(tf.squeeze(masks, axis=3)) + + result.append(tf.stack([target_height, target_width, num_channels])) + return result + + +def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None): + """Scales boxes from normalized to pixel coordinates. + + Args: + image: A 3D float32 tensor of shape [height, width, channels]. + boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding + boxes in normalized coordinates. Each row is of the form + [ymin, xmin, ymax, xmax]. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x normalized + coordinates. + + Returns: + image: unchanged input image. + scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the + bounding boxes in pixel coordinates. + scaled_keypoints: a 3D float32 tensor with shape + [num_instances, num_keypoints, 2] containing the keypoints in pixel + coordinates. + """ + boxlist = box_list.BoxList(boxes) + image_height = tf.shape(image)[0] + image_width = tf.shape(image)[1] + scaled_boxes = box_list_ops.scale(boxlist, image_height, image_width).get() + result = [image, scaled_boxes] + if keypoints is not None: + scaled_keypoints = keypoint_ops.scale(keypoints, image_height, image_width) + result.append(scaled_keypoints) + return tuple(result) + + +# TODO(alirezafathi): Investigate if instead the function should return None if +# masks is None. +# pylint: disable=g-doc-return-or-yield +def resize_image(image, + masks=None, + new_height=600, + new_width=1024, + method=tf.image.ResizeMethod.BILINEAR, + align_corners=False): + """Resizes images to the given height and width. + + Args: + image: A 3D tensor of shape [height, width, channels] + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. + new_height: (optional) (scalar) desired height of the image. + new_width: (optional) (scalar) desired width of the image. + method: (optional) interpolation method used in resizing. Defaults to + BILINEAR. + align_corners: bool. If true, exactly align all 4 corners of the input + and output. Defaults to False. + + Returns: + Note that the position of the resized_image_shape changes based on whether + masks are present. + resized_image: A tensor of size [new_height, new_width, channels]. + resized_masks: If masks is not None, also outputs masks. A 3D tensor of + shape [num_instances, new_height, new_width] + resized_image_shape: A 1D tensor of shape [3] containing the shape of the + resized image. + """ + with tf.name_scope( + 'ResizeImage', + values=[image, new_height, new_width, method, align_corners]): + new_image = tf.image.resize_images( + image, tf.stack([new_height, new_width]), + method=method, + align_corners=align_corners) + image_shape = shape_utils.combined_static_and_dynamic_shape(image) + result = [new_image] + if masks is not None: + num_instances = tf.shape(masks)[0] + new_size = tf.stack([new_height, new_width]) + def resize_masks_branch(): + new_masks = tf.expand_dims(masks, 3) + new_masks = tf.image.resize_nearest_neighbor( + new_masks, new_size, align_corners=align_corners) + new_masks = tf.squeeze(new_masks, axis=3) + return new_masks + + def reshape_masks_branch(): + # The shape function will be computed for both branches of the + # condition, regardless of which branch is actually taken. Make sure + # that we don't trigger an assertion in the shape function when trying + # to reshape a non empty tensor into an empty one. + new_masks = tf.reshape(masks, [-1, new_size[0], new_size[1]]) + return new_masks + + masks = tf.cond(num_instances > 0, resize_masks_branch, + reshape_masks_branch) + result.append(masks) + + result.append(tf.stack([new_height, new_width, image_shape[2]])) + return result + + +def subtract_channel_mean(image, means=None): + """Normalizes an image by subtracting a mean from each channel. + + Args: + image: A 3D tensor of shape [height, width, channels] + means: float list containing a mean for each channel + Returns: + normalized_images: a tensor of shape [height, width, channels] + Raises: + ValueError: if images is not a 4D tensor or if the number of means is not + equal to the number of channels. + """ + with tf.name_scope('SubtractChannelMean', values=[image, means]): + if len(image.get_shape()) != 3: + raise ValueError('Input must be of size [height, width, channels]') + if len(means) != image.get_shape()[-1]: + raise ValueError('len(means) must match the number of channels') + return image - [[means]] + + +def one_hot_encoding(labels, num_classes=None): + """One-hot encodes the multiclass labels. + + Example usage: + labels = tf.constant([1, 4], dtype=tf.int32) + one_hot = OneHotEncoding(labels, num_classes=5) + one_hot.eval() # evaluates to [0, 1, 0, 0, 1] + + Args: + labels: A tensor of shape [None] corresponding to the labels. + num_classes: Number of classes in the dataset. + Returns: + onehot_labels: a tensor of shape [num_classes] corresponding to the one hot + encoding of the labels. + Raises: + ValueError: if num_classes is not specified. + """ + with tf.name_scope('OneHotEncoding', values=[labels]): + if num_classes is None: + raise ValueError('num_classes must be specified') + + labels = tf.one_hot(labels, num_classes, 1, 0) + return tf.reduce_max(labels, 0) + + +def rgb_to_gray(image): + """Converts a 3 channel RGB image to a 1 channel grayscale image. + + Args: + image: Rank 3 float32 tensor containing 1 image -> [height, width, 3] + with pixel values varying between [0, 1]. + + Returns: + image: A single channel grayscale image -> [image, height, 1]. + """ + return _rgb_to_grayscale(image) + + +def ssd_random_crop(image, + boxes, + labels, + label_scores=None, + multiclass_scores=None, + masks=None, + keypoints=None, + min_object_covered=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), + aspect_ratio_range=((0.5, 2.0),) * 7, + area_range=((0.1, 1.0),) * 7, + overlap_thresh=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), + random_coef=(0.15,) * 7, + seed=None, + preprocess_vars_cache=None): + """Random crop preprocessing with default parameters as in SSD paper. + + Liu et al., SSD: Single shot multibox detector. + For further information on random crop preprocessing refer to RandomCrop + function above. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + labels: rank 1 int32 tensor containing the object classes. + label_scores: rank 1 float32 tensor containing the scores. + multiclass_scores: (optional) float32 tensor of shape + [num_instances, num_classes] representing the score for each box for each + class. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + min_object_covered: the cropped image must cover at least this fraction of + at least one of the input bounding boxes. + aspect_ratio_range: allowed range for aspect ratio of cropped image. + area_range: allowed range for area ratio between cropped image and the + original image. + overlap_thresh: minimum overlap thresh with new cropped + image to keep the box. + random_coef: a random coefficient that defines the chance of getting the + original image. If random_coef is 0, we will always get the + cropped image, and if it is 1.0, we will always get the + original image. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same rank as input image. + boxes: boxes which is the same rank as input boxes. + Boxes are in normalized form. + labels: new labels. + + If label_scores, multiclass_scores, masks, or keypoints is not None, the + function also returns: + label_scores: rank 1 float32 tensor with shape [num_instances]. + multiclass_scores: rank 2 float32 tensor with shape + [num_instances, num_classes] + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + """ + + def random_crop_selector(selected_result, index): + """Applies random_crop_image to selected result. + + Args: + selected_result: A tuple containing image, boxes, labels, keypoints (if + not None), and masks (if not None). + index: The index that was randomly selected. + + Returns: A tuple containing image, boxes, labels, keypoints (if not None), + and masks (if not None). + """ + + i = 3 + image, boxes, labels = selected_result[:i] + selected_label_scores = None + selected_multiclass_scores = None + selected_masks = None + selected_keypoints = None + if label_scores is not None: + selected_label_scores = selected_result[i] + i += 1 + if multiclass_scores is not None: + selected_multiclass_scores = selected_result[i] + i += 1 + if masks is not None: + selected_masks = selected_result[i] + i += 1 + if keypoints is not None: + selected_keypoints = selected_result[i] + + return random_crop_image( + image=image, + boxes=boxes, + labels=labels, + label_scores=selected_label_scores, + multiclass_scores=selected_multiclass_scores, + masks=selected_masks, + keypoints=selected_keypoints, + min_object_covered=min_object_covered[index], + aspect_ratio_range=aspect_ratio_range[index], + area_range=area_range[index], + overlap_thresh=overlap_thresh[index], + random_coef=random_coef[index], + seed=seed, + preprocess_vars_cache=preprocess_vars_cache) + + result = _apply_with_random_selector_tuples( + tuple( + t for t in (image, boxes, labels, label_scores, multiclass_scores, + masks, keypoints) if t is not None), + random_crop_selector, + num_cases=len(min_object_covered), + preprocess_vars_cache=preprocess_vars_cache, + key=preprocessor_cache.PreprocessorCache.SSD_CROP_SELECTOR_ID) + return result + + +def ssd_random_crop_pad(image, + boxes, + labels, + label_scores=None, + multiclass_scores=None, + min_object_covered=(0.1, 0.3, 0.5, 0.7, 0.9, 1.0), + aspect_ratio_range=((0.5, 2.0),) * 6, + area_range=((0.1, 1.0),) * 6, + overlap_thresh=(0.1, 0.3, 0.5, 0.7, 0.9, 1.0), + random_coef=(0.15,) * 6, + min_padded_size_ratio=((1.0, 1.0),) * 6, + max_padded_size_ratio=((2.0, 2.0),) * 6, + pad_color=(None,) * 6, + seed=None, + preprocess_vars_cache=None): + """Random crop preprocessing with default parameters as in SSD paper. + + Liu et al., SSD: Single shot multibox detector. + For further information on random crop preprocessing refer to RandomCrop + function above. + + Args: + image: rank 3 float32 tensor containing 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + labels: rank 1 int32 tensor containing the object classes. + label_scores: float32 tensor of shape [num_instances] representing the + score for each box. + multiclass_scores: (optional) float32 tensor of shape + [num_instances, num_classes] representing the score for each box for each + class. + min_object_covered: the cropped image must cover at least this fraction of + at least one of the input bounding boxes. + aspect_ratio_range: allowed range for aspect ratio of cropped image. + area_range: allowed range for area ratio between cropped image and the + original image. + overlap_thresh: minimum overlap thresh with new cropped + image to keep the box. + random_coef: a random coefficient that defines the chance of getting the + original image. If random_coef is 0, we will always get the + cropped image, and if it is 1.0, we will always get the + original image. + min_padded_size_ratio: min ratio of padded image height and width to the + input image's height and width. + max_padded_size_ratio: max ratio of padded image height and width to the + input image's height and width. + pad_color: padding color. A rank 1 tensor of [3] with dtype=tf.float32. + if set as None, it will be set to average color of the randomly + cropped image. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: Image shape will be [new_height, new_width, channels]. + boxes: boxes which is the same rank as input boxes. Boxes are in normalized + form. + new_labels: new labels. + new_label_scores: new label scores. + """ + + def random_crop_pad_selector(image_boxes_labels, index): + """Random crop preprocessing helper.""" + i = 3 + image, boxes, labels = image_boxes_labels[:i] + selected_label_scores = None + selected_multiclass_scores = None + if label_scores is not None: + selected_label_scores = image_boxes_labels[i] + i += 1 + if multiclass_scores is not None: + selected_multiclass_scores = image_boxes_labels[i] + + return random_crop_pad_image( + image, + boxes, + labels, + label_scores=selected_label_scores, + multiclass_scores=selected_multiclass_scores, + min_object_covered=min_object_covered[index], + aspect_ratio_range=aspect_ratio_range[index], + area_range=area_range[index], + overlap_thresh=overlap_thresh[index], + random_coef=random_coef[index], + min_padded_size_ratio=min_padded_size_ratio[index], + max_padded_size_ratio=max_padded_size_ratio[index], + pad_color=pad_color[index], + seed=seed, + preprocess_vars_cache=preprocess_vars_cache) + + return _apply_with_random_selector_tuples( + tuple(t for t in (image, boxes, labels, label_scores, multiclass_scores) + if t is not None), + random_crop_pad_selector, + num_cases=len(min_object_covered), + preprocess_vars_cache=preprocess_vars_cache, + key=preprocessor_cache.PreprocessorCache.SSD_CROP_PAD_SELECTOR_ID) + + +def ssd_random_crop_fixed_aspect_ratio( + image, + boxes, + labels, + label_scores=None, + multiclass_scores=None, + masks=None, + keypoints=None, + min_object_covered=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), + aspect_ratio=1.0, + area_range=((0.1, 1.0),) * 7, + overlap_thresh=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), + random_coef=(0.15,) * 7, + seed=None, + preprocess_vars_cache=None): + """Random crop preprocessing with default parameters as in SSD paper. + + Liu et al., SSD: Single shot multibox detector. + For further information on random crop preprocessing refer to RandomCrop + function above. + + The only difference is that the aspect ratio of the crops are fixed. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + labels: rank 1 int32 tensor containing the object classes. + label_scores: (optional) float32 tensor of shape [num_instances] + representing the score for each box. + multiclass_scores: (optional) float32 tensor of shape + [num_instances, num_classes] representing the score for each box for each + class. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + min_object_covered: the cropped image must cover at least this fraction of + at least one of the input bounding boxes. + aspect_ratio: aspect ratio of the cropped image. + area_range: allowed range for area ratio between cropped image and the + original image. + overlap_thresh: minimum overlap thresh with new cropped + image to keep the box. + random_coef: a random coefficient that defines the chance of getting the + original image. If random_coef is 0, we will always get the + cropped image, and if it is 1.0, we will always get the + original image. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same rank as input image. + boxes: boxes which is the same rank as input boxes. + Boxes are in normalized form. + labels: new labels. + + If mulitclass_scores, masks, or keypoints is not None, the function also + returns: + + multiclass_scores: rank 2 float32 tensor with shape + [num_instances, num_classes] + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + """ + aspect_ratio_range = ((aspect_ratio, aspect_ratio),) * len(area_range) + + crop_result = ssd_random_crop( + image, + boxes, + labels, + label_scores=label_scores, + multiclass_scores=multiclass_scores, + masks=masks, + keypoints=keypoints, + min_object_covered=min_object_covered, + aspect_ratio_range=aspect_ratio_range, + area_range=area_range, + overlap_thresh=overlap_thresh, + random_coef=random_coef, + seed=seed, + preprocess_vars_cache=preprocess_vars_cache) + i = 3 + new_image, new_boxes, new_labels = crop_result[:i] + new_label_scores = None + new_multiclass_scores = None + new_masks = None + new_keypoints = None + if label_scores is not None: + new_label_scores = crop_result[i] + i += 1 + if multiclass_scores is not None: + new_multiclass_scores = crop_result[i] + i += 1 + if masks is not None: + new_masks = crop_result[i] + i += 1 + if keypoints is not None: + new_keypoints = crop_result[i] + + result = random_crop_to_aspect_ratio( + new_image, + new_boxes, + new_labels, + label_scores=new_label_scores, + multiclass_scores=new_multiclass_scores, + masks=new_masks, + keypoints=new_keypoints, + aspect_ratio=aspect_ratio, + seed=seed, + preprocess_vars_cache=preprocess_vars_cache) + + return result + + +def ssd_random_crop_pad_fixed_aspect_ratio( + image, + boxes, + labels, + label_scores=None, + multiclass_scores=None, + masks=None, + keypoints=None, + min_object_covered=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), + aspect_ratio=1.0, + aspect_ratio_range=((0.5, 2.0),) * 7, + area_range=((0.1, 1.0),) * 7, + overlap_thresh=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), + random_coef=(0.15,) * 7, + min_padded_size_ratio=(1.0, 1.0), + max_padded_size_ratio=(2.0, 2.0), + seed=None, + preprocess_vars_cache=None): + """Random crop and pad preprocessing with default parameters as in SSD paper. + + Liu et al., SSD: Single shot multibox detector. + For further information on random crop preprocessing refer to RandomCrop + function above. + + The only difference is that after the initial crop, images are zero-padded + to a fixed aspect ratio instead of being resized to that aspect ratio. + + Args: + image: rank 3 float32 tensor contains 1 image -> [height, width, channels] + with pixel values varying between [0, 1]. + boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning their coordinates vary + between [0, 1]. + Each row is in the form of [ymin, xmin, ymax, xmax]. + labels: rank 1 int32 tensor containing the object classes. + label_scores: (optional) float32 tensor of shape [num_instances] + representing the score for each box. + multiclass_scores: (optional) float32 tensor of shape + [num_instances, num_classes] representing the score for each box for each + class. + masks: (optional) rank 3 float32 tensor with shape + [num_instances, height, width] containing instance masks. The masks + are of the same height, width as the input `image`. + keypoints: (optional) rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2]. The keypoints are in y-x + normalized coordinates. + min_object_covered: the cropped image must cover at least this fraction of + at least one of the input bounding boxes. + aspect_ratio: the final aspect ratio to pad to. + aspect_ratio_range: allowed range for aspect ratio of cropped image. + area_range: allowed range for area ratio between cropped image and the + original image. + overlap_thresh: minimum overlap thresh with new cropped + image to keep the box. + random_coef: a random coefficient that defines the chance of getting the + original image. If random_coef is 0, we will always get the + cropped image, and if it is 1.0, we will always get the + original image. + min_padded_size_ratio: min ratio of padded image height and width to the + input image's height and width. + max_padded_size_ratio: max ratio of padded image height and width to the + input image's height and width. + seed: random seed. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + image: image which is the same rank as input image. + boxes: boxes which is the same rank as input boxes. + Boxes are in normalized form. + labels: new labels. + + If multiclass_scores, masks, or keypoints is not None, the function also + returns: + + multiclass_scores: rank 2 with shape [num_instances, num_classes] + masks: rank 3 float32 tensor with shape [num_instances, height, width] + containing instance masks. + keypoints: rank 3 float32 tensor with shape + [num_instances, num_keypoints, 2] + """ + crop_result = ssd_random_crop( + image, + boxes, + labels, + label_scores=label_scores, + multiclass_scores=multiclass_scores, + masks=masks, + keypoints=keypoints, + min_object_covered=min_object_covered, + aspect_ratio_range=aspect_ratio_range, + area_range=area_range, + overlap_thresh=overlap_thresh, + random_coef=random_coef, + seed=seed, + preprocess_vars_cache=preprocess_vars_cache) + i = 3 + new_image, new_boxes, new_labels = crop_result[:i] + new_label_scores = None + new_multiclass_scores = None + new_masks = None + new_keypoints = None + if label_scores is not None: + new_label_scores = crop_result[i] + i += 1 + if multiclass_scores is not None: + new_multiclass_scores = crop_result[i] + i += 1 + if masks is not None: + new_masks = crop_result[i] + i += 1 + if keypoints is not None: + new_keypoints = crop_result[i] + + result = random_pad_to_aspect_ratio( + new_image, + new_boxes, + masks=new_masks, + keypoints=new_keypoints, + aspect_ratio=aspect_ratio, + min_padded_size_ratio=min_padded_size_ratio, + max_padded_size_ratio=max_padded_size_ratio, + seed=seed, + preprocess_vars_cache=preprocess_vars_cache) + + result = list(result) + i = 3 + result.insert(2, new_labels) + if new_label_scores is not None: + result.insert(i, new_label_scores) + i += 1 + if multiclass_scores is not None: + result.insert(i, new_multiclass_scores) + result = tuple(result) + + return result + + +def get_default_func_arg_map(include_label_scores=False, + include_multiclass_scores=False, + include_instance_masks=False, + include_keypoints=False): + """Returns the default mapping from a preprocessor function to its args. + + Args: + include_label_scores: If True, preprocessing functions will modify the + label scores, too. + include_multiclass_scores: If True, preprocessing functions will modify the + multiclass scores, too. + include_instance_masks: If True, preprocessing functions will modify the + instance masks, too. + include_keypoints: If True, preprocessing functions will modify the + keypoints, too. + + Returns: + A map from preprocessing functions to the arguments they receive. + """ + groundtruth_label_scores = None + if include_label_scores: + groundtruth_label_scores = (fields.InputDataFields.groundtruth_label_scores) + + multiclass_scores = None + if include_multiclass_scores: + multiclass_scores = (fields.InputDataFields.multiclass_scores) + + groundtruth_instance_masks = None + if include_instance_masks: + groundtruth_instance_masks = ( + fields.InputDataFields.groundtruth_instance_masks) + + groundtruth_keypoints = None + if include_keypoints: + groundtruth_keypoints = fields.InputDataFields.groundtruth_keypoints + + prep_func_arg_map = { + normalize_image: (fields.InputDataFields.image,), + random_horizontal_flip: ( + fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + groundtruth_instance_masks, + groundtruth_keypoints, + ), + random_vertical_flip: ( + fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + groundtruth_instance_masks, + groundtruth_keypoints, + ), + random_rotation90: ( + fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + groundtruth_instance_masks, + groundtruth_keypoints, + ), + random_pixel_value_scale: (fields.InputDataFields.image,), + random_image_scale: ( + fields.InputDataFields.image, + groundtruth_instance_masks, + ), + random_rgb_to_gray: (fields.InputDataFields.image,), + random_adjust_brightness: (fields.InputDataFields.image,), + random_adjust_contrast: (fields.InputDataFields.image,), + random_adjust_hue: (fields.InputDataFields.image,), + random_adjust_saturation: (fields.InputDataFields.image,), + random_distort_color: (fields.InputDataFields.image,), + random_jitter_boxes: (fields.InputDataFields.groundtruth_boxes,), + random_crop_image: (fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_classes, + groundtruth_label_scores, multiclass_scores, + groundtruth_instance_masks, groundtruth_keypoints), + random_pad_image: (fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes), + random_crop_pad_image: (fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_classes, + groundtruth_label_scores, + multiclass_scores), + random_crop_to_aspect_ratio: ( + fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_classes, + groundtruth_label_scores, + multiclass_scores, + groundtruth_instance_masks, + groundtruth_keypoints, + ), + random_pad_to_aspect_ratio: ( + fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + groundtruth_instance_masks, + groundtruth_keypoints, + ), + random_black_patches: (fields.InputDataFields.image,), + retain_boxes_above_threshold: ( + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_classes, + groundtruth_label_scores, + multiclass_scores, + groundtruth_instance_masks, + groundtruth_keypoints, + ), + image_to_float: (fields.InputDataFields.image,), + random_resize_method: (fields.InputDataFields.image,), + resize_to_range: ( + fields.InputDataFields.image, + groundtruth_instance_masks, + ), + resize_to_min_dimension: ( + fields.InputDataFields.image, + groundtruth_instance_masks, + ), + scale_boxes_to_pixel_coordinates: ( + fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + groundtruth_keypoints, + ), + resize_image: ( + fields.InputDataFields.image, + groundtruth_instance_masks, + ), + subtract_channel_mean: (fields.InputDataFields.image,), + one_hot_encoding: (fields.InputDataFields.groundtruth_image_classes,), + rgb_to_gray: (fields.InputDataFields.image,), + ssd_random_crop: ( + fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_classes, + groundtruth_label_scores, + multiclass_scores, + groundtruth_instance_masks, + groundtruth_keypoints + ), + ssd_random_crop_pad: (fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_classes, + groundtruth_label_scores, + multiclass_scores), + ssd_random_crop_fixed_aspect_ratio: ( + fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_classes, groundtruth_label_scores, + multiclass_scores, groundtruth_instance_masks, groundtruth_keypoints), + ssd_random_crop_pad_fixed_aspect_ratio: ( + fields.InputDataFields.image, + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_classes, + groundtruth_label_scores, + multiclass_scores, + groundtruth_instance_masks, + groundtruth_keypoints, + ), + } + + return prep_func_arg_map + + +def preprocess(tensor_dict, + preprocess_options, + func_arg_map=None, + preprocess_vars_cache=None): + """Preprocess images and bounding boxes. + + Various types of preprocessing (to be implemented) based on the + preprocess_options dictionary e.g. "crop image" (affects image and possibly + boxes), "white balance image" (affects only image), etc. If self._options + is None, no preprocessing is done. + + Args: + tensor_dict: dictionary that contains images, boxes, and can contain other + things as well. + images-> rank 4 float32 tensor contains + 1 image -> [1, height, width, 3]. + with pixel values varying between [0, 1] + boxes-> rank 2 float32 tensor containing + the bounding boxes -> [N, 4]. + Boxes are in normalized form meaning + their coordinates vary between [0, 1]. + Each row is in the form + of [ymin, xmin, ymax, xmax]. + preprocess_options: It is a list of tuples, where each tuple contains a + function and a dictionary that contains arguments and + their values. + func_arg_map: mapping from preprocessing functions to arguments that they + expect to receive and return. + preprocess_vars_cache: PreprocessorCache object that records previously + performed augmentations. Updated in-place. If this + function is called multiple times with the same + non-null cache, it will perform deterministically. + + Returns: + tensor_dict: which contains the preprocessed images, bounding boxes, etc. + + Raises: + ValueError: (a) If the functions passed to Preprocess + are not in func_arg_map. + (b) If the arguments that a function needs + do not exist in tensor_dict. + (c) If image in tensor_dict is not rank 4 + """ + if func_arg_map is None: + func_arg_map = get_default_func_arg_map() + + # changes the images to image (rank 4 to rank 3) since the functions + # receive rank 3 tensor for image + if fields.InputDataFields.image in tensor_dict: + images = tensor_dict[fields.InputDataFields.image] + if len(images.get_shape()) != 4: + raise ValueError('images in tensor_dict should be rank 4') + image = tf.squeeze(images, squeeze_dims=[0]) + tensor_dict[fields.InputDataFields.image] = image + + # Preprocess inputs based on preprocess_options + for option in preprocess_options: + func, params = option + if func not in func_arg_map: + raise ValueError('The function %s does not exist in func_arg_map' % + (func.__name__)) + arg_names = func_arg_map[func] + for a in arg_names: + if a is not None and a not in tensor_dict: + raise ValueError('The function %s requires argument %s' % + (func.__name__, a)) + + def get_arg(key): + return tensor_dict[key] if key is not None else None + + args = [get_arg(a) for a in arg_names] + if (preprocess_vars_cache is not None and + 'preprocess_vars_cache' in inspect.getargspec(func).args): + params['preprocess_vars_cache'] = preprocess_vars_cache + results = func(*args, **params) + if not isinstance(results, (list, tuple)): + results = (results,) + # Removes None args since the return values will not contain those. + arg_names = [arg_name for arg_name in arg_names if arg_name is not None] + for res, arg_name in zip(results, arg_names): + tensor_dict[arg_name] = res + + # changes the image to images (rank 3 to rank 4) to be compatible to what + # we received in the first place + if fields.InputDataFields.image in tensor_dict: + image = tensor_dict[fields.InputDataFields.image] + images = tf.expand_dims(image, 0) + tensor_dict[fields.InputDataFields.image] = images + + return tensor_dict diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/preprocessor_cache.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/preprocessor_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..2822a2bab209f37738b0c807765624114973de4d --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/preprocessor_cache.py @@ -0,0 +1,102 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Records previous preprocessing operations and allows them to be repeated. + +Used with object_detection.core.preprocessor. Passing a PreprocessorCache +into individual data augmentation functions or the general preprocess() function +will store all randomly generated variables in the PreprocessorCache. When +a preprocessor function is called multiple times with the same +PreprocessorCache object, that function will perform the same augmentation +on all calls. +""" + +from collections import defaultdict + + +class PreprocessorCache(object): + """Dictionary wrapper storing random variables generated during preprocessing. + """ + + # Constant keys representing different preprocessing functions + ROTATION90 = 'rotation90' + HORIZONTAL_FLIP = 'horizontal_flip' + VERTICAL_FLIP = 'vertical_flip' + PIXEL_VALUE_SCALE = 'pixel_value_scale' + IMAGE_SCALE = 'image_scale' + RGB_TO_GRAY = 'rgb_to_gray' + ADJUST_BRIGHTNESS = 'adjust_brightness' + ADJUST_CONTRAST = 'adjust_contrast' + ADJUST_HUE = 'adjust_hue' + ADJUST_SATURATION = 'adjust_saturation' + DISTORT_COLOR = 'distort_color' + STRICT_CROP_IMAGE = 'strict_crop_image' + CROP_IMAGE = 'crop_image' + PAD_IMAGE = 'pad_image' + CROP_TO_ASPECT_RATIO = 'crop_to_aspect_ratio' + RESIZE_METHOD = 'resize_method' + PAD_TO_ASPECT_RATIO = 'pad_to_aspect_ratio' + BLACK_PATCHES = 'black_patches' + ADD_BLACK_PATCH = 'add_black_patch' + SELECTOR = 'selector' + SELECTOR_TUPLES = 'selector_tuples' + SSD_CROP_SELECTOR_ID = 'ssd_crop_selector_id' + SSD_CROP_PAD_SELECTOR_ID = 'ssd_crop_pad_selector_id' + + # 23 permitted function ids + _VALID_FNS = [ROTATION90, HORIZONTAL_FLIP, VERTICAL_FLIP, PIXEL_VALUE_SCALE, + IMAGE_SCALE, RGB_TO_GRAY, ADJUST_BRIGHTNESS, ADJUST_CONTRAST, + ADJUST_HUE, ADJUST_SATURATION, DISTORT_COLOR, STRICT_CROP_IMAGE, + CROP_IMAGE, PAD_IMAGE, CROP_TO_ASPECT_RATIO, RESIZE_METHOD, + PAD_TO_ASPECT_RATIO, BLACK_PATCHES, ADD_BLACK_PATCH, SELECTOR, + SELECTOR_TUPLES, SSD_CROP_SELECTOR_ID, SSD_CROP_PAD_SELECTOR_ID] + + def __init__(self): + self._history = defaultdict(dict) + + def clear(self): + """Resets cache.""" + self._history = {} + + def get(self, function_id, key): + """Gets stored value given a function id and key. + + Args: + function_id: identifier for the preprocessing function used. + key: identifier for the variable stored. + Returns: + value: the corresponding value, expected to be a tensor or + nested structure of tensors. + Raises: + ValueError: if function_id is not one of the 23 valid function ids. + """ + if function_id not in self._VALID_FNS: + raise ValueError('Function id not recognized: %s.' % str(function_id)) + return self._history[function_id].get(key) + + def update(self, function_id, key, value): + """Adds a value to the dictionary. + + Args: + function_id: identifier for the preprocessing function used. + key: identifier for the variable stored. + value: the value to store, expected to be a tensor or nested structure + of tensors. + Raises: + ValueError: if function_id is not one of the 23 valid function ids. + """ + if function_id not in self._VALID_FNS: + raise ValueError('Function id not recognized: %s.' % str(function_id)) + self._history[function_id][key] = value + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/preprocessor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/preprocessor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..588a3f90cb1ec5aa104bd8519ddd5fb5b30dd3be --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/preprocessor_test.py @@ -0,0 +1,2814 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.preprocessor.""" + +import numpy as np +import six + +import tensorflow as tf + +from object_detection.core import preprocessor +from object_detection.core import preprocessor_cache +from object_detection.core import standard_fields as fields + +if six.PY2: + import mock # pylint: disable=g-import-not-at-top +else: + from unittest import mock # pylint: disable=g-import-not-at-top + + +class PreprocessorTest(tf.test.TestCase): + + def createColorfulTestImage(self): + ch255 = tf.fill([1, 100, 200, 1], tf.constant(255, dtype=tf.uint8)) + ch128 = tf.fill([1, 100, 200, 1], tf.constant(128, dtype=tf.uint8)) + ch0 = tf.fill([1, 100, 200, 1], tf.constant(0, dtype=tf.uint8)) + imr = tf.concat([ch255, ch0, ch0], 3) + img = tf.concat([ch255, ch255, ch0], 3) + imb = tf.concat([ch255, ch0, ch255], 3) + imw = tf.concat([ch128, ch128, ch128], 3) + imu = tf.concat([imr, img], 2) + imd = tf.concat([imb, imw], 2) + im = tf.concat([imu, imd], 1) + return im + + def createTestImages(self): + images_r = tf.constant([[[128, 128, 128, 128], [0, 0, 128, 128], + [0, 128, 128, 128], [192, 192, 128, 128]]], + dtype=tf.uint8) + images_r = tf.expand_dims(images_r, 3) + images_g = tf.constant([[[0, 0, 128, 128], [0, 0, 128, 128], + [0, 128, 192, 192], [192, 192, 128, 192]]], + dtype=tf.uint8) + images_g = tf.expand_dims(images_g, 3) + images_b = tf.constant([[[128, 128, 192, 0], [0, 0, 128, 192], + [0, 128, 128, 0], [192, 192, 192, 128]]], + dtype=tf.uint8) + images_b = tf.expand_dims(images_b, 3) + images = tf.concat([images_r, images_g, images_b], 3) + return images + + def createEmptyTestBoxes(self): + boxes = tf.constant([[]], dtype=tf.float32) + return boxes + + def createTestBoxes(self): + boxes = tf.constant( + [[0.0, 0.25, 0.75, 1.0], [0.25, 0.5, 0.75, 1.0]], dtype=tf.float32) + return boxes + + def createTestLabelScores(self): + return tf.constant([1.0, 0.5], dtype=tf.float32) + + def createTestLabelScoresWithMissingScore(self): + return tf.constant([0.5, np.nan], dtype=tf.float32) + + def createTestMasks(self): + mask = np.array([ + [[255.0, 0.0, 0.0], + [255.0, 0.0, 0.0], + [255.0, 0.0, 0.0]], + [[255.0, 255.0, 0.0], + [255.0, 255.0, 0.0], + [255.0, 255.0, 0.0]]]) + return tf.constant(mask, dtype=tf.float32) + + def createTestKeypoints(self): + keypoints = np.array([ + [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]], + [[0.4, 0.4], [0.5, 0.5], [0.6, 0.6]], + ]) + return tf.constant(keypoints, dtype=tf.float32) + + def createTestKeypointsInsideCrop(self): + keypoints = np.array([ + [[0.4, 0.4], [0.5, 0.5], [0.6, 0.6]], + [[0.4, 0.4], [0.5, 0.5], [0.6, 0.6]], + ]) + return tf.constant(keypoints, dtype=tf.float32) + + def createTestKeypointsOutsideCrop(self): + keypoints = np.array([ + [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]], + [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]], + ]) + return tf.constant(keypoints, dtype=tf.float32) + + def createKeypointFlipPermutation(self): + return np.array([0, 2, 1], dtype=np.int32) + + def createTestLabels(self): + labels = tf.constant([1, 2], dtype=tf.int32) + return labels + + def createTestBoxesOutOfImage(self): + boxes = tf.constant( + [[-0.1, 0.25, 0.75, 1], [0.25, 0.5, 0.75, 1.1]], dtype=tf.float32) + return boxes + + def createTestMultiClassScores(self): + return tf.constant([[1.0, 0.0], [0.5, 0.5]], dtype=tf.float32) + + def expectedImagesAfterNormalization(self): + images_r = tf.constant([[[0, 0, 0, 0], [-1, -1, 0, 0], + [-1, 0, 0, 0], [0.5, 0.5, 0, 0]]], + dtype=tf.float32) + images_r = tf.expand_dims(images_r, 3) + images_g = tf.constant([[[-1, -1, 0, 0], [-1, -1, 0, 0], + [-1, 0, 0.5, 0.5], [0.5, 0.5, 0, 0.5]]], + dtype=tf.float32) + images_g = tf.expand_dims(images_g, 3) + images_b = tf.constant([[[0, 0, 0.5, -1], [-1, -1, 0, 0.5], + [-1, 0, 0, -1], [0.5, 0.5, 0.5, 0]]], + dtype=tf.float32) + images_b = tf.expand_dims(images_b, 3) + images = tf.concat([images_r, images_g, images_b], 3) + return images + + def expectedMaxImageAfterColorScale(self): + images_r = tf.constant([[[0.1, 0.1, 0.1, 0.1], [-0.9, -0.9, 0.1, 0.1], + [-0.9, 0.1, 0.1, 0.1], [0.6, 0.6, 0.1, 0.1]]], + dtype=tf.float32) + images_r = tf.expand_dims(images_r, 3) + images_g = tf.constant([[[-0.9, -0.9, 0.1, 0.1], [-0.9, -0.9, 0.1, 0.1], + [-0.9, 0.1, 0.6, 0.6], [0.6, 0.6, 0.1, 0.6]]], + dtype=tf.float32) + images_g = tf.expand_dims(images_g, 3) + images_b = tf.constant([[[0.1, 0.1, 0.6, -0.9], [-0.9, -0.9, 0.1, 0.6], + [-0.9, 0.1, 0.1, -0.9], [0.6, 0.6, 0.6, 0.1]]], + dtype=tf.float32) + images_b = tf.expand_dims(images_b, 3) + images = tf.concat([images_r, images_g, images_b], 3) + return images + + def expectedMinImageAfterColorScale(self): + images_r = tf.constant([[[-0.1, -0.1, -0.1, -0.1], [-1, -1, -0.1, -0.1], + [-1, -0.1, -0.1, -0.1], [0.4, 0.4, -0.1, -0.1]]], + dtype=tf.float32) + images_r = tf.expand_dims(images_r, 3) + images_g = tf.constant([[[-1, -1, -0.1, -0.1], [-1, -1, -0.1, -0.1], + [-1, -0.1, 0.4, 0.4], [0.4, 0.4, -0.1, 0.4]]], + dtype=tf.float32) + images_g = tf.expand_dims(images_g, 3) + images_b = tf.constant([[[-0.1, -0.1, 0.4, -1], [-1, -1, -0.1, 0.4], + [-1, -0.1, -0.1, -1], [0.4, 0.4, 0.4, -0.1]]], + dtype=tf.float32) + images_b = tf.expand_dims(images_b, 3) + images = tf.concat([images_r, images_g, images_b], 3) + return images + + def expectedImagesAfterLeftRightFlip(self): + images_r = tf.constant([[[0, 0, 0, 0], [0, 0, -1, -1], + [0, 0, 0, -1], [0, 0, 0.5, 0.5]]], + dtype=tf.float32) + images_r = tf.expand_dims(images_r, 3) + images_g = tf.constant([[[0, 0, -1, -1], [0, 0, -1, -1], + [0.5, 0.5, 0, -1], [0.5, 0, 0.5, 0.5]]], + dtype=tf.float32) + images_g = tf.expand_dims(images_g, 3) + images_b = tf.constant([[[-1, 0.5, 0, 0], [0.5, 0, -1, -1], + [-1, 0, 0, -1], [0, 0.5, 0.5, 0.5]]], + dtype=tf.float32) + images_b = tf.expand_dims(images_b, 3) + images = tf.concat([images_r, images_g, images_b], 3) + return images + + def expectedImagesAfterUpDownFlip(self): + images_r = tf.constant([[[0.5, 0.5, 0, 0], [-1, 0, 0, 0], + [-1, -1, 0, 0], [0, 0, 0, 0]]], + dtype=tf.float32) + images_r = tf.expand_dims(images_r, 3) + images_g = tf.constant([[[0.5, 0.5, 0, 0.5], [-1, 0, 0.5, 0.5], + [-1, -1, 0, 0], [-1, -1, 0, 0]]], + dtype=tf.float32) + images_g = tf.expand_dims(images_g, 3) + images_b = tf.constant([[[0.5, 0.5, 0.5, 0], [-1, 0, 0, -1], + [-1, -1, 0, 0.5], [0, 0, 0.5, -1]]], + dtype=tf.float32) + images_b = tf.expand_dims(images_b, 3) + images = tf.concat([images_r, images_g, images_b], 3) + return images + + def expectedImagesAfterRot90(self): + images_r = tf.constant([[[0, 0, 0, 0], [0, 0, 0, 0], + [0, -1, 0, 0.5], [0, -1, -1, 0.5]]], + dtype=tf.float32) + images_r = tf.expand_dims(images_r, 3) + images_g = tf.constant([[[0, 0, 0.5, 0.5], [0, 0, 0.5, 0], + [-1, -1, 0, 0.5], [-1, -1, -1, 0.5]]], + dtype=tf.float32) + images_g = tf.expand_dims(images_g, 3) + images_b = tf.constant([[[-1, 0.5, -1, 0], [0.5, 0, 0, 0.5], + [0, -1, 0, 0.5], [0, -1, -1, 0.5]]], + dtype=tf.float32) + images_b = tf.expand_dims(images_b, 3) + images = tf.concat([images_r, images_g, images_b], 3) + return images + + def expectedBoxesAfterLeftRightFlip(self): + boxes = tf.constant([[0.0, 0.0, 0.75, 0.75], [0.25, 0.0, 0.75, 0.5]], + dtype=tf.float32) + return boxes + + def expectedBoxesAfterUpDownFlip(self): + boxes = tf.constant([[0.25, 0.25, 1.0, 1.0], [0.25, 0.5, 0.75, 1.0]], + dtype=tf.float32) + return boxes + + def expectedBoxesAfterRot90(self): + boxes = tf.constant( + [[0.0, 0.0, 0.75, 0.75], [0.0, 0.25, 0.5, 0.75]], dtype=tf.float32) + return boxes + + def expectedMasksAfterLeftRightFlip(self): + mask = np.array([ + [[0.0, 0.0, 255.0], + [0.0, 0.0, 255.0], + [0.0, 0.0, 255.0]], + [[0.0, 255.0, 255.0], + [0.0, 255.0, 255.0], + [0.0, 255.0, 255.0]]]) + return tf.constant(mask, dtype=tf.float32) + + def expectedMasksAfterUpDownFlip(self): + mask = np.array([ + [[255.0, 0.0, 0.0], + [255.0, 0.0, 0.0], + [255.0, 0.0, 0.0]], + [[255.0, 255.0, 0.0], + [255.0, 255.0, 0.0], + [255.0, 255.0, 0.0]]]) + return tf.constant(mask, dtype=tf.float32) + + def expectedMasksAfterRot90(self): + mask = np.array([ + [[0.0, 0.0, 0.0], + [0.0, 0.0, 0.0], + [255.0, 255.0, 255.0]], + [[0.0, 0.0, 0.0], + [255.0, 255.0, 255.0], + [255.0, 255.0, 255.0]]]) + return tf.constant(mask, dtype=tf.float32) + + def expectedLabelScoresAfterThresholding(self): + return tf.constant([1.0], dtype=tf.float32) + + def expectedBoxesAfterThresholding(self): + return tf.constant([[0.0, 0.25, 0.75, 1.0]], dtype=tf.float32) + + def expectedLabelsAfterThresholding(self): + return tf.constant([1], dtype=tf.float32) + + def expectedMultiClassScoresAfterThresholding(self): + return tf.constant([[1.0, 0.0]], dtype=tf.float32) + + def expectedMasksAfterThresholding(self): + mask = np.array([ + [[255.0, 0.0, 0.0], + [255.0, 0.0, 0.0], + [255.0, 0.0, 0.0]]]) + return tf.constant(mask, dtype=tf.float32) + + def expectedKeypointsAfterThresholding(self): + keypoints = np.array([ + [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]] + ]) + return tf.constant(keypoints, dtype=tf.float32) + + def expectedLabelScoresAfterThresholdingWithMissingScore(self): + return tf.constant([np.nan], dtype=tf.float32) + + def expectedBoxesAfterThresholdingWithMissingScore(self): + return tf.constant([[0.25, 0.5, 0.75, 1]], dtype=tf.float32) + + def expectedLabelsAfterThresholdingWithMissingScore(self): + return tf.constant([2], dtype=tf.float32) + + def testRgbToGrayscale(self): + images = self.createTestImages() + grayscale_images = preprocessor._rgb_to_grayscale(images) + expected_images = tf.image.rgb_to_grayscale(images) + with self.test_session() as sess: + (grayscale_images, expected_images) = sess.run( + [grayscale_images, expected_images]) + self.assertAllEqual(expected_images, grayscale_images) + + def testNormalizeImage(self): + preprocess_options = [(preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 256, + 'target_minval': -1, + 'target_maxval': 1 + })] + images = self.createTestImages() + tensor_dict = {fields.InputDataFields.image: images} + tensor_dict = preprocessor.preprocess(tensor_dict, preprocess_options) + images = tensor_dict[fields.InputDataFields.image] + images_expected = self.expectedImagesAfterNormalization() + + with self.test_session() as sess: + (images_, images_expected_) = sess.run( + [images, images_expected]) + images_shape_ = images_.shape + images_expected_shape_ = images_expected_.shape + expected_shape = [1, 4, 4, 3] + self.assertAllEqual(images_expected_shape_, images_shape_) + self.assertAllEqual(images_shape_, expected_shape) + self.assertAllClose(images_, images_expected_) + + def testRetainBoxesAboveThreshold(self): + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScores() + (retained_boxes, retained_labels, + retained_label_scores) = preprocessor.retain_boxes_above_threshold( + boxes, labels, label_scores, threshold=0.6) + with self.test_session() as sess: + (retained_boxes_, retained_labels_, retained_label_scores_, + expected_retained_boxes_, expected_retained_labels_, + expected_retained_label_scores_) = sess.run([ + retained_boxes, retained_labels, retained_label_scores, + self.expectedBoxesAfterThresholding(), + self.expectedLabelsAfterThresholding(), + self.expectedLabelScoresAfterThresholding()]) + self.assertAllClose( + retained_boxes_, expected_retained_boxes_) + self.assertAllClose( + retained_labels_, expected_retained_labels_) + self.assertAllClose( + retained_label_scores_, expected_retained_label_scores_) + + def testRetainBoxesAboveThresholdWithMultiClassScores(self): + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScores() + multiclass_scores = self.createTestMultiClassScores() + (_, _, _, + retained_multiclass_scores) = preprocessor.retain_boxes_above_threshold( + boxes, + labels, + label_scores, + multiclass_scores=multiclass_scores, + threshold=0.6) + with self.test_session() as sess: + (retained_multiclass_scores_, + expected_retained_multiclass_scores_) = sess.run([ + retained_multiclass_scores, + self.expectedMultiClassScoresAfterThresholding() + ]) + + self.assertAllClose(retained_multiclass_scores_, + expected_retained_multiclass_scores_) + + def testRetainBoxesAboveThresholdWithMasks(self): + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScores() + masks = self.createTestMasks() + _, _, _, retained_masks = preprocessor.retain_boxes_above_threshold( + boxes, labels, label_scores, masks, threshold=0.6) + with self.test_session() as sess: + retained_masks_, expected_retained_masks_ = sess.run([ + retained_masks, self.expectedMasksAfterThresholding()]) + + self.assertAllClose( + retained_masks_, expected_retained_masks_) + + def testRetainBoxesAboveThresholdWithKeypoints(self): + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScores() + keypoints = self.createTestKeypoints() + (_, _, _, retained_keypoints) = preprocessor.retain_boxes_above_threshold( + boxes, labels, label_scores, keypoints=keypoints, threshold=0.6) + with self.test_session() as sess: + (retained_keypoints_, + expected_retained_keypoints_) = sess.run([ + retained_keypoints, + self.expectedKeypointsAfterThresholding()]) + + self.assertAllClose( + retained_keypoints_, expected_retained_keypoints_) + + def testRetainBoxesAboveThresholdWithMissingScore(self): + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScoresWithMissingScore() + (retained_boxes, retained_labels, + retained_label_scores) = preprocessor.retain_boxes_above_threshold( + boxes, labels, label_scores, threshold=0.6) + with self.test_session() as sess: + (retained_boxes_, retained_labels_, retained_label_scores_, + expected_retained_boxes_, expected_retained_labels_, + expected_retained_label_scores_) = sess.run([ + retained_boxes, retained_labels, retained_label_scores, + self.expectedBoxesAfterThresholdingWithMissingScore(), + self.expectedLabelsAfterThresholdingWithMissingScore(), + self.expectedLabelScoresAfterThresholdingWithMissingScore()]) + self.assertAllClose( + retained_boxes_, expected_retained_boxes_) + self.assertAllClose( + retained_labels_, expected_retained_labels_) + self.assertAllClose( + retained_label_scores_, expected_retained_label_scores_) + + def testFlipBoxesLeftRight(self): + boxes = self.createTestBoxes() + flipped_boxes = preprocessor._flip_boxes_left_right(boxes) + expected_boxes = self.expectedBoxesAfterLeftRightFlip() + with self.test_session() as sess: + flipped_boxes, expected_boxes = sess.run([flipped_boxes, expected_boxes]) + self.assertAllEqual(flipped_boxes.flatten(), expected_boxes.flatten()) + + def testFlipBoxesUpDown(self): + boxes = self.createTestBoxes() + flipped_boxes = preprocessor._flip_boxes_up_down(boxes) + expected_boxes = self.expectedBoxesAfterUpDownFlip() + with self.test_session() as sess: + flipped_boxes, expected_boxes = sess.run([flipped_boxes, expected_boxes]) + self.assertAllEqual(flipped_boxes.flatten(), expected_boxes.flatten()) + + def testRot90Boxes(self): + boxes = self.createTestBoxes() + rotated_boxes = preprocessor._rot90_boxes(boxes) + expected_boxes = self.expectedBoxesAfterRot90() + with self.test_session() as sess: + rotated_boxes, expected_boxes = sess.run([rotated_boxes, expected_boxes]) + self.assertAllEqual(rotated_boxes.flatten(), expected_boxes.flatten()) + + def testFlipMasksLeftRight(self): + test_mask = self.createTestMasks() + flipped_mask = preprocessor._flip_masks_left_right(test_mask) + expected_mask = self.expectedMasksAfterLeftRightFlip() + with self.test_session() as sess: + flipped_mask, expected_mask = sess.run([flipped_mask, expected_mask]) + self.assertAllEqual(flipped_mask.flatten(), expected_mask.flatten()) + + def testFlipMasksUpDown(self): + test_mask = self.createTestMasks() + flipped_mask = preprocessor._flip_masks_up_down(test_mask) + expected_mask = self.expectedMasksAfterUpDownFlip() + with self.test_session() as sess: + flipped_mask, expected_mask = sess.run([flipped_mask, expected_mask]) + self.assertAllEqual(flipped_mask.flatten(), expected_mask.flatten()) + + def testRot90Masks(self): + test_mask = self.createTestMasks() + rotated_mask = preprocessor._rot90_masks(test_mask) + expected_mask = self.expectedMasksAfterRot90() + with self.test_session() as sess: + rotated_mask, expected_mask = sess.run([rotated_mask, expected_mask]) + self.assertAllEqual(rotated_mask.flatten(), expected_mask.flatten()) + + def _testPreprocessorCache(self, + preprocess_options, + test_boxes=False, + test_masks=False, + test_keypoints=False, + num_runs=4): + cache = preprocessor_cache.PreprocessorCache() + images = self.createTestImages() + boxes = self.createTestBoxes() + classes = self.createTestLabels() + masks = self.createTestMasks() + keypoints = self.createTestKeypoints() + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_instance_masks=test_masks, include_keypoints=test_keypoints) + out = [] + for i in range(num_runs): + tensor_dict = { + fields.InputDataFields.image: images, + } + num_outputs = 1 + if test_boxes: + tensor_dict[fields.InputDataFields.groundtruth_boxes] = boxes + tensor_dict[fields.InputDataFields.groundtruth_classes] = classes + num_outputs += 1 + if test_masks: + tensor_dict[fields.InputDataFields.groundtruth_instance_masks] = masks + num_outputs += 1 + if test_keypoints: + tensor_dict[fields.InputDataFields.groundtruth_keypoints] = keypoints + num_outputs += 1 + out.append(preprocessor.preprocess( + tensor_dict, preprocess_options, preprocessor_arg_map, cache)) + + with self.test_session() as sess: + to_run = [] + for i in range(num_runs): + to_run.append(out[i][fields.InputDataFields.image]) + if test_boxes: + to_run.append(out[i][fields.InputDataFields.groundtruth_boxes]) + if test_masks: + to_run.append( + out[i][fields.InputDataFields.groundtruth_instance_masks]) + if test_keypoints: + to_run.append(out[i][fields.InputDataFields.groundtruth_keypoints]) + + out_array = sess.run(to_run) + for i in range(num_outputs, len(out_array)): + self.assertAllClose(out_array[i], out_array[i - num_outputs]) + + def testRandomHorizontalFlip(self): + preprocess_options = [(preprocessor.random_horizontal_flip, {})] + images = self.expectedImagesAfterNormalization() + boxes = self.createTestBoxes() + tensor_dict = {fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes} + images_expected1 = self.expectedImagesAfterLeftRightFlip() + boxes_expected1 = self.expectedBoxesAfterLeftRightFlip() + images_expected2 = images + boxes_expected2 = boxes + tensor_dict = preprocessor.preprocess(tensor_dict, preprocess_options) + images = tensor_dict[fields.InputDataFields.image] + boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + + boxes_diff1 = tf.squared_difference(boxes, boxes_expected1) + boxes_diff2 = tf.squared_difference(boxes, boxes_expected2) + boxes_diff = tf.multiply(boxes_diff1, boxes_diff2) + boxes_diff_expected = tf.zeros_like(boxes_diff) + + images_diff1 = tf.squared_difference(images, images_expected1) + images_diff2 = tf.squared_difference(images, images_expected2) + images_diff = tf.multiply(images_diff1, images_diff2) + images_diff_expected = tf.zeros_like(images_diff) + + with self.test_session() as sess: + (images_diff_, images_diff_expected_, boxes_diff_, + boxes_diff_expected_) = sess.run([images_diff, images_diff_expected, + boxes_diff, boxes_diff_expected]) + self.assertAllClose(boxes_diff_, boxes_diff_expected_) + self.assertAllClose(images_diff_, images_diff_expected_) + + def testRandomHorizontalFlipWithEmptyBoxes(self): + preprocess_options = [(preprocessor.random_horizontal_flip, {})] + images = self.expectedImagesAfterNormalization() + boxes = self.createEmptyTestBoxes() + tensor_dict = {fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes} + images_expected1 = self.expectedImagesAfterLeftRightFlip() + boxes_expected = self.createEmptyTestBoxes() + images_expected2 = images + tensor_dict = preprocessor.preprocess(tensor_dict, preprocess_options) + images = tensor_dict[fields.InputDataFields.image] + boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + + images_diff1 = tf.squared_difference(images, images_expected1) + images_diff2 = tf.squared_difference(images, images_expected2) + images_diff = tf.multiply(images_diff1, images_diff2) + images_diff_expected = tf.zeros_like(images_diff) + + with self.test_session() as sess: + (images_diff_, images_diff_expected_, boxes_, + boxes_expected_) = sess.run([images_diff, images_diff_expected, boxes, + boxes_expected]) + self.assertAllClose(boxes_, boxes_expected_) + self.assertAllClose(images_diff_, images_diff_expected_) + + def testRandomHorizontalFlipWithCache(self): + keypoint_flip_permutation = self.createKeypointFlipPermutation() + preprocess_options = [ + (preprocessor.random_horizontal_flip, + {'keypoint_flip_permutation': keypoint_flip_permutation})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=True, + test_keypoints=True) + + def testRunRandomHorizontalFlipWithMaskAndKeypoints(self): + preprocess_options = [(preprocessor.random_horizontal_flip, {})] + image_height = 3 + image_width = 3 + images = tf.random_uniform([1, image_height, image_width, 3]) + boxes = self.createTestBoxes() + masks = self.createTestMasks() + keypoints = self.createTestKeypoints() + keypoint_flip_permutation = self.createKeypointFlipPermutation() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_instance_masks: masks, + fields.InputDataFields.groundtruth_keypoints: keypoints + } + preprocess_options = [ + (preprocessor.random_horizontal_flip, + {'keypoint_flip_permutation': keypoint_flip_permutation})] + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_instance_masks=True, include_keypoints=True) + tensor_dict = preprocessor.preprocess( + tensor_dict, preprocess_options, func_arg_map=preprocessor_arg_map) + boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks] + keypoints = tensor_dict[fields.InputDataFields.groundtruth_keypoints] + with self.test_session() as sess: + boxes, masks, keypoints = sess.run([boxes, masks, keypoints]) + self.assertTrue(boxes is not None) + self.assertTrue(masks is not None) + self.assertTrue(keypoints is not None) + + def testRandomVerticalFlip(self): + preprocess_options = [(preprocessor.random_vertical_flip, {})] + images = self.expectedImagesAfterNormalization() + boxes = self.createTestBoxes() + tensor_dict = {fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes} + images_expected1 = self.expectedImagesAfterUpDownFlip() + boxes_expected1 = self.expectedBoxesAfterUpDownFlip() + images_expected2 = images + boxes_expected2 = boxes + tensor_dict = preprocessor.preprocess(tensor_dict, preprocess_options) + images = tensor_dict[fields.InputDataFields.image] + boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + + boxes_diff1 = tf.squared_difference(boxes, boxes_expected1) + boxes_diff2 = tf.squared_difference(boxes, boxes_expected2) + boxes_diff = tf.multiply(boxes_diff1, boxes_diff2) + boxes_diff_expected = tf.zeros_like(boxes_diff) + + images_diff1 = tf.squared_difference(images, images_expected1) + images_diff2 = tf.squared_difference(images, images_expected2) + images_diff = tf.multiply(images_diff1, images_diff2) + images_diff_expected = tf.zeros_like(images_diff) + + with self.test_session() as sess: + (images_diff_, images_diff_expected_, boxes_diff_, + boxes_diff_expected_) = sess.run([images_diff, images_diff_expected, + boxes_diff, boxes_diff_expected]) + self.assertAllClose(boxes_diff_, boxes_diff_expected_) + self.assertAllClose(images_diff_, images_diff_expected_) + + def testRandomVerticalFlipWithEmptyBoxes(self): + preprocess_options = [(preprocessor.random_vertical_flip, {})] + images = self.expectedImagesAfterNormalization() + boxes = self.createEmptyTestBoxes() + tensor_dict = {fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes} + images_expected1 = self.expectedImagesAfterUpDownFlip() + boxes_expected = self.createEmptyTestBoxes() + images_expected2 = images + tensor_dict = preprocessor.preprocess(tensor_dict, preprocess_options) + images = tensor_dict[fields.InputDataFields.image] + boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + + images_diff1 = tf.squared_difference(images, images_expected1) + images_diff2 = tf.squared_difference(images, images_expected2) + images_diff = tf.multiply(images_diff1, images_diff2) + images_diff_expected = tf.zeros_like(images_diff) + + with self.test_session() as sess: + (images_diff_, images_diff_expected_, boxes_, + boxes_expected_) = sess.run([images_diff, images_diff_expected, boxes, + boxes_expected]) + self.assertAllClose(boxes_, boxes_expected_) + self.assertAllClose(images_diff_, images_diff_expected_) + + def testRandomVerticalFlipWithCache(self): + keypoint_flip_permutation = self.createKeypointFlipPermutation() + preprocess_options = [ + (preprocessor.random_vertical_flip, + {'keypoint_flip_permutation': keypoint_flip_permutation})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=True, + test_keypoints=True) + + def testRunRandomVerticalFlipWithMaskAndKeypoints(self): + preprocess_options = [(preprocessor.random_vertical_flip, {})] + image_height = 3 + image_width = 3 + images = tf.random_uniform([1, image_height, image_width, 3]) + boxes = self.createTestBoxes() + masks = self.createTestMasks() + keypoints = self.createTestKeypoints() + keypoint_flip_permutation = self.createKeypointFlipPermutation() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_instance_masks: masks, + fields.InputDataFields.groundtruth_keypoints: keypoints + } + preprocess_options = [ + (preprocessor.random_vertical_flip, + {'keypoint_flip_permutation': keypoint_flip_permutation})] + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_instance_masks=True, include_keypoints=True) + tensor_dict = preprocessor.preprocess( + tensor_dict, preprocess_options, func_arg_map=preprocessor_arg_map) + boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks] + keypoints = tensor_dict[fields.InputDataFields.groundtruth_keypoints] + with self.test_session() as sess: + boxes, masks, keypoints = sess.run([boxes, masks, keypoints]) + self.assertTrue(boxes is not None) + self.assertTrue(masks is not None) + self.assertTrue(keypoints is not None) + + def testRandomRotation90(self): + preprocess_options = [(preprocessor.random_rotation90, {})] + images = self.expectedImagesAfterNormalization() + boxes = self.createTestBoxes() + tensor_dict = {fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes} + images_expected1 = self.expectedImagesAfterRot90() + boxes_expected1 = self.expectedBoxesAfterRot90() + images_expected2 = images + boxes_expected2 = boxes + tensor_dict = preprocessor.preprocess(tensor_dict, preprocess_options) + images = tensor_dict[fields.InputDataFields.image] + boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + + boxes_diff1 = tf.squared_difference(boxes, boxes_expected1) + boxes_diff2 = tf.squared_difference(boxes, boxes_expected2) + boxes_diff = tf.multiply(boxes_diff1, boxes_diff2) + boxes_diff_expected = tf.zeros_like(boxes_diff) + + images_diff1 = tf.squared_difference(images, images_expected1) + images_diff2 = tf.squared_difference(images, images_expected2) + images_diff = tf.multiply(images_diff1, images_diff2) + images_diff_expected = tf.zeros_like(images_diff) + + with self.test_session() as sess: + (images_diff_, images_diff_expected_, boxes_diff_, + boxes_diff_expected_) = sess.run([images_diff, images_diff_expected, + boxes_diff, boxes_diff_expected]) + self.assertAllClose(boxes_diff_, boxes_diff_expected_) + self.assertAllClose(images_diff_, images_diff_expected_) + + def testRandomRotation90WithEmptyBoxes(self): + preprocess_options = [(preprocessor.random_rotation90, {})] + images = self.expectedImagesAfterNormalization() + boxes = self.createEmptyTestBoxes() + tensor_dict = {fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes} + images_expected1 = self.expectedImagesAfterRot90() + boxes_expected = self.createEmptyTestBoxes() + images_expected2 = images + tensor_dict = preprocessor.preprocess(tensor_dict, preprocess_options) + images = tensor_dict[fields.InputDataFields.image] + boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + + images_diff1 = tf.squared_difference(images, images_expected1) + images_diff2 = tf.squared_difference(images, images_expected2) + images_diff = tf.multiply(images_diff1, images_diff2) + images_diff_expected = tf.zeros_like(images_diff) + + with self.test_session() as sess: + (images_diff_, images_diff_expected_, boxes_, + boxes_expected_) = sess.run([images_diff, images_diff_expected, boxes, + boxes_expected]) + self.assertAllClose(boxes_, boxes_expected_) + self.assertAllClose(images_diff_, images_diff_expected_) + + def testRandomRotation90WithCache(self): + preprocess_options = [(preprocessor.random_rotation90, {})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=True, + test_keypoints=True) + + def testRunRandomRotation90WithMaskAndKeypoints(self): + preprocess_options = [(preprocessor.random_rotation90, {})] + image_height = 3 + image_width = 3 + images = tf.random_uniform([1, image_height, image_width, 3]) + boxes = self.createTestBoxes() + masks = self.createTestMasks() + keypoints = self.createTestKeypoints() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_instance_masks: masks, + fields.InputDataFields.groundtruth_keypoints: keypoints + } + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_instance_masks=True, include_keypoints=True) + tensor_dict = preprocessor.preprocess( + tensor_dict, preprocess_options, func_arg_map=preprocessor_arg_map) + boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks] + keypoints = tensor_dict[fields.InputDataFields.groundtruth_keypoints] + with self.test_session() as sess: + boxes, masks, keypoints = sess.run([boxes, masks, keypoints]) + self.assertTrue(boxes is not None) + self.assertTrue(masks is not None) + self.assertTrue(keypoints is not None) + + def testRandomPixelValueScale(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_pixel_value_scale, {})) + images = self.createTestImages() + tensor_dict = {fields.InputDataFields.image: images} + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + images_min = tf.to_float(images) * 0.9 / 255.0 + images_max = tf.to_float(images) * 1.1 / 255.0 + images = tensor_dict[fields.InputDataFields.image] + values_greater = tf.greater_equal(images, images_min) + values_less = tf.less_equal(images, images_max) + values_true = tf.fill([1, 4, 4, 3], True) + with self.test_session() as sess: + (values_greater_, values_less_, values_true_) = sess.run( + [values_greater, values_less, values_true]) + self.assertAllClose(values_greater_, values_true_) + self.assertAllClose(values_less_, values_true_) + + def testRandomPixelValueScaleWithCache(self): + preprocess_options = [] + preprocess_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocess_options.append((preprocessor.random_pixel_value_scale, {})) + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=False, + test_keypoints=False) + + def testRandomImageScale(self): + preprocess_options = [(preprocessor.random_image_scale, {})] + images_original = self.createTestImages() + tensor_dict = {fields.InputDataFields.image: images_original} + tensor_dict = preprocessor.preprocess(tensor_dict, preprocess_options) + images_scaled = tensor_dict[fields.InputDataFields.image] + images_original_shape = tf.shape(images_original) + images_scaled_shape = tf.shape(images_scaled) + with self.test_session() as sess: + (images_original_shape_, images_scaled_shape_) = sess.run( + [images_original_shape, images_scaled_shape]) + self.assertTrue( + images_original_shape_[1] * 0.5 <= images_scaled_shape_[1]) + self.assertTrue( + images_original_shape_[1] * 2.0 >= images_scaled_shape_[1]) + self.assertTrue( + images_original_shape_[2] * 0.5 <= images_scaled_shape_[2]) + self.assertTrue( + images_original_shape_[2] * 2.0 >= images_scaled_shape_[2]) + + def testRandomImageScaleWithCache(self): + preprocess_options = [(preprocessor.random_image_scale, {})] + self._testPreprocessorCache(preprocess_options, + test_boxes=False, + test_masks=False, + test_keypoints=False) + + def testRandomRGBtoGray(self): + preprocess_options = [(preprocessor.random_rgb_to_gray, {})] + images_original = self.createTestImages() + tensor_dict = {fields.InputDataFields.image: images_original} + tensor_dict = preprocessor.preprocess(tensor_dict, preprocess_options) + images_gray = tensor_dict[fields.InputDataFields.image] + images_gray_r, images_gray_g, images_gray_b = tf.split( + value=images_gray, num_or_size_splits=3, axis=3) + images_r, images_g, images_b = tf.split( + value=images_original, num_or_size_splits=3, axis=3) + images_r_diff1 = tf.squared_difference(tf.to_float(images_r), + tf.to_float(images_gray_r)) + images_r_diff2 = tf.squared_difference(tf.to_float(images_gray_r), + tf.to_float(images_gray_g)) + images_r_diff = tf.multiply(images_r_diff1, images_r_diff2) + images_g_diff1 = tf.squared_difference(tf.to_float(images_g), + tf.to_float(images_gray_g)) + images_g_diff2 = tf.squared_difference(tf.to_float(images_gray_g), + tf.to_float(images_gray_b)) + images_g_diff = tf.multiply(images_g_diff1, images_g_diff2) + images_b_diff1 = tf.squared_difference(tf.to_float(images_b), + tf.to_float(images_gray_b)) + images_b_diff2 = tf.squared_difference(tf.to_float(images_gray_b), + tf.to_float(images_gray_r)) + images_b_diff = tf.multiply(images_b_diff1, images_b_diff2) + image_zero1 = tf.constant(0, dtype=tf.float32, shape=[1, 4, 4, 1]) + with self.test_session() as sess: + (images_r_diff_, images_g_diff_, images_b_diff_, image_zero1_) = sess.run( + [images_r_diff, images_g_diff, images_b_diff, image_zero1]) + self.assertAllClose(images_r_diff_, image_zero1_) + self.assertAllClose(images_g_diff_, image_zero1_) + self.assertAllClose(images_b_diff_, image_zero1_) + + def testRandomRGBtoGrayWithCache(self): + preprocess_options = [( + preprocessor.random_rgb_to_gray, {'probability': 0.5})] + self._testPreprocessorCache(preprocess_options, + test_boxes=False, + test_masks=False, + test_keypoints=False) + + def testRandomAdjustBrightness(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_adjust_brightness, {})) + images_original = self.createTestImages() + tensor_dict = {fields.InputDataFields.image: images_original} + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + images_bright = tensor_dict[fields.InputDataFields.image] + image_original_shape = tf.shape(images_original) + image_bright_shape = tf.shape(images_bright) + with self.test_session() as sess: + (image_original_shape_, image_bright_shape_) = sess.run( + [image_original_shape, image_bright_shape]) + self.assertAllEqual(image_original_shape_, image_bright_shape_) + + def testRandomAdjustBrightnessWithCache(self): + preprocess_options = [] + preprocess_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocess_options.append((preprocessor.random_adjust_brightness, {})) + self._testPreprocessorCache(preprocess_options, + test_boxes=False, + test_masks=False, + test_keypoints=False) + + def testRandomAdjustContrast(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_adjust_contrast, {})) + images_original = self.createTestImages() + tensor_dict = {fields.InputDataFields.image: images_original} + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + images_contrast = tensor_dict[fields.InputDataFields.image] + image_original_shape = tf.shape(images_original) + image_contrast_shape = tf.shape(images_contrast) + with self.test_session() as sess: + (image_original_shape_, image_contrast_shape_) = sess.run( + [image_original_shape, image_contrast_shape]) + self.assertAllEqual(image_original_shape_, image_contrast_shape_) + + def testRandomAdjustContrastWithCache(self): + preprocess_options = [] + preprocess_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocess_options.append((preprocessor.random_adjust_contrast, {})) + self._testPreprocessorCache(preprocess_options, + test_boxes=False, + test_masks=False, + test_keypoints=False) + + def testRandomAdjustHue(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_adjust_hue, {})) + images_original = self.createTestImages() + tensor_dict = {fields.InputDataFields.image: images_original} + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + images_hue = tensor_dict[fields.InputDataFields.image] + image_original_shape = tf.shape(images_original) + image_hue_shape = tf.shape(images_hue) + with self.test_session() as sess: + (image_original_shape_, image_hue_shape_) = sess.run( + [image_original_shape, image_hue_shape]) + self.assertAllEqual(image_original_shape_, image_hue_shape_) + + def testRandomAdjustHueWithCache(self): + preprocess_options = [] + preprocess_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocess_options.append((preprocessor.random_adjust_hue, {})) + self._testPreprocessorCache(preprocess_options, + test_boxes=False, + test_masks=False, + test_keypoints=False) + + def testRandomDistortColor(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_distort_color, {})) + images_original = self.createTestImages() + images_original_shape = tf.shape(images_original) + tensor_dict = {fields.InputDataFields.image: images_original} + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + images_distorted_color = tensor_dict[fields.InputDataFields.image] + images_distorted_color_shape = tf.shape(images_distorted_color) + with self.test_session() as sess: + (images_original_shape_, images_distorted_color_shape_) = sess.run( + [images_original_shape, images_distorted_color_shape]) + self.assertAllEqual(images_original_shape_, images_distorted_color_shape_) + + def testRandomDistortColorWithCache(self): + preprocess_options = [] + preprocess_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocess_options.append((preprocessor.random_distort_color, {})) + self._testPreprocessorCache(preprocess_options, + test_boxes=False, + test_masks=False, + test_keypoints=False) + + def testRandomJitterBoxes(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.random_jitter_boxes, {})) + boxes = self.createTestBoxes() + boxes_shape = tf.shape(boxes) + tensor_dict = {fields.InputDataFields.groundtruth_boxes: boxes} + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + distorted_boxes = tensor_dict[fields.InputDataFields.groundtruth_boxes] + distorted_boxes_shape = tf.shape(distorted_boxes) + + with self.test_session() as sess: + (boxes_shape_, distorted_boxes_shape_) = sess.run( + [boxes_shape, distorted_boxes_shape]) + self.assertAllEqual(boxes_shape_, distorted_boxes_shape_) + + def testRandomCropImage(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_crop_image, {})) + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + distorted_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + distorted_images = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + boxes_rank = tf.rank(boxes) + distorted_boxes_rank = tf.rank(distorted_boxes) + images_rank = tf.rank(images) + distorted_images_rank = tf.rank(distorted_images) + self.assertEqual(3, distorted_images.get_shape()[3]) + + with self.test_session() as sess: + (boxes_rank_, distorted_boxes_rank_, images_rank_, + distorted_images_rank_) = sess.run([ + boxes_rank, distorted_boxes_rank, images_rank, distorted_images_rank + ]) + self.assertAllEqual(boxes_rank_, distorted_boxes_rank_) + self.assertAllEqual(images_rank_, distorted_images_rank_) + + def testRandomCropImageWithCache(self): + preprocess_options = [(preprocessor.random_rgb_to_gray, + {'probability': 0.5}), + (preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1, + }), + (preprocessor.random_crop_image, {})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=False, + test_keypoints=False) + + def testRandomCropImageGrayscale(self): + preprocessing_options = [(preprocessor.rgb_to_gray, {}), + (preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1, + }), + (preprocessor.random_crop_image, {})] + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options) + distorted_images = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + boxes_rank = tf.rank(boxes) + distorted_boxes_rank = tf.rank(distorted_boxes) + images_rank = tf.rank(images) + distorted_images_rank = tf.rank(distorted_images) + self.assertEqual(1, distorted_images.get_shape()[3]) + + with self.test_session() as sess: + session_results = sess.run([ + boxes_rank, distorted_boxes_rank, images_rank, distorted_images_rank + ]) + (boxes_rank_, distorted_boxes_rank_, images_rank_, + distorted_images_rank_) = session_results + self.assertAllEqual(boxes_rank_, distorted_boxes_rank_) + self.assertAllEqual(images_rank_, distorted_images_rank_) + + def testRandomCropImageWithBoxOutOfImage(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_crop_image, {})) + images = self.createTestImages() + boxes = self.createTestBoxesOutOfImage() + labels = self.createTestLabels() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + distorted_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + distorted_images = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + boxes_rank = tf.rank(boxes) + distorted_boxes_rank = tf.rank(distorted_boxes) + images_rank = tf.rank(images) + distorted_images_rank = tf.rank(distorted_images) + + with self.test_session() as sess: + (boxes_rank_, distorted_boxes_rank_, images_rank_, + distorted_images_rank_) = sess.run( + [boxes_rank, distorted_boxes_rank, images_rank, + distorted_images_rank]) + self.assertAllEqual(boxes_rank_, distorted_boxes_rank_) + self.assertAllEqual(images_rank_, distorted_images_rank_) + + def testRandomCropImageWithRandomCoefOne(self): + preprocessing_options = [(preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })] + + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScores() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_label_scores: label_scores + } + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + images = tensor_dict[fields.InputDataFields.image] + + preprocessing_options = [(preprocessor.random_crop_image, { + 'random_coef': 1.0 + })] + distorted_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + + distorted_images = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + distorted_label_scores = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_label_scores] + boxes_shape = tf.shape(boxes) + distorted_boxes_shape = tf.shape(distorted_boxes) + images_shape = tf.shape(images) + distorted_images_shape = tf.shape(distorted_images) + + with self.test_session() as sess: + (boxes_shape_, distorted_boxes_shape_, images_shape_, + distorted_images_shape_, images_, distorted_images_, + boxes_, distorted_boxes_, labels_, distorted_labels_, + label_scores_, distorted_label_scores_) = sess.run( + [boxes_shape, distorted_boxes_shape, images_shape, + distorted_images_shape, images, distorted_images, + boxes, distorted_boxes, labels, distorted_labels, + label_scores, distorted_label_scores]) + self.assertAllEqual(boxes_shape_, distorted_boxes_shape_) + self.assertAllEqual(images_shape_, distorted_images_shape_) + self.assertAllClose(images_, distorted_images_) + self.assertAllClose(boxes_, distorted_boxes_) + self.assertAllEqual(labels_, distorted_labels_) + self.assertAllEqual(label_scores_, distorted_label_scores_) + + def testRandomCropWithMockSampleDistortedBoundingBox(self): + preprocessing_options = [(preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })] + + images = self.createColorfulTestImage() + boxes = tf.constant([[0.1, 0.1, 0.8, 0.3], + [0.2, 0.4, 0.75, 0.75], + [0.3, 0.1, 0.4, 0.7]], dtype=tf.float32) + labels = tf.constant([1, 7, 11], dtype=tf.int32) + + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + images = tensor_dict[fields.InputDataFields.image] + + preprocessing_options = [(preprocessor.random_crop_image, {})] + with mock.patch.object( + tf.image, + 'sample_distorted_bounding_box') as mock_sample_distorted_bounding_box: + mock_sample_distorted_bounding_box.return_value = (tf.constant( + [6, 143, 0], dtype=tf.int32), tf.constant( + [190, 237, -1], dtype=tf.int32), tf.constant( + [[[0.03, 0.3575, 0.98, 0.95]]], dtype=tf.float32)) + + distorted_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + expected_boxes = tf.constant([[0.178947, 0.07173, 0.75789469, 0.66244733], + [0.28421, 0.0, 0.38947365, 0.57805908]], + dtype=tf.float32) + expected_labels = tf.constant([7, 11], dtype=tf.int32) + + with self.test_session() as sess: + (distorted_boxes_, distorted_labels_, + expected_boxes_, expected_labels_) = sess.run( + [distorted_boxes, distorted_labels, + expected_boxes, expected_labels]) + self.assertAllClose(distorted_boxes_, expected_boxes_) + self.assertAllEqual(distorted_labels_, expected_labels_) + + def testRandomCropImageWithMultiClassScores(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_crop_image, {})) + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + multiclass_scores = self.createTestMultiClassScores() + + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.multiclass_scores: multiclass_scores + } + distorted_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + distorted_images = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_multiclass_scores = distorted_tensor_dict[ + fields.InputDataFields.multiclass_scores] + boxes_rank = tf.rank(boxes) + distorted_boxes_rank = tf.rank(distorted_boxes) + images_rank = tf.rank(images) + distorted_images_rank = tf.rank(distorted_images) + multiclass_scores_rank = tf.rank(multiclass_scores) + distorted_multiclass_scores_rank = tf.rank(distorted_multiclass_scores) + + with self.test_session() as sess: + (boxes_rank_, distorted_boxes_, distorted_boxes_rank_, images_rank_, + distorted_images_rank_, multiclass_scores_rank_, + distorted_multiclass_scores_rank_, + distorted_multiclass_scores_) = sess.run([ + boxes_rank, distorted_boxes, distorted_boxes_rank, images_rank, + distorted_images_rank, multiclass_scores_rank, + distorted_multiclass_scores_rank, distorted_multiclass_scores + ]) + self.assertAllEqual(boxes_rank_, distorted_boxes_rank_) + self.assertAllEqual(images_rank_, distorted_images_rank_) + self.assertAllEqual(multiclass_scores_rank_, + distorted_multiclass_scores_rank_) + self.assertAllEqual(distorted_boxes_.shape[0], + distorted_multiclass_scores_.shape[0]) + + def testStrictRandomCropImageWithLabelScores(self): + image = self.createColorfulTestImage()[0] + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScores() + with mock.patch.object( + tf.image, + 'sample_distorted_bounding_box' + ) as mock_sample_distorted_bounding_box: + mock_sample_distorted_bounding_box.return_value = ( + tf.constant([6, 143, 0], dtype=tf.int32), + tf.constant([190, 237, -1], dtype=tf.int32), + tf.constant([[[0.03, 0.3575, 0.98, 0.95]]], dtype=tf.float32)) + new_image, new_boxes, new_labels, new_label_scores = ( + preprocessor._strict_random_crop_image( + image, boxes, labels, label_scores)) + with self.test_session() as sess: + new_image, new_boxes, new_labels, new_label_scores = ( + sess.run( + [new_image, new_boxes, new_labels, new_label_scores]) + ) + + expected_boxes = np.array( + [[0.0, 0.0, 0.75789469, 1.0], + [0.23157893, 0.24050637, 0.75789469, 1.0]], dtype=np.float32) + self.assertAllEqual(new_image.shape, [190, 237, 3]) + self.assertAllEqual(new_label_scores, [1.0, 0.5]) + self.assertAllClose( + new_boxes.flatten(), expected_boxes.flatten()) + + def testStrictRandomCropImageWithMasks(self): + image = self.createColorfulTestImage()[0] + boxes = self.createTestBoxes() + labels = self.createTestLabels() + masks = tf.random_uniform([2, 200, 400], dtype=tf.float32) + with mock.patch.object( + tf.image, + 'sample_distorted_bounding_box' + ) as mock_sample_distorted_bounding_box: + mock_sample_distorted_bounding_box.return_value = ( + tf.constant([6, 143, 0], dtype=tf.int32), + tf.constant([190, 237, -1], dtype=tf.int32), + tf.constant([[[0.03, 0.3575, 0.98, 0.95]]], dtype=tf.float32)) + new_image, new_boxes, new_labels, new_masks = ( + preprocessor._strict_random_crop_image( + image, boxes, labels, masks=masks)) + with self.test_session() as sess: + new_image, new_boxes, new_labels, new_masks = sess.run( + [new_image, new_boxes, new_labels, new_masks]) + expected_boxes = np.array( + [[0.0, 0.0, 0.75789469, 1.0], + [0.23157893, 0.24050637, 0.75789469, 1.0]], dtype=np.float32) + self.assertAllEqual(new_image.shape, [190, 237, 3]) + self.assertAllEqual(new_masks.shape, [2, 190, 237]) + self.assertAllClose( + new_boxes.flatten(), expected_boxes.flatten()) + + def testStrictRandomCropImageWithKeypoints(self): + image = self.createColorfulTestImage()[0] + boxes = self.createTestBoxes() + labels = self.createTestLabels() + keypoints = self.createTestKeypoints() + with mock.patch.object( + tf.image, + 'sample_distorted_bounding_box' + ) as mock_sample_distorted_bounding_box: + mock_sample_distorted_bounding_box.return_value = ( + tf.constant([6, 143, 0], dtype=tf.int32), + tf.constant([190, 237, -1], dtype=tf.int32), + tf.constant([[[0.03, 0.3575, 0.98, 0.95]]], dtype=tf.float32)) + new_image, new_boxes, new_labels, new_keypoints = ( + preprocessor._strict_random_crop_image( + image, boxes, labels, keypoints=keypoints)) + with self.test_session() as sess: + new_image, new_boxes, new_labels, new_keypoints = sess.run( + [new_image, new_boxes, new_labels, new_keypoints]) + + expected_boxes = np.array([ + [0.0, 0.0, 0.75789469, 1.0], + [0.23157893, 0.24050637, 0.75789469, 1.0],], dtype=np.float32) + expected_keypoints = np.array([ + [[np.nan, np.nan], + [np.nan, np.nan], + [np.nan, np.nan]], + [[0.38947368, 0.07173], + [0.49473682, 0.24050637], + [0.60000002, 0.40928277]] + ], dtype=np.float32) + self.assertAllEqual(new_image.shape, [190, 237, 3]) + self.assertAllClose( + new_boxes.flatten(), expected_boxes.flatten()) + self.assertAllClose( + new_keypoints.flatten(), expected_keypoints.flatten()) + + def testRunRandomCropImageWithMasks(self): + image = self.createColorfulTestImage() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + masks = tf.random_uniform([2, 200, 400], dtype=tf.float32) + + tensor_dict = { + fields.InputDataFields.image: image, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_instance_masks: masks, + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_instance_masks=True) + + preprocessing_options = [(preprocessor.random_crop_image, {})] + + with mock.patch.object( + tf.image, + 'sample_distorted_bounding_box' + ) as mock_sample_distorted_bounding_box: + mock_sample_distorted_bounding_box.return_value = ( + tf.constant([6, 143, 0], dtype=tf.int32), + tf.constant([190, 237, -1], dtype=tf.int32), + tf.constant([[[0.03, 0.3575, 0.98, 0.95]]], dtype=tf.float32)) + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_image = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + distorted_masks = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_instance_masks] + with self.test_session() as sess: + (distorted_image_, distorted_boxes_, distorted_labels_, + distorted_masks_) = sess.run( + [distorted_image, distorted_boxes, distorted_labels, + distorted_masks]) + + expected_boxes = np.array([ + [0.0, 0.0, 0.75789469, 1.0], + [0.23157893, 0.24050637, 0.75789469, 1.0], + ], dtype=np.float32) + self.assertAllEqual(distorted_image_.shape, [1, 190, 237, 3]) + self.assertAllEqual(distorted_masks_.shape, [2, 190, 237]) + self.assertAllEqual(distorted_labels_, [1, 2]) + self.assertAllClose( + distorted_boxes_.flatten(), expected_boxes.flatten()) + + def testRunRandomCropImageWithKeypointsInsideCrop(self): + image = self.createColorfulTestImage() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + keypoints = self.createTestKeypointsInsideCrop() + + tensor_dict = { + fields.InputDataFields.image: image, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_keypoints: keypoints + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_keypoints=True) + + preprocessing_options = [(preprocessor.random_crop_image, {})] + + with mock.patch.object( + tf.image, + 'sample_distorted_bounding_box' + ) as mock_sample_distorted_bounding_box: + mock_sample_distorted_bounding_box.return_value = ( + tf.constant([6, 143, 0], dtype=tf.int32), + tf.constant([190, 237, -1], dtype=tf.int32), + tf.constant([[[0.03, 0.3575, 0.98, 0.95]]], dtype=tf.float32)) + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_image = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + distorted_keypoints = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_keypoints] + with self.test_session() as sess: + (distorted_image_, distorted_boxes_, distorted_labels_, + distorted_keypoints_) = sess.run( + [distorted_image, distorted_boxes, distorted_labels, + distorted_keypoints]) + + expected_boxes = np.array([ + [0.0, 0.0, 0.75789469, 1.0], + [0.23157893, 0.24050637, 0.75789469, 1.0], + ], dtype=np.float32) + expected_keypoints = np.array([ + [[0.38947368, 0.07173], + [0.49473682, 0.24050637], + [0.60000002, 0.40928277]], + [[0.38947368, 0.07173], + [0.49473682, 0.24050637], + [0.60000002, 0.40928277]] + ]) + self.assertAllEqual(distorted_image_.shape, [1, 190, 237, 3]) + self.assertAllEqual(distorted_labels_, [1, 2]) + self.assertAllClose( + distorted_boxes_.flatten(), expected_boxes.flatten()) + self.assertAllClose( + distorted_keypoints_.flatten(), expected_keypoints.flatten()) + + def testRunRandomCropImageWithKeypointsOutsideCrop(self): + image = self.createColorfulTestImage() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + keypoints = self.createTestKeypointsOutsideCrop() + + tensor_dict = { + fields.InputDataFields.image: image, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_keypoints: keypoints + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_keypoints=True) + + preprocessing_options = [(preprocessor.random_crop_image, {})] + + with mock.patch.object( + tf.image, + 'sample_distorted_bounding_box' + ) as mock_sample_distorted_bounding_box: + mock_sample_distorted_bounding_box.return_value = ( + tf.constant([6, 143, 0], dtype=tf.int32), + tf.constant([190, 237, -1], dtype=tf.int32), + tf.constant([[[0.03, 0.3575, 0.98, 0.95]]], dtype=tf.float32)) + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_image = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + distorted_keypoints = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_keypoints] + with self.test_session() as sess: + (distorted_image_, distorted_boxes_, distorted_labels_, + distorted_keypoints_) = sess.run( + [distorted_image, distorted_boxes, distorted_labels, + distorted_keypoints]) + + expected_boxes = np.array([ + [0.0, 0.0, 0.75789469, 1.0], + [0.23157893, 0.24050637, 0.75789469, 1.0], + ], dtype=np.float32) + expected_keypoints = np.array([ + [[np.nan, np.nan], + [np.nan, np.nan], + [np.nan, np.nan]], + [[np.nan, np.nan], + [np.nan, np.nan], + [np.nan, np.nan]], + ]) + self.assertAllEqual(distorted_image_.shape, [1, 190, 237, 3]) + self.assertAllEqual(distorted_labels_, [1, 2]) + self.assertAllClose( + distorted_boxes_.flatten(), expected_boxes.flatten()) + self.assertAllClose( + distorted_keypoints_.flatten(), expected_keypoints.flatten()) + + def testRunRetainBoxesAboveThreshold(self): + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScores() + + tensor_dict = { + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_label_scores: label_scores + } + + preprocessing_options = [ + (preprocessor.retain_boxes_above_threshold, {'threshold': 0.6}) + ] + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_label_scores=True) + retained_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + retained_boxes = retained_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + retained_labels = retained_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + retained_label_scores = retained_tensor_dict[ + fields.InputDataFields.groundtruth_label_scores] + + with self.test_session() as sess: + (retained_boxes_, retained_labels_, + retained_label_scores_, expected_retained_boxes_, + expected_retained_labels_, expected_retained_label_scores_) = sess.run( + [retained_boxes, retained_labels, retained_label_scores, + self.expectedBoxesAfterThresholding(), + self.expectedLabelsAfterThresholding(), + self.expectedLabelScoresAfterThresholding()]) + + self.assertAllClose(retained_boxes_, expected_retained_boxes_) + self.assertAllClose(retained_labels_, expected_retained_labels_) + self.assertAllClose( + retained_label_scores_, expected_retained_label_scores_) + + def testRunRetainBoxesAboveThresholdWithMasks(self): + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScores() + masks = self.createTestMasks() + + tensor_dict = { + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_label_scores: label_scores, + fields.InputDataFields.groundtruth_instance_masks: masks + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_label_scores=True, + include_instance_masks=True) + + preprocessing_options = [ + (preprocessor.retain_boxes_above_threshold, {'threshold': 0.6}) + ] + + retained_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + retained_masks = retained_tensor_dict[ + fields.InputDataFields.groundtruth_instance_masks] + + with self.test_session() as sess: + (retained_masks_, expected_masks_) = sess.run( + [retained_masks, + self.expectedMasksAfterThresholding()]) + self.assertAllClose(retained_masks_, expected_masks_) + + def testRunRetainBoxesAboveThresholdWithKeypoints(self): + boxes = self.createTestBoxes() + labels = self.createTestLabels() + label_scores = self.createTestLabelScores() + keypoints = self.createTestKeypoints() + + tensor_dict = { + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_label_scores: label_scores, + fields.InputDataFields.groundtruth_keypoints: keypoints + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_label_scores=True, + include_keypoints=True) + + preprocessing_options = [ + (preprocessor.retain_boxes_above_threshold, {'threshold': 0.6}) + ] + + retained_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + retained_keypoints = retained_tensor_dict[ + fields.InputDataFields.groundtruth_keypoints] + + with self.test_session() as sess: + (retained_keypoints_, expected_keypoints_) = sess.run( + [retained_keypoints, + self.expectedKeypointsAfterThresholding()]) + self.assertAllClose(retained_keypoints_, expected_keypoints_) + + def testRandomCropToAspectRatioWithCache(self): + preprocess_options = [(preprocessor.random_crop_to_aspect_ratio, {})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=False, + test_keypoints=False) + + def testRunRandomCropToAspectRatioWithMasks(self): + image = self.createColorfulTestImage() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + masks = tf.random_uniform([2, 200, 400], dtype=tf.float32) + + tensor_dict = { + fields.InputDataFields.image: image, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_instance_masks: masks + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_instance_masks=True) + + preprocessing_options = [(preprocessor.random_crop_to_aspect_ratio, {})] + + with mock.patch.object(preprocessor, + '_random_integer') as mock_random_integer: + mock_random_integer.return_value = tf.constant(0, dtype=tf.int32) + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_image = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + distorted_masks = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_instance_masks] + with self.test_session() as sess: + (distorted_image_, distorted_boxes_, distorted_labels_, + distorted_masks_) = sess.run([ + distorted_image, distorted_boxes, distorted_labels, distorted_masks + ]) + + expected_boxes = np.array([0.0, 0.5, 0.75, 1.0], dtype=np.float32) + self.assertAllEqual(distorted_image_.shape, [1, 200, 200, 3]) + self.assertAllEqual(distorted_labels_, [1]) + self.assertAllClose(distorted_boxes_.flatten(), + expected_boxes.flatten()) + self.assertAllEqual(distorted_masks_.shape, [1, 200, 200]) + + def testRunRandomCropToAspectRatioWithKeypoints(self): + image = self.createColorfulTestImage() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + keypoints = self.createTestKeypoints() + + tensor_dict = { + fields.InputDataFields.image: image, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_keypoints: keypoints + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_keypoints=True) + + preprocessing_options = [(preprocessor.random_crop_to_aspect_ratio, {})] + + with mock.patch.object(preprocessor, + '_random_integer') as mock_random_integer: + mock_random_integer.return_value = tf.constant(0, dtype=tf.int32) + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_image = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + distorted_keypoints = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_keypoints] + with self.test_session() as sess: + (distorted_image_, distorted_boxes_, distorted_labels_, + distorted_keypoints_) = sess.run([ + distorted_image, distorted_boxes, distorted_labels, + distorted_keypoints + ]) + + expected_boxes = np.array([0.0, 0.5, 0.75, 1.0], dtype=np.float32) + expected_keypoints = np.array( + [[0.1, 0.2], [0.2, 0.4], [0.3, 0.6]], dtype=np.float32) + self.assertAllEqual(distorted_image_.shape, [1, 200, 200, 3]) + self.assertAllEqual(distorted_labels_, [1]) + self.assertAllClose(distorted_boxes_.flatten(), + expected_boxes.flatten()) + self.assertAllClose(distorted_keypoints_.flatten(), + expected_keypoints.flatten()) + + def testRandomPadToAspectRatioWithCache(self): + preprocess_options = [(preprocessor.random_pad_to_aspect_ratio, {})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=True, + test_keypoints=True) + + def testRunRandomPadToAspectRatioWithMinMaxPaddedSizeRatios(self): + image = self.createColorfulTestImage() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + + tensor_dict = { + fields.InputDataFields.image: image, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map() + preprocessing_options = [(preprocessor.random_pad_to_aspect_ratio, + {'min_padded_size_ratio': (4.0, 4.0), + 'max_padded_size_ratio': (4.0, 4.0)})] + + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_image = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + with self.test_session() as sess: + distorted_image_, distorted_boxes_, distorted_labels_ = sess.run([ + distorted_image, distorted_boxes, distorted_labels]) + + expected_boxes = np.array( + [[0.0, 0.125, 0.1875, 0.5], [0.0625, 0.25, 0.1875, 0.5]], + dtype=np.float32) + self.assertAllEqual(distorted_image_.shape, [1, 800, 800, 3]) + self.assertAllEqual(distorted_labels_, [1, 2]) + self.assertAllClose(distorted_boxes_.flatten(), + expected_boxes.flatten()) + + def testRunRandomPadToAspectRatioWithMasks(self): + image = self.createColorfulTestImage() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + masks = tf.random_uniform([2, 200, 400], dtype=tf.float32) + + tensor_dict = { + fields.InputDataFields.image: image, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_instance_masks: masks + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_instance_masks=True) + + preprocessing_options = [(preprocessor.random_pad_to_aspect_ratio, {})] + + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_image = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + distorted_masks = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_instance_masks] + with self.test_session() as sess: + (distorted_image_, distorted_boxes_, distorted_labels_, + distorted_masks_) = sess.run([ + distorted_image, distorted_boxes, distorted_labels, distorted_masks + ]) + + expected_boxes = np.array( + [[0.0, 0.25, 0.375, 1.0], [0.125, 0.5, 0.375, 1.0]], dtype=np.float32) + self.assertAllEqual(distorted_image_.shape, [1, 400, 400, 3]) + self.assertAllEqual(distorted_labels_, [1, 2]) + self.assertAllClose(distorted_boxes_.flatten(), + expected_boxes.flatten()) + self.assertAllEqual(distorted_masks_.shape, [2, 400, 400]) + + def testRunRandomPadToAspectRatioWithKeypoints(self): + image = self.createColorfulTestImage() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + keypoints = self.createTestKeypoints() + + tensor_dict = { + fields.InputDataFields.image: image, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.groundtruth_keypoints: keypoints + } + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_keypoints=True) + + preprocessing_options = [(preprocessor.random_pad_to_aspect_ratio, {})] + + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_image = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_labels = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_classes] + distorted_keypoints = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_keypoints] + with self.test_session() as sess: + (distorted_image_, distorted_boxes_, distorted_labels_, + distorted_keypoints_) = sess.run([ + distorted_image, distorted_boxes, distorted_labels, + distorted_keypoints + ]) + + expected_boxes = np.array( + [[0.0, 0.25, 0.375, 1.0], [0.125, 0.5, 0.375, 1.0]], dtype=np.float32) + expected_keypoints = np.array([ + [[0.05, 0.1], [0.1, 0.2], [0.15, 0.3]], + [[0.2, 0.4], [0.25, 0.5], [0.3, 0.6]], + ], dtype=np.float32) + self.assertAllEqual(distorted_image_.shape, [1, 400, 400, 3]) + self.assertAllEqual(distorted_labels_, [1, 2]) + self.assertAllClose(distorted_boxes_.flatten(), + expected_boxes.flatten()) + self.assertAllClose(distorted_keypoints_.flatten(), + expected_keypoints.flatten()) + + def testRandomPadImageWithCache(self): + preprocess_options = [(preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1,}), (preprocessor.random_pad_image, {})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=True, + test_keypoints=True) + + def testRandomPadImage(self): + preprocessing_options = [(preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })] + + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + images = tensor_dict[fields.InputDataFields.image] + + preprocessing_options = [(preprocessor.random_pad_image, {})] + padded_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + + padded_images = padded_tensor_dict[fields.InputDataFields.image] + padded_boxes = padded_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + boxes_shape = tf.shape(boxes) + padded_boxes_shape = tf.shape(padded_boxes) + images_shape = tf.shape(images) + padded_images_shape = tf.shape(padded_images) + + with self.test_session() as sess: + (boxes_shape_, padded_boxes_shape_, images_shape_, + padded_images_shape_, boxes_, padded_boxes_) = sess.run( + [boxes_shape, padded_boxes_shape, images_shape, + padded_images_shape, boxes, padded_boxes]) + self.assertAllEqual(boxes_shape_, padded_boxes_shape_) + self.assertTrue((images_shape_[1] >= padded_images_shape_[1] * 0.5).all) + self.assertTrue((images_shape_[2] >= padded_images_shape_[2] * 0.5).all) + self.assertTrue((images_shape_[1] <= padded_images_shape_[1]).all) + self.assertTrue((images_shape_[2] <= padded_images_shape_[2]).all) + self.assertTrue(np.all((boxes_[:, 2] - boxes_[:, 0]) >= ( + padded_boxes_[:, 2] - padded_boxes_[:, 0]))) + self.assertTrue(np.all((boxes_[:, 3] - boxes_[:, 1]) >= ( + padded_boxes_[:, 3] - padded_boxes_[:, 1]))) + + def testRandomCropPadImageWithCache(self): + preprocess_options = [(preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1,}), (preprocessor.random_crop_pad_image, {})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=True, + test_keypoints=True) + + def testRandomCropPadImageWithRandomCoefOne(self): + preprocessing_options = [(preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })] + + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + tensor_dict = preprocessor.preprocess(tensor_dict, preprocessing_options) + images = tensor_dict[fields.InputDataFields.image] + + preprocessing_options = [(preprocessor.random_crop_pad_image, { + 'random_coef': 1.0 + })] + padded_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + + padded_images = padded_tensor_dict[fields.InputDataFields.image] + padded_boxes = padded_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + boxes_shape = tf.shape(boxes) + padded_boxes_shape = tf.shape(padded_boxes) + images_shape = tf.shape(images) + padded_images_shape = tf.shape(padded_images) + + with self.test_session() as sess: + (boxes_shape_, padded_boxes_shape_, images_shape_, + padded_images_shape_, boxes_, padded_boxes_) = sess.run( + [boxes_shape, padded_boxes_shape, images_shape, + padded_images_shape, boxes, padded_boxes]) + self.assertAllEqual(boxes_shape_, padded_boxes_shape_) + self.assertTrue((images_shape_[1] >= padded_images_shape_[1] * 0.5).all) + self.assertTrue((images_shape_[2] >= padded_images_shape_[2] * 0.5).all) + self.assertTrue((images_shape_[1] <= padded_images_shape_[1]).all) + self.assertTrue((images_shape_[2] <= padded_images_shape_[2]).all) + self.assertTrue(np.all((boxes_[:, 2] - boxes_[:, 0]) >= ( + padded_boxes_[:, 2] - padded_boxes_[:, 0]))) + self.assertTrue(np.all((boxes_[:, 3] - boxes_[:, 1]) >= ( + padded_boxes_[:, 3] - padded_boxes_[:, 1]))) + + def testRandomCropToAspectRatio(self): + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + tensor_dict = preprocessor.preprocess(tensor_dict, []) + images = tensor_dict[fields.InputDataFields.image] + + preprocessing_options = [(preprocessor.random_crop_to_aspect_ratio, { + 'aspect_ratio': 2.0 + })] + cropped_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + + cropped_images = cropped_tensor_dict[fields.InputDataFields.image] + cropped_boxes = cropped_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + boxes_shape = tf.shape(boxes) + cropped_boxes_shape = tf.shape(cropped_boxes) + images_shape = tf.shape(images) + cropped_images_shape = tf.shape(cropped_images) + + with self.test_session() as sess: + (boxes_shape_, cropped_boxes_shape_, images_shape_, + cropped_images_shape_) = sess.run([ + boxes_shape, cropped_boxes_shape, images_shape, cropped_images_shape + ]) + self.assertAllEqual(boxes_shape_, cropped_boxes_shape_) + self.assertEqual(images_shape_[1], cropped_images_shape_[1] * 2) + self.assertEqual(images_shape_[2], cropped_images_shape_[2]) + + def testRandomPadToAspectRatio(self): + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + tensor_dict = preprocessor.preprocess(tensor_dict, []) + images = tensor_dict[fields.InputDataFields.image] + + preprocessing_options = [(preprocessor.random_pad_to_aspect_ratio, { + 'aspect_ratio': 2.0 + })] + padded_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + + padded_images = padded_tensor_dict[fields.InputDataFields.image] + padded_boxes = padded_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + boxes_shape = tf.shape(boxes) + padded_boxes_shape = tf.shape(padded_boxes) + images_shape = tf.shape(images) + padded_images_shape = tf.shape(padded_images) + + with self.test_session() as sess: + (boxes_shape_, padded_boxes_shape_, images_shape_, + padded_images_shape_) = sess.run([ + boxes_shape, padded_boxes_shape, images_shape, padded_images_shape + ]) + self.assertAllEqual(boxes_shape_, padded_boxes_shape_) + self.assertEqual(images_shape_[1], padded_images_shape_[1]) + self.assertEqual(2 * images_shape_[2], padded_images_shape_[2]) + + def testRandomBlackPatchesWithCache(self): + preprocess_options = [] + preprocess_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocess_options.append((preprocessor.random_black_patches, { + 'size_to_image_ratio': 0.5 + })) + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=True, + test_keypoints=True) + + def testRandomBlackPatches(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_black_patches, { + 'size_to_image_ratio': 0.5 + })) + images = self.createTestImages() + tensor_dict = {fields.InputDataFields.image: images} + blacked_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + blacked_images = blacked_tensor_dict[fields.InputDataFields.image] + images_shape = tf.shape(images) + blacked_images_shape = tf.shape(blacked_images) + + with self.test_session() as sess: + (images_shape_, blacked_images_shape_) = sess.run( + [images_shape, blacked_images_shape]) + self.assertAllEqual(images_shape_, blacked_images_shape_) + + def testRandomResizeMethodWithCache(self): + preprocess_options = [] + preprocess_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocess_options.append((preprocessor.random_resize_method, { + 'target_size': (75, 150) + })) + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=True, + test_keypoints=True) + + def testRandomResizeMethod(self): + preprocessing_options = [] + preprocessing_options.append((preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + })) + preprocessing_options.append((preprocessor.random_resize_method, { + 'target_size': (75, 150) + })) + images = self.createTestImages() + tensor_dict = {fields.InputDataFields.image: images} + resized_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + resized_images = resized_tensor_dict[fields.InputDataFields.image] + resized_images_shape = tf.shape(resized_images) + expected_images_shape = tf.constant([1, 75, 150, 3], dtype=tf.int32) + + with self.test_session() as sess: + (expected_images_shape_, resized_images_shape_) = sess.run( + [expected_images_shape, resized_images_shape]) + self.assertAllEqual(expected_images_shape_, + resized_images_shape_) + + def testResizeImageWithMasks(self): + """Tests image resizing, checking output sizes.""" + in_image_shape_list = [[60, 40, 3], [15, 30, 3]] + in_masks_shape_list = [[15, 60, 40], [10, 15, 30]] + height = 50 + width = 100 + expected_image_shape_list = [[50, 100, 3], [50, 100, 3]] + expected_masks_shape_list = [[15, 50, 100], [10, 50, 100]] + + for (in_image_shape, expected_image_shape, in_masks_shape, + expected_mask_shape) in zip(in_image_shape_list, + expected_image_shape_list, + in_masks_shape_list, + expected_masks_shape_list): + in_image = tf.random_uniform(in_image_shape) + in_masks = tf.random_uniform(in_masks_shape) + out_image, out_masks, _ = preprocessor.resize_image( + in_image, in_masks, new_height=height, new_width=width) + out_image_shape = tf.shape(out_image) + out_masks_shape = tf.shape(out_masks) + + with self.test_session() as sess: + out_image_shape, out_masks_shape = sess.run( + [out_image_shape, out_masks_shape]) + self.assertAllEqual(out_image_shape, expected_image_shape) + self.assertAllEqual(out_masks_shape, expected_mask_shape) + + def testResizeImageWithMasksTensorInputHeightAndWidth(self): + """Tests image resizing, checking output sizes.""" + in_image_shape_list = [[60, 40, 3], [15, 30, 3]] + in_masks_shape_list = [[15, 60, 40], [10, 15, 30]] + height = tf.constant(50, dtype=tf.int32) + width = tf.constant(100, dtype=tf.int32) + expected_image_shape_list = [[50, 100, 3], [50, 100, 3]] + expected_masks_shape_list = [[15, 50, 100], [10, 50, 100]] + + for (in_image_shape, expected_image_shape, in_masks_shape, + expected_mask_shape) in zip(in_image_shape_list, + expected_image_shape_list, + in_masks_shape_list, + expected_masks_shape_list): + in_image = tf.random_uniform(in_image_shape) + in_masks = tf.random_uniform(in_masks_shape) + out_image, out_masks, _ = preprocessor.resize_image( + in_image, in_masks, new_height=height, new_width=width) + out_image_shape = tf.shape(out_image) + out_masks_shape = tf.shape(out_masks) + + with self.test_session() as sess: + out_image_shape, out_masks_shape = sess.run( + [out_image_shape, out_masks_shape]) + self.assertAllEqual(out_image_shape, expected_image_shape) + self.assertAllEqual(out_masks_shape, expected_mask_shape) + + def testResizeImageWithNoInstanceMask(self): + """Tests image resizing, checking output sizes.""" + in_image_shape_list = [[60, 40, 3], [15, 30, 3]] + in_masks_shape_list = [[0, 60, 40], [0, 15, 30]] + height = 50 + width = 100 + expected_image_shape_list = [[50, 100, 3], [50, 100, 3]] + expected_masks_shape_list = [[0, 50, 100], [0, 50, 100]] + + for (in_image_shape, expected_image_shape, in_masks_shape, + expected_mask_shape) in zip(in_image_shape_list, + expected_image_shape_list, + in_masks_shape_list, + expected_masks_shape_list): + in_image = tf.random_uniform(in_image_shape) + in_masks = tf.random_uniform(in_masks_shape) + out_image, out_masks, _ = preprocessor.resize_image( + in_image, in_masks, new_height=height, new_width=width) + out_image_shape = tf.shape(out_image) + out_masks_shape = tf.shape(out_masks) + + with self.test_session() as sess: + out_image_shape, out_masks_shape = sess.run( + [out_image_shape, out_masks_shape]) + self.assertAllEqual(out_image_shape, expected_image_shape) + self.assertAllEqual(out_masks_shape, expected_mask_shape) + + def testResizeToRangePreservesStaticSpatialShape(self): + """Tests image resizing, checking output sizes.""" + in_shape_list = [[60, 40, 3], [15, 30, 3], [15, 50, 3]] + min_dim = 50 + max_dim = 100 + expected_shape_list = [[75, 50, 3], [50, 100, 3], [30, 100, 3]] + + for in_shape, expected_shape in zip(in_shape_list, expected_shape_list): + in_image = tf.random_uniform(in_shape) + out_image, _ = preprocessor.resize_to_range( + in_image, min_dimension=min_dim, max_dimension=max_dim) + self.assertAllEqual(out_image.get_shape().as_list(), expected_shape) + + def testResizeToRangeWithDynamicSpatialShape(self): + """Tests image resizing, checking output sizes.""" + in_shape_list = [[60, 40, 3], [15, 30, 3], [15, 50, 3]] + min_dim = 50 + max_dim = 100 + expected_shape_list = [[75, 50, 3], [50, 100, 3], [30, 100, 3]] + + for in_shape, expected_shape in zip(in_shape_list, expected_shape_list): + in_image = tf.placeholder(tf.float32, shape=(None, None, 3)) + out_image, _ = preprocessor.resize_to_range( + in_image, min_dimension=min_dim, max_dimension=max_dim) + out_image_shape = tf.shape(out_image) + with self.test_session() as sess: + out_image_shape = sess.run(out_image_shape, + feed_dict={in_image: + np.random.randn(*in_shape)}) + self.assertAllEqual(out_image_shape, expected_shape) + + def testResizeToRangeWithPadToMaxDimensionReturnsCorrectShapes(self): + in_shape_list = [[60, 40, 3], [15, 30, 3], [15, 50, 3]] + min_dim = 50 + max_dim = 100 + expected_shape_list = [[100, 100, 3], [100, 100, 3], [100, 100, 3]] + + for in_shape, expected_shape in zip(in_shape_list, expected_shape_list): + in_image = tf.placeholder(tf.float32, shape=(None, None, 3)) + out_image, _ = preprocessor.resize_to_range( + in_image, + min_dimension=min_dim, + max_dimension=max_dim, + pad_to_max_dimension=True) + self.assertAllEqual(out_image.shape.as_list(), expected_shape) + out_image_shape = tf.shape(out_image) + with self.test_session() as sess: + out_image_shape = sess.run( + out_image_shape, feed_dict={in_image: np.random.randn(*in_shape)}) + self.assertAllEqual(out_image_shape, expected_shape) + + def testResizeToRangeWithPadToMaxDimensionReturnsCorrectTensor(self): + in_image_np = np.array([[[0, 1, 2]]], np.float32) + ex_image_np = np.array( + [[[0, 1, 2], [123.68, 116.779, 103.939]], + [[123.68, 116.779, 103.939], [123.68, 116.779, 103.939]]], np.float32) + min_dim = 1 + max_dim = 2 + + in_image = tf.placeholder(tf.float32, shape=(None, None, 3)) + out_image, _ = preprocessor.resize_to_range( + in_image, + min_dimension=min_dim, + max_dimension=max_dim, + pad_to_max_dimension=True, + per_channel_pad_value=(123.68, 116.779, 103.939)) + + with self.test_session() as sess: + out_image_np = sess.run(out_image, feed_dict={in_image: in_image_np}) + self.assertAllClose(ex_image_np, out_image_np) + + def testResizeToRangeWithMasksPreservesStaticSpatialShape(self): + """Tests image resizing, checking output sizes.""" + in_image_shape_list = [[60, 40, 3], [15, 30, 3]] + in_masks_shape_list = [[15, 60, 40], [10, 15, 30]] + min_dim = 50 + max_dim = 100 + expected_image_shape_list = [[75, 50, 3], [50, 100, 3]] + expected_masks_shape_list = [[15, 75, 50], [10, 50, 100]] + + for (in_image_shape, expected_image_shape, in_masks_shape, + expected_mask_shape) in zip(in_image_shape_list, + expected_image_shape_list, + in_masks_shape_list, + expected_masks_shape_list): + in_image = tf.random_uniform(in_image_shape) + in_masks = tf.random_uniform(in_masks_shape) + out_image, out_masks, _ = preprocessor.resize_to_range( + in_image, in_masks, min_dimension=min_dim, max_dimension=max_dim) + self.assertAllEqual(out_masks.get_shape().as_list(), expected_mask_shape) + self.assertAllEqual(out_image.get_shape().as_list(), expected_image_shape) + + def testResizeToRangeWithMasksAndDynamicSpatialShape(self): + """Tests image resizing, checking output sizes.""" + in_image_shape_list = [[60, 40, 3], [15, 30, 3]] + in_masks_shape_list = [[15, 60, 40], [10, 15, 30]] + min_dim = 50 + max_dim = 100 + expected_image_shape_list = [[75, 50, 3], [50, 100, 3]] + expected_masks_shape_list = [[15, 75, 50], [10, 50, 100]] + + for (in_image_shape, expected_image_shape, in_masks_shape, + expected_mask_shape) in zip(in_image_shape_list, + expected_image_shape_list, + in_masks_shape_list, + expected_masks_shape_list): + in_image = tf.placeholder(tf.float32, shape=(None, None, 3)) + in_masks = tf.placeholder(tf.float32, shape=(None, None, None)) + in_masks = tf.random_uniform(in_masks_shape) + out_image, out_masks, _ = preprocessor.resize_to_range( + in_image, in_masks, min_dimension=min_dim, max_dimension=max_dim) + out_image_shape = tf.shape(out_image) + out_masks_shape = tf.shape(out_masks) + + with self.test_session() as sess: + out_image_shape, out_masks_shape = sess.run( + [out_image_shape, out_masks_shape], + feed_dict={ + in_image: np.random.randn(*in_image_shape), + in_masks: np.random.randn(*in_masks_shape) + }) + self.assertAllEqual(out_image_shape, expected_image_shape) + self.assertAllEqual(out_masks_shape, expected_mask_shape) + + def testResizeToRangeWithInstanceMasksTensorOfSizeZero(self): + """Tests image resizing, checking output sizes.""" + in_image_shape_list = [[60, 40, 3], [15, 30, 3]] + in_masks_shape_list = [[0, 60, 40], [0, 15, 30]] + min_dim = 50 + max_dim = 100 + expected_image_shape_list = [[75, 50, 3], [50, 100, 3]] + expected_masks_shape_list = [[0, 75, 50], [0, 50, 100]] + + for (in_image_shape, expected_image_shape, in_masks_shape, + expected_mask_shape) in zip(in_image_shape_list, + expected_image_shape_list, + in_masks_shape_list, + expected_masks_shape_list): + in_image = tf.random_uniform(in_image_shape) + in_masks = tf.random_uniform(in_masks_shape) + out_image, out_masks, _ = preprocessor.resize_to_range( + in_image, in_masks, min_dimension=min_dim, max_dimension=max_dim) + out_image_shape = tf.shape(out_image) + out_masks_shape = tf.shape(out_masks) + + with self.test_session() as sess: + out_image_shape, out_masks_shape = sess.run( + [out_image_shape, out_masks_shape]) + self.assertAllEqual(out_image_shape, expected_image_shape) + self.assertAllEqual(out_masks_shape, expected_mask_shape) + + def testResizeToRange4DImageTensor(self): + image = tf.random_uniform([1, 200, 300, 3]) + with self.assertRaises(ValueError): + preprocessor.resize_to_range(image, 500, 600) + + def testResizeToRangeSameMinMax(self): + """Tests image resizing, checking output sizes.""" + in_shape_list = [[312, 312, 3], [299, 299, 3]] + min_dim = 320 + max_dim = 320 + expected_shape_list = [[320, 320, 3], [320, 320, 3]] + + for in_shape, expected_shape in zip(in_shape_list, expected_shape_list): + in_image = tf.random_uniform(in_shape) + out_image, _ = preprocessor.resize_to_range( + in_image, min_dimension=min_dim, max_dimension=max_dim) + out_image_shape = tf.shape(out_image) + + with self.test_session() as sess: + out_image_shape = sess.run(out_image_shape) + self.assertAllEqual(out_image_shape, expected_shape) + + def testResizeToMinDimensionTensorShapes(self): + in_image_shape_list = [[60, 55, 3], [15, 30, 3]] + in_masks_shape_list = [[15, 60, 55], [10, 15, 30]] + min_dim = 50 + expected_image_shape_list = [[60, 55, 3], [50, 100, 3]] + expected_masks_shape_list = [[15, 60, 55], [10, 50, 100]] + + for (in_image_shape, expected_image_shape, in_masks_shape, + expected_mask_shape) in zip(in_image_shape_list, + expected_image_shape_list, + in_masks_shape_list, + expected_masks_shape_list): + in_image = tf.placeholder(tf.float32, shape=(None, None, 3)) + in_masks = tf.placeholder(tf.float32, shape=(None, None, None)) + in_masks = tf.random_uniform(in_masks_shape) + out_image, out_masks, _ = preprocessor.resize_to_min_dimension( + in_image, in_masks, min_dimension=min_dim) + out_image_shape = tf.shape(out_image) + out_masks_shape = tf.shape(out_masks) + + with self.test_session() as sess: + out_image_shape, out_masks_shape = sess.run( + [out_image_shape, out_masks_shape], + feed_dict={ + in_image: np.random.randn(*in_image_shape), + in_masks: np.random.randn(*in_masks_shape) + }) + self.assertAllEqual(out_image_shape, expected_image_shape) + self.assertAllEqual(out_masks_shape, expected_mask_shape) + + def testResizeToMinDimensionWithInstanceMasksTensorOfSizeZero(self): + """Tests image resizing, checking output sizes.""" + in_image_shape_list = [[60, 40, 3], [15, 30, 3]] + in_masks_shape_list = [[0, 60, 40], [0, 15, 30]] + min_dim = 50 + expected_image_shape_list = [[75, 50, 3], [50, 100, 3]] + expected_masks_shape_list = [[0, 75, 50], [0, 50, 100]] + + for (in_image_shape, expected_image_shape, in_masks_shape, + expected_mask_shape) in zip(in_image_shape_list, + expected_image_shape_list, + in_masks_shape_list, + expected_masks_shape_list): + in_image = tf.random_uniform(in_image_shape) + in_masks = tf.random_uniform(in_masks_shape) + out_image, out_masks, _ = preprocessor.resize_to_min_dimension( + in_image, in_masks, min_dimension=min_dim) + out_image_shape = tf.shape(out_image) + out_masks_shape = tf.shape(out_masks) + + with self.test_session() as sess: + out_image_shape, out_masks_shape = sess.run( + [out_image_shape, out_masks_shape]) + self.assertAllEqual(out_image_shape, expected_image_shape) + self.assertAllEqual(out_masks_shape, expected_mask_shape) + + def testResizeToMinDimensionRaisesErrorOn4DImage(self): + image = tf.random_uniform([1, 200, 300, 3]) + with self.assertRaises(ValueError): + preprocessor.resize_to_min_dimension(image, 500) + + def testScaleBoxesToPixelCoordinates(self): + """Tests box scaling, checking scaled values.""" + in_shape = [60, 40, 3] + in_boxes = [[0.1, 0.2, 0.4, 0.6], + [0.5, 0.3, 0.9, 0.7]] + + expected_boxes = [[6., 8., 24., 24.], + [30., 12., 54., 28.]] + + in_image = tf.random_uniform(in_shape) + in_boxes = tf.constant(in_boxes) + _, out_boxes = preprocessor.scale_boxes_to_pixel_coordinates( + in_image, boxes=in_boxes) + with self.test_session() as sess: + out_boxes = sess.run(out_boxes) + self.assertAllClose(out_boxes, expected_boxes) + + def testScaleBoxesToPixelCoordinatesWithKeypoints(self): + """Tests box and keypoint scaling, checking scaled values.""" + in_shape = [60, 40, 3] + in_boxes = self.createTestBoxes() + in_keypoints = self.createTestKeypoints() + + expected_boxes = [[0., 10., 45., 40.], + [15., 20., 45., 40.]] + expected_keypoints = [ + [[6., 4.], [12., 8.], [18., 12.]], + [[24., 16.], [30., 20.], [36., 24.]], + ] + + in_image = tf.random_uniform(in_shape) + _, out_boxes, out_keypoints = preprocessor.scale_boxes_to_pixel_coordinates( + in_image, boxes=in_boxes, keypoints=in_keypoints) + with self.test_session() as sess: + out_boxes_, out_keypoints_ = sess.run([out_boxes, out_keypoints]) + self.assertAllClose(out_boxes_, expected_boxes) + self.assertAllClose(out_keypoints_, expected_keypoints) + + def testSubtractChannelMean(self): + """Tests whether channel means have been subtracted.""" + with self.test_session(): + image = tf.zeros((240, 320, 3)) + means = [1, 2, 3] + actual = preprocessor.subtract_channel_mean(image, means=means) + actual = actual.eval() + + self.assertTrue((actual[:, :, 0] == -1).all()) + self.assertTrue((actual[:, :, 1] == -2).all()) + self.assertTrue((actual[:, :, 2] == -3).all()) + + def testOneHotEncoding(self): + """Tests one hot encoding of multiclass labels.""" + with self.test_session(): + labels = tf.constant([1, 4, 2], dtype=tf.int32) + one_hot = preprocessor.one_hot_encoding(labels, num_classes=5) + one_hot = one_hot.eval() + + self.assertAllEqual([0, 1, 1, 0, 1], one_hot) + + def testSSDRandomCropWithCache(self): + preprocess_options = [ + (preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + }), + (preprocessor.ssd_random_crop, {})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=False, + test_keypoints=False) + + def testSSDRandomCrop(self): + preprocessing_options = [ + (preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + }), + (preprocessor.ssd_random_crop, {})] + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + distorted_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + distorted_images = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + + images_rank = tf.rank(images) + distorted_images_rank = tf.rank(distorted_images) + boxes_rank = tf.rank(boxes) + distorted_boxes_rank = tf.rank(distorted_boxes) + + with self.test_session() as sess: + (boxes_rank_, distorted_boxes_rank_, images_rank_, + distorted_images_rank_) = sess.run( + [boxes_rank, distorted_boxes_rank, images_rank, + distorted_images_rank]) + self.assertAllEqual(boxes_rank_, distorted_boxes_rank_) + self.assertAllEqual(images_rank_, distorted_images_rank_) + + def testSSDRandomCropWithMultiClassScores(self): + preprocessing_options = [(preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + }), (preprocessor.ssd_random_crop, {})] + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + multiclass_scores = self.createTestMultiClassScores() + + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + fields.InputDataFields.multiclass_scores: multiclass_scores, + } + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_multiclass_scores=True) + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_images = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + distorted_multiclass_scores = distorted_tensor_dict[ + fields.InputDataFields.multiclass_scores] + + images_rank = tf.rank(images) + distorted_images_rank = tf.rank(distorted_images) + boxes_rank = tf.rank(boxes) + distorted_boxes_rank = tf.rank(distorted_boxes) + multiclass_scores_rank = tf.rank(multiclass_scores) + distorted_multiclass_scores_rank = tf.rank(distorted_multiclass_scores) + + with self.test_session() as sess: + (boxes_rank_, distorted_boxes_, distorted_boxes_rank_, images_rank_, + distorted_images_rank_, multiclass_scores_rank_, + distorted_multiclass_scores_, + distorted_multiclass_scores_rank_) = sess.run([ + boxes_rank, distorted_boxes, distorted_boxes_rank, images_rank, + distorted_images_rank, multiclass_scores_rank, + distorted_multiclass_scores, distorted_multiclass_scores_rank + ]) + self.assertAllEqual(boxes_rank_, distorted_boxes_rank_) + self.assertAllEqual(images_rank_, distorted_images_rank_) + self.assertAllEqual(multiclass_scores_rank_, + distorted_multiclass_scores_rank_) + self.assertAllEqual(distorted_boxes_.shape[0], + distorted_multiclass_scores_.shape[0]) + + def testSSDRandomCropPad(self): + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + preprocessing_options = [ + (preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + }), + (preprocessor.ssd_random_crop_pad, {})] + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + distorted_tensor_dict = preprocessor.preprocess(tensor_dict, + preprocessing_options) + distorted_images = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + + images_rank = tf.rank(images) + distorted_images_rank = tf.rank(distorted_images) + boxes_rank = tf.rank(boxes) + distorted_boxes_rank = tf.rank(distorted_boxes) + + with self.test_session() as sess: + (boxes_rank_, distorted_boxes_rank_, images_rank_, + distorted_images_rank_) = sess.run([ + boxes_rank, distorted_boxes_rank, images_rank, distorted_images_rank + ]) + self.assertAllEqual(boxes_rank_, distorted_boxes_rank_) + self.assertAllEqual(images_rank_, distorted_images_rank_) + + def testSSDRandomCropFixedAspectRatioWithCache(self): + preprocess_options = [ + (preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + }), + (preprocessor.ssd_random_crop_fixed_aspect_ratio, {})] + self._testPreprocessorCache(preprocess_options, + test_boxes=True, + test_masks=False, + test_keypoints=False) + + def _testSSDRandomCropFixedAspectRatio(self, + include_label_scores, + include_multiclass_scores, + include_instance_masks, + include_keypoints): + images = self.createTestImages() + boxes = self.createTestBoxes() + labels = self.createTestLabels() + preprocessing_options = [(preprocessor.normalize_image, { + 'original_minval': 0, + 'original_maxval': 255, + 'target_minval': 0, + 'target_maxval': 1 + }), (preprocessor.ssd_random_crop_fixed_aspect_ratio, {})] + tensor_dict = { + fields.InputDataFields.image: images, + fields.InputDataFields.groundtruth_boxes: boxes, + fields.InputDataFields.groundtruth_classes: labels, + } + if include_label_scores: + label_scores = self.createTestLabelScores() + tensor_dict[fields.InputDataFields.groundtruth_label_scores] = ( + label_scores) + if include_multiclass_scores: + multiclass_scores = self.createTestMultiClassScores() + tensor_dict[fields.InputDataFields.multiclass_scores] = ( + multiclass_scores) + if include_instance_masks: + masks = self.createTestMasks() + tensor_dict[fields.InputDataFields.groundtruth_instance_masks] = masks + if include_keypoints: + keypoints = self.createTestKeypoints() + tensor_dict[fields.InputDataFields.groundtruth_keypoints] = keypoints + + preprocessor_arg_map = preprocessor.get_default_func_arg_map( + include_label_scores=include_label_scores, + include_multiclass_scores=include_multiclass_scores, + include_instance_masks=include_instance_masks, + include_keypoints=include_keypoints) + distorted_tensor_dict = preprocessor.preprocess( + tensor_dict, preprocessing_options, func_arg_map=preprocessor_arg_map) + distorted_images = distorted_tensor_dict[fields.InputDataFields.image] + distorted_boxes = distorted_tensor_dict[ + fields.InputDataFields.groundtruth_boxes] + images_rank = tf.rank(images) + distorted_images_rank = tf.rank(distorted_images) + boxes_rank = tf.rank(boxes) + distorted_boxes_rank = tf.rank(distorted_boxes) + + with self.test_session() as sess: + (boxes_rank_, distorted_boxes_rank_, images_rank_, + distorted_images_rank_) = sess.run( + [boxes_rank, distorted_boxes_rank, images_rank, + distorted_images_rank]) + self.assertAllEqual(boxes_rank_, distorted_boxes_rank_) + self.assertAllEqual(images_rank_, distorted_images_rank_) + + def testSSDRandomCropFixedAspectRatio(self): + self._testSSDRandomCropFixedAspectRatio(include_label_scores=False, + include_multiclass_scores=False, + include_instance_masks=False, + include_keypoints=False) + + def testSSDRandomCropFixedAspectRatioWithMultiClassScores(self): + self._testSSDRandomCropFixedAspectRatio(include_label_scores=False, + include_multiclass_scores=True, + include_instance_masks=False, + include_keypoints=False) + + def testSSDRandomCropFixedAspectRatioWithMasksAndKeypoints(self): + self._testSSDRandomCropFixedAspectRatio(include_label_scores=False, + include_multiclass_scores=False, + include_instance_masks=True, + include_keypoints=True) + + def testSSDRandomCropFixedAspectRatioWithLabelScoresMasksAndKeypoints(self): + self._testSSDRandomCropFixedAspectRatio(include_label_scores=True, + include_multiclass_scores=False, + include_instance_masks=True, + include_keypoints=True) + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/region_similarity_calculator.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/region_similarity_calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..f344006a3c56c95021dae47fcf5195a1b9743d85 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/region_similarity_calculator.py @@ -0,0 +1,114 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Region Similarity Calculators for BoxLists. + +Region Similarity Calculators compare a pairwise measure of similarity +between the boxes in two BoxLists. +""" +from abc import ABCMeta +from abc import abstractmethod + +import tensorflow as tf + +from object_detection.core import box_list_ops + + +class RegionSimilarityCalculator(object): + """Abstract base class for region similarity calculator.""" + __metaclass__ = ABCMeta + + def compare(self, boxlist1, boxlist2, scope=None): + """Computes matrix of pairwise similarity between BoxLists. + + This op (to be overriden) computes a measure of pairwise similarity between + the boxes in the given BoxLists. Higher values indicate more similarity. + + Note that this method simply measures similarity and does not explicitly + perform a matching. + + Args: + boxlist1: BoxList holding N boxes. + boxlist2: BoxList holding M boxes. + scope: Op scope name. Defaults to 'Compare' if None. + + Returns: + a (float32) tensor of shape [N, M] with pairwise similarity score. + """ + with tf.name_scope(scope, 'Compare', [boxlist1, boxlist2]) as scope: + return self._compare(boxlist1, boxlist2) + + @abstractmethod + def _compare(self, boxlist1, boxlist2): + pass + + +class IouSimilarity(RegionSimilarityCalculator): + """Class to compute similarity based on Intersection over Union (IOU) metric. + + This class computes pairwise similarity between two BoxLists based on IOU. + """ + + def _compare(self, boxlist1, boxlist2): + """Compute pairwise IOU similarity between the two BoxLists. + + Args: + boxlist1: BoxList holding N boxes. + boxlist2: BoxList holding M boxes. + + Returns: + A tensor with shape [N, M] representing pairwise iou scores. + """ + return box_list_ops.iou(boxlist1, boxlist2) + + +class NegSqDistSimilarity(RegionSimilarityCalculator): + """Class to compute similarity based on the squared distance metric. + + This class computes pairwise similarity between two BoxLists based on the + negative squared distance metric. + """ + + def _compare(self, boxlist1, boxlist2): + """Compute matrix of (negated) sq distances. + + Args: + boxlist1: BoxList holding N boxes. + boxlist2: BoxList holding M boxes. + + Returns: + A tensor with shape [N, M] representing negated pairwise squared distance. + """ + return -1 * box_list_ops.sq_dist(boxlist1, boxlist2) + + +class IoaSimilarity(RegionSimilarityCalculator): + """Class to compute similarity based on Intersection over Area (IOA) metric. + + This class computes pairwise similarity between two BoxLists based on their + pairwise intersections divided by the areas of second BoxLists. + """ + + def _compare(self, boxlist1, boxlist2): + """Compute pairwise IOA similarity between the two BoxLists. + + Args: + boxlist1: BoxList holding N boxes. + boxlist2: BoxList holding M boxes. + + Returns: + A tensor with shape [N, M] representing pairwise IOA scores. + """ + return box_list_ops.ioa(boxlist1, boxlist2) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/region_similarity_calculator_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/region_similarity_calculator_test.py new file mode 100644 index 0000000000000000000000000000000000000000..162151a3b53468a7724133ca681efc0df5293563 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/region_similarity_calculator_test.py @@ -0,0 +1,75 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for region_similarity_calculator.""" +import tensorflow as tf + +from object_detection.core import box_list +from object_detection.core import region_similarity_calculator + + +class RegionSimilarityCalculatorTest(tf.test.TestCase): + + def test_get_correct_pairwise_similarity_based_on_iou(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]) + exp_output = [[2.0 / 16.0, 0, 6.0 / 400.0], [1.0 / 16.0, 0.0, 5.0 / 400.0]] + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + iou_similarity_calculator = region_similarity_calculator.IouSimilarity() + iou_similarity = iou_similarity_calculator.compare(boxes1, boxes2) + with self.test_session() as sess: + iou_output = sess.run(iou_similarity) + self.assertAllClose(iou_output, exp_output) + + def test_get_correct_pairwise_similarity_based_on_squared_distances(self): + corners1 = tf.constant([[0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 0.0, 2.0]]) + corners2 = tf.constant([[3.0, 4.0, 1.0, 0.0], + [-4.0, 0.0, 0.0, 3.0], + [0.0, 0.0, 0.0, 0.0]]) + exp_output = [[-26, -25, 0], [-18, -27, -6]] + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + dist_similarity_calc = region_similarity_calculator.NegSqDistSimilarity() + dist_similarity = dist_similarity_calc.compare(boxes1, boxes2) + with self.test_session() as sess: + dist_output = sess.run(dist_similarity) + self.assertAllClose(dist_output, exp_output) + + def test_get_correct_pairwise_similarity_based_on_ioa(self): + corners1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]) + corners2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]) + exp_output_1 = [[2.0 / 12.0, 0, 6.0 / 400.0], + [1.0 / 12.0, 0.0, 5.0 / 400.0]] + exp_output_2 = [[2.0 / 6.0, 1.0 / 5.0], + [0, 0], + [6.0 / 6.0, 5.0 / 5.0]] + boxes1 = box_list.BoxList(corners1) + boxes2 = box_list.BoxList(corners2) + ioa_similarity_calculator = region_similarity_calculator.IoaSimilarity() + ioa_similarity_1 = ioa_similarity_calculator.compare(boxes1, boxes2) + ioa_similarity_2 = ioa_similarity_calculator.compare(boxes2, boxes1) + with self.test_session() as sess: + iou_output_1, iou_output_2 = sess.run( + [ioa_similarity_1, ioa_similarity_2]) + self.assertAllClose(iou_output_1, exp_output_1) + self.assertAllClose(iou_output_2, exp_output_2) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/standard_fields.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/standard_fields.py new file mode 100644 index 0000000000000000000000000000000000000000..11282da6deca075935d25e3558bfe1a25588fb20 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/standard_fields.py @@ -0,0 +1,227 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Contains classes specifying naming conventions used for object detection. + + +Specifies: + InputDataFields: standard fields used by reader/preprocessor/batcher. + DetectionResultFields: standard fields returned by object detector. + BoxListFields: standard field used by BoxList + TfExampleFields: standard fields for tf-example data format (go/tf-example). +""" + + +class InputDataFields(object): + """Names for the input tensors. + + Holds the standard data field names to use for identifying input tensors. This + should be used by the decoder to identify keys for the returned tensor_dict + containing input tensors. And it should be used by the model to identify the + tensors it needs. + + Attributes: + image: image. + image_additional_channels: additional channels. + original_image: image in the original input size. + key: unique key corresponding to image. + source_id: source of the original image. + filename: original filename of the dataset (without common path). + groundtruth_image_classes: image-level class labels. + groundtruth_boxes: coordinates of the ground truth boxes in the image. + groundtruth_classes: box-level class labels. + groundtruth_label_types: box-level label types (e.g. explicit negative). + groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead] + is the groundtruth a single object or a crowd. + groundtruth_area: area of a groundtruth segment. + groundtruth_difficult: is a `difficult` object + groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of the + same class, forming a connected group, where instances are heavily + occluding each other. + proposal_boxes: coordinates of object proposal boxes. + proposal_objectness: objectness score of each proposal. + groundtruth_instance_masks: ground truth instance masks. + groundtruth_instance_boundaries: ground truth instance boundaries. + groundtruth_instance_classes: instance mask-level class labels. + groundtruth_keypoints: ground truth keypoints. + groundtruth_keypoint_visibilities: ground truth keypoint visibilities. + groundtruth_label_scores: groundtruth label scores. + groundtruth_weights: groundtruth weight factor for bounding boxes. + num_groundtruth_boxes: number of groundtruth boxes. + true_image_shapes: true shapes of images in the resized images, as resized + images can be padded with zeros. + verified_labels: list of human-verified image-level labels (note, that a + label can be verified both as positive and negative). + multiclass_scores: the label score per class for each box. + """ + image = 'image' + image_additional_channels = 'image_additional_channels' + original_image = 'original_image' + key = 'key' + source_id = 'source_id' + filename = 'filename' + groundtruth_image_classes = 'groundtruth_image_classes' + groundtruth_boxes = 'groundtruth_boxes' + groundtruth_classes = 'groundtruth_classes' + groundtruth_label_types = 'groundtruth_label_types' + groundtruth_is_crowd = 'groundtruth_is_crowd' + groundtruth_area = 'groundtruth_area' + groundtruth_difficult = 'groundtruth_difficult' + groundtruth_group_of = 'groundtruth_group_of' + proposal_boxes = 'proposal_boxes' + proposal_objectness = 'proposal_objectness' + groundtruth_instance_masks = 'groundtruth_instance_masks' + groundtruth_instance_boundaries = 'groundtruth_instance_boundaries' + groundtruth_instance_classes = 'groundtruth_instance_classes' + groundtruth_keypoints = 'groundtruth_keypoints' + groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities' + groundtruth_label_scores = 'groundtruth_label_scores' + groundtruth_weights = 'groundtruth_weights' + num_groundtruth_boxes = 'num_groundtruth_boxes' + true_image_shape = 'true_image_shape' + verified_labels = 'verified_labels' + multiclass_scores = 'multiclass_scores' + + +class DetectionResultFields(object): + """Naming conventions for storing the output of the detector. + + Attributes: + source_id: source of the original image. + key: unique key corresponding to image. + detection_boxes: coordinates of the detection boxes in the image. + detection_scores: detection scores for the detection boxes in the image. + detection_classes: detection-level class labels. + detection_masks: contains a segmentation mask for each detection box. + detection_boundaries: contains an object boundary for each detection box. + detection_keypoints: contains detection keypoints for each detection box. + num_detections: number of detections in the batch. + """ + + source_id = 'source_id' + key = 'key' + detection_boxes = 'detection_boxes' + detection_scores = 'detection_scores' + detection_classes = 'detection_classes' + detection_masks = 'detection_masks' + detection_boundaries = 'detection_boundaries' + detection_keypoints = 'detection_keypoints' + num_detections = 'num_detections' + + +class BoxListFields(object): + """Naming conventions for BoxLists. + + Attributes: + boxes: bounding box coordinates. + classes: classes per bounding box. + scores: scores per bounding box. + weights: sample weights per bounding box. + objectness: objectness score per bounding box. + masks: masks per bounding box. + boundaries: boundaries per bounding box. + keypoints: keypoints per bounding box. + keypoint_heatmaps: keypoint heatmaps per bounding box. + is_crowd: is_crowd annotation per bounding box. + """ + boxes = 'boxes' + classes = 'classes' + scores = 'scores' + weights = 'weights' + objectness = 'objectness' + masks = 'masks' + boundaries = 'boundaries' + keypoints = 'keypoints' + keypoint_heatmaps = 'keypoint_heatmaps' + is_crowd = 'is_crowd' + + +class TfExampleFields(object): + """TF-example proto feature names for object detection. + + Holds the standard feature names to load from an Example proto for object + detection. + + Attributes: + image_encoded: JPEG encoded string + image_format: image format, e.g. "JPEG" + filename: filename + channels: number of channels of image + colorspace: colorspace, e.g. "RGB" + height: height of image in pixels, e.g. 462 + width: width of image in pixels, e.g. 581 + source_id: original source of the image + image_class_text: image-level label in text format + image_class_label: image-level label in numerical format + object_class_text: labels in text format, e.g. ["person", "cat"] + object_class_label: labels in numbers, e.g. [16, 8] + object_bbox_xmin: xmin coordinates of groundtruth box, e.g. 10, 30 + object_bbox_xmax: xmax coordinates of groundtruth box, e.g. 50, 40 + object_bbox_ymin: ymin coordinates of groundtruth box, e.g. 40, 50 + object_bbox_ymax: ymax coordinates of groundtruth box, e.g. 80, 70 + object_view: viewpoint of object, e.g. ["frontal", "left"] + object_truncated: is object truncated, e.g. [true, false] + object_occluded: is object occluded, e.g. [true, false] + object_difficult: is object difficult, e.g. [true, false] + object_group_of: is object a single object or a group of objects + object_depiction: is object a depiction + object_is_crowd: [DEPRECATED, use object_group_of instead] + is the object a single object or a crowd + object_segment_area: the area of the segment. + object_weight: a weight factor for the object's bounding box. + instance_masks: instance segmentation masks. + instance_boundaries: instance boundaries. + instance_classes: Classes for each instance segmentation mask. + detection_class_label: class label in numbers. + detection_bbox_ymin: ymin coordinates of a detection box. + detection_bbox_xmin: xmin coordinates of a detection box. + detection_bbox_ymax: ymax coordinates of a detection box. + detection_bbox_xmax: xmax coordinates of a detection box. + detection_score: detection score for the class label and box. + """ + image_encoded = 'image/encoded' + image_format = 'image/format' # format is reserved keyword + filename = 'image/filename' + channels = 'image/channels' + colorspace = 'image/colorspace' + height = 'image/height' + width = 'image/width' + source_id = 'image/source_id' + image_class_text = 'image/class/text' + image_class_label = 'image/class/label' + object_class_text = 'image/object/class/text' + object_class_label = 'image/object/class/label' + object_bbox_ymin = 'image/object/bbox/ymin' + object_bbox_xmin = 'image/object/bbox/xmin' + object_bbox_ymax = 'image/object/bbox/ymax' + object_bbox_xmax = 'image/object/bbox/xmax' + object_view = 'image/object/view' + object_truncated = 'image/object/truncated' + object_occluded = 'image/object/occluded' + object_difficult = 'image/object/difficult' + object_group_of = 'image/object/group_of' + object_depiction = 'image/object/depiction' + object_is_crowd = 'image/object/is_crowd' + object_segment_area = 'image/object/segment/area' + object_weight = 'image/object/weight' + instance_masks = 'image/segmentation/object' + instance_boundaries = 'image/boundaries/object' + instance_classes = 'image/segmentation/object/class' + detection_class_label = 'image/detection/label' + detection_bbox_ymin = 'image/detection/bbox/ymin' + detection_bbox_xmin = 'image/detection/bbox/xmin' + detection_bbox_ymax = 'image/detection/bbox/ymax' + detection_bbox_xmax = 'image/detection/bbox/xmax' + detection_score = 'image/detection/score' diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/target_assigner.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/target_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..14e66def1fe0a873c96900288290491718d3d5ab --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/target_assigner.py @@ -0,0 +1,458 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Base target assigner module. + +The job of a TargetAssigner is, for a given set of anchors (bounding boxes) and +groundtruth detections (bounding boxes), to assign classification and regression +targets to each anchor as well as weights to each anchor (specifying, e.g., +which anchors should not contribute to training loss). + +It assigns classification/regression targets by performing the following steps: +1) Computing pairwise similarity between anchors and groundtruth boxes using a + provided RegionSimilarity Calculator +2) Computing a matching based on the similarity matrix using a provided Matcher +3) Assigning regression targets based on the matching and a provided BoxCoder +4) Assigning classification targets based on the matching and groundtruth labels + +Note that TargetAssigners only operate on detections from a single +image at a time, so any logic for applying a TargetAssigner to multiple +images must be handled externally. +""" +import tensorflow as tf + +from object_detection.box_coders import faster_rcnn_box_coder +from object_detection.box_coders import mean_stddev_box_coder +from object_detection.core import box_coder as bcoder +from object_detection.core import box_list +from object_detection.core import matcher as mat +from object_detection.core import region_similarity_calculator as sim_calc +from object_detection.core import standard_fields as fields +from object_detection.matchers import argmax_matcher +from object_detection.matchers import bipartite_matcher +from object_detection.utils import shape_utils + + +class TargetAssigner(object): + """Target assigner to compute classification and regression targets.""" + + def __init__(self, similarity_calc, matcher, box_coder, + negative_class_weight=1.0, unmatched_cls_target=None): + """Construct Object Detection Target Assigner. + + Args: + similarity_calc: a RegionSimilarityCalculator + matcher: an object_detection.core.Matcher used to match groundtruth to + anchors. + box_coder: an object_detection.core.BoxCoder used to encode matching + groundtruth boxes with respect to anchors. + negative_class_weight: classification weight to be associated to negative + anchors (default: 1.0). The weight must be in [0., 1.]. + unmatched_cls_target: a float32 tensor with shape [d_1, d_2, ..., d_k] + which is consistent with the classification target for each + anchor (and can be empty for scalar targets). This shape must thus be + compatible with the groundtruth labels that are passed to the "assign" + function (which have shape [num_gt_boxes, d_1, d_2, ..., d_k]). + If set to None, unmatched_cls_target is set to be [0] for each anchor. + + Raises: + ValueError: if similarity_calc is not a RegionSimilarityCalculator or + if matcher is not a Matcher or if box_coder is not a BoxCoder + """ + if not isinstance(similarity_calc, sim_calc.RegionSimilarityCalculator): + raise ValueError('similarity_calc must be a RegionSimilarityCalculator') + if not isinstance(matcher, mat.Matcher): + raise ValueError('matcher must be a Matcher') + if not isinstance(box_coder, bcoder.BoxCoder): + raise ValueError('box_coder must be a BoxCoder') + self._similarity_calc = similarity_calc + self._matcher = matcher + self._box_coder = box_coder + self._negative_class_weight = negative_class_weight + if unmatched_cls_target is None: + self._unmatched_cls_target = tf.constant([0], tf.float32) + else: + self._unmatched_cls_target = unmatched_cls_target + + @property + def box_coder(self): + return self._box_coder + + def assign(self, anchors, groundtruth_boxes, groundtruth_labels=None, + groundtruth_weights=None, **params): + """Assign classification and regression targets to each anchor. + + For a given set of anchors and groundtruth detections, match anchors + to groundtruth_boxes and assign classification and regression targets to + each anchor as well as weights based on the resulting match (specifying, + e.g., which anchors should not contribute to training loss). + + Anchors that are not matched to anything are given a classification target + of self._unmatched_cls_target which can be specified via the constructor. + + Args: + anchors: a BoxList representing N anchors + groundtruth_boxes: a BoxList representing M groundtruth boxes + groundtruth_labels: a tensor of shape [M, d_1, ... d_k] + with labels for each of the ground_truth boxes. The subshape + [d_1, ... d_k] can be empty (corresponding to scalar inputs). When set + to None, groundtruth_labels assumes a binary problem where all + ground_truth boxes get a positive label (of 1). + groundtruth_weights: a float tensor of shape [M] indicating the weight to + assign to all anchors match to a particular groundtruth box. The weights + must be in [0., 1.]. If None, all weights are set to 1. + **params: Additional keyword arguments for specific implementations of + the Matcher. + + Returns: + cls_targets: a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], + where the subshape [d_1, ..., d_k] is compatible with groundtruth_labels + which has shape [num_gt_boxes, d_1, d_2, ... d_k]. + cls_weights: a float32 tensor with shape [num_anchors] + reg_targets: a float32 tensor with shape [num_anchors, box_code_dimension] + reg_weights: a float32 tensor with shape [num_anchors] + match: a matcher.Match object encoding the match between anchors and + groundtruth boxes, with rows corresponding to groundtruth boxes + and columns corresponding to anchors. + + Raises: + ValueError: if anchors or groundtruth_boxes are not of type + box_list.BoxList + """ + if not isinstance(anchors, box_list.BoxList): + raise ValueError('anchors must be an BoxList') + if not isinstance(groundtruth_boxes, box_list.BoxList): + raise ValueError('groundtruth_boxes must be an BoxList') + + if groundtruth_labels is None: + groundtruth_labels = tf.ones(tf.expand_dims(groundtruth_boxes.num_boxes(), + 0)) + groundtruth_labels = tf.expand_dims(groundtruth_labels, -1) + unmatched_shape_assert = shape_utils.assert_shape_equal( + shape_utils.combined_static_and_dynamic_shape(groundtruth_labels)[1:], + shape_utils.combined_static_and_dynamic_shape( + self._unmatched_cls_target)) + labels_and_box_shapes_assert = shape_utils.assert_shape_equal( + shape_utils.combined_static_and_dynamic_shape( + groundtruth_labels)[:1], + shape_utils.combined_static_and_dynamic_shape( + groundtruth_boxes.get())[:1]) + + if groundtruth_weights is None: + num_gt_boxes = groundtruth_boxes.num_boxes_static() + if not num_gt_boxes: + num_gt_boxes = groundtruth_boxes.num_boxes() + groundtruth_weights = tf.ones([num_gt_boxes], dtype=tf.float32) + with tf.control_dependencies( + [unmatched_shape_assert, labels_and_box_shapes_assert]): + match_quality_matrix = self._similarity_calc.compare(groundtruth_boxes, + anchors) + match = self._matcher.match(match_quality_matrix, **params) + reg_targets = self._create_regression_targets(anchors, + groundtruth_boxes, + match) + cls_targets = self._create_classification_targets(groundtruth_labels, + match) + reg_weights = self._create_regression_weights(match, groundtruth_weights) + cls_weights = self._create_classification_weights(match, + groundtruth_weights) + + num_anchors = anchors.num_boxes_static() + if num_anchors is not None: + reg_targets = self._reset_target_shape(reg_targets, num_anchors) + cls_targets = self._reset_target_shape(cls_targets, num_anchors) + reg_weights = self._reset_target_shape(reg_weights, num_anchors) + cls_weights = self._reset_target_shape(cls_weights, num_anchors) + + return cls_targets, cls_weights, reg_targets, reg_weights, match + + def _reset_target_shape(self, target, num_anchors): + """Sets the static shape of the target. + + Args: + target: the target tensor. Its first dimension will be overwritten. + num_anchors: the number of anchors, which is used to override the target's + first dimension. + + Returns: + A tensor with the shape info filled in. + """ + target_shape = target.get_shape().as_list() + target_shape[0] = num_anchors + target.set_shape(target_shape) + return target + + def _create_regression_targets(self, anchors, groundtruth_boxes, match): + """Returns a regression target for each anchor. + + Args: + anchors: a BoxList representing N anchors + groundtruth_boxes: a BoxList representing M groundtruth_boxes + match: a matcher.Match object + + Returns: + reg_targets: a float32 tensor with shape [N, box_code_dimension] + """ + matched_gt_boxes = match.gather_based_on_match( + groundtruth_boxes.get(), + unmatched_value=tf.zeros(4), + ignored_value=tf.zeros(4)) + matched_gt_boxlist = box_list.BoxList(matched_gt_boxes) + if groundtruth_boxes.has_field(fields.BoxListFields.keypoints): + groundtruth_keypoints = groundtruth_boxes.get_field( + fields.BoxListFields.keypoints) + matched_keypoints = match.gather_based_on_match( + groundtruth_keypoints, + unmatched_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]), + ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:])) + matched_gt_boxlist.add_field(fields.BoxListFields.keypoints, + matched_keypoints) + matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors) + match_results_shape = shape_utils.combined_static_and_dynamic_shape( + match.match_results) + + # Zero out the unmatched and ignored regression targets. + unmatched_ignored_reg_targets = tf.tile( + self._default_regression_target(), [match_results_shape[0], 1]) + matched_anchors_mask = match.matched_column_indicator() + reg_targets = tf.where(matched_anchors_mask, + matched_reg_targets, + unmatched_ignored_reg_targets) + return reg_targets + + def _default_regression_target(self): + """Returns the default target for anchors to regress to. + + Default regression targets are set to zero (though in + this implementation what these targets are set to should + not matter as the regression weight of any box set to + regress to the default target is zero). + + Returns: + default_target: a float32 tensor with shape [1, box_code_dimension] + """ + return tf.constant([self._box_coder.code_size*[0]], tf.float32) + + def _create_classification_targets(self, groundtruth_labels, match): + """Create classification targets for each anchor. + + Assign a classification target of for each anchor to the matching + groundtruth label that is provided by match. Anchors that are not matched + to anything are given the target self._unmatched_cls_target + + Args: + groundtruth_labels: a tensor of shape [num_gt_boxes, d_1, ... d_k] + with labels for each of the ground_truth boxes. The subshape + [d_1, ... d_k] can be empty (corresponding to scalar labels). + match: a matcher.Match object that provides a matching between anchors + and groundtruth boxes. + + Returns: + a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], where the + subshape [d_1, ..., d_k] is compatible with groundtruth_labels which has + shape [num_gt_boxes, d_1, d_2, ... d_k]. + """ + return match.gather_based_on_match( + groundtruth_labels, + unmatched_value=self._unmatched_cls_target, + ignored_value=self._unmatched_cls_target) + + def _create_regression_weights(self, match, groundtruth_weights): + """Set regression weight for each anchor. + + Only positive anchors are set to contribute to the regression loss, so this + method returns a weight of 1 for every positive anchor and 0 for every + negative anchor. + + Args: + match: a matcher.Match object that provides a matching between anchors + and groundtruth boxes. + groundtruth_weights: a float tensor of shape [M] indicating the weight to + assign to all anchors match to a particular groundtruth box. + + Returns: + a float32 tensor with shape [num_anchors] representing regression weights. + """ + return match.gather_based_on_match( + groundtruth_weights, ignored_value=0., unmatched_value=0.) + + def _create_classification_weights(self, + match, + groundtruth_weights): + """Create classification weights for each anchor. + + Positive (matched) anchors are associated with a weight of + positive_class_weight and negative (unmatched) anchors are associated with + a weight of negative_class_weight. When anchors are ignored, weights are set + to zero. By default, both positive/negative weights are set to 1.0, + but they can be adjusted to handle class imbalance (which is almost always + the case in object detection). + + Args: + match: a matcher.Match object that provides a matching between anchors + and groundtruth boxes. + groundtruth_weights: a float tensor of shape [M] indicating the weight to + assign to all anchors match to a particular groundtruth box. + + Returns: + a float32 tensor with shape [num_anchors] representing classification + weights. + """ + return match.gather_based_on_match( + groundtruth_weights, + ignored_value=0., + unmatched_value=self._negative_class_weight) + + def get_box_coder(self): + """Get BoxCoder of this TargetAssigner. + + Returns: + BoxCoder object. + """ + return self._box_coder + + +# TODO(rathodv): This method pulls in all the implementation dependencies into +# core. Therefore its best to have this factory method outside of core. +def create_target_assigner(reference, stage=None, + negative_class_weight=1.0, + unmatched_cls_target=None): + """Factory function for creating standard target assigners. + + Args: + reference: string referencing the type of TargetAssigner. + stage: string denoting stage: {proposal, detection}. + negative_class_weight: classification weight to be associated to negative + anchors (default: 1.0) + unmatched_cls_target: a float32 tensor with shape [d_1, d_2, ..., d_k] + which is consistent with the classification target for each + anchor (and can be empty for scalar targets). This shape must thus be + compatible with the groundtruth labels that are passed to the Assign + function (which have shape [num_gt_boxes, d_1, d_2, ..., d_k]). + If set to None, unmatched_cls_target is set to be 0 for each anchor. + + Returns: + TargetAssigner: desired target assigner. + + Raises: + ValueError: if combination reference+stage is invalid. + """ + if reference == 'Multibox' and stage == 'proposal': + similarity_calc = sim_calc.NegSqDistSimilarity() + matcher = bipartite_matcher.GreedyBipartiteMatcher() + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder() + + elif reference == 'FasterRCNN' and stage == 'proposal': + similarity_calc = sim_calc.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.7, + unmatched_threshold=0.3, + force_match_for_each_row=True) + box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( + scale_factors=[10.0, 10.0, 5.0, 5.0]) + + elif reference == 'FasterRCNN' and stage == 'detection': + similarity_calc = sim_calc.IouSimilarity() + # Uses all proposals with IOU < 0.5 as candidate negatives. + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + negatives_lower_than_unmatched=True) + box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( + scale_factors=[10.0, 10.0, 5.0, 5.0]) + + elif reference == 'FastRCNN': + similarity_calc = sim_calc.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.1, + force_match_for_each_row=False, + negatives_lower_than_unmatched=False) + box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder() + + else: + raise ValueError('No valid combination of reference and stage.') + + return TargetAssigner(similarity_calc, matcher, box_coder, + negative_class_weight=negative_class_weight, + unmatched_cls_target=unmatched_cls_target) + + +def batch_assign_targets(target_assigner, + anchors_batch, + gt_box_batch, + gt_class_targets_batch, + gt_weights_batch=None): + """Batched assignment of classification and regression targets. + + Args: + target_assigner: a target assigner. + anchors_batch: BoxList representing N box anchors or list of BoxList objects + with length batch_size representing anchor sets. + gt_box_batch: a list of BoxList objects with length batch_size + representing groundtruth boxes for each image in the batch + gt_class_targets_batch: a list of tensors with length batch_size, where + each tensor has shape [num_gt_boxes_i, classification_target_size] and + num_gt_boxes_i is the number of boxes in the ith boxlist of + gt_box_batch. + gt_weights_batch: A list of 1-D tf.float32 tensors of shape + [num_boxes] containing weights for groundtruth boxes. + + Returns: + batch_cls_targets: a tensor with shape [batch_size, num_anchors, + num_classes], + batch_cls_weights: a tensor with shape [batch_size, num_anchors], + batch_reg_targets: a tensor with shape [batch_size, num_anchors, + box_code_dimension] + batch_reg_weights: a tensor with shape [batch_size, num_anchors], + match_list: a list of matcher.Match objects encoding the match between + anchors and groundtruth boxes for each image of the batch, + with rows of the Match objects corresponding to groundtruth boxes + and columns corresponding to anchors. + Raises: + ValueError: if input list lengths are inconsistent, i.e., + batch_size == len(gt_box_batch) == len(gt_class_targets_batch) + and batch_size == len(anchors_batch) unless anchors_batch is a single + BoxList. + """ + if not isinstance(anchors_batch, list): + anchors_batch = len(gt_box_batch) * [anchors_batch] + if not all( + isinstance(anchors, box_list.BoxList) for anchors in anchors_batch): + raise ValueError('anchors_batch must be a BoxList or list of BoxLists.') + if not (len(anchors_batch) + == len(gt_box_batch) + == len(gt_class_targets_batch)): + raise ValueError('batch size incompatible with lengths of anchors_batch, ' + 'gt_box_batch and gt_class_targets_batch.') + cls_targets_list = [] + cls_weights_list = [] + reg_targets_list = [] + reg_weights_list = [] + match_list = [] + if gt_weights_batch is None: + gt_weights_batch = [None] * len(gt_class_targets_batch) + for anchors, gt_boxes, gt_class_targets, gt_weights in zip( + anchors_batch, gt_box_batch, gt_class_targets_batch, gt_weights_batch): + (cls_targets, cls_weights, reg_targets, + reg_weights, match) = target_assigner.assign( + anchors, gt_boxes, gt_class_targets, gt_weights) + cls_targets_list.append(cls_targets) + cls_weights_list.append(cls_weights) + reg_targets_list.append(reg_targets) + reg_weights_list.append(reg_weights) + match_list.append(match) + batch_cls_targets = tf.stack(cls_targets_list) + batch_cls_weights = tf.stack(cls_weights_list) + batch_reg_targets = tf.stack(reg_targets_list) + batch_reg_weights = tf.stack(reg_weights_list) + return (batch_cls_targets, batch_cls_weights, batch_reg_targets, + batch_reg_weights, match_list) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/core/target_assigner_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/target_assigner_test.py new file mode 100644 index 0000000000000000000000000000000000000000..34a35b6435bcd364faf36ba4f130f1310f6d8b22 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/core/target_assigner_test.py @@ -0,0 +1,827 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.target_assigner.""" +import numpy as np +import tensorflow as tf + +from object_detection.box_coders import keypoint_box_coder +from object_detection.box_coders import mean_stddev_box_coder +from object_detection.core import box_list +from object_detection.core import region_similarity_calculator +from object_detection.core import standard_fields as fields +from object_detection.core import target_assigner as targetassigner +from object_detection.matchers import argmax_matcher +from object_detection.matchers import bipartite_matcher +from object_detection.utils import test_case + + +class TargetAssignerTest(test_case.TestCase): + + def test_assign_agnostic(self): + def graph_fn(anchor_means, groundtruth_box_corners): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, unmatched_cls_target=None) + anchors_boxlist = box_list.BoxList(anchor_means) + groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners) + result = target_assigner.assign(anchors_boxlist, groundtruth_boxlist) + (cls_targets, cls_weights, reg_targets, reg_weights, _) = result + return (cls_targets, cls_weights, reg_targets, reg_weights) + + anchor_means = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 0.8], + [0, 0.5, .5, 1.0]], dtype=np.float32) + groundtruth_box_corners = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.9, 0.9]], + dtype=np.float32) + exp_cls_targets = [[1], [1], [0]] + exp_cls_weights = [1, 1, 1] + exp_reg_targets = [[0, 0, 0, 0], + [0, 0, -1, 1], + [0, 0, 0, 0]] + exp_reg_weights = [1, 1, 0] + + (cls_targets_out, + cls_weights_out, reg_targets_out, reg_weights_out) = self.execute( + graph_fn, [anchor_means, groundtruth_box_corners]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + self.assertEquals(cls_targets_out.dtype, np.float32) + self.assertEquals(cls_weights_out.dtype, np.float32) + self.assertEquals(reg_targets_out.dtype, np.float32) + self.assertEquals(reg_weights_out.dtype, np.float32) + + def test_assign_class_agnostic_with_ignored_matches(self): + # Note: test is very similar to above. The third box matched with an IOU + # of 0.35, which is between the matched and unmatched threshold. This means + # That like above the expected classification targets are [1, 1, 0]. + # Unlike above, the third target is ignored and therefore expected + # classification weights are [1, 1, 0]. + def graph_fn(anchor_means, groundtruth_box_corners): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.3) + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, unmatched_cls_target=None) + anchors_boxlist = box_list.BoxList(anchor_means) + groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners) + result = target_assigner.assign(anchors_boxlist, groundtruth_boxlist) + (cls_targets, cls_weights, reg_targets, reg_weights, _) = result + return (cls_targets, cls_weights, reg_targets, reg_weights) + + anchor_means = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 0.8], + [0.0, 0.5, .9, 1.0]], dtype=np.float32) + groundtruth_box_corners = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.9, 0.9]], dtype=np.float32) + exp_cls_targets = [[1], [1], [0]] + exp_cls_weights = [1, 1, 0] + exp_reg_targets = [[0, 0, 0, 0], + [0, 0, -1, 1], + [0, 0, 0, 0]] + exp_reg_weights = [1, 1, 0] + (cls_targets_out, + cls_weights_out, reg_targets_out, reg_weights_out) = self.execute( + graph_fn, [anchor_means, groundtruth_box_corners]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + self.assertEquals(cls_targets_out.dtype, np.float32) + self.assertEquals(cls_weights_out.dtype, np.float32) + self.assertEquals(reg_targets_out.dtype, np.float32) + self.assertEquals(reg_weights_out.dtype, np.float32) + + def test_assign_agnostic_with_keypoints(self): + def graph_fn(anchor_means, groundtruth_box_corners, + groundtruth_keypoints): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = keypoint_box_coder.KeypointBoxCoder( + num_keypoints=6, scale_factors=[10.0, 10.0, 5.0, 5.0]) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, unmatched_cls_target=None) + anchors_boxlist = box_list.BoxList(anchor_means) + groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners) + groundtruth_boxlist.add_field(fields.BoxListFields.keypoints, + groundtruth_keypoints) + result = target_assigner.assign(anchors_boxlist, groundtruth_boxlist) + (cls_targets, cls_weights, reg_targets, reg_weights, _) = result + return (cls_targets, cls_weights, reg_targets, reg_weights) + + anchor_means = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 1.0], + [0.0, 0.5, .9, 1.0]], dtype=np.float32) + groundtruth_box_corners = np.array([[0.0, 0.0, 0.5, 0.5], + [0.45, 0.45, 0.95, 0.95]], + dtype=np.float32) + groundtruth_keypoints = np.array( + [[[0.1, 0.2], [0.1, 0.3], [0.2, 0.2], [0.2, 0.2], [0.1, 0.1], [0.9, 0]], + [[0, 0.3], [0.2, 0.4], [0.5, 0.6], [0, 0.6], [0.8, 0.2], [0.2, 0.4]]], + dtype=np.float32) + exp_cls_targets = [[1], [1], [0]] + exp_cls_weights = [1, 1, 1] + exp_reg_targets = [[0, 0, 0, 0, -3, -1, -3, 1, -1, -1, -1, -1, -3, -3, 13, + -5], + [-1, -1, 0, 0, -15, -9, -11, -7, -5, -3, -15, -3, 1, -11, + -11, -7], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] + exp_reg_weights = [1, 1, 0] + (cls_targets_out, cls_weights_out, reg_targets_out, + reg_weights_out) = self.execute(graph_fn, [anchor_means, + groundtruth_box_corners, + groundtruth_keypoints]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + self.assertEquals(cls_targets_out.dtype, np.float32) + self.assertEquals(cls_weights_out.dtype, np.float32) + self.assertEquals(reg_targets_out.dtype, np.float32) + self.assertEquals(reg_weights_out.dtype, np.float32) + + def test_assign_class_agnostic_with_keypoints_and_ignored_matches(self): + # Note: test is very similar to above. The third box matched with an IOU + # of 0.35, which is between the matched and unmatched threshold. This means + # That like above the expected classification targets are [1, 1, 0]. + # Unlike above, the third target is ignored and therefore expected + # classification weights are [1, 1, 0]. + def graph_fn(anchor_means, groundtruth_box_corners, + groundtruth_keypoints): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = keypoint_box_coder.KeypointBoxCoder( + num_keypoints=6, scale_factors=[10.0, 10.0, 5.0, 5.0]) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, unmatched_cls_target=None) + anchors_boxlist = box_list.BoxList(anchor_means) + groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners) + groundtruth_boxlist.add_field(fields.BoxListFields.keypoints, + groundtruth_keypoints) + result = target_assigner.assign(anchors_boxlist, groundtruth_boxlist) + (cls_targets, cls_weights, reg_targets, reg_weights, _) = result + return (cls_targets, cls_weights, reg_targets, reg_weights) + + anchor_means = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 1.0], + [0.0, 0.5, .9, 1.0]], dtype=np.float32) + groundtruth_box_corners = np.array([[0.0, 0.0, 0.5, 0.5], + [0.45, 0.45, 0.95, 0.95]], + dtype=np.float32) + groundtruth_keypoints = np.array( + [[[0.1, 0.2], [0.1, 0.3], [0.2, 0.2], [0.2, 0.2], [0.1, 0.1], [0.9, 0]], + [[0, 0.3], [0.2, 0.4], [0.5, 0.6], [0, 0.6], [0.8, 0.2], [0.2, 0.4]]], + dtype=np.float32) + exp_cls_targets = [[1], [1], [0]] + exp_cls_weights = [1, 1, 1] + exp_reg_targets = [[0, 0, 0, 0, -3, -1, -3, 1, -1, -1, -1, -1, -3, -3, 13, + -5], + [-1, -1, 0, 0, -15, -9, -11, -7, -5, -3, -15, -3, 1, -11, + -11, -7], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] + exp_reg_weights = [1, 1, 0] + (cls_targets_out, cls_weights_out, reg_targets_out, + reg_weights_out) = self.execute(graph_fn, [anchor_means, + groundtruth_box_corners, + groundtruth_keypoints]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + self.assertEquals(cls_targets_out.dtype, np.float32) + self.assertEquals(cls_weights_out.dtype, np.float32) + self.assertEquals(reg_targets_out.dtype, np.float32) + self.assertEquals(reg_weights_out.dtype, np.float32) + + def test_assign_multiclass(self): + + def graph_fn(anchor_means, groundtruth_box_corners, groundtruth_labels): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + unmatched_cls_target = tf.constant([1, 0, 0, 0, 0, 0, 0], tf.float32) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, + unmatched_cls_target=unmatched_cls_target) + + anchors_boxlist = box_list.BoxList(anchor_means) + groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners) + result = target_assigner.assign(anchors_boxlist, groundtruth_boxlist, + groundtruth_labels) + (cls_targets, cls_weights, reg_targets, reg_weights, _) = result + return (cls_targets, cls_weights, reg_targets, reg_weights) + + anchor_means = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 0.8], + [0, 0.5, .5, 1.0], + [.75, 0, 1.0, .25]], dtype=np.float32) + groundtruth_box_corners = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.9, 0.9], + [.75, 0, .95, .27]], dtype=np.float32) + groundtruth_labels = np.array([[0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 1, 0, 0, 0]], dtype=np.float32) + + exp_cls_targets = [[0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0], + [1, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 0]] + exp_cls_weights = [1, 1, 1, 1] + exp_reg_targets = [[0, 0, 0, 0], + [0, 0, -1, 1], + [0, 0, 0, 0], + [0, 0, -.5, .2]] + exp_reg_weights = [1, 1, 0, 1] + + (cls_targets_out, + cls_weights_out, reg_targets_out, reg_weights_out) = self.execute( + graph_fn, [anchor_means, groundtruth_box_corners, groundtruth_labels]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + self.assertEquals(cls_targets_out.dtype, np.float32) + self.assertEquals(cls_weights_out.dtype, np.float32) + self.assertEquals(reg_targets_out.dtype, np.float32) + self.assertEquals(reg_weights_out.dtype, np.float32) + + def test_assign_multiclass_with_groundtruth_weights(self): + + def graph_fn(anchor_means, groundtruth_box_corners, groundtruth_labels, + groundtruth_weights): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + unmatched_cls_target = tf.constant([1, 0, 0, 0, 0, 0, 0], tf.float32) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, + unmatched_cls_target=unmatched_cls_target) + + anchors_boxlist = box_list.BoxList(anchor_means) + groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners) + result = target_assigner.assign(anchors_boxlist, groundtruth_boxlist, + groundtruth_labels, + groundtruth_weights) + (_, cls_weights, _, reg_weights, _) = result + return (cls_weights, reg_weights) + + anchor_means = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 0.8], + [0, 0.5, .5, 1.0], + [.75, 0, 1.0, .25]], dtype=np.float32) + groundtruth_box_corners = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.9, 0.9], + [.75, 0, .95, .27]], dtype=np.float32) + groundtruth_labels = np.array([[0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 1, 0, 0, 0]], dtype=np.float32) + groundtruth_weights = np.array([0.3, 0., 0.5], dtype=np.float32) + + exp_cls_weights = [0.3, 0., 1, 0.5] # background class gets weight of 1. + exp_reg_weights = [0.3, 0., 0., 0.5] # background class gets weight of 0. + + (cls_weights_out, reg_weights_out) = self.execute(graph_fn, [ + anchor_means, groundtruth_box_corners, groundtruth_labels, + groundtruth_weights + ]) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_weights_out, exp_reg_weights) + + def test_assign_multidimensional_class_targets(self): + + def graph_fn(anchor_means, groundtruth_box_corners, groundtruth_labels): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + + unmatched_cls_target = tf.constant([[0, 0], [0, 0]], tf.float32) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, + unmatched_cls_target=unmatched_cls_target) + + anchors_boxlist = box_list.BoxList(anchor_means) + groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners) + result = target_assigner.assign(anchors_boxlist, groundtruth_boxlist, + groundtruth_labels) + (cls_targets, cls_weights, reg_targets, reg_weights, _) = result + return (cls_targets, cls_weights, reg_targets, reg_weights) + + anchor_means = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 0.8], + [0, 0.5, .5, 1.0], + [.75, 0, 1.0, .25]], dtype=np.float32) + groundtruth_box_corners = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.9, 0.9], + [.75, 0, .95, .27]], dtype=np.float32) + + groundtruth_labels = np.array([[[0, 1], [1, 0]], + [[1, 0], [0, 1]], + [[0, 1], [1, .5]]], np.float32) + + exp_cls_targets = [[[0, 1], [1, 0]], + [[1, 0], [0, 1]], + [[0, 0], [0, 0]], + [[0, 1], [1, .5]]] + exp_cls_weights = [1, 1, 1, 1] + exp_reg_targets = [[0, 0, 0, 0], + [0, 0, -1, 1], + [0, 0, 0, 0], + [0, 0, -.5, .2]] + exp_reg_weights = [1, 1, 0, 1] + (cls_targets_out, + cls_weights_out, reg_targets_out, reg_weights_out) = self.execute( + graph_fn, [anchor_means, groundtruth_box_corners, groundtruth_labels]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + self.assertEquals(cls_targets_out.dtype, np.float32) + self.assertEquals(cls_weights_out.dtype, np.float32) + self.assertEquals(reg_targets_out.dtype, np.float32) + self.assertEquals(reg_weights_out.dtype, np.float32) + + def test_assign_empty_groundtruth(self): + + def graph_fn(anchor_means, groundtruth_box_corners, groundtruth_labels): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + unmatched_cls_target = tf.constant([0, 0, 0], tf.float32) + anchors_boxlist = box_list.BoxList(anchor_means) + groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, + unmatched_cls_target=unmatched_cls_target) + result = target_assigner.assign(anchors_boxlist, groundtruth_boxlist, + groundtruth_labels) + (cls_targets, cls_weights, reg_targets, reg_weights, _) = result + return (cls_targets, cls_weights, reg_targets, reg_weights) + + groundtruth_box_corners = np.zeros((0, 4), dtype=np.float32) + groundtruth_labels = np.zeros((0, 3), dtype=np.float32) + anchor_means = np.array([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 0.8], + [0, 0.5, .5, 1.0], + [.75, 0, 1.0, .25]], + dtype=np.float32) + exp_cls_targets = [[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]] + exp_cls_weights = [1, 1, 1, 1] + exp_reg_targets = [[0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0]] + exp_reg_weights = [0, 0, 0, 0] + (cls_targets_out, + cls_weights_out, reg_targets_out, reg_weights_out) = self.execute( + graph_fn, [anchor_means, groundtruth_box_corners, groundtruth_labels]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + self.assertEquals(cls_targets_out.dtype, np.float32) + self.assertEquals(cls_weights_out.dtype, np.float32) + self.assertEquals(reg_targets_out.dtype, np.float32) + self.assertEquals(reg_weights_out.dtype, np.float32) + + def test_raises_error_on_incompatible_groundtruth_boxes_and_labels(self): + similarity_calc = region_similarity_calculator.NegSqDistSimilarity() + matcher = bipartite_matcher.GreedyBipartiteMatcher() + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder() + unmatched_cls_target = tf.constant([1, 0, 0, 0, 0, 0, 0], tf.float32) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, + unmatched_cls_target=unmatched_cls_target) + + prior_means = tf.constant([[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 0.8], + [0, 0.5, .5, 1.0], + [.75, 0, 1.0, .25]]) + priors = box_list.BoxList(prior_means) + + box_corners = [[0.0, 0.0, 0.5, 0.5], + [0.0, 0.0, 0.5, 0.8], + [0.5, 0.5, 0.9, 0.9], + [.75, 0, .95, .27]] + boxes = box_list.BoxList(tf.constant(box_corners)) + + groundtruth_labels = tf.constant([[0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 1, 0, 0, 0]], tf.float32) + with self.assertRaisesRegexp(ValueError, 'Unequal shapes'): + target_assigner.assign(priors, boxes, groundtruth_labels, + num_valid_rows=3) + + def test_raises_error_on_invalid_groundtruth_labels(self): + similarity_calc = region_similarity_calculator.NegSqDistSimilarity() + matcher = bipartite_matcher.GreedyBipartiteMatcher() + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=1.0) + unmatched_cls_target = tf.constant([[0, 0], [0, 0], [0, 0]], tf.float32) + target_assigner = targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, + unmatched_cls_target=unmatched_cls_target) + + prior_means = tf.constant([[0.0, 0.0, 0.5, 0.5]]) + priors = box_list.BoxList(prior_means) + + box_corners = [[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.9, 0.9], + [.75, 0, .95, .27]] + boxes = box_list.BoxList(tf.constant(box_corners)) + groundtruth_labels = tf.constant([[[0, 1], [1, 0]]], tf.float32) + + with self.assertRaises(ValueError): + target_assigner.assign(priors, boxes, groundtruth_labels, + num_valid_rows=3) + + +class BatchTargetAssignerTest(test_case.TestCase): + + def _get_agnostic_target_assigner(self): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + return targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, + unmatched_cls_target=None) + + def _get_multi_class_target_assigner(self, num_classes): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + unmatched_cls_target = tf.constant([1] + num_classes * [0], tf.float32) + return targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, + unmatched_cls_target=unmatched_cls_target) + + def _get_multi_dimensional_target_assigner(self, target_dimensions): + similarity_calc = region_similarity_calculator.IouSimilarity() + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=0.5, + unmatched_threshold=0.5) + box_coder = mean_stddev_box_coder.MeanStddevBoxCoder(stddev=0.1) + unmatched_cls_target = tf.constant(np.zeros(target_dimensions), + tf.float32) + return targetassigner.TargetAssigner( + similarity_calc, matcher, box_coder, + unmatched_cls_target=unmatched_cls_target) + + def test_batch_assign_targets(self): + + def graph_fn(anchor_means, groundtruth_boxlist1, groundtruth_boxlist2): + box_list1 = box_list.BoxList(groundtruth_boxlist1) + box_list2 = box_list.BoxList(groundtruth_boxlist2) + gt_box_batch = [box_list1, box_list2] + gt_class_targets = [None, None] + anchors_boxlist = box_list.BoxList(anchor_means) + agnostic_target_assigner = self._get_agnostic_target_assigner() + (cls_targets, cls_weights, reg_targets, reg_weights, + _) = targetassigner.batch_assign_targets( + agnostic_target_assigner, anchors_boxlist, gt_box_batch, + gt_class_targets) + return (cls_targets, cls_weights, reg_targets, reg_weights) + + groundtruth_boxlist1 = np.array([[0., 0., 0.2, 0.2]], dtype=np.float32) + groundtruth_boxlist2 = np.array([[0, 0.25123152, 1, 1], + [0.015789, 0.0985, 0.55789, 0.3842]], + dtype=np.float32) + anchor_means = np.array([[0, 0, .25, .25], + [0, .25, 1, 1], + [0, .1, .5, .5], + [.75, .75, 1, 1]], dtype=np.float32) + + exp_reg_targets = [[[0, 0, -0.5, -0.5], + [0, 0, 0, 0], + [0, 0, 0, 0,], + [0, 0, 0, 0,],], + [[0, 0, 0, 0,], + [0, 0.01231521, 0, 0], + [0.15789001, -0.01500003, 0.57889998, -1.15799987], + [0, 0, 0, 0]]] + exp_cls_weights = [[1, 1, 1, 1], + [1, 1, 1, 1]] + exp_cls_targets = [[[1], [0], [0], [0]], + [[0], [1], [1], [0]]] + exp_reg_weights = [[1, 0, 0, 0], + [0, 1, 1, 0]] + + (cls_targets_out, + cls_weights_out, reg_targets_out, reg_weights_out) = self.execute( + graph_fn, [anchor_means, groundtruth_boxlist1, groundtruth_boxlist2]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + + def test_batch_assign_multiclass_targets(self): + + def graph_fn(anchor_means, groundtruth_boxlist1, groundtruth_boxlist2, + class_targets1, class_targets2): + box_list1 = box_list.BoxList(groundtruth_boxlist1) + box_list2 = box_list.BoxList(groundtruth_boxlist2) + gt_box_batch = [box_list1, box_list2] + gt_class_targets = [class_targets1, class_targets2] + anchors_boxlist = box_list.BoxList(anchor_means) + multiclass_target_assigner = self._get_multi_class_target_assigner( + num_classes=3) + (cls_targets, cls_weights, reg_targets, reg_weights, + _) = targetassigner.batch_assign_targets( + multiclass_target_assigner, anchors_boxlist, gt_box_batch, + gt_class_targets) + return (cls_targets, cls_weights, reg_targets, reg_weights) + + groundtruth_boxlist1 = np.array([[0., 0., 0.2, 0.2]], dtype=np.float32) + groundtruth_boxlist2 = np.array([[0, 0.25123152, 1, 1], + [0.015789, 0.0985, 0.55789, 0.3842]], + dtype=np.float32) + class_targets1 = np.array([[0, 1, 0, 0]], dtype=np.float32) + class_targets2 = np.array([[0, 0, 0, 1], + [0, 0, 1, 0]], dtype=np.float32) + + anchor_means = np.array([[0, 0, .25, .25], + [0, .25, 1, 1], + [0, .1, .5, .5], + [.75, .75, 1, 1]], dtype=np.float32) + + exp_reg_targets = [[[0, 0, -0.5, -0.5], + [0, 0, 0, 0], + [0, 0, 0, 0,], + [0, 0, 0, 0,],], + [[0, 0, 0, 0,], + [0, 0.01231521, 0, 0], + [0.15789001, -0.01500003, 0.57889998, -1.15799987], + [0, 0, 0, 0]]] + exp_cls_weights = [[1, 1, 1, 1], + [1, 1, 1, 1]] + exp_cls_targets = [[[0, 1, 0, 0], + [1, 0, 0, 0], + [1, 0, 0, 0], + [1, 0, 0, 0]], + [[1, 0, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0], + [1, 0, 0, 0]]] + exp_reg_weights = [[1, 0, 0, 0], + [0, 1, 1, 0]] + + (cls_targets_out, cls_weights_out, reg_targets_out, + reg_weights_out) = self.execute(graph_fn, [ + anchor_means, groundtruth_boxlist1, groundtruth_boxlist2, + class_targets1, class_targets2 + ]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + + def test_batch_assign_multiclass_targets_with_padded_groundtruth(self): + + def graph_fn(anchor_means, groundtruth_boxlist1, groundtruth_boxlist2, + class_targets1, class_targets2, groundtruth_weights1, + groundtruth_weights2): + box_list1 = box_list.BoxList(groundtruth_boxlist1) + box_list2 = box_list.BoxList(groundtruth_boxlist2) + gt_box_batch = [box_list1, box_list2] + gt_class_targets = [class_targets1, class_targets2] + gt_weights = [groundtruth_weights1, groundtruth_weights2] + anchors_boxlist = box_list.BoxList(anchor_means) + multiclass_target_assigner = self._get_multi_class_target_assigner( + num_classes=3) + (cls_targets, cls_weights, reg_targets, reg_weights, + _) = targetassigner.batch_assign_targets( + multiclass_target_assigner, anchors_boxlist, gt_box_batch, + gt_class_targets, gt_weights) + return (cls_targets, cls_weights, reg_targets, reg_weights) + + groundtruth_boxlist1 = np.array([[0., 0., 0.2, 0.2], + [0., 0., 0., 0.]], dtype=np.float32) + groundtruth_weights1 = np.array([1, 0], dtype=np.float32) + groundtruth_boxlist2 = np.array([[0, 0.25123152, 1, 1], + [0.015789, 0.0985, 0.55789, 0.3842], + [0, 0, 0, 0]], + dtype=np.float32) + groundtruth_weights2 = np.array([1, 1, 0], dtype=np.float32) + class_targets1 = np.array([[0, 1, 0, 0], [0, 0, 0, 0]], dtype=np.float32) + class_targets2 = np.array([[0, 0, 0, 1], + [0, 0, 1, 0], + [0, 0, 0, 0]], dtype=np.float32) + + anchor_means = np.array([[0, 0, .25, .25], + [0, .25, 1, 1], + [0, .1, .5, .5], + [.75, .75, 1, 1]], dtype=np.float32) + + exp_reg_targets = [[[0, 0, -0.5, -0.5], + [0, 0, 0, 0], + [0, 0, 0, 0,], + [0, 0, 0, 0,],], + [[0, 0, 0, 0,], + [0, 0.01231521, 0, 0], + [0.15789001, -0.01500003, 0.57889998, -1.15799987], + [0, 0, 0, 0]]] + exp_cls_weights = [[1, 1, 1, 1], + [1, 1, 1, 1]] + exp_cls_targets = [[[0, 1, 0, 0], + [1, 0, 0, 0], + [1, 0, 0, 0], + [1, 0, 0, 0]], + [[1, 0, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0], + [1, 0, 0, 0]]] + exp_reg_weights = [[1, 0, 0, 0], + [0, 1, 1, 0]] + + (cls_targets_out, cls_weights_out, reg_targets_out, + reg_weights_out) = self.execute(graph_fn, [ + anchor_means, groundtruth_boxlist1, groundtruth_boxlist2, + class_targets1, class_targets2, groundtruth_weights1, + groundtruth_weights2 + ]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + + def test_batch_assign_multidimensional_targets(self): + + def graph_fn(anchor_means, groundtruth_boxlist1, groundtruth_boxlist2, + class_targets1, class_targets2): + box_list1 = box_list.BoxList(groundtruth_boxlist1) + box_list2 = box_list.BoxList(groundtruth_boxlist2) + gt_box_batch = [box_list1, box_list2] + gt_class_targets = [class_targets1, class_targets2] + anchors_boxlist = box_list.BoxList(anchor_means) + multiclass_target_assigner = self._get_multi_dimensional_target_assigner( + target_dimensions=(2, 3)) + (cls_targets, cls_weights, reg_targets, reg_weights, + _) = targetassigner.batch_assign_targets( + multiclass_target_assigner, anchors_boxlist, gt_box_batch, + gt_class_targets) + return (cls_targets, cls_weights, reg_targets, reg_weights) + + groundtruth_boxlist1 = np.array([[0., 0., 0.2, 0.2]], dtype=np.float32) + groundtruth_boxlist2 = np.array([[0, 0.25123152, 1, 1], + [0.015789, 0.0985, 0.55789, 0.3842]], + dtype=np.float32) + class_targets1 = np.array([[0, 1, 0, 0]], dtype=np.float32) + class_targets2 = np.array([[0, 0, 0, 1], + [0, 0, 1, 0]], dtype=np.float32) + class_targets1 = np.array([[[0, 1, 1], + [1, 1, 0]]], dtype=np.float32) + class_targets2 = np.array([[[0, 1, 1], + [1, 1, 0]], + [[0, 0, 1], + [0, 0, 1]]], dtype=np.float32) + + anchor_means = np.array([[0, 0, .25, .25], + [0, .25, 1, 1], + [0, .1, .5, .5], + [.75, .75, 1, 1]], dtype=np.float32) + + exp_reg_targets = [[[0, 0, -0.5, -0.5], + [0, 0, 0, 0], + [0, 0, 0, 0,], + [0, 0, 0, 0,],], + [[0, 0, 0, 0,], + [0, 0.01231521, 0, 0], + [0.15789001, -0.01500003, 0.57889998, -1.15799987], + [0, 0, 0, 0]]] + exp_cls_weights = [[1, 1, 1, 1], + [1, 1, 1, 1]] + exp_cls_targets = [[[[0., 1., 1.], + [1., 1., 0.]], + [[0., 0., 0.], + [0., 0., 0.]], + [[0., 0., 0.], + [0., 0., 0.]], + [[0., 0., 0.], + [0., 0., 0.]]], + [[[0., 0., 0.], + [0., 0., 0.]], + [[0., 1., 1.], + [1., 1., 0.]], + [[0., 0., 1.], + [0., 0., 1.]], + [[0., 0., 0.], + [0., 0., 0.]]]] + exp_reg_weights = [[1, 0, 0, 0], + [0, 1, 1, 0]] + + (cls_targets_out, cls_weights_out, reg_targets_out, + reg_weights_out) = self.execute(graph_fn, [ + anchor_means, groundtruth_boxlist1, groundtruth_boxlist2, + class_targets1, class_targets2 + ]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + + def test_batch_assign_empty_groundtruth(self): + + def graph_fn(anchor_means, groundtruth_box_corners, gt_class_targets): + groundtruth_boxlist = box_list.BoxList(groundtruth_box_corners) + gt_box_batch = [groundtruth_boxlist] + gt_class_targets_batch = [gt_class_targets] + anchors_boxlist = box_list.BoxList(anchor_means) + + multiclass_target_assigner = self._get_multi_class_target_assigner( + num_classes=3) + + (cls_targets, cls_weights, reg_targets, reg_weights, + _) = targetassigner.batch_assign_targets( + multiclass_target_assigner, anchors_boxlist, + gt_box_batch, gt_class_targets_batch) + return (cls_targets, cls_weights, reg_targets, reg_weights) + + groundtruth_box_corners = np.zeros((0, 4), dtype=np.float32) + anchor_means = np.array([[0, 0, .25, .25], + [0, .25, 1, 1]], dtype=np.float32) + exp_reg_targets = [[[0, 0, 0, 0], + [0, 0, 0, 0]]] + exp_cls_weights = [[1, 1]] + exp_cls_targets = [[[1, 0, 0, 0], + [1, 0, 0, 0]]] + exp_reg_weights = [[0, 0]] + num_classes = 3 + pad = 1 + gt_class_targets = np.zeros((0, num_classes + pad), dtype=np.float32) + + (cls_targets_out, + cls_weights_out, reg_targets_out, reg_weights_out) = self.execute( + graph_fn, [anchor_means, groundtruth_box_corners, gt_class_targets]) + self.assertAllClose(cls_targets_out, exp_cls_targets) + self.assertAllClose(cls_weights_out, exp_cls_weights) + self.assertAllClose(reg_targets_out, exp_reg_targets) + self.assertAllClose(reg_weights_out, exp_reg_weights) + + +class CreateTargetAssignerTest(tf.test.TestCase): + + def test_create_target_assigner(self): + """Tests that named constructor gives working target assigners. + + TODO(rathodv): Make this test more general. + """ + corners = [[0.0, 0.0, 1.0, 1.0]] + groundtruth = box_list.BoxList(tf.constant(corners)) + + priors = box_list.BoxList(tf.constant(corners)) + multibox_ta = (targetassigner + .create_target_assigner('Multibox', stage='proposal')) + multibox_ta.assign(priors, groundtruth) + # No tests on output, as that may vary arbitrarily as new target assigners + # are added. As long as it is constructed correctly and runs without errors, + # tests on the individual assigners cover correctness of the assignments. + + anchors = box_list.BoxList(tf.constant(corners)) + faster_rcnn_proposals_ta = (targetassigner + .create_target_assigner('FasterRCNN', + stage='proposal')) + faster_rcnn_proposals_ta.assign(anchors, groundtruth) + + fast_rcnn_ta = (targetassigner + .create_target_assigner('FastRCNN')) + fast_rcnn_ta.assign(anchors, groundtruth) + + faster_rcnn_detection_ta = (targetassigner + .create_target_assigner('FasterRCNN', + stage='detection')) + faster_rcnn_detection_ta.assign(anchors, groundtruth) + + with self.assertRaises(ValueError): + targetassigner.create_target_assigner('InvalidDetector', + stage='invalid_stage') + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data/ava_label_map_v2.1.pbtxt b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/ava_label_map_v2.1.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..5e2c485682830919a09300ac851e6b0e4bdf3efb --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/ava_label_map_v2.1.pbtxt @@ -0,0 +1,240 @@ +item { + name: "bend/bow (at the waist)" + id: 1 +} +item { + name: "crouch/kneel" + id: 3 +} +item { + name: "dance" + id: 4 +} +item { + name: "fall down" + id: 5 +} +item { + name: "get up" + id: 6 +} +item { + name: "jump/leap" + id: 7 +} +item { + name: "lie/sleep" + id: 8 +} +item { + name: "martial art" + id: 9 +} +item { + name: "run/jog" + id: 10 +} +item { + name: "sit" + id: 11 +} +item { + name: "stand" + id: 12 +} +item { + name: "swim" + id: 13 +} +item { + name: "walk" + id: 14 +} +item { + name: "answer phone" + id: 15 +} +item { + name: "carry/hold (an object)" + id: 17 +} +item { + name: "climb (e.g., a mountain)" + id: 20 +} +item { + name: "close (e.g., a door, a box)" + id: 22 +} +item { + name: "cut" + id: 24 +} +item { + name: "dress/put on clothing" + id: 26 +} +item { + name: "drink" + id: 27 +} +item { + name: "drive (e.g., a car, a truck)" + id: 28 +} +item { + name: "eat" + id: 29 +} +item { + name: "enter" + id: 30 +} +item { + name: "hit (an object)" + id: 34 +} +item { + name: "lift/pick up" + id: 36 +} +item { + name: "listen (e.g., to music)" + id: 37 +} +item { + name: "open (e.g., a window, a car door)" + id: 38 +} +item { + name: "play musical instrument" + id: 41 +} +item { + name: "point to (an object)" + id: 43 +} +item { + name: "pull (an object)" + id: 45 +} +item { + name: "push (an object)" + id: 46 +} +item { + name: "put down" + id: 47 +} +item { + name: "read" + id: 48 +} +item { + name: "ride (e.g., a bike, a car, a horse)" + id: 49 +} +item { + name: "sail boat" + id: 51 +} +item { + name: "shoot" + id: 52 +} +item { + name: "smoke" + id: 54 +} +item { + name: "take a photo" + id: 56 +} +item { + name: "text on/look at a cellphone" + id: 57 +} +item { + name: "throw" + id: 58 +} +item { + name: "touch (an object)" + id: 59 +} +item { + name: "turn (e.g., a screwdriver)" + id: 60 +} +item { + name: "watch (e.g., TV)" + id: 61 +} +item { + name: "work on a computer" + id: 62 +} +item { + name: "write" + id: 63 +} +item { + name: "fight/hit (a person)" + id: 64 +} +item { + name: "give/serve (an object) to (a person)" + id: 65 +} +item { + name: "grab (a person)" + id: 66 +} +item { + name: "hand clap" + id: 67 +} +item { + name: "hand shake" + id: 68 +} +item { + name: "hand wave" + id: 69 +} +item { + name: "hug (a person)" + id: 70 +} +item { + name: "kiss (a person)" + id: 72 +} +item { + name: "lift (a person)" + id: 73 +} +item { + name: "listen to (a person)" + id: 74 +} +item { + name: "push (another person)" + id: 76 +} +item { + name: "sing to (e.g., self, a person, a group)" + id: 77 +} +item { + name: "take (an object) from (a person)" + id: 78 +} +item { + name: "talk to (e.g., self, a person, a group)" + id: 79 +} +item { + name: "watch (a person)" + id: 80 +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data/kitti_label_map.pbtxt b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/kitti_label_map.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..0afcc6936ebdb37ecbc7c3245929fcf178a02c0b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/kitti_label_map.pbtxt @@ -0,0 +1,9 @@ +item { + id: 1 + name: 'car' +} + +item { + id: 2 + name: 'pedestrian' +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data/mscoco_label_map.pbtxt b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/mscoco_label_map.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..1f4872bd0c7f53e70beecf88af005c07a5df9e08 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/mscoco_label_map.pbtxt @@ -0,0 +1,400 @@ +item { + name: "/m/01g317" + id: 1 + display_name: "person" +} +item { + name: "/m/0199g" + id: 2 + display_name: "bicycle" +} +item { + name: "/m/0k4j" + id: 3 + display_name: "car" +} +item { + name: "/m/04_sv" + id: 4 + display_name: "motorcycle" +} +item { + name: "/m/05czz6l" + id: 5 + display_name: "airplane" +} +item { + name: "/m/01bjv" + id: 6 + display_name: "bus" +} +item { + name: "/m/07jdr" + id: 7 + display_name: "train" +} +item { + name: "/m/07r04" + id: 8 + display_name: "truck" +} +item { + name: "/m/019jd" + id: 9 + display_name: "boat" +} +item { + name: "/m/015qff" + id: 10 + display_name: "traffic light" +} +item { + name: "/m/01pns0" + id: 11 + display_name: "fire hydrant" +} +item { + name: "/m/02pv19" + id: 13 + display_name: "stop sign" +} +item { + name: "/m/015qbp" + id: 14 + display_name: "parking meter" +} +item { + name: "/m/0cvnqh" + id: 15 + display_name: "bench" +} +item { + name: "/m/015p6" + id: 16 + display_name: "bird" +} +item { + name: "/m/01yrx" + id: 17 + display_name: "cat" +} +item { + name: "/m/0bt9lr" + id: 18 + display_name: "dog" +} +item { + name: "/m/03k3r" + id: 19 + display_name: "horse" +} +item { + name: "/m/07bgp" + id: 20 + display_name: "sheep" +} +item { + name: "/m/01xq0k1" + id: 21 + display_name: "cow" +} +item { + name: "/m/0bwd_0j" + id: 22 + display_name: "elephant" +} +item { + name: "/m/01dws" + id: 23 + display_name: "bear" +} +item { + name: "/m/0898b" + id: 24 + display_name: "zebra" +} +item { + name: "/m/03bk1" + id: 25 + display_name: "giraffe" +} +item { + name: "/m/01940j" + id: 27 + display_name: "backpack" +} +item { + name: "/m/0hnnb" + id: 28 + display_name: "umbrella" +} +item { + name: "/m/080hkjn" + id: 31 + display_name: "handbag" +} +item { + name: "/m/01rkbr" + id: 32 + display_name: "tie" +} +item { + name: "/m/01s55n" + id: 33 + display_name: "suitcase" +} +item { + name: "/m/02wmf" + id: 34 + display_name: "frisbee" +} +item { + name: "/m/071p9" + id: 35 + display_name: "skis" +} +item { + name: "/m/06__v" + id: 36 + display_name: "snowboard" +} +item { + name: "/m/018xm" + id: 37 + display_name: "sports ball" +} +item { + name: "/m/02zt3" + id: 38 + display_name: "kite" +} +item { + name: "/m/03g8mr" + id: 39 + display_name: "baseball bat" +} +item { + name: "/m/03grzl" + id: 40 + display_name: "baseball glove" +} +item { + name: "/m/06_fw" + id: 41 + display_name: "skateboard" +} +item { + name: "/m/019w40" + id: 42 + display_name: "surfboard" +} +item { + name: "/m/0dv9c" + id: 43 + display_name: "tennis racket" +} +item { + name: "/m/04dr76w" + id: 44 + display_name: "bottle" +} +item { + name: "/m/09tvcd" + id: 46 + display_name: "wine glass" +} +item { + name: "/m/08gqpm" + id: 47 + display_name: "cup" +} +item { + name: "/m/0dt3t" + id: 48 + display_name: "fork" +} +item { + name: "/m/04ctx" + id: 49 + display_name: "knife" +} +item { + name: "/m/0cmx8" + id: 50 + display_name: "spoon" +} +item { + name: "/m/04kkgm" + id: 51 + display_name: "bowl" +} +item { + name: "/m/09qck" + id: 52 + display_name: "banana" +} +item { + name: "/m/014j1m" + id: 53 + display_name: "apple" +} +item { + name: "/m/0l515" + id: 54 + display_name: "sandwich" +} +item { + name: "/m/0cyhj_" + id: 55 + display_name: "orange" +} +item { + name: "/m/0hkxq" + id: 56 + display_name: "broccoli" +} +item { + name: "/m/0fj52s" + id: 57 + display_name: "carrot" +} +item { + name: "/m/01b9xk" + id: 58 + display_name: "hot dog" +} +item { + name: "/m/0663v" + id: 59 + display_name: "pizza" +} +item { + name: "/m/0jy4k" + id: 60 + display_name: "donut" +} +item { + name: "/m/0fszt" + id: 61 + display_name: "cake" +} +item { + name: "/m/01mzpv" + id: 62 + display_name: "chair" +} +item { + name: "/m/02crq1" + id: 63 + display_name: "couch" +} +item { + name: "/m/03fp41" + id: 64 + display_name: "potted plant" +} +item { + name: "/m/03ssj5" + id: 65 + display_name: "bed" +} +item { + name: "/m/04bcr3" + id: 67 + display_name: "dining table" +} +item { + name: "/m/09g1w" + id: 70 + display_name: "toilet" +} +item { + name: "/m/07c52" + id: 72 + display_name: "tv" +} +item { + name: "/m/01c648" + id: 73 + display_name: "laptop" +} +item { + name: "/m/020lf" + id: 74 + display_name: "mouse" +} +item { + name: "/m/0qjjc" + id: 75 + display_name: "remote" +} +item { + name: "/m/01m2v" + id: 76 + display_name: "keyboard" +} +item { + name: "/m/050k8" + id: 77 + display_name: "cell phone" +} +item { + name: "/m/0fx9l" + id: 78 + display_name: "microwave" +} +item { + name: "/m/029bxz" + id: 79 + display_name: "oven" +} +item { + name: "/m/01k6s3" + id: 80 + display_name: "toaster" +} +item { + name: "/m/0130jx" + id: 81 + display_name: "sink" +} +item { + name: "/m/040b_t" + id: 82 + display_name: "refrigerator" +} +item { + name: "/m/0bt_c3" + id: 84 + display_name: "book" +} +item { + name: "/m/01x3z" + id: 85 + display_name: "clock" +} +item { + name: "/m/02s195" + id: 86 + display_name: "vase" +} +item { + name: "/m/01lsmm" + id: 87 + display_name: "scissors" +} +item { + name: "/m/0kmg4" + id: 88 + display_name: "teddy bear" +} +item { + name: "/m/03wvsk" + id: 89 + display_name: "hair drier" +} +item { + name: "/m/012xff" + id: 90 + display_name: "toothbrush" +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data/oid_bbox_trainable_label_map.pbtxt b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/oid_bbox_trainable_label_map.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..863e4f31d719cd148fd56c981e219257334f9c7e --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/oid_bbox_trainable_label_map.pbtxt @@ -0,0 +1,2725 @@ +item { + name: "/m/01g317" + id: 1 + display_name: "Person" +} +item { + name: "/m/09j2d" + id: 2 + display_name: "Clothing" +} +item { + name: "/m/04yx4" + id: 3 + display_name: "Man" +} +item { + name: "/m/0dzct" + id: 4 + display_name: "Face" +} +item { + name: "/m/07j7r" + id: 5 + display_name: "Tree" +} +item { + name: "/m/05s2s" + id: 6 + display_name: "Plant" +} +item { + name: "/m/03bt1vf" + id: 7 + display_name: "Woman" +} +item { + name: "/m/07yv9" + id: 8 + display_name: "Vehicle" +} +item { + name: "/m/0cgh4" + id: 9 + display_name: "Building" +} +item { + name: "/m/01prls" + id: 10 + display_name: "Land vehicle" +} +item { + name: "/m/09j5n" + id: 11 + display_name: "Footwear" +} +item { + name: "/m/05r655" + id: 12 + display_name: "Girl" +} +item { + name: "/m/0jbk" + id: 13 + display_name: "Animal" +} +item { + name: "/m/0k4j" + id: 14 + display_name: "Car" +} +item { + name: "/m/02wbm" + id: 15 + display_name: "Food" +} +item { + name: "/m/083wq" + id: 16 + display_name: "Wheel" +} +item { + name: "/m/0c9ph5" + id: 17 + display_name: "Flower" +} +item { + name: "/m/0c_jw" + id: 18 + display_name: "Furniture" +} +item { + name: "/m/0d4v4" + id: 19 + display_name: "Window" +} +item { + name: "/m/03jm5" + id: 20 + display_name: "House" +} +item { + name: "/m/01bl7v" + id: 21 + display_name: "Boy" +} +item { + name: "/m/0463sg" + id: 22 + display_name: "Fashion accessory" +} +item { + name: "/m/04bcr3" + id: 23 + display_name: "Table" +} +item { + name: "/m/0jyfg" + id: 24 + display_name: "Glasses" +} +item { + name: "/m/01xyhv" + id: 25 + display_name: "Suit" +} +item { + name: "/m/08dz3q" + id: 26 + display_name: "Auto part" +} +item { + name: "/m/015p6" + id: 27 + display_name: "Bird" +} +item { + name: "/m/05y5lj" + id: 28 + display_name: "Sports equipment" +} +item { + name: "/m/01d40f" + id: 29 + display_name: "Dress" +} +item { + name: "/m/0bt9lr" + id: 30 + display_name: "Dog" +} +item { + name: "/m/01lrl" + id: 31 + display_name: "Carnivore" +} +item { + name: "/m/02p0tk3" + id: 32 + display_name: "Human body" +} +item { + name: "/m/0fly7" + id: 33 + display_name: "Jeans" +} +item { + name: "/m/04szw" + id: 34 + display_name: "Musical instrument" +} +item { + name: "/m/0271t" + id: 35 + display_name: "Drink" +} +item { + name: "/m/019jd" + id: 36 + display_name: "Boat" +} +item { + name: "/m/03q69" + id: 37 + display_name: "Hair" +} +item { + name: "/m/0h9mv" + id: 38 + display_name: "Tire" +} +item { + name: "/m/04hgtk" + id: 39 + display_name: "Head" +} +item { + name: "/m/01yrx" + id: 40 + display_name: "Cat" +} +item { + name: "/m/01rzcn" + id: 41 + display_name: "Watercraft" +} +item { + name: "/m/01mzpv" + id: 42 + display_name: "Chair" +} +item { + name: "/m/0199g" + id: 43 + display_name: "Bike" +} +item { + name: "/m/01fdzj" + id: 44 + display_name: "Tower" +} +item { + name: "/m/04rky" + id: 45 + display_name: "Mammal" +} +item { + name: "/m/079cl" + id: 46 + display_name: "Skyscraper" +} +item { + name: "/m/0dzf4" + id: 47 + display_name: "Arm" +} +item { + name: "/m/0138tl" + id: 48 + display_name: "Toy" +} +item { + name: "/m/06msq" + id: 49 + display_name: "Sculpture" +} +item { + name: "/m/03xxp" + id: 50 + display_name: "Invertebrate" +} +item { + name: "/m/0hg7b" + id: 51 + display_name: "Microphone" +} +item { + name: "/m/01n5jq" + id: 52 + display_name: "Poster" +} +item { + name: "/m/03vt0" + id: 53 + display_name: "Insect" +} +item { + name: "/m/0342h" + id: 54 + display_name: "Guitar" +} +item { + name: "/m/0k0pj" + id: 55 + display_name: "Nose" +} +item { + name: "/m/02dl1y" + id: 56 + display_name: "Hat" +} +item { + name: "/m/04brg2" + id: 57 + display_name: "Tableware" +} +item { + name: "/m/02dgv" + id: 58 + display_name: "Door" +} +item { + name: "/m/01bqk0" + id: 59 + display_name: "Bicycle wheel" +} +item { + name: "/m/017ftj" + id: 60 + display_name: "Sunglasses" +} +item { + name: "/m/052lwg6" + id: 61 + display_name: "Baked goods" +} +item { + name: "/m/014sv8" + id: 62 + display_name: "Eye" +} +item { + name: "/m/0270h" + id: 63 + display_name: "Dessert" +} +item { + name: "/m/0283dt1" + id: 64 + display_name: "Mouth" +} +item { + name: "/m/0k5j" + id: 65 + display_name: "Aircraft" +} +item { + name: "/m/0cmf2" + id: 66 + display_name: "Airplane" +} +item { + name: "/m/07jdr" + id: 67 + display_name: "Train" +} +item { + name: "/m/032b3c" + id: 68 + display_name: "Jacket" +} +item { + name: "/m/033rq4" + id: 69 + display_name: "Street light" +} +item { + name: "/m/0k65p" + id: 70 + display_name: "Hand" +} +item { + name: "/m/01ww8y" + id: 71 + display_name: "Snack" +} +item { + name: "/m/0zvk5" + id: 72 + display_name: "Helmet" +} +item { + name: "/m/07mhn" + id: 73 + display_name: "Trousers" +} +item { + name: "/m/04dr76w" + id: 74 + display_name: "Bottle" +} +item { + name: "/m/03fp41" + id: 75 + display_name: "Houseplant" +} +item { + name: "/m/03k3r" + id: 76 + display_name: "Horse" +} +item { + name: "/m/01y9k5" + id: 77 + display_name: "Desk" +} +item { + name: "/m/0cdl1" + id: 78 + display_name: "Palm tree" +} +item { + name: "/m/0f4s2w" + id: 79 + display_name: "Vegetable" +} +item { + name: "/m/02xwb" + id: 80 + display_name: "Fruit" +} +item { + name: "/m/035r7c" + id: 81 + display_name: "Leg" +} +item { + name: "/m/0bt_c3" + id: 82 + display_name: "Book" +} +item { + name: "/m/01_bhs" + id: 83 + display_name: "Fast food" +} +item { + name: "/m/01599" + id: 84 + display_name: "Beer" +} +item { + name: "/m/03120" + id: 85 + display_name: "Flag" +} +item { + name: "/m/026t6" + id: 86 + display_name: "Drum" +} +item { + name: "/m/01bjv" + id: 87 + display_name: "Bus" +} +item { + name: "/m/07r04" + id: 88 + display_name: "Truck" +} +item { + name: "/m/018xm" + id: 89 + display_name: "Ball" +} +item { + name: "/m/01rkbr" + id: 90 + display_name: "Tie" +} +item { + name: "/m/0fm3zh" + id: 91 + display_name: "Flowerpot" +} +item { + name: "/m/02_n6y" + id: 92 + display_name: "Goggles" +} +item { + name: "/m/04_sv" + id: 93 + display_name: "Motorcycle" +} +item { + name: "/m/06z37_" + id: 94 + display_name: "Picture frame" +} +item { + name: "/m/01bfm9" + id: 95 + display_name: "Shorts" +} +item { + name: "/m/0h8mhzd" + id: 96 + display_name: "Sports uniform" +} +item { + name: "/m/0d_2m" + id: 97 + display_name: "Moths and butterflies" +} +item { + name: "/m/0gjbg72" + id: 98 + display_name: "Shelf" +} +item { + name: "/m/01n4qj" + id: 99 + display_name: "Shirt" +} +item { + name: "/m/0ch_cf" + id: 100 + display_name: "Fish" +} +item { + name: "/m/06m11" + id: 101 + display_name: "Rose" +} +item { + name: "/m/01jfm_" + id: 102 + display_name: "Licence plate" +} +item { + name: "/m/02crq1" + id: 103 + display_name: "Couch" +} +item { + name: "/m/083kb" + id: 104 + display_name: "Weapon" +} +item { + name: "/m/01c648" + id: 105 + display_name: "Laptop" +} +item { + name: "/m/09tvcd" + id: 106 + display_name: "Wine glass" +} +item { + name: "/m/0h2r6" + id: 107 + display_name: "Van" +} +item { + name: "/m/081qc" + id: 108 + display_name: "Wine" +} +item { + name: "/m/09ddx" + id: 109 + display_name: "Duck" +} +item { + name: "/m/03p3bw" + id: 110 + display_name: "Bicycle helmet" +} +item { + name: "/m/0cyf8" + id: 111 + display_name: "Butterfly" +} +item { + name: "/m/0b_rs" + id: 112 + display_name: "Swimming pool" +} +item { + name: "/m/039xj_" + id: 113 + display_name: "Ear" +} +item { + name: "/m/021sj1" + id: 114 + display_name: "Office" +} +item { + name: "/m/0dv5r" + id: 115 + display_name: "Camera" +} +item { + name: "/m/01lynh" + id: 116 + display_name: "Stairs" +} +item { + name: "/m/06bt6" + id: 117 + display_name: "Reptile" +} +item { + name: "/m/01226z" + id: 118 + display_name: "Football" +} +item { + name: "/m/0fszt" + id: 119 + display_name: "Cake" +} +item { + name: "/m/050k8" + id: 120 + display_name: "Mobile phone" +} +item { + name: "/m/02wbtzl" + id: 121 + display_name: "Sun hat" +} +item { + name: "/m/02p5f1q" + id: 122 + display_name: "Coffee cup" +} +item { + name: "/m/025nd" + id: 123 + display_name: "Christmas tree" +} +item { + name: "/m/02522" + id: 124 + display_name: "Computer monitor" +} +item { + name: "/m/09ct_" + id: 125 + display_name: "Helicopter" +} +item { + name: "/m/0cvnqh" + id: 126 + display_name: "Bench" +} +item { + name: "/m/0d5gx" + id: 127 + display_name: "Castle" +} +item { + name: "/m/01xygc" + id: 128 + display_name: "Coat" +} +item { + name: "/m/04m6gz" + id: 129 + display_name: "Porch" +} +item { + name: "/m/01gkx_" + id: 130 + display_name: "Swimwear" +} +item { + name: "/m/01s105" + id: 131 + display_name: "Cabinetry" +} +item { + name: "/m/01j61q" + id: 132 + display_name: "Tent" +} +item { + name: "/m/0hnnb" + id: 133 + display_name: "Umbrella" +} +item { + name: "/m/01j51" + id: 134 + display_name: "Balloon" +} +item { + name: "/m/01knjb" + id: 135 + display_name: "Billboard" +} +item { + name: "/m/03__z0" + id: 136 + display_name: "Bookcase" +} +item { + name: "/m/01m2v" + id: 137 + display_name: "Computer keyboard" +} +item { + name: "/m/0167gd" + id: 138 + display_name: "Doll" +} +item { + name: "/m/0284d" + id: 139 + display_name: "Dairy" +} +item { + name: "/m/03ssj5" + id: 140 + display_name: "Bed" +} +item { + name: "/m/02fq_6" + id: 141 + display_name: "Fedora" +} +item { + name: "/m/06nwz" + id: 142 + display_name: "Seafood" +} +item { + name: "/m/0220r2" + id: 143 + display_name: "Fountain" +} +item { + name: "/m/01mqdt" + id: 144 + display_name: "Traffic sign" +} +item { + name: "/m/0268lbt" + id: 145 + display_name: "Hiking equipment" +} +item { + name: "/m/07c52" + id: 146 + display_name: "Television" +} +item { + name: "/m/0grw1" + id: 147 + display_name: "Salad" +} +item { + name: "/m/01h3n" + id: 148 + display_name: "Bee" +} +item { + name: "/m/078n6m" + id: 149 + display_name: "Coffee table" +} +item { + name: "/m/01xq0k1" + id: 150 + display_name: "Cattle" +} +item { + name: "/m/0gd2v" + id: 151 + display_name: "Marine mammal" +} +item { + name: "/m/0dbvp" + id: 152 + display_name: "Goose" +} +item { + name: "/m/03rszm" + id: 153 + display_name: "Curtain" +} +item { + name: "/m/0h8n5zk" + id: 154 + display_name: "Kitchen & dining room table" +} +item { + name: "/m/019dx1" + id: 155 + display_name: "Home appliance" +} +item { + name: "/m/03hl4l9" + id: 156 + display_name: "Marine invertebrates" +} +item { + name: "/m/0b3fp9" + id: 157 + display_name: "Countertop" +} +item { + name: "/m/02rdsp" + id: 158 + display_name: "Office supplies" +} +item { + name: "/m/0hf58v5" + id: 159 + display_name: "Luggage and bags" +} +item { + name: "/m/04h7h" + id: 160 + display_name: "Lighthouse" +} +item { + name: "/m/024g6" + id: 161 + display_name: "Cocktail" +} +item { + name: "/m/0cffdh" + id: 162 + display_name: "Maple" +} +item { + name: "/m/03q5c7" + id: 163 + display_name: "Saucer" +} +item { + name: "/m/014y4n" + id: 164 + display_name: "Paddle" +} +item { + name: "/m/01yx86" + id: 165 + display_name: "Bronze sculpture" +} +item { + name: "/m/020jm" + id: 166 + display_name: "Beetle" +} +item { + name: "/m/025dyy" + id: 167 + display_name: "Box" +} +item { + name: "/m/01llwg" + id: 168 + display_name: "Necklace" +} +item { + name: "/m/08pbxl" + id: 169 + display_name: "Monkey" +} +item { + name: "/m/02d9qx" + id: 170 + display_name: "Whiteboard" +} +item { + name: "/m/02pkr5" + id: 171 + display_name: "Plumbing fixture" +} +item { + name: "/m/0h99cwc" + id: 172 + display_name: "Kitchen appliance" +} +item { + name: "/m/050gv4" + id: 173 + display_name: "Plate" +} +item { + name: "/m/02vqfm" + id: 174 + display_name: "Coffee" +} +item { + name: "/m/09kx5" + id: 175 + display_name: "Deer" +} +item { + name: "/m/019w40" + id: 176 + display_name: "Surfboard" +} +item { + name: "/m/09dzg" + id: 177 + display_name: "Turtle" +} +item { + name: "/m/07k1x" + id: 178 + display_name: "Tool" +} +item { + name: "/m/080hkjn" + id: 179 + display_name: "Handbag" +} +item { + name: "/m/07qxg_" + id: 180 + display_name: "Football helmet" +} +item { + name: "/m/0ph39" + id: 181 + display_name: "Canoe" +} +item { + name: "/m/018p4k" + id: 182 + display_name: "Cart" +} +item { + name: "/m/02h19r" + id: 183 + display_name: "Scarf" +} +item { + name: "/m/015h_t" + id: 184 + display_name: "Beard" +} +item { + name: "/m/0fqfqc" + id: 185 + display_name: "Drawer" +} +item { + name: "/m/025rp__" + id: 186 + display_name: "Cowboy hat" +} +item { + name: "/m/01x3z" + id: 187 + display_name: "Clock" +} +item { + name: "/m/0crjs" + id: 188 + display_name: "Convenience store" +} +item { + name: "/m/0l515" + id: 189 + display_name: "Sandwich" +} +item { + name: "/m/015qff" + id: 190 + display_name: "Traffic light" +} +item { + name: "/m/09kmb" + id: 191 + display_name: "Spider" +} +item { + name: "/m/09728" + id: 192 + display_name: "Bread" +} +item { + name: "/m/071qp" + id: 193 + display_name: "Squirrel" +} +item { + name: "/m/02s195" + id: 194 + display_name: "Vase" +} +item { + name: "/m/06c54" + id: 195 + display_name: "Rifle" +} +item { + name: "/m/01xqw" + id: 196 + display_name: "Cello" +} +item { + name: "/m/05zsy" + id: 197 + display_name: "Pumpkin" +} +item { + name: "/m/0bwd_0j" + id: 198 + display_name: "Elephant" +} +item { + name: "/m/04m9y" + id: 199 + display_name: "Lizard" +} +item { + name: "/m/052sf" + id: 200 + display_name: "Mushroom" +} +item { + name: "/m/03grzl" + id: 201 + display_name: "Baseball glove" +} +item { + name: "/m/01z1kdw" + id: 202 + display_name: "Juice" +} +item { + name: "/m/02wv6h6" + id: 203 + display_name: "Skirt" +} +item { + name: "/m/016m2d" + id: 204 + display_name: "Skull" +} +item { + name: "/m/0dtln" + id: 205 + display_name: "Lamp" +} +item { + name: "/m/057cc" + id: 206 + display_name: "Musical keyboard" +} +item { + name: "/m/06k2mb" + id: 207 + display_name: "High heels" +} +item { + name: "/m/0f6wt" + id: 208 + display_name: "Falcon" +} +item { + name: "/m/0cxn2" + id: 209 + display_name: "Ice cream" +} +item { + name: "/m/02jvh9" + id: 210 + display_name: "Mug" +} +item { + name: "/m/0gjkl" + id: 211 + display_name: "Watch" +} +item { + name: "/m/01b638" + id: 212 + display_name: "Boot" +} +item { + name: "/m/071p9" + id: 213 + display_name: "Ski" +} +item { + name: "/m/0pg52" + id: 214 + display_name: "Taxi" +} +item { + name: "/m/0ftb8" + id: 215 + display_name: "Sunflower" +} +item { + name: "/m/0hnyx" + id: 216 + display_name: "Pastry" +} +item { + name: "/m/02jz0l" + id: 217 + display_name: "Tap" +} +item { + name: "/m/04kkgm" + id: 218 + display_name: "Bowl" +} +item { + name: "/m/0174n1" + id: 219 + display_name: "Glove" +} +item { + name: "/m/0gv1x" + id: 220 + display_name: "Parrot" +} +item { + name: "/m/09csl" + id: 221 + display_name: "Eagle" +} +item { + name: "/m/02jnhm" + id: 222 + display_name: "Tin can" +} +item { + name: "/m/099ssp" + id: 223 + display_name: "Platter" +} +item { + name: "/m/03nfch" + id: 224 + display_name: "Sandal" +} +item { + name: "/m/07y_7" + id: 225 + display_name: "Violin" +} +item { + name: "/m/05z6w" + id: 226 + display_name: "Penguin" +} +item { + name: "/m/03m3pdh" + id: 227 + display_name: "Sofa bed" +} +item { + name: "/m/09ld4" + id: 228 + display_name: "Frog" +} +item { + name: "/m/09b5t" + id: 229 + display_name: "Chicken" +} +item { + name: "/m/054xkw" + id: 230 + display_name: "Lifejacket" +} +item { + name: "/m/0130jx" + id: 231 + display_name: "Sink" +} +item { + name: "/m/07fbm7" + id: 232 + display_name: "Strawberry" +} +item { + name: "/m/01dws" + id: 233 + display_name: "Bear" +} +item { + name: "/m/01tcjp" + id: 234 + display_name: "Muffin" +} +item { + name: "/m/0dftk" + id: 235 + display_name: "Swan" +} +item { + name: "/m/0c06p" + id: 236 + display_name: "Candle" +} +item { + name: "/m/034c16" + id: 237 + display_name: "Pillow" +} +item { + name: "/m/09d5_" + id: 238 + display_name: "Owl" +} +item { + name: "/m/03hlz0c" + id: 239 + display_name: "Kitchen utensil" +} +item { + name: "/m/0ft9s" + id: 240 + display_name: "Dragonfly" +} +item { + name: "/m/011k07" + id: 241 + display_name: "Tortoise" +} +item { + name: "/m/054_l" + id: 242 + display_name: "Mirror" +} +item { + name: "/m/0jqgx" + id: 243 + display_name: "Lily" +} +item { + name: "/m/0663v" + id: 244 + display_name: "Pizza" +} +item { + name: "/m/0242l" + id: 245 + display_name: "Coin" +} +item { + name: "/m/014trl" + id: 246 + display_name: "Cosmetics" +} +item { + name: "/m/05r5c" + id: 247 + display_name: "Piano" +} +item { + name: "/m/07j87" + id: 248 + display_name: "Tomato" +} +item { + name: "/m/05kyg_" + id: 249 + display_name: "Chest of drawers" +} +item { + name: "/m/0kmg4" + id: 250 + display_name: "Teddy bear" +} +item { + name: "/m/07cmd" + id: 251 + display_name: "Tank" +} +item { + name: "/m/0dv77" + id: 252 + display_name: "Squash" +} +item { + name: "/m/096mb" + id: 253 + display_name: "Lion" +} +item { + name: "/m/01gmv2" + id: 254 + display_name: "Brassiere" +} +item { + name: "/m/07bgp" + id: 255 + display_name: "Sheep" +} +item { + name: "/m/0cmx8" + id: 256 + display_name: "Spoon" +} +item { + name: "/m/029tx" + id: 257 + display_name: "Dinosaur" +} +item { + name: "/m/073bxn" + id: 258 + display_name: "Tripod" +} +item { + name: "/m/0bh9flk" + id: 259 + display_name: "Tablet computer" +} +item { + name: "/m/06mf6" + id: 260 + display_name: "Rabbit" +} +item { + name: "/m/06_fw" + id: 261 + display_name: "Skateboard" +} +item { + name: "/m/078jl" + id: 262 + display_name: "Snake" +} +item { + name: "/m/0fbdv" + id: 263 + display_name: "Shellfish" +} +item { + name: "/m/0h23m" + id: 264 + display_name: "Sparrow" +} +item { + name: "/m/014j1m" + id: 265 + display_name: "Apple" +} +item { + name: "/m/03fwl" + id: 266 + display_name: "Goat" +} +item { + name: "/m/02y6n" + id: 267 + display_name: "French fries" +} +item { + name: "/m/06c7f7" + id: 268 + display_name: "Lipstick" +} +item { + name: "/m/026qbn5" + id: 269 + display_name: "studio couch" +} +item { + name: "/m/0cdn1" + id: 270 + display_name: "Hamburger" +} +item { + name: "/m/07clx" + id: 271 + display_name: "Tea" +} +item { + name: "/m/07cx4" + id: 272 + display_name: "Telephone" +} +item { + name: "/m/03g8mr" + id: 273 + display_name: "Baseball bat" +} +item { + name: "/m/0cnyhnx" + id: 274 + display_name: "Bull" +} +item { + name: "/m/01b7fy" + id: 275 + display_name: "Headphones" +} +item { + name: "/m/04gth" + id: 276 + display_name: "Lavender" +} +item { + name: "/m/0cyfs" + id: 277 + display_name: "Parachute" +} +item { + name: "/m/021mn" + id: 278 + display_name: "Cookie" +} +item { + name: "/m/07dm6" + id: 279 + display_name: "Tiger" +} +item { + name: "/m/0k1tl" + id: 280 + display_name: "Pen" +} +item { + name: "/m/0dv9c" + id: 281 + display_name: "Racket" +} +item { + name: "/m/0dt3t" + id: 282 + display_name: "Fork" +} +item { + name: "/m/04yqq2" + id: 283 + display_name: "Bust" +} +item { + name: "/m/01cmb2" + id: 284 + display_name: "Miniskirt" +} +item { + name: "/m/0gd36" + id: 285 + display_name: "Sea lion" +} +item { + name: "/m/033cnk" + id: 286 + display_name: "Egg" +} +item { + name: "/m/06ncr" + id: 287 + display_name: "Saxophone" +} +item { + name: "/m/03bk1" + id: 288 + display_name: "Giraffe" +} +item { + name: "/m/0bjyj5" + id: 289 + display_name: "Waste container" +} +item { + name: "/m/06__v" + id: 290 + display_name: "Snowboard" +} +item { + name: "/m/0qmmr" + id: 291 + display_name: "Wheelchair" +} +item { + name: "/m/01xgg_" + id: 292 + display_name: "Medical equipment" +} +item { + name: "/m/0czz2" + id: 293 + display_name: "Antelope" +} +item { + name: "/m/02l8p9" + id: 294 + display_name: "Harbor seal" +} +item { + name: "/m/09g1w" + id: 295 + display_name: "Toilet" +} +item { + name: "/m/0ll1f78" + id: 296 + display_name: "Shrimp" +} +item { + name: "/m/0cyhj_" + id: 297 + display_name: "Orange" +} +item { + name: "/m/0642b4" + id: 298 + display_name: "Cupboard" +} +item { + name: "/m/0h8mzrc" + id: 299 + display_name: "Wall clock" +} +item { + name: "/m/068zj" + id: 300 + display_name: "Pig" +} +item { + name: "/m/02z51p" + id: 301 + display_name: "Nightstand" +} +item { + name: "/m/0h8nr_l" + id: 302 + display_name: "Bathroom accessory" +} +item { + name: "/m/0388q" + id: 303 + display_name: "Grape" +} +item { + name: "/m/02hj4" + id: 304 + display_name: "Dolphin" +} +item { + name: "/m/01jfsr" + id: 305 + display_name: "Lantern" +} +item { + name: "/m/07gql" + id: 306 + display_name: "Trumpet" +} +item { + name: "/m/0h8my_4" + id: 307 + display_name: "Tennis racket" +} +item { + name: "/m/0n28_" + id: 308 + display_name: "Crab" +} +item { + name: "/m/0120dh" + id: 309 + display_name: "Sea turtle" +} +item { + name: "/m/020kz" + id: 310 + display_name: "Cannon" +} +item { + name: "/m/0mkg" + id: 311 + display_name: "Accordion" +} +item { + name: "/m/03c7gz" + id: 312 + display_name: "Door handle" +} +item { + name: "/m/09k_b" + id: 313 + display_name: "Lemon" +} +item { + name: "/m/031n1" + id: 314 + display_name: "Foot" +} +item { + name: "/m/04rmv" + id: 315 + display_name: "Mouse" +} +item { + name: "/m/084rd" + id: 316 + display_name: "Wok" +} +item { + name: "/m/02rgn06" + id: 317 + display_name: "Volleyball" +} +item { + name: "/m/05z55" + id: 318 + display_name: "Pasta" +} +item { + name: "/m/01r546" + id: 319 + display_name: "Earrings" +} +item { + name: "/m/09qck" + id: 320 + display_name: "Banana" +} +item { + name: "/m/012w5l" + id: 321 + display_name: "Ladder" +} +item { + name: "/m/01940j" + id: 322 + display_name: "Backpack" +} +item { + name: "/m/09f_2" + id: 323 + display_name: "Crocodile" +} +item { + name: "/m/02p3w7d" + id: 324 + display_name: "Roller skates" +} +item { + name: "/m/057p5t" + id: 325 + display_name: "Scoreboard" +} +item { + name: "/m/0d8zb" + id: 326 + display_name: "Jellyfish" +} +item { + name: "/m/01nq26" + id: 327 + display_name: "Sock" +} +item { + name: "/m/01x_v" + id: 328 + display_name: "Camel" +} +item { + name: "/m/05gqfk" + id: 329 + display_name: "Plastic bag" +} +item { + name: "/m/0cydv" + id: 330 + display_name: "Caterpillar" +} +item { + name: "/m/07030" + id: 331 + display_name: "Sushi" +} +item { + name: "/m/084zz" + id: 332 + display_name: "Whale" +} +item { + name: "/m/0c29q" + id: 333 + display_name: "Leopard" +} +item { + name: "/m/02zn6n" + id: 334 + display_name: "Barrel" +} +item { + name: "/m/03tw93" + id: 335 + display_name: "Fireplace" +} +item { + name: "/m/0fqt361" + id: 336 + display_name: "Stool" +} +item { + name: "/m/0f9_l" + id: 337 + display_name: "Snail" +} +item { + name: "/m/0gm28" + id: 338 + display_name: "Candy" +} +item { + name: "/m/09rvcxw" + id: 339 + display_name: "Rocket" +} +item { + name: "/m/01nkt" + id: 340 + display_name: "Cheese" +} +item { + name: "/m/04p0qw" + id: 341 + display_name: "Billiard table" +} +item { + name: "/m/03hj559" + id: 342 + display_name: "Mixing bowl" +} +item { + name: "/m/07pj7bq" + id: 343 + display_name: "Bowling equipment" +} +item { + name: "/m/04ctx" + id: 344 + display_name: "Knife" +} +item { + name: "/m/0703r8" + id: 345 + display_name: "Loveseat" +} +item { + name: "/m/03qrc" + id: 346 + display_name: "Hamster" +} +item { + name: "/m/020lf" + id: 347 + display_name: "Mouse" +} +item { + name: "/m/0by6g" + id: 348 + display_name: "Shark" +} +item { + name: "/m/01fh4r" + id: 349 + display_name: "Teapot" +} +item { + name: "/m/07c6l" + id: 350 + display_name: "Trombone" +} +item { + name: "/m/03bj1" + id: 351 + display_name: "Panda" +} +item { + name: "/m/0898b" + id: 352 + display_name: "Zebra" +} +item { + name: "/m/02x984l" + id: 353 + display_name: "Mechanical fan" +} +item { + name: "/m/0fj52s" + id: 354 + display_name: "Carrot" +} +item { + name: "/m/0cd4d" + id: 355 + display_name: "Cheetah" +} +item { + name: "/m/02068x" + id: 356 + display_name: "Gondola" +} +item { + name: "/m/01vbnl" + id: 357 + display_name: "Bidet" +} +item { + name: "/m/0449p" + id: 358 + display_name: "Jaguar" +} +item { + name: "/m/0gj37" + id: 359 + display_name: "Ladybug" +} +item { + name: "/m/0nl46" + id: 360 + display_name: "Crown" +} +item { + name: "/m/0152hh" + id: 361 + display_name: "Snowman" +} +item { + name: "/m/03dnzn" + id: 362 + display_name: "Bathtub" +} +item { + name: "/m/05_5p_0" + id: 363 + display_name: "Table tennis racket" +} +item { + name: "/m/02jfl0" + id: 364 + display_name: "Sombrero" +} +item { + name: "/m/01dxs" + id: 365 + display_name: "Brown bear" +} +item { + name: "/m/0cjq5" + id: 366 + display_name: "Lobster" +} +item { + name: "/m/040b_t" + id: 367 + display_name: "Refrigerator" +} +item { + name: "/m/0_cp5" + id: 368 + display_name: "Oyster" +} +item { + name: "/m/0gxl3" + id: 369 + display_name: "Handgun" +} +item { + name: "/m/029bxz" + id: 370 + display_name: "Oven" +} +item { + name: "/m/02zt3" + id: 371 + display_name: "Kite" +} +item { + name: "/m/03d443" + id: 372 + display_name: "Rhinoceros" +} +item { + name: "/m/0306r" + id: 373 + display_name: "Fox" +} +item { + name: "/m/0h8l4fh" + id: 374 + display_name: "Light bulb" +} +item { + name: "/m/0633h" + id: 375 + display_name: "Polar bear" +} +item { + name: "/m/01s55n" + id: 376 + display_name: "Suitcase" +} +item { + name: "/m/0hkxq" + id: 377 + display_name: "Broccoli" +} +item { + name: "/m/0cn6p" + id: 378 + display_name: "Otter" +} +item { + name: "/m/0dbzx" + id: 379 + display_name: "Mule" +} +item { + name: "/m/01dy8n" + id: 380 + display_name: "Woodpecker" +} +item { + name: "/m/01h8tj" + id: 381 + display_name: "Starfish" +} +item { + name: "/m/03s_tn" + id: 382 + display_name: "Kettle" +} +item { + name: "/m/01xs3r" + id: 383 + display_name: "Jet ski" +} +item { + name: "/m/031b6r" + id: 384 + display_name: "Window blind" +} +item { + name: "/m/06j2d" + id: 385 + display_name: "Raven" +} +item { + name: "/m/0hqkz" + id: 386 + display_name: "Grapefruit" +} +item { + name: "/m/01_5g" + id: 387 + display_name: "Chopsticks" +} +item { + name: "/m/02zvsm" + id: 388 + display_name: "Tart" +} +item { + name: "/m/0kpqd" + id: 389 + display_name: "Watermelon" +} +item { + name: "/m/015x4r" + id: 390 + display_name: "Cucumber" +} +item { + name: "/m/061hd_" + id: 391 + display_name: "Infant bed" +} +item { + name: "/m/04ylt" + id: 392 + display_name: "Missile" +} +item { + name: "/m/02wv84t" + id: 393 + display_name: "Gas stove" +} +item { + name: "/m/04y4h8h" + id: 394 + display_name: "Bathroom cabinet" +} +item { + name: "/m/01gllr" + id: 395 + display_name: "Beehive" +} +item { + name: "/m/0pcr" + id: 396 + display_name: "Alpaca" +} +item { + name: "/m/0jy4k" + id: 397 + display_name: "Doughnut" +} +item { + name: "/m/09f20" + id: 398 + display_name: "Hippopotamus" +} +item { + name: "/m/0mcx2" + id: 399 + display_name: "Ipod" +} +item { + name: "/m/04c0y" + id: 400 + display_name: "Kangaroo" +} +item { + name: "/m/0_k2" + id: 401 + display_name: "Ant" +} +item { + name: "/m/0jg57" + id: 402 + display_name: "Bell pepper" +} +item { + name: "/m/03fj2" + id: 403 + display_name: "Goldfish" +} +item { + name: "/m/03ldnb" + id: 404 + display_name: "Ceiling fan" +} +item { + name: "/m/06nrc" + id: 405 + display_name: "Shotgun" +} +item { + name: "/m/01btn" + id: 406 + display_name: "Barge" +} +item { + name: "/m/05vtc" + id: 407 + display_name: "Potato" +} +item { + name: "/m/08hvt4" + id: 408 + display_name: "Jug" +} +item { + name: "/m/0fx9l" + id: 409 + display_name: "Microwave oven" +} +item { + name: "/m/01h44" + id: 410 + display_name: "Bat" +} +item { + name: "/m/05n4y" + id: 411 + display_name: "Ostrich" +} +item { + name: "/m/0jly1" + id: 412 + display_name: "Turkey" +} +item { + name: "/m/06y5r" + id: 413 + display_name: "Sword" +} +item { + name: "/m/05ctyq" + id: 414 + display_name: "Tennis ball" +} +item { + name: "/m/0fp6w" + id: 415 + display_name: "Pineapple" +} +item { + name: "/m/0d4w1" + id: 416 + display_name: "Closet" +} +item { + name: "/m/02pv19" + id: 417 + display_name: "Stop sign" +} +item { + name: "/m/07crc" + id: 418 + display_name: "Taco" +} +item { + name: "/m/01dwwc" + id: 419 + display_name: "Pancake" +} +item { + name: "/m/01b9xk" + id: 420 + display_name: "Hot dog" +} +item { + name: "/m/013y1f" + id: 421 + display_name: "Organ" +} +item { + name: "/m/0m53l" + id: 422 + display_name: "Rays and skates" +} +item { + name: "/m/0174k2" + id: 423 + display_name: "Washing machine" +} +item { + name: "/m/01dwsz" + id: 424 + display_name: "Waffle" +} +item { + name: "/m/04vv5k" + id: 425 + display_name: "Snowplow" +} +item { + name: "/m/04cp_" + id: 426 + display_name: "Koala" +} +item { + name: "/m/0fz0h" + id: 427 + display_name: "Honeycomb" +} +item { + name: "/m/0llzx" + id: 428 + display_name: "Sewing machine" +} +item { + name: "/m/0319l" + id: 429 + display_name: "Horn" +} +item { + name: "/m/04v6l4" + id: 430 + display_name: "Frying pan" +} +item { + name: "/m/0dkzw" + id: 431 + display_name: "Seat belt" +} +item { + name: "/m/027pcv" + id: 432 + display_name: "Zucchini" +} +item { + name: "/m/0323sq" + id: 433 + display_name: "Golf cart" +} +item { + name: "/m/054fyh" + id: 434 + display_name: "Pitcher" +} +item { + name: "/m/01pns0" + id: 435 + display_name: "Fire hydrant" +} +item { + name: "/m/012n7d" + id: 436 + display_name: "Ambulance" +} +item { + name: "/m/044r5d" + id: 437 + display_name: "Golf ball" +} +item { + name: "/m/01krhy" + id: 438 + display_name: "Tiara" +} +item { + name: "/m/0dq75" + id: 439 + display_name: "Raccoon" +} +item { + name: "/m/0176mf" + id: 440 + display_name: "Belt" +} +item { + name: "/m/0h8lkj8" + id: 441 + display_name: "Corded phone" +} +item { + name: "/m/04tn4x" + id: 442 + display_name: "Swim cap" +} +item { + name: "/m/06l9r" + id: 443 + display_name: "Red panda" +} +item { + name: "/m/0cjs7" + id: 444 + display_name: "Asparagus" +} +item { + name: "/m/01lsmm" + id: 445 + display_name: "Scissors" +} +item { + name: "/m/01lcw4" + id: 446 + display_name: "Limousine" +} +item { + name: "/m/047j0r" + id: 447 + display_name: "Filing cabinet" +} +item { + name: "/m/01fb_0" + id: 448 + display_name: "Bagel" +} +item { + name: "/m/04169hn" + id: 449 + display_name: "Wood-burning stove" +} +item { + name: "/m/076bq" + id: 450 + display_name: "Segway" +} +item { + name: "/m/0hdln" + id: 451 + display_name: "Ruler" +} +item { + name: "/m/01g3x7" + id: 452 + display_name: "Bow and arrow" +} +item { + name: "/m/0l3ms" + id: 453 + display_name: "Balance beam" +} +item { + name: "/m/058qzx" + id: 454 + display_name: "Kitchen knife" +} +item { + name: "/m/0h8n6ft" + id: 455 + display_name: "Cake stand" +} +item { + name: "/m/018j2" + id: 456 + display_name: "Banjo" +} +item { + name: "/m/0l14j_" + id: 457 + display_name: "Flute" +} +item { + name: "/m/0wdt60w" + id: 458 + display_name: "Rugby ball" +} +item { + name: "/m/02gzp" + id: 459 + display_name: "Dagger" +} +item { + name: "/m/0h8n6f9" + id: 460 + display_name: "Dog bed" +} +item { + name: "/m/0fbw6" + id: 461 + display_name: "Cabbage" +} +item { + name: "/m/07kng9" + id: 462 + display_name: "Picnic basket" +} +item { + name: "/m/0dj6p" + id: 463 + display_name: "Peach" +} +item { + name: "/m/06pcq" + id: 464 + display_name: "Submarine sandwich" +} +item { + name: "/m/061_f" + id: 465 + display_name: "Pear" +} +item { + name: "/m/04g2r" + id: 466 + display_name: "Lynx" +} +item { + name: "/m/0jwn_" + id: 467 + display_name: "Pomegranate" +} +item { + name: "/m/02f9f_" + id: 468 + display_name: "Shower" +} +item { + name: "/m/01f8m5" + id: 469 + display_name: "Blue jay" +} +item { + name: "/m/01m4t" + id: 470 + display_name: "Printer" +} +item { + name: "/m/0cl4p" + id: 471 + display_name: "Hedgehog" +} +item { + name: "/m/07xyvk" + id: 472 + display_name: "Coffeemaker" +} +item { + name: "/m/084hf" + id: 473 + display_name: "Worm" +} +item { + name: "/m/03v5tg" + id: 474 + display_name: "Drinking straw" +} +item { + name: "/m/0qjjc" + id: 475 + display_name: "Remote control" +} +item { + name: "/m/015x5n" + id: 476 + display_name: "Radish" +} +item { + name: "/m/0ccs93" + id: 477 + display_name: "Canary" +} +item { + name: "/m/0nybt" + id: 478 + display_name: "Seahorse" +} +item { + name: "/m/02vkqh8" + id: 479 + display_name: "Wardrobe" +} +item { + name: "/m/09gtd" + id: 480 + display_name: "Toilet paper" +} +item { + name: "/m/019h78" + id: 481 + display_name: "Centipede" +} +item { + name: "/m/015wgc" + id: 482 + display_name: "Croissant" +} +item { + name: "/m/01x3jk" + id: 483 + display_name: "Snowmobile" +} +item { + name: "/m/01j3zr" + id: 484 + display_name: "Burrito" +} +item { + name: "/m/0c568" + id: 485 + display_name: "Porcupine" +} +item { + name: "/m/02pdsw" + id: 486 + display_name: "Cutting board" +} +item { + name: "/m/029b3" + id: 487 + display_name: "Dice" +} +item { + name: "/m/03q5t" + id: 488 + display_name: "Harpsichord" +} +item { + name: "/m/0p833" + id: 489 + display_name: "Perfume" +} +item { + name: "/m/01d380" + id: 490 + display_name: "Drill" +} +item { + name: "/m/024d2" + id: 491 + display_name: "Calculator" +} +item { + name: "/m/0mw_6" + id: 492 + display_name: "Willow" +} +item { + name: "/m/01f91_" + id: 493 + display_name: "Pretzel" +} +item { + name: "/m/02g30s" + id: 494 + display_name: "Guacamole" +} +item { + name: "/m/01hrv5" + id: 495 + display_name: "Popcorn" +} +item { + name: "/m/03m5k" + id: 496 + display_name: "Harp" +} +item { + name: "/m/0162_1" + id: 497 + display_name: "Towel" +} +item { + name: "/m/063rgb" + id: 498 + display_name: "Mixer" +} +item { + name: "/m/06_72j" + id: 499 + display_name: "Digital clock" +} +item { + name: "/m/046dlr" + id: 500 + display_name: "Alarm clock" +} +item { + name: "/m/047v4b" + id: 501 + display_name: "Artichoke" +} +item { + name: "/m/04zpv" + id: 502 + display_name: "Milk" +} +item { + name: "/m/043nyj" + id: 503 + display_name: "Common fig" +} +item { + name: "/m/03bbps" + id: 504 + display_name: "Power plugs and sockets" +} +item { + name: "/m/02w3r3" + id: 505 + display_name: "Paper towel" +} +item { + name: "/m/02pjr4" + id: 506 + display_name: "Blender" +} +item { + name: "/m/0755b" + id: 507 + display_name: "Scorpion" +} +item { + name: "/m/02lbcq" + id: 508 + display_name: "Stretcher" +} +item { + name: "/m/0fldg" + id: 509 + display_name: "Mango" +} +item { + name: "/m/012074" + id: 510 + display_name: "Magpie" +} +item { + name: "/m/035vxb" + id: 511 + display_name: "Isopod" +} +item { + name: "/m/02w3_ws" + id: 512 + display_name: "Personal care" +} +item { + name: "/m/0f6nr" + id: 513 + display_name: "Unicycle" +} +item { + name: "/m/0420v5" + id: 514 + display_name: "Punching bag" +} +item { + name: "/m/0frqm" + id: 515 + display_name: "Envelope" +} +item { + name: "/m/03txqz" + id: 516 + display_name: "Scale" +} +item { + name: "/m/0271qf7" + id: 517 + display_name: "Wine rack" +} +item { + name: "/m/074d1" + id: 518 + display_name: "Submarine" +} +item { + name: "/m/08p92x" + id: 519 + display_name: "Cream" +} +item { + name: "/m/01j4z9" + id: 520 + display_name: "Chainsaw" +} +item { + name: "/m/0kpt_" + id: 521 + display_name: "Cantaloupe" +} +item { + name: "/m/0h8n27j" + id: 522 + display_name: "Serving tray" +} +item { + name: "/m/03y6mg" + id: 523 + display_name: "Food processor" +} +item { + name: "/m/04h8sr" + id: 524 + display_name: "Dumbbell" +} +item { + name: "/m/065h6l" + id: 525 + display_name: "Jacuzzi" +} +item { + name: "/m/02tsc9" + id: 526 + display_name: "Slow cooker" +} +item { + name: "/m/012ysf" + id: 527 + display_name: "Syringe" +} +item { + name: "/m/0ky7b" + id: 528 + display_name: "Dishwasher" +} +item { + name: "/m/02wg_p" + id: 529 + display_name: "Tree house" +} +item { + name: "/m/0584n8" + id: 530 + display_name: "Briefcase" +} +item { + name: "/m/03kt2w" + id: 531 + display_name: "Stationary bicycle" +} +item { + name: "/m/05kms" + id: 532 + display_name: "Oboe" +} +item { + name: "/m/030610" + id: 533 + display_name: "Treadmill" +} +item { + name: "/m/0lt4_" + id: 534 + display_name: "Binoculars" +} +item { + name: "/m/076lb9" + id: 535 + display_name: "Bench" +} +item { + name: "/m/02ctlc" + id: 536 + display_name: "Cricket ball" +} +item { + name: "/m/02x8cch" + id: 537 + display_name: "Salt and pepper shakers" +} +item { + name: "/m/09gys" + id: 538 + display_name: "Squid" +} +item { + name: "/m/03jbxj" + id: 539 + display_name: "Light switch" +} +item { + name: "/m/012xff" + id: 540 + display_name: "Toothbrush" +} +item { + name: "/m/0h8kx63" + id: 541 + display_name: "Spice rack" +} +item { + name: "/m/073g6" + id: 542 + display_name: "Stethoscope" +} +item { + name: "/m/02cvgx" + id: 543 + display_name: "Winter melon" +} +item { + name: "/m/027rl48" + id: 544 + display_name: "Ladle" +} +item { + name: "/m/01kb5b" + id: 545 + display_name: "Flashlight" +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data/oid_object_detection_challenge_500_label_map.pbtxt b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/oid_object_detection_challenge_500_label_map.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..044f6d4c813729a693cac761f43a2246e07f7b6a --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/oid_object_detection_challenge_500_label_map.pbtxt @@ -0,0 +1,2500 @@ +item { + name: "/m/061hd_" + id: 1 + display_name: "Infant bed" +} +item { + name: "/m/06m11" + id: 2 + display_name: "Rose" +} +item { + name: "/m/03120" + id: 3 + display_name: "Flag" +} +item { + name: "/m/01kb5b" + id: 4 + display_name: "Flashlight" +} +item { + name: "/m/0120dh" + id: 5 + display_name: "Sea turtle" +} +item { + name: "/m/0dv5r" + id: 6 + display_name: "Camera" +} +item { + name: "/m/0jbk" + id: 7 + display_name: "Animal" +} +item { + name: "/m/0174n1" + id: 8 + display_name: "Glove" +} +item { + name: "/m/09f_2" + id: 9 + display_name: "Crocodile" +} +item { + name: "/m/01xq0k1" + id: 10 + display_name: "Cattle" +} +item { + name: "/m/03jm5" + id: 11 + display_name: "House" +} +item { + name: "/m/02g30s" + id: 12 + display_name: "Guacamole" +} +item { + name: "/m/05z6w" + id: 13 + display_name: "Penguin" +} +item { + name: "/m/01jfm_" + id: 14 + display_name: "Vehicle registration plate" +} +item { + name: "/m/076lb9" + id: 15 + display_name: "Training bench" +} +item { + name: "/m/0gj37" + id: 16 + display_name: "Ladybug" +} +item { + name: "/m/0k0pj" + id: 17 + display_name: "Human nose" +} +item { + name: "/m/0kpqd" + id: 18 + display_name: "Watermelon" +} +item { + name: "/m/0l14j_" + id: 19 + display_name: "Flute" +} +item { + name: "/m/0cyf8" + id: 20 + display_name: "Butterfly" +} +item { + name: "/m/0174k2" + id: 21 + display_name: "Washing machine" +} +item { + name: "/m/0dq75" + id: 22 + display_name: "Raccoon" +} +item { + name: "/m/076bq" + id: 23 + display_name: "Segway" +} +item { + name: "/m/07crc" + id: 24 + display_name: "Taco" +} +item { + name: "/m/0d8zb" + id: 25 + display_name: "Jellyfish" +} +item { + name: "/m/0fszt" + id: 26 + display_name: "Cake" +} +item { + name: "/m/0k1tl" + id: 27 + display_name: "Pen" +} +item { + name: "/m/020kz" + id: 28 + display_name: "Cannon" +} +item { + name: "/m/09728" + id: 29 + display_name: "Bread" +} +item { + name: "/m/07j7r" + id: 30 + display_name: "Tree" +} +item { + name: "/m/0fbdv" + id: 31 + display_name: "Shellfish" +} +item { + name: "/m/03ssj5" + id: 32 + display_name: "Bed" +} +item { + name: "/m/03qrc" + id: 33 + display_name: "Hamster" +} +item { + name: "/m/02dl1y" + id: 34 + display_name: "Hat" +} +item { + name: "/m/01k6s3" + id: 35 + display_name: "Toaster" +} +item { + name: "/m/02jfl0" + id: 36 + display_name: "Sombrero" +} +item { + name: "/m/01krhy" + id: 37 + display_name: "Tiara" +} +item { + name: "/m/04kkgm" + id: 38 + display_name: "Bowl" +} +item { + name: "/m/0ft9s" + id: 39 + display_name: "Dragonfly" +} +item { + name: "/m/0d_2m" + id: 40 + display_name: "Moths and butterflies" +} +item { + name: "/m/0czz2" + id: 41 + display_name: "Antelope" +} +item { + name: "/m/0f4s2w" + id: 42 + display_name: "Vegetable" +} +item { + name: "/m/07dd4" + id: 43 + display_name: "Torch" +} +item { + name: "/m/0cgh4" + id: 44 + display_name: "Building" +} +item { + name: "/m/03bbps" + id: 45 + display_name: "Power plugs and sockets" +} +item { + name: "/m/02pjr4" + id: 46 + display_name: "Blender" +} +item { + name: "/m/04p0qw" + id: 47 + display_name: "Billiard table" +} +item { + name: "/m/02pdsw" + id: 48 + display_name: "Cutting board" +} +item { + name: "/m/01yx86" + id: 49 + display_name: "Bronze sculpture" +} +item { + name: "/m/09dzg" + id: 50 + display_name: "Turtle" +} +item { + name: "/m/0hkxq" + id: 51 + display_name: "Broccoli" +} +item { + name: "/m/07dm6" + id: 52 + display_name: "Tiger" +} +item { + name: "/m/054_l" + id: 53 + display_name: "Mirror" +} +item { + name: "/m/01dws" + id: 54 + display_name: "Bear" +} +item { + name: "/m/027pcv" + id: 55 + display_name: "Zucchini" +} +item { + name: "/m/01d40f" + id: 56 + display_name: "Dress" +} +item { + name: "/m/02rgn06" + id: 57 + display_name: "Volleyball" +} +item { + name: "/m/0342h" + id: 58 + display_name: "Guitar" +} +item { + name: "/m/06bt6" + id: 59 + display_name: "Reptile" +} +item { + name: "/m/0323sq" + id: 60 + display_name: "Golf cart" +} +item { + name: "/m/02zvsm" + id: 61 + display_name: "Tart" +} +item { + name: "/m/02fq_6" + id: 62 + display_name: "Fedora" +} +item { + name: "/m/01lrl" + id: 63 + display_name: "Carnivore" +} +item { + name: "/m/0k4j" + id: 64 + display_name: "Car" +} +item { + name: "/m/04h7h" + id: 65 + display_name: "Lighthouse" +} +item { + name: "/m/07xyvk" + id: 66 + display_name: "Coffeemaker" +} +item { + name: "/m/03y6mg" + id: 67 + display_name: "Food processor" +} +item { + name: "/m/07r04" + id: 68 + display_name: "Truck" +} +item { + name: "/m/03__z0" + id: 69 + display_name: "Bookcase" +} +item { + name: "/m/019w40" + id: 70 + display_name: "Surfboard" +} +item { + name: "/m/09j5n" + id: 71 + display_name: "Footwear" +} +item { + name: "/m/0cvnqh" + id: 72 + display_name: "Bench" +} +item { + name: "/m/01llwg" + id: 73 + display_name: "Necklace" +} +item { + name: "/m/0c9ph5" + id: 74 + display_name: "Flower" +} +item { + name: "/m/015x5n" + id: 75 + display_name: "Radish" +} +item { + name: "/m/0gd2v" + id: 76 + display_name: "Marine mammal" +} +item { + name: "/m/04v6l4" + id: 77 + display_name: "Frying pan" +} +item { + name: "/m/02jz0l" + id: 78 + display_name: "Tap" +} +item { + name: "/m/0dj6p" + id: 79 + display_name: "Peach" +} +item { + name: "/m/04ctx" + id: 80 + display_name: "Knife" +} +item { + name: "/m/080hkjn" + id: 81 + display_name: "Handbag" +} +item { + name: "/m/01c648" + id: 82 + display_name: "Laptop" +} +item { + name: "/m/01j61q" + id: 83 + display_name: "Tent" +} +item { + name: "/m/012n7d" + id: 84 + display_name: "Ambulance" +} +item { + name: "/m/025nd" + id: 85 + display_name: "Christmas tree" +} +item { + name: "/m/09csl" + id: 86 + display_name: "Eagle" +} +item { + name: "/m/01lcw4" + id: 87 + display_name: "Limousine" +} +item { + name: "/m/0h8n5zk" + id: 88 + display_name: "Kitchen & dining room table" +} +item { + name: "/m/0633h" + id: 89 + display_name: "Polar bear" +} +item { + name: "/m/01fdzj" + id: 90 + display_name: "Tower" +} +item { + name: "/m/01226z" + id: 91 + display_name: "Football" +} +item { + name: "/m/0mw_6" + id: 92 + display_name: "Willow" +} +item { + name: "/m/04hgtk" + id: 93 + display_name: "Human head" +} +item { + name: "/m/02pv19" + id: 94 + display_name: "Stop sign" +} +item { + name: "/m/09qck" + id: 95 + display_name: "Banana" +} +item { + name: "/m/063rgb" + id: 96 + display_name: "Mixer" +} +item { + name: "/m/0lt4_" + id: 97 + display_name: "Binoculars" +} +item { + name: "/m/0270h" + id: 98 + display_name: "Dessert" +} +item { + name: "/m/01h3n" + id: 99 + display_name: "Bee" +} +item { + name: "/m/01mzpv" + id: 100 + display_name: "Chair" +} +item { + name: "/m/04169hn" + id: 101 + display_name: "Wood-burning stove" +} +item { + name: "/m/0fm3zh" + id: 102 + display_name: "Flowerpot" +} +item { + name: "/m/0d20w4" + id: 103 + display_name: "Beaker" +} +item { + name: "/m/0_cp5" + id: 104 + display_name: "Oyster" +} +item { + name: "/m/01dy8n" + id: 105 + display_name: "Woodpecker" +} +item { + name: "/m/03m5k" + id: 106 + display_name: "Harp" +} +item { + name: "/m/03dnzn" + id: 107 + display_name: "Bathtub" +} +item { + name: "/m/0h8mzrc" + id: 108 + display_name: "Wall clock" +} +item { + name: "/m/0h8mhzd" + id: 109 + display_name: "Sports uniform" +} +item { + name: "/m/03d443" + id: 110 + display_name: "Rhinoceros" +} +item { + name: "/m/01gllr" + id: 111 + display_name: "Beehive" +} +item { + name: "/m/0642b4" + id: 112 + display_name: "Cupboard" +} +item { + name: "/m/09b5t" + id: 113 + display_name: "Chicken" +} +item { + name: "/m/04yx4" + id: 114 + display_name: "Man" +} +item { + name: "/m/01f8m5" + id: 115 + display_name: "Blue jay" +} +item { + name: "/m/015x4r" + id: 116 + display_name: "Cucumber" +} +item { + name: "/m/01j51" + id: 117 + display_name: "Balloon" +} +item { + name: "/m/02zt3" + id: 118 + display_name: "Kite" +} +item { + name: "/m/03tw93" + id: 119 + display_name: "Fireplace" +} +item { + name: "/m/01jfsr" + id: 120 + display_name: "Lantern" +} +item { + name: "/m/04ylt" + id: 121 + display_name: "Missile" +} +item { + name: "/m/0bt_c3" + id: 122 + display_name: "Book" +} +item { + name: "/m/0cmx8" + id: 123 + display_name: "Spoon" +} +item { + name: "/m/0hqkz" + id: 124 + display_name: "Grapefruit" +} +item { + name: "/m/071qp" + id: 125 + display_name: "Squirrel" +} +item { + name: "/m/0cyhj_" + id: 126 + display_name: "Orange" +} +item { + name: "/m/01xygc" + id: 127 + display_name: "Coat" +} +item { + name: "/m/0420v5" + id: 128 + display_name: "Punching bag" +} +item { + name: "/m/0898b" + id: 129 + display_name: "Zebra" +} +item { + name: "/m/01knjb" + id: 130 + display_name: "Billboard" +} +item { + name: "/m/0199g" + id: 131 + display_name: "Bicycle" +} +item { + name: "/m/03c7gz" + id: 132 + display_name: "Door handle" +} +item { + name: "/m/02x984l" + id: 133 + display_name: "Mechanical fan" +} +item { + name: "/m/04zwwv" + id: 134 + display_name: "Ring binder" +} +item { + name: "/m/04bcr3" + id: 135 + display_name: "Table" +} +item { + name: "/m/0gv1x" + id: 136 + display_name: "Parrot" +} +item { + name: "/m/01nq26" + id: 137 + display_name: "Sock" +} +item { + name: "/m/02s195" + id: 138 + display_name: "Vase" +} +item { + name: "/m/083kb" + id: 139 + display_name: "Weapon" +} +item { + name: "/m/06nrc" + id: 140 + display_name: "Shotgun" +} +item { + name: "/m/0jyfg" + id: 141 + display_name: "Glasses" +} +item { + name: "/m/0nybt" + id: 142 + display_name: "Seahorse" +} +item { + name: "/m/0176mf" + id: 143 + display_name: "Belt" +} +item { + name: "/m/01rzcn" + id: 144 + display_name: "Watercraft" +} +item { + name: "/m/0d4v4" + id: 145 + display_name: "Window" +} +item { + name: "/m/03bk1" + id: 146 + display_name: "Giraffe" +} +item { + name: "/m/096mb" + id: 147 + display_name: "Lion" +} +item { + name: "/m/0h9mv" + id: 148 + display_name: "Tire" +} +item { + name: "/m/07yv9" + id: 149 + display_name: "Vehicle" +} +item { + name: "/m/0ph39" + id: 150 + display_name: "Canoe" +} +item { + name: "/m/01rkbr" + id: 151 + display_name: "Tie" +} +item { + name: "/m/0gjbg72" + id: 152 + display_name: "Shelf" +} +item { + name: "/m/06z37_" + id: 153 + display_name: "Picture frame" +} +item { + name: "/m/01m4t" + id: 154 + display_name: "Printer" +} +item { + name: "/m/035r7c" + id: 155 + display_name: "Human leg" +} +item { + name: "/m/019jd" + id: 156 + display_name: "Boat" +} +item { + name: "/m/02tsc9" + id: 157 + display_name: "Slow cooker" +} +item { + name: "/m/015wgc" + id: 158 + display_name: "Croissant" +} +item { + name: "/m/0c06p" + id: 159 + display_name: "Candle" +} +item { + name: "/m/01dwwc" + id: 160 + display_name: "Pancake" +} +item { + name: "/m/034c16" + id: 161 + display_name: "Pillow" +} +item { + name: "/m/0242l" + id: 162 + display_name: "Coin" +} +item { + name: "/m/02lbcq" + id: 163 + display_name: "Stretcher" +} +item { + name: "/m/03nfch" + id: 164 + display_name: "Sandal" +} +item { + name: "/m/03bt1vf" + id: 165 + display_name: "Woman" +} +item { + name: "/m/01lynh" + id: 166 + display_name: "Stairs" +} +item { + name: "/m/03q5t" + id: 167 + display_name: "Harpsichord" +} +item { + name: "/m/0fqt361" + id: 168 + display_name: "Stool" +} +item { + name: "/m/01bjv" + id: 169 + display_name: "Bus" +} +item { + name: "/m/01s55n" + id: 170 + display_name: "Suitcase" +} +item { + name: "/m/0283dt1" + id: 171 + display_name: "Human mouth" +} +item { + name: "/m/01z1kdw" + id: 172 + display_name: "Juice" +} +item { + name: "/m/016m2d" + id: 173 + display_name: "Skull" +} +item { + name: "/m/02dgv" + id: 174 + display_name: "Door" +} +item { + name: "/m/07y_7" + id: 175 + display_name: "Violin" +} +item { + name: "/m/01_5g" + id: 176 + display_name: "Chopsticks" +} +item { + name: "/m/06_72j" + id: 177 + display_name: "Digital clock" +} +item { + name: "/m/0ftb8" + id: 178 + display_name: "Sunflower" +} +item { + name: "/m/0c29q" + id: 179 + display_name: "Leopard" +} +item { + name: "/m/0jg57" + id: 180 + display_name: "Bell pepper" +} +item { + name: "/m/02l8p9" + id: 181 + display_name: "Harbor seal" +} +item { + name: "/m/078jl" + id: 182 + display_name: "Snake" +} +item { + name: "/m/0llzx" + id: 183 + display_name: "Sewing machine" +} +item { + name: "/m/0dbvp" + id: 184 + display_name: "Goose" +} +item { + name: "/m/09ct_" + id: 185 + display_name: "Helicopter" +} +item { + name: "/m/0dkzw" + id: 186 + display_name: "Seat belt" +} +item { + name: "/m/02p5f1q" + id: 187 + display_name: "Coffee cup" +} +item { + name: "/m/0fx9l" + id: 188 + display_name: "Microwave oven" +} +item { + name: "/m/01b9xk" + id: 189 + display_name: "Hot dog" +} +item { + name: "/m/0b3fp9" + id: 190 + display_name: "Countertop" +} +item { + name: "/m/0h8n27j" + id: 191 + display_name: "Serving tray" +} +item { + name: "/m/0h8n6f9" + id: 192 + display_name: "Dog bed" +} +item { + name: "/m/01599" + id: 193 + display_name: "Beer" +} +item { + name: "/m/017ftj" + id: 194 + display_name: "Sunglasses" +} +item { + name: "/m/044r5d" + id: 195 + display_name: "Golf ball" +} +item { + name: "/m/01dwsz" + id: 196 + display_name: "Waffle" +} +item { + name: "/m/0cdl1" + id: 197 + display_name: "Palm tree" +} +item { + name: "/m/07gql" + id: 198 + display_name: "Trumpet" +} +item { + name: "/m/0hdln" + id: 199 + display_name: "Ruler" +} +item { + name: "/m/0zvk5" + id: 200 + display_name: "Helmet" +} +item { + name: "/m/012w5l" + id: 201 + display_name: "Ladder" +} +item { + name: "/m/021sj1" + id: 202 + display_name: "Office building" +} +item { + name: "/m/0bh9flk" + id: 203 + display_name: "Tablet computer" +} +item { + name: "/m/09gtd" + id: 204 + display_name: "Toilet paper" +} +item { + name: "/m/0jwn_" + id: 205 + display_name: "Pomegranate" +} +item { + name: "/m/02wv6h6" + id: 206 + display_name: "Skirt" +} +item { + name: "/m/02wv84t" + id: 207 + display_name: "Gas stove" +} +item { + name: "/m/021mn" + id: 208 + display_name: "Cookie" +} +item { + name: "/m/018p4k" + id: 209 + display_name: "Cart" +} +item { + name: "/m/06j2d" + id: 210 + display_name: "Raven" +} +item { + name: "/m/033cnk" + id: 211 + display_name: "Egg" +} +item { + name: "/m/01j3zr" + id: 212 + display_name: "Burrito" +} +item { + name: "/m/03fwl" + id: 213 + display_name: "Goat" +} +item { + name: "/m/058qzx" + id: 214 + display_name: "Kitchen knife" +} +item { + name: "/m/06_fw" + id: 215 + display_name: "Skateboard" +} +item { + name: "/m/02x8cch" + id: 216 + display_name: "Salt and pepper shakers" +} +item { + name: "/m/04g2r" + id: 217 + display_name: "Lynx" +} +item { + name: "/m/01b638" + id: 218 + display_name: "Boot" +} +item { + name: "/m/099ssp" + id: 219 + display_name: "Platter" +} +item { + name: "/m/071p9" + id: 220 + display_name: "Ski" +} +item { + name: "/m/01gkx_" + id: 221 + display_name: "Swimwear" +} +item { + name: "/m/0b_rs" + id: 222 + display_name: "Swimming pool" +} +item { + name: "/m/03v5tg" + id: 223 + display_name: "Drinking straw" +} +item { + name: "/m/01j5ks" + id: 224 + display_name: "Wrench" +} +item { + name: "/m/026t6" + id: 225 + display_name: "Drum" +} +item { + name: "/m/0_k2" + id: 226 + display_name: "Ant" +} +item { + name: "/m/039xj_" + id: 227 + display_name: "Human ear" +} +item { + name: "/m/01b7fy" + id: 228 + display_name: "Headphones" +} +item { + name: "/m/0220r2" + id: 229 + display_name: "Fountain" +} +item { + name: "/m/015p6" + id: 230 + display_name: "Bird" +} +item { + name: "/m/0fly7" + id: 231 + display_name: "Jeans" +} +item { + name: "/m/07c52" + id: 232 + display_name: "Television" +} +item { + name: "/m/0n28_" + id: 233 + display_name: "Crab" +} +item { + name: "/m/0hg7b" + id: 234 + display_name: "Microphone" +} +item { + name: "/m/019dx1" + id: 235 + display_name: "Home appliance" +} +item { + name: "/m/04vv5k" + id: 236 + display_name: "Snowplow" +} +item { + name: "/m/020jm" + id: 237 + display_name: "Beetle" +} +item { + name: "/m/047v4b" + id: 238 + display_name: "Artichoke" +} +item { + name: "/m/01xs3r" + id: 239 + display_name: "Jet ski" +} +item { + name: "/m/03kt2w" + id: 240 + display_name: "Stationary bicycle" +} +item { + name: "/m/03q69" + id: 241 + display_name: "Human hair" +} +item { + name: "/m/01dxs" + id: 242 + display_name: "Brown bear" +} +item { + name: "/m/01h8tj" + id: 243 + display_name: "Starfish" +} +item { + name: "/m/0dt3t" + id: 244 + display_name: "Fork" +} +item { + name: "/m/0cjq5" + id: 245 + display_name: "Lobster" +} +item { + name: "/m/0h8lkj8" + id: 246 + display_name: "Corded phone" +} +item { + name: "/m/0271t" + id: 247 + display_name: "Drink" +} +item { + name: "/m/03q5c7" + id: 248 + display_name: "Saucer" +} +item { + name: "/m/0fj52s" + id: 249 + display_name: "Carrot" +} +item { + name: "/m/03vt0" + id: 250 + display_name: "Insect" +} +item { + name: "/m/01x3z" + id: 251 + display_name: "Clock" +} +item { + name: "/m/0d5gx" + id: 252 + display_name: "Castle" +} +item { + name: "/m/0h8my_4" + id: 253 + display_name: "Tennis racket" +} +item { + name: "/m/03ldnb" + id: 254 + display_name: "Ceiling fan" +} +item { + name: "/m/0cjs7" + id: 255 + display_name: "Asparagus" +} +item { + name: "/m/0449p" + id: 256 + display_name: "Jaguar" +} +item { + name: "/m/04szw" + id: 257 + display_name: "Musical instrument" +} +item { + name: "/m/07jdr" + id: 258 + display_name: "Train" +} +item { + name: "/m/01yrx" + id: 259 + display_name: "Cat" +} +item { + name: "/m/06c54" + id: 260 + display_name: "Rifle" +} +item { + name: "/m/04h8sr" + id: 261 + display_name: "Dumbbell" +} +item { + name: "/m/050k8" + id: 262 + display_name: "Mobile phone" +} +item { + name: "/m/0pg52" + id: 263 + display_name: "Taxi" +} +item { + name: "/m/02f9f_" + id: 264 + display_name: "Shower" +} +item { + name: "/m/054fyh" + id: 265 + display_name: "Pitcher" +} +item { + name: "/m/09k_b" + id: 266 + display_name: "Lemon" +} +item { + name: "/m/03xxp" + id: 267 + display_name: "Invertebrate" +} +item { + name: "/m/0jly1" + id: 268 + display_name: "Turkey" +} +item { + name: "/m/06k2mb" + id: 269 + display_name: "High heels" +} +item { + name: "/m/04yqq2" + id: 270 + display_name: "Bust" +} +item { + name: "/m/0bwd_0j" + id: 271 + display_name: "Elephant" +} +item { + name: "/m/02h19r" + id: 272 + display_name: "Scarf" +} +item { + name: "/m/02zn6n" + id: 273 + display_name: "Barrel" +} +item { + name: "/m/07c6l" + id: 274 + display_name: "Trombone" +} +item { + name: "/m/05zsy" + id: 275 + display_name: "Pumpkin" +} +item { + name: "/m/025dyy" + id: 276 + display_name: "Box" +} +item { + name: "/m/07j87" + id: 277 + display_name: "Tomato" +} +item { + name: "/m/09ld4" + id: 278 + display_name: "Frog" +} +item { + name: "/m/01vbnl" + id: 279 + display_name: "Bidet" +} +item { + name: "/m/0dzct" + id: 280 + display_name: "Human face" +} +item { + name: "/m/03fp41" + id: 281 + display_name: "Houseplant" +} +item { + name: "/m/0h2r6" + id: 282 + display_name: "Van" +} +item { + name: "/m/0by6g" + id: 283 + display_name: "Shark" +} +item { + name: "/m/0cxn2" + id: 284 + display_name: "Ice cream" +} +item { + name: "/m/04tn4x" + id: 285 + display_name: "Swim cap" +} +item { + name: "/m/0f6wt" + id: 286 + display_name: "Falcon" +} +item { + name: "/m/05n4y" + id: 287 + display_name: "Ostrich" +} +item { + name: "/m/0gxl3" + id: 288 + display_name: "Handgun" +} +item { + name: "/m/02d9qx" + id: 289 + display_name: "Whiteboard" +} +item { + name: "/m/04m9y" + id: 290 + display_name: "Lizard" +} +item { + name: "/m/05z55" + id: 291 + display_name: "Pasta" +} +item { + name: "/m/01x3jk" + id: 292 + display_name: "Snowmobile" +} +item { + name: "/m/0h8l4fh" + id: 293 + display_name: "Light bulb" +} +item { + name: "/m/031b6r" + id: 294 + display_name: "Window blind" +} +item { + name: "/m/01tcjp" + id: 295 + display_name: "Muffin" +} +item { + name: "/m/01f91_" + id: 296 + display_name: "Pretzel" +} +item { + name: "/m/02522" + id: 297 + display_name: "Computer monitor" +} +item { + name: "/m/0319l" + id: 298 + display_name: "Horn" +} +item { + name: "/m/0c_jw" + id: 299 + display_name: "Furniture" +} +item { + name: "/m/0l515" + id: 300 + display_name: "Sandwich" +} +item { + name: "/m/0306r" + id: 301 + display_name: "Fox" +} +item { + name: "/m/0crjs" + id: 302 + display_name: "Convenience store" +} +item { + name: "/m/0ch_cf" + id: 303 + display_name: "Fish" +} +item { + name: "/m/02xwb" + id: 304 + display_name: "Fruit" +} +item { + name: "/m/01r546" + id: 305 + display_name: "Earrings" +} +item { + name: "/m/03rszm" + id: 306 + display_name: "Curtain" +} +item { + name: "/m/0388q" + id: 307 + display_name: "Grape" +} +item { + name: "/m/03m3pdh" + id: 308 + display_name: "Sofa bed" +} +item { + name: "/m/03k3r" + id: 309 + display_name: "Horse" +} +item { + name: "/m/0hf58v5" + id: 310 + display_name: "Luggage and bags" +} +item { + name: "/m/01y9k5" + id: 311 + display_name: "Desk" +} +item { + name: "/m/05441v" + id: 312 + display_name: "Crutch" +} +item { + name: "/m/03p3bw" + id: 313 + display_name: "Bicycle helmet" +} +item { + name: "/m/0175cv" + id: 314 + display_name: "Tick" +} +item { + name: "/m/0cmf2" + id: 315 + display_name: "Airplane" +} +item { + name: "/m/0ccs93" + id: 316 + display_name: "Canary" +} +item { + name: "/m/02d1br" + id: 317 + display_name: "Spatula" +} +item { + name: "/m/0gjkl" + id: 318 + display_name: "Watch" +} +item { + name: "/m/0jqgx" + id: 319 + display_name: "Lily" +} +item { + name: "/m/0h99cwc" + id: 320 + display_name: "Kitchen appliance" +} +item { + name: "/m/047j0r" + id: 321 + display_name: "Filing cabinet" +} +item { + name: "/m/0k5j" + id: 322 + display_name: "Aircraft" +} +item { + name: "/m/0h8n6ft" + id: 323 + display_name: "Cake stand" +} +item { + name: "/m/0gm28" + id: 324 + display_name: "Candy" +} +item { + name: "/m/0130jx" + id: 325 + display_name: "Sink" +} +item { + name: "/m/04rmv" + id: 326 + display_name: "Mouse" +} +item { + name: "/m/081qc" + id: 327 + display_name: "Wine" +} +item { + name: "/m/0qmmr" + id: 328 + display_name: "Wheelchair" +} +item { + name: "/m/03fj2" + id: 329 + display_name: "Goldfish" +} +item { + name: "/m/040b_t" + id: 330 + display_name: "Refrigerator" +} +item { + name: "/m/02y6n" + id: 331 + display_name: "French fries" +} +item { + name: "/m/0fqfqc" + id: 332 + display_name: "Drawer" +} +item { + name: "/m/030610" + id: 333 + display_name: "Treadmill" +} +item { + name: "/m/07kng9" + id: 334 + display_name: "Picnic basket" +} +item { + name: "/m/029b3" + id: 335 + display_name: "Dice" +} +item { + name: "/m/0fbw6" + id: 336 + display_name: "Cabbage" +} +item { + name: "/m/07qxg_" + id: 337 + display_name: "Football helmet" +} +item { + name: "/m/068zj" + id: 338 + display_name: "Pig" +} +item { + name: "/m/01g317" + id: 339 + display_name: "Person" +} +item { + name: "/m/01bfm9" + id: 340 + display_name: "Shorts" +} +item { + name: "/m/02068x" + id: 341 + display_name: "Gondola" +} +item { + name: "/m/0fz0h" + id: 342 + display_name: "Honeycomb" +} +item { + name: "/m/0jy4k" + id: 343 + display_name: "Doughnut" +} +item { + name: "/m/05kyg_" + id: 344 + display_name: "Chest of drawers" +} +item { + name: "/m/01prls" + id: 345 + display_name: "Land vehicle" +} +item { + name: "/m/01h44" + id: 346 + display_name: "Bat" +} +item { + name: "/m/08pbxl" + id: 347 + display_name: "Monkey" +} +item { + name: "/m/02gzp" + id: 348 + display_name: "Dagger" +} +item { + name: "/m/04brg2" + id: 349 + display_name: "Tableware" +} +item { + name: "/m/031n1" + id: 350 + display_name: "Human foot" +} +item { + name: "/m/02jvh9" + id: 351 + display_name: "Mug" +} +item { + name: "/m/046dlr" + id: 352 + display_name: "Alarm clock" +} +item { + name: "/m/0h8ntjv" + id: 353 + display_name: "Pressure cooker" +} +item { + name: "/m/0k65p" + id: 354 + display_name: "Human hand" +} +item { + name: "/m/011k07" + id: 355 + display_name: "Tortoise" +} +item { + name: "/m/03grzl" + id: 356 + display_name: "Baseball glove" +} +item { + name: "/m/06y5r" + id: 357 + display_name: "Sword" +} +item { + name: "/m/061_f" + id: 358 + display_name: "Pear" +} +item { + name: "/m/01cmb2" + id: 359 + display_name: "Miniskirt" +} +item { + name: "/m/01mqdt" + id: 360 + display_name: "Traffic sign" +} +item { + name: "/m/05r655" + id: 361 + display_name: "Girl" +} +item { + name: "/m/02p3w7d" + id: 362 + display_name: "Roller skates" +} +item { + name: "/m/029tx" + id: 363 + display_name: "Dinosaur" +} +item { + name: "/m/04m6gz" + id: 364 + display_name: "Porch" +} +item { + name: "/m/015h_t" + id: 365 + display_name: "Human beard" +} +item { + name: "/m/06pcq" + id: 366 + display_name: "Submarine sandwich" +} +item { + name: "/m/01bms0" + id: 367 + display_name: "Screwdriver" +} +item { + name: "/m/07fbm7" + id: 368 + display_name: "Strawberry" +} +item { + name: "/m/09tvcd" + id: 369 + display_name: "Wine glass" +} +item { + name: "/m/06nwz" + id: 370 + display_name: "Seafood" +} +item { + name: "/m/0dv9c" + id: 371 + display_name: "Racket" +} +item { + name: "/m/083wq" + id: 372 + display_name: "Wheel" +} +item { + name: "/m/0gd36" + id: 373 + display_name: "Sea lion" +} +item { + name: "/m/0138tl" + id: 374 + display_name: "Toy" +} +item { + name: "/m/07clx" + id: 375 + display_name: "Tea" +} +item { + name: "/m/05ctyq" + id: 376 + display_name: "Tennis ball" +} +item { + name: "/m/0bjyj5" + id: 377 + display_name: "Waste container" +} +item { + name: "/m/0dbzx" + id: 378 + display_name: "Mule" +} +item { + name: "/m/02ctlc" + id: 379 + display_name: "Cricket ball" +} +item { + name: "/m/0fp6w" + id: 380 + display_name: "Pineapple" +} +item { + name: "/m/0djtd" + id: 381 + display_name: "Coconut" +} +item { + name: "/m/0167gd" + id: 382 + display_name: "Doll" +} +item { + name: "/m/078n6m" + id: 383 + display_name: "Coffee table" +} +item { + name: "/m/0152hh" + id: 384 + display_name: "Snowman" +} +item { + name: "/m/04gth" + id: 385 + display_name: "Lavender" +} +item { + name: "/m/0ll1f78" + id: 386 + display_name: "Shrimp" +} +item { + name: "/m/0cffdh" + id: 387 + display_name: "Maple" +} +item { + name: "/m/025rp__" + id: 388 + display_name: "Cowboy hat" +} +item { + name: "/m/02_n6y" + id: 389 + display_name: "Goggles" +} +item { + name: "/m/0wdt60w" + id: 390 + display_name: "Rugby ball" +} +item { + name: "/m/0cydv" + id: 391 + display_name: "Caterpillar" +} +item { + name: "/m/01n5jq" + id: 392 + display_name: "Poster" +} +item { + name: "/m/09rvcxw" + id: 393 + display_name: "Rocket" +} +item { + name: "/m/013y1f" + id: 394 + display_name: "Organ" +} +item { + name: "/m/06ncr" + id: 395 + display_name: "Saxophone" +} +item { + name: "/m/015qff" + id: 396 + display_name: "Traffic light" +} +item { + name: "/m/024g6" + id: 397 + display_name: "Cocktail" +} +item { + name: "/m/05gqfk" + id: 398 + display_name: "Plastic bag" +} +item { + name: "/m/0dv77" + id: 399 + display_name: "Squash" +} +item { + name: "/m/052sf" + id: 400 + display_name: "Mushroom" +} +item { + name: "/m/0cdn1" + id: 401 + display_name: "Hamburger" +} +item { + name: "/m/03jbxj" + id: 402 + display_name: "Light switch" +} +item { + name: "/m/0cyfs" + id: 403 + display_name: "Parachute" +} +item { + name: "/m/0kmg4" + id: 404 + display_name: "Teddy bear" +} +item { + name: "/m/02cvgx" + id: 405 + display_name: "Winter melon" +} +item { + name: "/m/09kx5" + id: 406 + display_name: "Deer" +} +item { + name: "/m/057cc" + id: 407 + display_name: "Musical keyboard" +} +item { + name: "/m/02pkr5" + id: 408 + display_name: "Plumbing fixture" +} +item { + name: "/m/057p5t" + id: 409 + display_name: "Scoreboard" +} +item { + name: "/m/03g8mr" + id: 410 + display_name: "Baseball bat" +} +item { + name: "/m/0frqm" + id: 411 + display_name: "Envelope" +} +item { + name: "/m/03m3vtv" + id: 412 + display_name: "Adhesive tape" +} +item { + name: "/m/0584n8" + id: 413 + display_name: "Briefcase" +} +item { + name: "/m/014y4n" + id: 414 + display_name: "Paddle" +} +item { + name: "/m/01g3x7" + id: 415 + display_name: "Bow and arrow" +} +item { + name: "/m/07cx4" + id: 416 + display_name: "Telephone" +} +item { + name: "/m/07bgp" + id: 417 + display_name: "Sheep" +} +item { + name: "/m/032b3c" + id: 418 + display_name: "Jacket" +} +item { + name: "/m/01bl7v" + id: 419 + display_name: "Boy" +} +item { + name: "/m/0663v" + id: 420 + display_name: "Pizza" +} +item { + name: "/m/0cn6p" + id: 421 + display_name: "Otter" +} +item { + name: "/m/02rdsp" + id: 422 + display_name: "Office supplies" +} +item { + name: "/m/02crq1" + id: 423 + display_name: "Couch" +} +item { + name: "/m/01xqw" + id: 424 + display_name: "Cello" +} +item { + name: "/m/0cnyhnx" + id: 425 + display_name: "Bull" +} +item { + name: "/m/01x_v" + id: 426 + display_name: "Camel" +} +item { + name: "/m/018xm" + id: 427 + display_name: "Ball" +} +item { + name: "/m/09ddx" + id: 428 + display_name: "Duck" +} +item { + name: "/m/084zz" + id: 429 + display_name: "Whale" +} +item { + name: "/m/01n4qj" + id: 430 + display_name: "Shirt" +} +item { + name: "/m/07cmd" + id: 431 + display_name: "Tank" +} +item { + name: "/m/04_sv" + id: 432 + display_name: "Motorcycle" +} +item { + name: "/m/0mkg" + id: 433 + display_name: "Accordion" +} +item { + name: "/m/09d5_" + id: 434 + display_name: "Owl" +} +item { + name: "/m/0c568" + id: 435 + display_name: "Porcupine" +} +item { + name: "/m/02wbtzl" + id: 436 + display_name: "Sun hat" +} +item { + name: "/m/05bm6" + id: 437 + display_name: "Nail" +} +item { + name: "/m/01lsmm" + id: 438 + display_name: "Scissors" +} +item { + name: "/m/0dftk" + id: 439 + display_name: "Swan" +} +item { + name: "/m/0dtln" + id: 440 + display_name: "Lamp" +} +item { + name: "/m/0nl46" + id: 441 + display_name: "Crown" +} +item { + name: "/m/05r5c" + id: 442 + display_name: "Piano" +} +item { + name: "/m/06msq" + id: 443 + display_name: "Sculpture" +} +item { + name: "/m/0cd4d" + id: 444 + display_name: "Cheetah" +} +item { + name: "/m/05kms" + id: 445 + display_name: "Oboe" +} +item { + name: "/m/02jnhm" + id: 446 + display_name: "Tin can" +} +item { + name: "/m/0fldg" + id: 447 + display_name: "Mango" +} +item { + name: "/m/073bxn" + id: 448 + display_name: "Tripod" +} +item { + name: "/m/029bxz" + id: 449 + display_name: "Oven" +} +item { + name: "/m/020lf" + id: 450 + display_name: "Computer mouse" +} +item { + name: "/m/01btn" + id: 451 + display_name: "Barge" +} +item { + name: "/m/02vqfm" + id: 452 + display_name: "Coffee" +} +item { + name: "/m/06__v" + id: 453 + display_name: "Snowboard" +} +item { + name: "/m/043nyj" + id: 454 + display_name: "Common fig" +} +item { + name: "/m/0grw1" + id: 455 + display_name: "Salad" +} +item { + name: "/m/03hl4l9" + id: 456 + display_name: "Marine invertebrates" +} +item { + name: "/m/0hnnb" + id: 457 + display_name: "Umbrella" +} +item { + name: "/m/04c0y" + id: 458 + display_name: "Kangaroo" +} +item { + name: "/m/0dzf4" + id: 459 + display_name: "Human arm" +} +item { + name: "/m/07v9_z" + id: 460 + display_name: "Measuring cup" +} +item { + name: "/m/0f9_l" + id: 461 + display_name: "Snail" +} +item { + name: "/m/0703r8" + id: 462 + display_name: "Loveseat" +} +item { + name: "/m/01xyhv" + id: 463 + display_name: "Suit" +} +item { + name: "/m/01fh4r" + id: 464 + display_name: "Teapot" +} +item { + name: "/m/04dr76w" + id: 465 + display_name: "Bottle" +} +item { + name: "/m/0pcr" + id: 466 + display_name: "Alpaca" +} +item { + name: "/m/03s_tn" + id: 467 + display_name: "Kettle" +} +item { + name: "/m/07mhn" + id: 468 + display_name: "Trousers" +} +item { + name: "/m/01hrv5" + id: 469 + display_name: "Popcorn" +} +item { + name: "/m/019h78" + id: 470 + display_name: "Centipede" +} +item { + name: "/m/09kmb" + id: 471 + display_name: "Spider" +} +item { + name: "/m/0h23m" + id: 472 + display_name: "Sparrow" +} +item { + name: "/m/050gv4" + id: 473 + display_name: "Plate" +} +item { + name: "/m/01fb_0" + id: 474 + display_name: "Bagel" +} +item { + name: "/m/02w3_ws" + id: 475 + display_name: "Personal care" +} +item { + name: "/m/014j1m" + id: 476 + display_name: "Apple" +} +item { + name: "/m/01gmv2" + id: 477 + display_name: "Brassiere" +} +item { + name: "/m/04y4h8h" + id: 478 + display_name: "Bathroom cabinet" +} +item { + name: "/m/026qbn5" + id: 479 + display_name: "Studio couch" +} +item { + name: "/m/01m2v" + id: 480 + display_name: "Computer keyboard" +} +item { + name: "/m/05_5p_0" + id: 481 + display_name: "Table tennis racket" +} +item { + name: "/m/07030" + id: 482 + display_name: "Sushi" +} +item { + name: "/m/01s105" + id: 483 + display_name: "Cabinetry" +} +item { + name: "/m/033rq4" + id: 484 + display_name: "Street light" +} +item { + name: "/m/0162_1" + id: 485 + display_name: "Towel" +} +item { + name: "/m/02z51p" + id: 486 + display_name: "Nightstand" +} +item { + name: "/m/06mf6" + id: 487 + display_name: "Rabbit" +} +item { + name: "/m/02hj4" + id: 488 + display_name: "Dolphin" +} +item { + name: "/m/0bt9lr" + id: 489 + display_name: "Dog" +} +item { + name: "/m/08hvt4" + id: 490 + display_name: "Jug" +} +item { + name: "/m/084rd" + id: 491 + display_name: "Wok" +} +item { + name: "/m/01pns0" + id: 492 + display_name: "Fire hydrant" +} +item { + name: "/m/014sv8" + id: 493 + display_name: "Human eye" +} +item { + name: "/m/079cl" + id: 494 + display_name: "Skyscraper" +} +item { + name: "/m/01940j" + id: 495 + display_name: "Backpack" +} +item { + name: "/m/05vtc" + id: 496 + display_name: "Potato" +} +item { + name: "/m/02w3r3" + id: 497 + display_name: "Paper towel" +} +item { + name: "/m/054xkw" + id: 498 + display_name: "Lifejacket" +} +item { + name: "/m/01bqk0" + id: 499 + display_name: "Bicycle wheel" +} +item { + name: "/m/09g1w" + id: 500 + display_name: "Toilet" +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data/pascal_label_map.pbtxt b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/pascal_label_map.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..c9e9e2affcd73ae5cb272a51b44306a74cf22eea --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/pascal_label_map.pbtxt @@ -0,0 +1,99 @@ +item { + id: 1 + name: 'aeroplane' +} + +item { + id: 2 + name: 'bicycle' +} + +item { + id: 3 + name: 'bird' +} + +item { + id: 4 + name: 'boat' +} + +item { + id: 5 + name: 'bottle' +} + +item { + id: 6 + name: 'bus' +} + +item { + id: 7 + name: 'car' +} + +item { + id: 8 + name: 'cat' +} + +item { + id: 9 + name: 'chair' +} + +item { + id: 10 + name: 'cow' +} + +item { + id: 11 + name: 'diningtable' +} + +item { + id: 12 + name: 'dog' +} + +item { + id: 13 + name: 'horse' +} + +item { + id: 14 + name: 'motorbike' +} + +item { + id: 15 + name: 'person' +} + +item { + id: 16 + name: 'pottedplant' +} + +item { + id: 17 + name: 'sheep' +} + +item { + id: 18 + name: 'sofa' +} + +item { + id: 19 + name: 'train' +} + +item { + id: 20 + name: 'tvmonitor' +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data/pet_label_map.pbtxt b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/pet_label_map.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..54d7d3518941ceb0d2dc3465bdf702d4eaac3f07 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/data/pet_label_map.pbtxt @@ -0,0 +1,184 @@ +item { + id: 1 + name: 'Abyssinian' +} + +item { + id: 2 + name: 'american_bulldog' +} + +item { + id: 3 + name: 'american_pit_bull_terrier' +} + +item { + id: 4 + name: 'basset_hound' +} + +item { + id: 5 + name: 'beagle' +} + +item { + id: 6 + name: 'Bengal' +} + +item { + id: 7 + name: 'Birman' +} + +item { + id: 8 + name: 'Bombay' +} + +item { + id: 9 + name: 'boxer' +} + +item { + id: 10 + name: 'British_Shorthair' +} + +item { + id: 11 + name: 'chihuahua' +} + +item { + id: 12 + name: 'Egyptian_Mau' +} + +item { + id: 13 + name: 'english_cocker_spaniel' +} + +item { + id: 14 + name: 'english_setter' +} + +item { + id: 15 + name: 'german_shorthaired' +} + +item { + id: 16 + name: 'great_pyrenees' +} + +item { + id: 17 + name: 'havanese' +} + +item { + id: 18 + name: 'japanese_chin' +} + +item { + id: 19 + name: 'keeshond' +} + +item { + id: 20 + name: 'leonberger' +} + +item { + id: 21 + name: 'Maine_Coon' +} + +item { + id: 22 + name: 'miniature_pinscher' +} + +item { + id: 23 + name: 'newfoundland' +} + +item { + id: 24 + name: 'Persian' +} + +item { + id: 25 + name: 'pomeranian' +} + +item { + id: 26 + name: 'pug' +} + +item { + id: 27 + name: 'Ragdoll' +} + +item { + id: 28 + name: 'Russian_Blue' +} + +item { + id: 29 + name: 'saint_bernard' +} + +item { + id: 30 + name: 'samoyed' +} + +item { + id: 31 + name: 'scottish_terrier' +} + +item { + id: 32 + name: 'shiba_inu' +} + +item { + id: 33 + name: 'Siamese' +} + +item { + id: 34 + name: 'Sphynx' +} + +item { + id: 35 + name: 'staffordshire_bull_terrier' +} + +item { + id: 36 + name: 'wheaten_terrier' +} + +item { + id: 37 + name: 'yorkshire_terrier' +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data_decoders/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/data_decoders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data_decoders/tf_example_decoder.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/data_decoders/tf_example_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..8480a14b4ade6497e57db505875ae0795b191063 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/data_decoders/tf_example_decoder.py @@ -0,0 +1,439 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tensorflow Example proto decoder for object detection. + +A decoder to decode string tensors containing serialized tensorflow.Example +protos for object detection. +""" +import tensorflow as tf + +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops +from object_detection.core import data_decoder +from object_detection.core import standard_fields as fields +from object_detection.protos import input_reader_pb2 +from object_detection.utils import label_map_util + +slim_example_decoder = tf.contrib.slim.tfexample_decoder + + +# TODO(lzc): keep LookupTensor and BackupHandler in sync with +# tf.contrib.slim.tfexample_decoder version. +class LookupTensor(slim_example_decoder.Tensor): + """An ItemHandler that returns a parsed Tensor, the result of a lookup.""" + + def __init__(self, + tensor_key, + table, + shape_keys=None, + shape=None, + default_value=''): + """Initializes the LookupTensor handler. + + Simply calls a vocabulary (most often, a label mapping) lookup. + + Args: + tensor_key: the name of the `TFExample` feature to read the tensor from. + table: A tf.lookup table. + shape_keys: Optional name or list of names of the TF-Example feature in + which the tensor shape is stored. If a list, then each corresponds to + one dimension of the shape. + shape: Optional output shape of the `Tensor`. If provided, the `Tensor` is + reshaped accordingly. + default_value: The value used when the `tensor_key` is not found in a + particular `TFExample`. + + Raises: + ValueError: if both `shape_keys` and `shape` are specified. + """ + self._table = table + super(LookupTensor, self).__init__(tensor_key, shape_keys, shape, + default_value) + + def tensors_to_item(self, keys_to_tensors): + unmapped_tensor = super(LookupTensor, self).tensors_to_item(keys_to_tensors) + return self._table.lookup(unmapped_tensor) + + +class BackupHandler(slim_example_decoder.ItemHandler): + """An ItemHandler that tries two ItemHandlers in order.""" + + def __init__(self, handler, backup): + """Initializes the BackupHandler handler. + + If the first Handler's tensors_to_item returns a Tensor with no elements, + the second Handler is used. + + Args: + handler: The primary ItemHandler. + backup: The backup ItemHandler. + + Raises: + ValueError: if either is not an ItemHandler. + """ + if not isinstance(handler, slim_example_decoder.ItemHandler): + raise ValueError('Primary handler is of type %s instead of ItemHandler' % + type(handler)) + if not isinstance(backup, slim_example_decoder.ItemHandler): + raise ValueError( + 'Backup handler is of type %s instead of ItemHandler' % type(backup)) + self._handler = handler + self._backup = backup + super(BackupHandler, self).__init__(handler.keys + backup.keys) + + def tensors_to_item(self, keys_to_tensors): + item = self._handler.tensors_to_item(keys_to_tensors) + return control_flow_ops.cond( + pred=math_ops.equal(math_ops.reduce_prod(array_ops.shape(item)), 0), + true_fn=lambda: self._backup.tensors_to_item(keys_to_tensors), + false_fn=lambda: item) + + +class TfExampleDecoder(data_decoder.DataDecoder): + """Tensorflow Example proto decoder.""" + + def __init__(self, + load_instance_masks=False, + instance_mask_type=input_reader_pb2.NUMERICAL_MASKS, + label_map_proto_file=None, + use_display_name=False, + dct_method='', + num_keypoints=0, + num_additional_channels=0): + """Constructor sets keys_to_features and items_to_handlers. + + Args: + load_instance_masks: whether or not to load and handle instance masks. + instance_mask_type: type of instance masks. Options are provided in + input_reader.proto. This is only used if `load_instance_masks` is True. + label_map_proto_file: a file path to a + object_detection.protos.StringIntLabelMap proto. If provided, then the + mapped IDs of 'image/object/class/text' will take precedence over the + existing 'image/object/class/label' ID. Also, if provided, it is + assumed that 'image/object/class/text' will be in the data. + use_display_name: whether or not to use the `display_name` for label + mapping (instead of `name`). Only used if label_map_proto_file is + provided. + dct_method: An optional string. Defaults to None. It only takes + effect when image format is jpeg, used to specify a hint about the + algorithm used for jpeg decompression. Currently valid values + are ['INTEGER_FAST', 'INTEGER_ACCURATE']. The hint may be ignored, for + example, the jpeg library does not have that specific option. + num_keypoints: the number of keypoints per object. + num_additional_channels: how many additional channels to use. + + Raises: + ValueError: If `instance_mask_type` option is not one of + input_reader_pb2.DEFAULT, input_reader_pb2.NUMERICAL, or + input_reader_pb2.PNG_MASKS. + """ + self.keys_to_features = { + 'image/encoded': + tf.FixedLenFeature((), tf.string, default_value=''), + 'image/format': + tf.FixedLenFeature((), tf.string, default_value='jpeg'), + 'image/filename': + tf.FixedLenFeature((), tf.string, default_value=''), + 'image/key/sha256': + tf.FixedLenFeature((), tf.string, default_value=''), + 'image/source_id': + tf.FixedLenFeature((), tf.string, default_value=''), + 'image/height': + tf.FixedLenFeature((), tf.int64, default_value=1), + 'image/width': + tf.FixedLenFeature((), tf.int64, default_value=1), + # Object boxes and classes. + 'image/object/bbox/xmin': + tf.VarLenFeature(tf.float32), + 'image/object/bbox/xmax': + tf.VarLenFeature(tf.float32), + 'image/object/bbox/ymin': + tf.VarLenFeature(tf.float32), + 'image/object/bbox/ymax': + tf.VarLenFeature(tf.float32), + 'image/object/class/label': + tf.VarLenFeature(tf.int64), + 'image/object/class/text': + tf.VarLenFeature(tf.string), + 'image/object/area': + tf.VarLenFeature(tf.float32), + 'image/object/is_crowd': + tf.VarLenFeature(tf.int64), + 'image/object/difficult': + tf.VarLenFeature(tf.int64), + 'image/object/group_of': + tf.VarLenFeature(tf.int64), + 'image/object/weight': + tf.VarLenFeature(tf.float32), + } + # We are checking `dct_method` instead of passing it directly in order to + # ensure TF version 1.6 compatibility. + if dct_method: + image = slim_example_decoder.Image( + image_key='image/encoded', + format_key='image/format', + channels=3, + dct_method=dct_method) + additional_channel_image = slim_example_decoder.Image( + image_key='image/additional_channels/encoded', + format_key='image/format', + channels=1, + repeated=True, + dct_method=dct_method) + else: + image = slim_example_decoder.Image( + image_key='image/encoded', format_key='image/format', channels=3) + additional_channel_image = slim_example_decoder.Image( + image_key='image/additional_channels/encoded', + format_key='image/format', + channels=1, + repeated=True) + self.items_to_handlers = { + fields.InputDataFields.image: + image, + fields.InputDataFields.source_id: ( + slim_example_decoder.Tensor('image/source_id')), + fields.InputDataFields.key: ( + slim_example_decoder.Tensor('image/key/sha256')), + fields.InputDataFields.filename: ( + slim_example_decoder.Tensor('image/filename')), + # Object boxes and classes. + fields.InputDataFields.groundtruth_boxes: ( + slim_example_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'], + 'image/object/bbox/')), + fields.InputDataFields.groundtruth_area: + slim_example_decoder.Tensor('image/object/area'), + fields.InputDataFields.groundtruth_is_crowd: ( + slim_example_decoder.Tensor('image/object/is_crowd')), + fields.InputDataFields.groundtruth_difficult: ( + slim_example_decoder.Tensor('image/object/difficult')), + fields.InputDataFields.groundtruth_group_of: ( + slim_example_decoder.Tensor('image/object/group_of')), + fields.InputDataFields.groundtruth_weights: ( + slim_example_decoder.Tensor('image/object/weight')), + } + if num_additional_channels > 0: + self.keys_to_features[ + 'image/additional_channels/encoded'] = tf.FixedLenFeature( + (num_additional_channels,), tf.string) + self.items_to_handlers[ + fields.InputDataFields. + image_additional_channels] = additional_channel_image + self._num_keypoints = num_keypoints + if num_keypoints > 0: + self.keys_to_features['image/object/keypoint/x'] = ( + tf.VarLenFeature(tf.float32)) + self.keys_to_features['image/object/keypoint/y'] = ( + tf.VarLenFeature(tf.float32)) + self.items_to_handlers[fields.InputDataFields.groundtruth_keypoints] = ( + slim_example_decoder.ItemHandlerCallback( + ['image/object/keypoint/y', 'image/object/keypoint/x'], + self._reshape_keypoints)) + if load_instance_masks: + if instance_mask_type in (input_reader_pb2.DEFAULT, + input_reader_pb2.NUMERICAL_MASKS): + self.keys_to_features['image/object/mask'] = ( + tf.VarLenFeature(tf.float32)) + self.items_to_handlers[ + fields.InputDataFields.groundtruth_instance_masks] = ( + slim_example_decoder.ItemHandlerCallback( + ['image/object/mask', 'image/height', 'image/width'], + self._reshape_instance_masks)) + elif instance_mask_type == input_reader_pb2.PNG_MASKS: + self.keys_to_features['image/object/mask'] = tf.VarLenFeature(tf.string) + self.items_to_handlers[ + fields.InputDataFields.groundtruth_instance_masks] = ( + slim_example_decoder.ItemHandlerCallback( + ['image/object/mask', 'image/height', 'image/width'], + self._decode_png_instance_masks)) + else: + raise ValueError('Did not recognize the `instance_mask_type` option.') + if label_map_proto_file: + label_map = label_map_util.get_label_map_dict(label_map_proto_file, + use_display_name) + # We use a default_value of -1, but we expect all labels to be contained + # in the label map. + table = tf.contrib.lookup.HashTable( + initializer=tf.contrib.lookup.KeyValueTensorInitializer( + keys=tf.constant(list(label_map.keys())), + values=tf.constant(list(label_map.values()), dtype=tf.int64)), + default_value=-1) + # If the label_map_proto is provided, try to use it in conjunction with + # the class text, and fall back to a materialized ID. + # TODO(lzc): note that here we are using BackupHandler defined in this + # file(which is branching slim_example_decoder.BackupHandler). Need to + # switch back to slim_example_decoder.BackupHandler once tf 1.5 becomes + # more popular. + label_handler = BackupHandler( + LookupTensor('image/object/class/text', table, default_value=''), + slim_example_decoder.Tensor('image/object/class/label')) + else: + label_handler = slim_example_decoder.Tensor('image/object/class/label') + self.items_to_handlers[ + fields.InputDataFields.groundtruth_classes] = label_handler + + def decode(self, tf_example_string_tensor): + """Decodes serialized tensorflow example and returns a tensor dictionary. + + Args: + tf_example_string_tensor: a string tensor holding a serialized tensorflow + example proto. + + Returns: + A dictionary of the following tensors. + fields.InputDataFields.image - 3D uint8 tensor of shape [None, None, 3] + containing image. + fields.InputDataFields.source_id - string tensor containing original + image id. + fields.InputDataFields.key - string tensor with unique sha256 hash key. + fields.InputDataFields.filename - string tensor with original dataset + filename. + fields.InputDataFields.groundtruth_boxes - 2D float32 tensor of shape + [None, 4] containing box corners. + fields.InputDataFields.groundtruth_classes - 1D int64 tensor of shape + [None] containing classes for the boxes. + fields.InputDataFields.groundtruth_weights - 1D float32 tensor of + shape [None] indicating the weights of groundtruth boxes. + fields.InputDataFields.num_groundtruth_boxes - int32 scalar indicating + the number of groundtruth_boxes. + fields.InputDataFields.groundtruth_area - 1D float32 tensor of shape + [None] containing containing object mask area in pixel squared. + fields.InputDataFields.groundtruth_is_crowd - 1D bool tensor of shape + [None] indicating if the boxes enclose a crowd. + + Optional: + fields.InputDataFields.image_additional_channels - 3D uint8 tensor of + shape [None, None, num_additional_channels]. 1st dim is height; 2nd dim + is width; 3rd dim is the number of additional channels. + fields.InputDataFields.groundtruth_difficult - 1D bool tensor of shape + [None] indicating if the boxes represent `difficult` instances. + fields.InputDataFields.groundtruth_group_of - 1D bool tensor of shape + [None] indicating if the boxes represent `group_of` instances. + fields.InputDataFields.groundtruth_keypoints - 3D float32 tensor of + shape [None, None, 2] containing keypoints, where the coordinates of + the keypoints are ordered (y, x). + fields.InputDataFields.groundtruth_instance_masks - 3D float32 tensor of + shape [None, None, None] containing instance masks. + """ + serialized_example = tf.reshape(tf_example_string_tensor, shape=[]) + decoder = slim_example_decoder.TFExampleDecoder(self.keys_to_features, + self.items_to_handlers) + keys = decoder.list_items() + tensors = decoder.decode(serialized_example, items=keys) + tensor_dict = dict(zip(keys, tensors)) + is_crowd = fields.InputDataFields.groundtruth_is_crowd + tensor_dict[is_crowd] = tf.cast(tensor_dict[is_crowd], dtype=tf.bool) + tensor_dict[fields.InputDataFields.image].set_shape([None, None, 3]) + tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = tf.shape( + tensor_dict[fields.InputDataFields.groundtruth_boxes])[0] + + if fields.InputDataFields.image_additional_channels in tensor_dict: + channels = tensor_dict[fields.InputDataFields.image_additional_channels] + channels = tf.squeeze(channels, axis=3) + channels = tf.transpose(channels, perm=[1, 2, 0]) + tensor_dict[fields.InputDataFields.image_additional_channels] = channels + + def default_groundtruth_weights(): + return tf.ones( + [tf.shape(tensor_dict[fields.InputDataFields.groundtruth_boxes])[0]], + dtype=tf.float32) + + tensor_dict[fields.InputDataFields.groundtruth_weights] = tf.cond( + tf.greater( + tf.shape( + tensor_dict[fields.InputDataFields.groundtruth_weights])[0], + 0), lambda: tensor_dict[fields.InputDataFields.groundtruth_weights], + default_groundtruth_weights) + return tensor_dict + + def _reshape_keypoints(self, keys_to_tensors): + """Reshape keypoints. + + The instance segmentation masks are reshaped to [num_instances, + num_keypoints, 2]. + + Args: + keys_to_tensors: a dictionary from keys to tensors. + + Returns: + A 3-D float tensor of shape [num_instances, num_keypoints, 2] with values + in {0, 1}. + """ + y = keys_to_tensors['image/object/keypoint/y'] + if isinstance(y, tf.SparseTensor): + y = tf.sparse_tensor_to_dense(y) + y = tf.expand_dims(y, 1) + x = keys_to_tensors['image/object/keypoint/x'] + if isinstance(x, tf.SparseTensor): + x = tf.sparse_tensor_to_dense(x) + x = tf.expand_dims(x, 1) + keypoints = tf.concat([y, x], 1) + keypoints = tf.reshape(keypoints, [-1, self._num_keypoints, 2]) + return keypoints + + def _reshape_instance_masks(self, keys_to_tensors): + """Reshape instance segmentation masks. + + The instance segmentation masks are reshaped to [num_instances, height, + width]. + + Args: + keys_to_tensors: a dictionary from keys to tensors. + + Returns: + A 3-D float tensor of shape [num_instances, height, width] with values + in {0, 1}. + """ + height = keys_to_tensors['image/height'] + width = keys_to_tensors['image/width'] + to_shape = tf.cast(tf.stack([-1, height, width]), tf.int32) + masks = keys_to_tensors['image/object/mask'] + if isinstance(masks, tf.SparseTensor): + masks = tf.sparse_tensor_to_dense(masks) + masks = tf.reshape(tf.to_float(tf.greater(masks, 0.0)), to_shape) + return tf.cast(masks, tf.float32) + + def _decode_png_instance_masks(self, keys_to_tensors): + """Decode PNG instance segmentation masks and stack into dense tensor. + + The instance segmentation masks are reshaped to [num_instances, height, + width]. + + Args: + keys_to_tensors: a dictionary from keys to tensors. + + Returns: + A 3-D float tensor of shape [num_instances, height, width] with values + in {0, 1}. + """ + + def decode_png_mask(image_buffer): + image = tf.squeeze( + tf.image.decode_image(image_buffer, channels=1), axis=2) + image.set_shape([None, None]) + image = tf.to_float(tf.greater(image, 0)) + return image + + png_masks = keys_to_tensors['image/object/mask'] + height = keys_to_tensors['image/height'] + width = keys_to_tensors['image/width'] + if isinstance(png_masks, tf.SparseTensor): + png_masks = tf.sparse_tensor_to_dense(png_masks, default_value='') + return tf.cond( + tf.greater(tf.size(png_masks), 0), + lambda: tf.map_fn(decode_png_mask, png_masks, dtype=tf.float32), + lambda: tf.zeros(tf.to_int32(tf.stack([0, height, width])))) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/data_decoders/tf_example_decoder_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/data_decoders/tf_example_decoder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b567b8c20f442f135653b49ace7e85088fd67ad1 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/data_decoders/tf_example_decoder_test.py @@ -0,0 +1,767 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.data_decoders.tf_example_decoder.""" + +import os +import numpy as np +import tensorflow as tf + +from tensorflow.core.example import example_pb2 +from tensorflow.core.example import feature_pb2 +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import lookup_ops +from tensorflow.python.ops import parsing_ops +from object_detection.core import standard_fields as fields +from object_detection.data_decoders import tf_example_decoder +from object_detection.protos import input_reader_pb2 + +slim_example_decoder = tf.contrib.slim.tfexample_decoder + + +class TfExampleDecoderTest(tf.test.TestCase): + + def _EncodeImage(self, image_tensor, encoding_type='jpeg'): + with self.test_session(): + if encoding_type == 'jpeg': + image_encoded = tf.image.encode_jpeg(tf.constant(image_tensor)).eval() + elif encoding_type == 'png': + image_encoded = tf.image.encode_png(tf.constant(image_tensor)).eval() + else: + raise ValueError('Invalid encoding type.') + return image_encoded + + def _DecodeImage(self, image_encoded, encoding_type='jpeg'): + with self.test_session(): + if encoding_type == 'jpeg': + image_decoded = tf.image.decode_jpeg(tf.constant(image_encoded)).eval() + elif encoding_type == 'png': + image_decoded = tf.image.decode_png(tf.constant(image_encoded)).eval() + else: + raise ValueError('Invalid encoding type.') + return image_decoded + + def _Int64Feature(self, value): + return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + + def _FloatFeature(self, value): + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) + + def _BytesFeature(self, value): + if isinstance(value, list): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + + def _Int64FeatureFromList(self, ndarray): + return feature_pb2.Feature( + int64_list=feature_pb2.Int64List(value=ndarray.flatten().tolist())) + + def _BytesFeatureFromList(self, ndarray): + values = ndarray.flatten().tolist() + return feature_pb2.Feature(bytes_list=feature_pb2.BytesList(value=values)) + + def testDecodeAdditionalChannels(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + + additional_channel_tensor = np.random.randint( + 256, size=(4, 5, 1)).astype(np.uint8) + encoded_additional_channel = self._EncodeImage(additional_channel_tensor) + decoded_additional_channel = self._DecodeImage(encoded_additional_channel) + + example = tf.train.Example( + features=tf.train.Features( + feature={ + 'image/encoded': + self._BytesFeature(encoded_jpeg), + 'image/additional_channels/encoded': + self._BytesFeatureFromList( + np.array([encoded_additional_channel] * 2)), + 'image/format': + self._BytesFeature('jpeg'), + 'image/source_id': + self._BytesFeature('image_id'), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder( + num_additional_channels=2) + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + self.assertAllEqual( + np.concatenate([decoded_additional_channel] * 2, axis=2), + tensor_dict[fields.InputDataFields.image_additional_channels]) + + def testDecodeExampleWithBranchedBackupHandler(self): + example1 = example_pb2.Example( + features=feature_pb2.Features( + feature={ + 'image/object/class/text': + self._BytesFeatureFromList( + np.array(['cat', 'dog', 'guinea pig'])), + 'image/object/class/label': + self._Int64FeatureFromList(np.array([42, 10, 900])) + })) + example2 = example_pb2.Example( + features=feature_pb2.Features( + feature={ + 'image/object/class/text': + self._BytesFeatureFromList( + np.array(['cat', 'dog', 'guinea pig'])), + })) + example3 = example_pb2.Example( + features=feature_pb2.Features( + feature={ + 'image/object/class/label': + self._Int64FeatureFromList(np.array([42, 10, 901])) + })) + # 'dog' -> 0, 'guinea pig' -> 1, 'cat' -> 2 + table = lookup_ops.index_table_from_tensor( + constant_op.constant(['dog', 'guinea pig', 'cat'])) + keys_to_features = { + 'image/object/class/text': parsing_ops.VarLenFeature(dtypes.string), + 'image/object/class/label': parsing_ops.VarLenFeature(dtypes.int64), + } + backup_handler = tf_example_decoder.BackupHandler( + handler=slim_example_decoder.Tensor('image/object/class/label'), + backup=tf_example_decoder.LookupTensor('image/object/class/text', + table)) + items_to_handlers = { + 'labels': backup_handler, + } + decoder = slim_example_decoder.TFExampleDecoder(keys_to_features, + items_to_handlers) + obtained_class_ids_each_example = [] + with self.test_session() as sess: + sess.run(lookup_ops.tables_initializer()) + for example in [example1, example2, example3]: + serialized_example = array_ops.reshape( + example.SerializeToString(), shape=[]) + obtained_class_ids_each_example.append( + decoder.decode(serialized_example)[0].eval()) + + self.assertAllClose([42, 10, 900], obtained_class_ids_each_example[0]) + self.assertAllClose([2, 0, 1], obtained_class_ids_each_example[1]) + self.assertAllClose([42, 10, 901], obtained_class_ids_each_example[2]) + + def testDecodeExampleWithBranchedLookup(self): + + example = example_pb2.Example(features=feature_pb2.Features(feature={ + 'image/object/class/text': self._BytesFeatureFromList( + np.array(['cat', 'dog', 'guinea pig'])), + })) + serialized_example = example.SerializeToString() + # 'dog' -> 0, 'guinea pig' -> 1, 'cat' -> 2 + table = lookup_ops.index_table_from_tensor( + constant_op.constant(['dog', 'guinea pig', 'cat'])) + + with self.test_session() as sess: + sess.run(lookup_ops.tables_initializer()) + + serialized_example = array_ops.reshape(serialized_example, shape=[]) + + keys_to_features = { + 'image/object/class/text': parsing_ops.VarLenFeature(dtypes.string), + } + + items_to_handlers = { + 'labels': + tf_example_decoder.LookupTensor('image/object/class/text', table), + } + + decoder = slim_example_decoder.TFExampleDecoder(keys_to_features, + items_to_handlers) + obtained_class_ids = decoder.decode(serialized_example)[0].eval() + + self.assertAllClose([2, 0, 1], obtained_class_ids) + + def testDecodeJpegImage(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + decoded_jpeg = self._DecodeImage(encoded_jpeg) + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/source_id': self._BytesFeature('image_id'), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[fields.InputDataFields.image]. + get_shape().as_list()), [None, None, 3]) + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual(decoded_jpeg, tensor_dict[fields.InputDataFields.image]) + self.assertEqual('image_id', tensor_dict[fields.InputDataFields.source_id]) + + def testDecodeImageKeyAndFilename(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/key/sha256': self._BytesFeature('abc'), + 'image/filename': self._BytesFeature('filename') + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertEqual('abc', tensor_dict[fields.InputDataFields.key]) + self.assertEqual('filename', tensor_dict[fields.InputDataFields.filename]) + + def testDecodePngImage(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_png = self._EncodeImage(image_tensor, encoding_type='png') + decoded_png = self._DecodeImage(encoded_png, encoding_type='png') + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_png), + 'image/format': self._BytesFeature('png'), + 'image/source_id': self._BytesFeature('image_id') + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[fields.InputDataFields.image]. + get_shape().as_list()), [None, None, 3]) + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual(decoded_png, tensor_dict[fields.InputDataFields.image]) + self.assertEqual('image_id', tensor_dict[fields.InputDataFields.source_id]) + + def testDecodePngInstanceMasks(self): + image_tensor = np.random.randint(256, size=(10, 10, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + mask_1 = np.random.randint(0, 2, size=(10, 10, 1)).astype(np.uint8) + mask_2 = np.random.randint(0, 2, size=(10, 10, 1)).astype(np.uint8) + encoded_png_1 = self._EncodeImage(mask_1, encoding_type='png') + decoded_png_1 = np.squeeze(mask_1.astype(np.float32)) + encoded_png_2 = self._EncodeImage(mask_2, encoding_type='png') + decoded_png_2 = np.squeeze(mask_2.astype(np.float32)) + encoded_masks = [encoded_png_1, encoded_png_2] + decoded_masks = np.stack([decoded_png_1, decoded_png_2]) + example = tf.train.Example( + features=tf.train.Features( + feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/mask': self._BytesFeature(encoded_masks) + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder( + load_instance_masks=True, instance_mask_type=input_reader_pb2.PNG_MASKS) + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual( + decoded_masks, + tensor_dict[fields.InputDataFields.groundtruth_instance_masks]) + + def testDecodeEmptyPngInstanceMasks(self): + image_tensor = np.random.randint(256, size=(10, 10, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + encoded_masks = [] + example = tf.train.Example( + features=tf.train.Features( + feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/mask': self._BytesFeature(encoded_masks), + 'image/height': self._Int64Feature([10]), + 'image/width': self._Int64Feature([10]), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder( + load_instance_masks=True, instance_mask_type=input_reader_pb2.PNG_MASKS) + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + self.assertAllEqual( + tensor_dict[fields.InputDataFields.groundtruth_instance_masks].shape, + [0, 10, 10]) + + def testDecodeBoundingBox(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + bbox_ymins = [0.0, 4.0] + bbox_xmins = [1.0, 5.0] + bbox_ymaxs = [2.0, 6.0] + bbox_xmaxs = [3.0, 7.0] + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/bbox/ymin': self._FloatFeature(bbox_ymins), + 'image/object/bbox/xmin': self._FloatFeature(bbox_xmins), + 'image/object/bbox/ymax': self._FloatFeature(bbox_ymaxs), + 'image/object/bbox/xmax': self._FloatFeature(bbox_xmaxs), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[fields.InputDataFields.groundtruth_boxes]. + get_shape().as_list()), [None, 4]) + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + expected_boxes = np.vstack([bbox_ymins, bbox_xmins, + bbox_ymaxs, bbox_xmaxs]).transpose() + self.assertAllEqual(expected_boxes, + tensor_dict[fields.InputDataFields.groundtruth_boxes]) + self.assertAllEqual( + 2, tensor_dict[fields.InputDataFields.num_groundtruth_boxes]) + + @test_util.enable_c_shapes + def testDecodeKeypoint(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + bbox_ymins = [0.0, 4.0] + bbox_xmins = [1.0, 5.0] + bbox_ymaxs = [2.0, 6.0] + bbox_xmaxs = [3.0, 7.0] + keypoint_ys = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + keypoint_xs = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/bbox/ymin': self._FloatFeature(bbox_ymins), + 'image/object/bbox/xmin': self._FloatFeature(bbox_xmins), + 'image/object/bbox/ymax': self._FloatFeature(bbox_ymaxs), + 'image/object/bbox/xmax': self._FloatFeature(bbox_xmaxs), + 'image/object/keypoint/y': self._FloatFeature(keypoint_ys), + 'image/object/keypoint/x': self._FloatFeature(keypoint_xs), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder(num_keypoints=3) + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[fields.InputDataFields.groundtruth_boxes]. + get_shape().as_list()), [None, 4]) + self.assertAllEqual((tensor_dict[fields.InputDataFields. + groundtruth_keypoints]. + get_shape().as_list()), [2, 3, 2]) + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + expected_boxes = np.vstack([bbox_ymins, bbox_xmins, + bbox_ymaxs, bbox_xmaxs]).transpose() + self.assertAllEqual(expected_boxes, + tensor_dict[fields.InputDataFields.groundtruth_boxes]) + self.assertAllEqual( + 2, tensor_dict[fields.InputDataFields.num_groundtruth_boxes]) + + expected_keypoints = ( + np.vstack([keypoint_ys, keypoint_xs]).transpose().reshape((2, 3, 2))) + self.assertAllEqual(expected_keypoints, + tensor_dict[ + fields.InputDataFields.groundtruth_keypoints]) + + def testDecodeDefaultGroundtruthWeights(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + bbox_ymins = [0.0, 4.0] + bbox_xmins = [1.0, 5.0] + bbox_ymaxs = [2.0, 6.0] + bbox_xmaxs = [3.0, 7.0] + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/bbox/ymin': self._FloatFeature(bbox_ymins), + 'image/object/bbox/xmin': self._FloatFeature(bbox_xmins), + 'image/object/bbox/ymax': self._FloatFeature(bbox_ymaxs), + 'image/object/bbox/xmax': self._FloatFeature(bbox_xmaxs), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[fields.InputDataFields.groundtruth_boxes]. + get_shape().as_list()), [None, 4]) + + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllClose(tensor_dict[fields.InputDataFields.groundtruth_weights], + np.ones(2, dtype=np.float32)) + + @test_util.enable_c_shapes + def testDecodeObjectLabel(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + bbox_classes = [0, 1] + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/class/label': self._Int64Feature(bbox_classes), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[ + fields.InputDataFields.groundtruth_classes].get_shape().as_list()), + [2]) + + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual(bbox_classes, + tensor_dict[fields.InputDataFields.groundtruth_classes]) + + def testDecodeObjectLabelNoText(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + bbox_classes = [1, 2] + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/class/label': self._Int64Feature(bbox_classes), + })).SerializeToString() + label_map_string = """ + item { + id:1 + name:'cat' + } + item { + id:2 + name:'dog' + } + """ + label_map_path = os.path.join(self.get_temp_dir(), 'label_map.pbtxt') + with tf.gfile.Open(label_map_path, 'wb') as f: + f.write(label_map_string) + + example_decoder = tf_example_decoder.TfExampleDecoder( + label_map_proto_file=label_map_path) + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[ + fields.InputDataFields.groundtruth_classes].get_shape().as_list()), + [None]) + + init = tf.tables_initializer() + with self.test_session() as sess: + sess.run(init) + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual(bbox_classes, + tensor_dict[fields.InputDataFields.groundtruth_classes]) + + def testDecodeObjectLabelUnrecognizedName(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + bbox_classes_text = ['cat', 'cheetah'] + example = tf.train.Example( + features=tf.train.Features( + feature={ + 'image/encoded': + self._BytesFeature(encoded_jpeg), + 'image/format': + self._BytesFeature('jpeg'), + 'image/object/class/text': + self._BytesFeature(bbox_classes_text), + })).SerializeToString() + + label_map_string = """ + item { + id:2 + name:'cat' + } + item { + id:1 + name:'dog' + } + """ + label_map_path = os.path.join(self.get_temp_dir(), 'label_map.pbtxt') + with tf.gfile.Open(label_map_path, 'wb') as f: + f.write(label_map_string) + example_decoder = tf_example_decoder.TfExampleDecoder( + label_map_proto_file=label_map_path) + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[fields.InputDataFields.groundtruth_classes] + .get_shape().as_list()), [None]) + + with self.test_session() as sess: + sess.run(tf.tables_initializer()) + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual([2, -1], + tensor_dict[fields.InputDataFields.groundtruth_classes]) + + def testDecodeObjectLabelWithMapping(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + bbox_classes_text = ['cat', 'dog'] + example = tf.train.Example( + features=tf.train.Features( + feature={ + 'image/encoded': + self._BytesFeature(encoded_jpeg), + 'image/format': + self._BytesFeature('jpeg'), + 'image/object/class/text': + self._BytesFeature(bbox_classes_text), + })).SerializeToString() + + label_map_string = """ + item { + id:3 + name:'cat' + } + item { + id:1 + name:'dog' + } + """ + label_map_path = os.path.join(self.get_temp_dir(), 'label_map.pbtxt') + with tf.gfile.Open(label_map_path, 'wb') as f: + f.write(label_map_string) + example_decoder = tf_example_decoder.TfExampleDecoder( + label_map_proto_file=label_map_path) + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[fields.InputDataFields.groundtruth_classes] + .get_shape().as_list()), [None]) + + with self.test_session() as sess: + sess.run(tf.tables_initializer()) + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual([3, 1], + tensor_dict[fields.InputDataFields.groundtruth_classes]) + + @test_util.enable_c_shapes + def testDecodeObjectArea(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + object_area = [100., 174.] + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/area': self._FloatFeature(object_area), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[fields.InputDataFields.groundtruth_area]. + get_shape().as_list()), [2]) + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual(object_area, + tensor_dict[fields.InputDataFields.groundtruth_area]) + + @test_util.enable_c_shapes + def testDecodeObjectIsCrowd(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + object_is_crowd = [0, 1] + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/is_crowd': self._Int64Feature(object_is_crowd), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[ + fields.InputDataFields.groundtruth_is_crowd].get_shape().as_list()), + [2]) + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual([bool(item) for item in object_is_crowd], + tensor_dict[ + fields.InputDataFields.groundtruth_is_crowd]) + + @test_util.enable_c_shapes + def testDecodeObjectDifficult(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + object_difficult = [0, 1] + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/difficult': self._Int64Feature(object_difficult), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[ + fields.InputDataFields.groundtruth_difficult].get_shape().as_list()), + [2]) + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual([bool(item) for item in object_difficult], + tensor_dict[ + fields.InputDataFields.groundtruth_difficult]) + + @test_util.enable_c_shapes + def testDecodeObjectGroupOf(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + object_group_of = [0, 1] + example = tf.train.Example(features=tf.train.Features( + feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/group_of': self._Int64Feature(object_group_of), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[ + fields.InputDataFields.groundtruth_group_of].get_shape().as_list()), + [2]) + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual( + [bool(item) for item in object_group_of], + tensor_dict[fields.InputDataFields.groundtruth_group_of]) + + def testDecodeObjectWeight(self): + image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + object_weights = [0.75, 1.0] + example = tf.train.Example(features=tf.train.Features( + feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/object/weight': self._FloatFeature(object_weights), + })).SerializeToString() + + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual((tensor_dict[ + fields.InputDataFields.groundtruth_weights].get_shape().as_list()), + [None]) + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual( + object_weights, + tensor_dict[fields.InputDataFields.groundtruth_weights]) + + @test_util.enable_c_shapes + def testDecodeInstanceSegmentation(self): + num_instances = 4 + image_height = 5 + image_width = 3 + + # Randomly generate image. + image_tensor = np.random.randint(256, size=(image_height, + image_width, + 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + + # Randomly generate instance segmentation masks. + instance_masks = ( + np.random.randint(2, size=(num_instances, + image_height, + image_width)).astype(np.float32)) + instance_masks_flattened = np.reshape(instance_masks, [-1]) + + # Randomly generate class labels for each instance. + object_classes = np.random.randint( + 100, size=(num_instances)).astype(np.int64) + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/height': self._Int64Feature([image_height]), + 'image/width': self._Int64Feature([image_width]), + 'image/object/mask': self._FloatFeature(instance_masks_flattened), + 'image/object/class/label': self._Int64Feature( + object_classes)})).SerializeToString() + example_decoder = tf_example_decoder.TfExampleDecoder( + load_instance_masks=True) + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + + self.assertAllEqual(( + tensor_dict[fields.InputDataFields.groundtruth_instance_masks]. + get_shape().as_list()), [4, 5, 3]) + + self.assertAllEqual(( + tensor_dict[fields.InputDataFields.groundtruth_classes]. + get_shape().as_list()), [4]) + + with self.test_session() as sess: + tensor_dict = sess.run(tensor_dict) + + self.assertAllEqual( + instance_masks.astype(np.float32), + tensor_dict[fields.InputDataFields.groundtruth_instance_masks]) + self.assertAllEqual( + object_classes, + tensor_dict[fields.InputDataFields.groundtruth_classes]) + + def testInstancesNotAvailableByDefault(self): + num_instances = 4 + image_height = 5 + image_width = 3 + # Randomly generate image. + image_tensor = np.random.randint(256, size=(image_height, + image_width, + 3)).astype(np.uint8) + encoded_jpeg = self._EncodeImage(image_tensor) + + # Randomly generate instance segmentation masks. + instance_masks = ( + np.random.randint(2, size=(num_instances, + image_height, + image_width)).astype(np.float32)) + instance_masks_flattened = np.reshape(instance_masks, [-1]) + + # Randomly generate class labels for each instance. + object_classes = np.random.randint( + 100, size=(num_instances)).astype(np.int64) + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': self._BytesFeature(encoded_jpeg), + 'image/format': self._BytesFeature('jpeg'), + 'image/height': self._Int64Feature([image_height]), + 'image/width': self._Int64Feature([image_width]), + 'image/object/mask': self._FloatFeature(instance_masks_flattened), + 'image/object/class/label': self._Int64Feature( + object_classes)})).SerializeToString() + example_decoder = tf_example_decoder.TfExampleDecoder() + tensor_dict = example_decoder.decode(tf.convert_to_tensor(example)) + self.assertTrue(fields.InputDataFields.groundtruth_instance_masks + not in tensor_dict) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_coco_tf_record.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_coco_tf_record.py new file mode 100644 index 0000000000000000000000000000000000000000..9928443d805effb24b46f599929c4b7db73fb2c8 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_coco_tf_record.py @@ -0,0 +1,273 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +r"""Convert raw COCO dataset to TFRecord for object_detection. + +Example usage: + python create_coco_tf_record.py --logtostderr \ + --train_image_dir="${TRAIN_IMAGE_DIR}" \ + --val_image_dir="${VAL_IMAGE_DIR}" \ + --test_image_dir="${TEST_IMAGE_DIR}" \ + --train_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \ + --val_annotations_file="${VAL_ANNOTATIONS_FILE}" \ + --testdev_annotations_file="${TESTDEV_ANNOTATIONS_FILE}" \ + --output_dir="${OUTPUT_DIR}" +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import hashlib +import io +import json +import os +import numpy as np +import PIL.Image + +from pycocotools import mask +import tensorflow as tf + +from object_detection.utils import dataset_util +from object_detection.utils import label_map_util + + +flags = tf.app.flags +tf.flags.DEFINE_boolean('include_masks', False, + 'Whether to include instance segmentations masks ' + '(PNG encoded) in the result. default: False.') +tf.flags.DEFINE_string('train_image_dir', '', + 'Training image directory.') +tf.flags.DEFINE_string('val_image_dir', '', + 'Validation image directory.') +tf.flags.DEFINE_string('test_image_dir', '', + 'Test image directory.') +tf.flags.DEFINE_string('train_annotations_file', '', + 'Training annotations JSON file.') +tf.flags.DEFINE_string('val_annotations_file', '', + 'Validation annotations JSON file.') +tf.flags.DEFINE_string('testdev_annotations_file', '', + 'Test-dev annotations JSON file.') +tf.flags.DEFINE_string('output_dir', '/tmp/', 'Output data directory.') + +FLAGS = flags.FLAGS + +tf.logging.set_verbosity(tf.logging.INFO) + + +def create_tf_example(image, + annotations_list, + image_dir, + category_index, + include_masks=False): + """Converts image and annotations to a tf.Example proto. + + Args: + image: dict with keys: + [u'license', u'file_name', u'coco_url', u'height', u'width', + u'date_captured', u'flickr_url', u'id'] + annotations_list: + list of dicts with keys: + [u'segmentation', u'area', u'iscrowd', u'image_id', + u'bbox', u'category_id', u'id'] + Notice that bounding box coordinates in the official COCO dataset are + given as [x, y, width, height] tuples using absolute coordinates where + x, y represent the top-left (0-indexed) corner. This function converts + to the format expected by the Tensorflow Object Detection API (which is + which is [ymin, xmin, ymax, xmax] with coordinates normalized relative + to image size). + image_dir: directory containing the image files. + category_index: a dict containing COCO category information keyed + by the 'id' field of each category. See the + label_map_util.create_category_index function. + include_masks: Whether to include instance segmentations masks + (PNG encoded) in the result. default: False. + Returns: + example: The converted tf.Example + num_annotations_skipped: Number of (invalid) annotations that were ignored. + + Raises: + ValueError: if the image pointed to by data['filename'] is not a valid JPEG + """ + image_height = image['height'] + image_width = image['width'] + filename = image['file_name'] + image_id = image['id'] + + full_path = os.path.join(image_dir, filename) + with tf.gfile.GFile(full_path, 'rb') as fid: + encoded_jpg = fid.read() + encoded_jpg_io = io.BytesIO(encoded_jpg) + image = PIL.Image.open(encoded_jpg_io) + key = hashlib.sha256(encoded_jpg).hexdigest() + + xmin = [] + xmax = [] + ymin = [] + ymax = [] + is_crowd = [] + category_names = [] + category_ids = [] + area = [] + encoded_mask_png = [] + num_annotations_skipped = 0 + for object_annotations in annotations_list: + (x, y, width, height) = tuple(object_annotations['bbox']) + if width <= 0 or height <= 0: + num_annotations_skipped += 1 + continue + if x + width > image_width or y + height > image_height: + num_annotations_skipped += 1 + continue + xmin.append(float(x) / image_width) + xmax.append(float(x + width) / image_width) + ymin.append(float(y) / image_height) + ymax.append(float(y + height) / image_height) + is_crowd.append(object_annotations['iscrowd']) + category_id = int(object_annotations['category_id']) + category_ids.append(category_id) + category_names.append(category_index[category_id]['name'].encode('utf8')) + area.append(object_annotations['area']) + + if include_masks: + run_len_encoding = mask.frPyObjects(object_annotations['segmentation'], + image_height, image_width) + binary_mask = mask.decode(run_len_encoding) + if not object_annotations['iscrowd']: + binary_mask = np.amax(binary_mask, axis=2) + pil_image = PIL.Image.fromarray(binary_mask) + output_io = io.BytesIO() + pil_image.save(output_io, format='PNG') + encoded_mask_png.append(output_io.getvalue()) + feature_dict = { + 'image/height': + dataset_util.int64_feature(image_height), + 'image/width': + dataset_util.int64_feature(image_width), + 'image/filename': + dataset_util.bytes_feature(filename.encode('utf8')), + 'image/source_id': + dataset_util.bytes_feature(str(image_id).encode('utf8')), + 'image/key/sha256': + dataset_util.bytes_feature(key.encode('utf8')), + 'image/encoded': + dataset_util.bytes_feature(encoded_jpg), + 'image/format': + dataset_util.bytes_feature('jpeg'.encode('utf8')), + 'image/object/bbox/xmin': + dataset_util.float_list_feature(xmin), + 'image/object/bbox/xmax': + dataset_util.float_list_feature(xmax), + 'image/object/bbox/ymin': + dataset_util.float_list_feature(ymin), + 'image/object/bbox/ymax': + dataset_util.float_list_feature(ymax), + 'image/object/class/label': + dataset_util.int64_list_feature(category_ids), + 'image/object/is_crowd': + dataset_util.int64_list_feature(is_crowd), + 'image/object/area': + dataset_util.float_list_feature(area), + } + if include_masks: + feature_dict['image/object/mask'] = ( + dataset_util.bytes_list_feature(encoded_mask_png)) + example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) + return key, example, num_annotations_skipped + + +def _create_tf_record_from_coco_annotations( + annotations_file, image_dir, output_path, include_masks): + """Loads COCO annotation json files and converts to tf.Record format. + + Args: + annotations_file: JSON file containing bounding box annotations. + image_dir: Directory containing the image files. + output_path: Path to output tf.Record file. + include_masks: Whether to include instance segmentations masks + (PNG encoded) in the result. default: False. + """ + with tf.gfile.GFile(annotations_file, 'r') as fid: + groundtruth_data = json.load(fid) + images = groundtruth_data['images'] + category_index = label_map_util.create_category_index( + groundtruth_data['categories']) + + annotations_index = {} + if 'annotations' in groundtruth_data: + tf.logging.info( + 'Found groundtruth annotations. Building annotations index.') + for annotation in groundtruth_data['annotations']: + image_id = annotation['image_id'] + if image_id not in annotations_index: + annotations_index[image_id] = [] + annotations_index[image_id].append(annotation) + missing_annotation_count = 0 + for image in images: + image_id = image['id'] + if image_id not in annotations_index: + missing_annotation_count += 1 + annotations_index[image_id] = [] + tf.logging.info('%d images are missing annotations.', + missing_annotation_count) + + tf.logging.info('writing to output path: %s', output_path) + writer = tf.python_io.TFRecordWriter(output_path) + total_num_annotations_skipped = 0 + for idx, image in enumerate(images): + if idx % 100 == 0: + tf.logging.info('On image %d of %d', idx, len(images)) + annotations_list = annotations_index[image['id']] + _, tf_example, num_annotations_skipped = create_tf_example( + image, annotations_list, image_dir, category_index, include_masks) + total_num_annotations_skipped += num_annotations_skipped + writer.write(tf_example.SerializeToString()) + writer.close() + tf.logging.info('Finished writing, skipped %d annotations.', + total_num_annotations_skipped) + + +def main(_): + assert FLAGS.train_image_dir, '`train_image_dir` missing.' + assert FLAGS.val_image_dir, '`val_image_dir` missing.' + assert FLAGS.test_image_dir, '`test_image_dir` missing.' + assert FLAGS.train_annotations_file, '`train_annotations_file` missing.' + assert FLAGS.val_annotations_file, '`val_annotations_file` missing.' + assert FLAGS.testdev_annotations_file, '`testdev_annotations_file` missing.' + + if not tf.gfile.IsDirectory(FLAGS.output_dir): + tf.gfile.MakeDirs(FLAGS.output_dir) + train_output_path = os.path.join(FLAGS.output_dir, 'coco_train.record') + val_output_path = os.path.join(FLAGS.output_dir, 'coco_val.record') + testdev_output_path = os.path.join(FLAGS.output_dir, 'coco_testdev.record') + + _create_tf_record_from_coco_annotations( + FLAGS.train_annotations_file, + FLAGS.train_image_dir, + train_output_path, + FLAGS.include_masks) + _create_tf_record_from_coco_annotations( + FLAGS.val_annotations_file, + FLAGS.val_image_dir, + val_output_path, + FLAGS.include_masks) + _create_tf_record_from_coco_annotations( + FLAGS.testdev_annotations_file, + FLAGS.test_image_dir, + testdev_output_path, + FLAGS.include_masks) + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_coco_tf_record_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_coco_tf_record_test.py new file mode 100644 index 0000000000000000000000000000000000000000..45697eeff5bc9f103621fda2cb729ee71ef7c4d6 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_coco_tf_record_test.py @@ -0,0 +1,188 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Test for create_coco_tf_record.py.""" + +import io +import os + +import numpy as np +import PIL.Image +import tensorflow as tf + +from object_detection.dataset_tools import create_coco_tf_record + + +class CreateCocoTFRecordTest(tf.test.TestCase): + + def _assertProtoEqual(self, proto_field, expectation): + """Helper function to assert if a proto field equals some value. + + Args: + proto_field: The protobuf field to compare. + expectation: The expected value of the protobuf field. + """ + proto_list = [p for p in proto_field] + self.assertListEqual(proto_list, expectation) + + def test_create_tf_example(self): + image_file_name = 'tmp_image.jpg' + image_data = np.random.rand(256, 256, 3) + tmp_dir = self.get_temp_dir() + save_path = os.path.join(tmp_dir, image_file_name) + image = PIL.Image.fromarray(image_data, 'RGB') + image.save(save_path) + + image = { + 'file_name': image_file_name, + 'height': 256, + 'width': 256, + 'id': 11, + } + + annotations_list = [{ + 'area': .5, + 'iscrowd': False, + 'image_id': 11, + 'bbox': [64, 64, 128, 128], + 'category_id': 2, + 'id': 1000, + }] + + image_dir = tmp_dir + category_index = { + 1: { + 'name': 'dog', + 'id': 1 + }, + 2: { + 'name': 'cat', + 'id': 2 + }, + 3: { + 'name': 'human', + 'id': 3 + } + } + + (_, example, + num_annotations_skipped) = create_coco_tf_record.create_tf_example( + image, annotations_list, image_dir, category_index) + + self.assertEqual(num_annotations_skipped, 0) + self._assertProtoEqual( + example.features.feature['image/height'].int64_list.value, [256]) + self._assertProtoEqual( + example.features.feature['image/width'].int64_list.value, [256]) + self._assertProtoEqual( + example.features.feature['image/filename'].bytes_list.value, + [image_file_name]) + self._assertProtoEqual( + example.features.feature['image/source_id'].bytes_list.value, + [str(image['id'])]) + self._assertProtoEqual( + example.features.feature['image/format'].bytes_list.value, ['jpeg']) + self._assertProtoEqual( + example.features.feature['image/object/bbox/xmin'].float_list.value, + [0.25]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/ymin'].float_list.value, + [0.25]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/xmax'].float_list.value, + [0.75]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/ymax'].float_list.value, + [0.75]) + + def test_create_tf_example_with_instance_masks(self): + image_file_name = 'tmp_image.jpg' + image_data = np.random.rand(8, 8, 3) + tmp_dir = self.get_temp_dir() + save_path = os.path.join(tmp_dir, image_file_name) + image = PIL.Image.fromarray(image_data, 'RGB') + image.save(save_path) + + image = { + 'file_name': image_file_name, + 'height': 8, + 'width': 8, + 'id': 11, + } + + annotations_list = [{ + 'area': .5, + 'iscrowd': False, + 'image_id': 11, + 'bbox': [0, 0, 8, 8], + 'segmentation': [[4, 0, 0, 0, 0, 4], [8, 4, 4, 8, 8, 8]], + 'category_id': 1, + 'id': 1000, + }] + + image_dir = tmp_dir + category_index = { + 1: { + 'name': 'dog', + 'id': 1 + }, + } + + (_, example, + num_annotations_skipped) = create_coco_tf_record.create_tf_example( + image, annotations_list, image_dir, category_index, include_masks=True) + + self.assertEqual(num_annotations_skipped, 0) + self._assertProtoEqual( + example.features.feature['image/height'].int64_list.value, [8]) + self._assertProtoEqual( + example.features.feature['image/width'].int64_list.value, [8]) + self._assertProtoEqual( + example.features.feature['image/filename'].bytes_list.value, + [image_file_name]) + self._assertProtoEqual( + example.features.feature['image/source_id'].bytes_list.value, + [str(image['id'])]) + self._assertProtoEqual( + example.features.feature['image/format'].bytes_list.value, ['jpeg']) + self._assertProtoEqual( + example.features.feature['image/object/bbox/xmin'].float_list.value, + [0]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/ymin'].float_list.value, + [0]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/xmax'].float_list.value, + [1]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/ymax'].float_list.value, + [1]) + encoded_mask_pngs = [ + io.BytesIO(encoded_masks) for encoded_masks in example.features.feature[ + 'image/object/mask'].bytes_list.value + ] + pil_masks = [ + np.array(PIL.Image.open(encoded_mask_png)) + for encoded_mask_png in encoded_mask_pngs + ] + self.assertTrue(len(pil_masks) == 1) + self.assertAllEqual(pil_masks[0], + [[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 1], + [0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1]]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_kitti_tf_record.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_kitti_tf_record.py new file mode 100644 index 0000000000000000000000000000000000000000..c612db99166114689b8c40112bc03be53db44eef --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_kitti_tf_record.py @@ -0,0 +1,310 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +r"""Convert raw KITTI detection dataset to TFRecord for object_detection. + +Converts KITTI detection dataset to TFRecords with a standard format allowing + to use this dataset to train object detectors. The raw dataset can be + downloaded from: + http://kitti.is.tue.mpg.de/kitti/data_object_image_2.zip. + http://kitti.is.tue.mpg.de/kitti/data_object_label_2.zip + Permission can be requested at the main website. + + KITTI detection dataset contains 7481 training images. Using this code with + the default settings will set aside the first 500 images as a validation set. + This can be altered using the flags, see details below. + +Example usage: + python object_detection/dataset_tools/create_kitti_tf_record.py \ + --data_dir=/home/user/kitti \ + --output_path=/home/user/kitti.record +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import hashlib +import io +import os + +import numpy as np +import PIL.Image as pil +import tensorflow as tf + +from object_detection.utils import dataset_util +from object_detection.utils import label_map_util +from object_detection.utils.np_box_ops import iou + +tf.app.flags.DEFINE_string('data_dir', '', 'Location of root directory for the ' + 'data. Folder structure is assumed to be:' + '/training/label_2 (annotations) and' + '/data_object_image_2/training/image_2' + '(images).') +tf.app.flags.DEFINE_string('output_path', '', 'Path to which TFRecord files' + 'will be written. The TFRecord with the training set' + 'will be located at: _train.tfrecord.' + 'And the TFRecord with the validation set will be' + 'located at: _val.tfrecord') +tf.app.flags.DEFINE_string('classes_to_use', 'car,pedestrian,dontcare', + 'Comma separated list of class names that will be' + 'used. Adding the dontcare class will remove all' + 'bboxs in the dontcare regions.') +tf.app.flags.DEFINE_string('label_map_path', 'data/kitti_label_map.pbtxt', + 'Path to label map proto.') +tf.app.flags.DEFINE_integer('validation_set_size', '500', 'Number of images to' + 'be used as a validation set.') +FLAGS = tf.app.flags.FLAGS + + +def convert_kitti_to_tfrecords(data_dir, output_path, classes_to_use, + label_map_path, validation_set_size): + """Convert the KITTI detection dataset to TFRecords. + + Args: + data_dir: The full path to the unzipped folder containing the unzipped data + from data_object_image_2 and data_object_label_2.zip. + Folder structure is assumed to be: data_dir/training/label_2 (annotations) + and data_dir/data_object_image_2/training/image_2 (images). + output_path: The path to which TFRecord files will be written. The TFRecord + with the training set will be located at: _train.tfrecord + And the TFRecord with the validation set will be located at: + _val.tfrecord + classes_to_use: List of strings naming the classes for which data should be + converted. Use the same names as presented in the KIITI README file. + Adding dontcare class will remove all other bounding boxes that overlap + with areas marked as dontcare regions. + label_map_path: Path to label map proto + validation_set_size: How many images should be left as the validation set. + (Ffirst `validation_set_size` examples are selected to be in the + validation set). + """ + label_map_dict = label_map_util.get_label_map_dict(label_map_path) + train_count = 0 + val_count = 0 + + annotation_dir = os.path.join(data_dir, + 'training', + 'label_2') + + image_dir = os.path.join(data_dir, + 'data_object_image_2', + 'training', + 'image_2') + + train_writer = tf.python_io.TFRecordWriter('%s_train.tfrecord'% + output_path) + val_writer = tf.python_io.TFRecordWriter('%s_val.tfrecord'% + output_path) + + images = sorted(tf.gfile.ListDirectory(image_dir)) + for img_name in images: + img_num = int(img_name.split('.')[0]) + is_validation_img = img_num < validation_set_size + img_anno = read_annotation_file(os.path.join(annotation_dir, + str(img_num).zfill(6)+'.txt')) + + image_path = os.path.join(image_dir, img_name) + + # Filter all bounding boxes of this frame that are of a legal class, and + # don't overlap with a dontcare region. + # TODO(talremez) filter out targets that are truncated or heavily occluded. + annotation_for_image = filter_annotations(img_anno, classes_to_use) + + example = prepare_example(image_path, annotation_for_image, label_map_dict) + if is_validation_img: + val_writer.write(example.SerializeToString()) + val_count += 1 + else: + train_writer.write(example.SerializeToString()) + train_count += 1 + + train_writer.close() + val_writer.close() + + +def prepare_example(image_path, annotations, label_map_dict): + """Converts a dictionary with annotations for an image to tf.Example proto. + + Args: + image_path: The complete path to image. + annotations: A dictionary representing the annotation of a single object + that appears in the image. + label_map_dict: A map from string label names to integer ids. + + Returns: + example: The converted tf.Example. + """ + with tf.gfile.GFile(image_path, 'rb') as fid: + encoded_png = fid.read() + encoded_png_io = io.BytesIO(encoded_png) + image = pil.open(encoded_png_io) + image = np.asarray(image) + + key = hashlib.sha256(encoded_png).hexdigest() + + width = int(image.shape[1]) + height = int(image.shape[0]) + + xmin_norm = annotations['2d_bbox_left'] / float(width) + ymin_norm = annotations['2d_bbox_top'] / float(height) + xmax_norm = annotations['2d_bbox_right'] / float(width) + ymax_norm = annotations['2d_bbox_bottom'] / float(height) + + difficult_obj = [0]*len(xmin_norm) + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': dataset_util.int64_feature(height), + 'image/width': dataset_util.int64_feature(width), + 'image/filename': dataset_util.bytes_feature(image_path.encode('utf8')), + 'image/source_id': dataset_util.bytes_feature(image_path.encode('utf8')), + 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), + 'image/encoded': dataset_util.bytes_feature(encoded_png), + 'image/format': dataset_util.bytes_feature('png'.encode('utf8')), + 'image/object/bbox/xmin': dataset_util.float_list_feature(xmin_norm), + 'image/object/bbox/xmax': dataset_util.float_list_feature(xmax_norm), + 'image/object/bbox/ymin': dataset_util.float_list_feature(ymin_norm), + 'image/object/bbox/ymax': dataset_util.float_list_feature(ymax_norm), + 'image/object/class/text': dataset_util.bytes_list_feature( + [x.encode('utf8') for x in annotations['type']]), + 'image/object/class/label': dataset_util.int64_list_feature( + [label_map_dict[x] for x in annotations['type']]), + 'image/object/difficult': dataset_util.int64_list_feature(difficult_obj), + 'image/object/truncated': dataset_util.float_list_feature( + annotations['truncated']), + 'image/object/alpha': dataset_util.float_list_feature( + annotations['alpha']), + 'image/object/3d_bbox/height': dataset_util.float_list_feature( + annotations['3d_bbox_height']), + 'image/object/3d_bbox/width': dataset_util.float_list_feature( + annotations['3d_bbox_width']), + 'image/object/3d_bbox/length': dataset_util.float_list_feature( + annotations['3d_bbox_length']), + 'image/object/3d_bbox/x': dataset_util.float_list_feature( + annotations['3d_bbox_x']), + 'image/object/3d_bbox/y': dataset_util.float_list_feature( + annotations['3d_bbox_y']), + 'image/object/3d_bbox/z': dataset_util.float_list_feature( + annotations['3d_bbox_z']), + 'image/object/3d_bbox/rot_y': dataset_util.float_list_feature( + annotations['3d_bbox_rot_y']), + })) + + return example + + +def filter_annotations(img_all_annotations, used_classes): + """Filters out annotations from the unused classes and dontcare regions. + + Filters out the annotations that belong to classes we do now wish to use and + (optionally) also removes all boxes that overlap with dontcare regions. + + Args: + img_all_annotations: A list of annotation dictionaries. See documentation of + read_annotation_file for more details about the format of the annotations. + used_classes: A list of strings listing the classes we want to keep, if the + list contains "dontcare", all bounding boxes with overlapping with dont + care regions will also be filtered out. + + Returns: + img_filtered_annotations: A list of annotation dictionaries that have passed + the filtering. + """ + + img_filtered_annotations = {} + + # Filter the type of the objects. + relevant_annotation_indices = [ + i for i, x in enumerate(img_all_annotations['type']) if x in used_classes + ] + + for key in img_all_annotations.keys(): + img_filtered_annotations[key] = ( + img_all_annotations[key][relevant_annotation_indices]) + + if 'dontcare' in used_classes: + dont_care_indices = [i for i, + x in enumerate(img_filtered_annotations['type']) + if x == 'dontcare'] + + # bounding box format [y_min, x_min, y_max, x_max] + all_boxes = np.stack([img_filtered_annotations['2d_bbox_top'], + img_filtered_annotations['2d_bbox_left'], + img_filtered_annotations['2d_bbox_bottom'], + img_filtered_annotations['2d_bbox_right']], + axis=1) + + ious = iou(boxes1=all_boxes, + boxes2=all_boxes[dont_care_indices]) + + # Remove all bounding boxes that overlap with a dontcare region. + if ious.size > 0: + boxes_to_remove = np.amax(ious, axis=1) > 0.0 + for key in img_all_annotations.keys(): + img_filtered_annotations[key] = ( + img_filtered_annotations[key][np.logical_not(boxes_to_remove)]) + + return img_filtered_annotations + + +def read_annotation_file(filename): + """Reads a KITTI annotation file. + + Converts a KITTI annotation file into a dictionary containing all the + relevant information. + + Args: + filename: the path to the annotataion text file. + + Returns: + anno: A dictionary with the converted annotation information. See annotation + README file for details on the different fields. + """ + with open(filename) as f: + content = f.readlines() + content = [x.strip().split(' ') for x in content] + + anno = {} + anno['type'] = np.array([x[0].lower() for x in content]) + anno['truncated'] = np.array([float(x[1]) for x in content]) + anno['occluded'] = np.array([int(x[2]) for x in content]) + anno['alpha'] = np.array([float(x[3]) for x in content]) + + anno['2d_bbox_left'] = np.array([float(x[4]) for x in content]) + anno['2d_bbox_top'] = np.array([float(x[5]) for x in content]) + anno['2d_bbox_right'] = np.array([float(x[6]) for x in content]) + anno['2d_bbox_bottom'] = np.array([float(x[7]) for x in content]) + + anno['3d_bbox_height'] = np.array([float(x[8]) for x in content]) + anno['3d_bbox_width'] = np.array([float(x[9]) for x in content]) + anno['3d_bbox_length'] = np.array([float(x[10]) for x in content]) + anno['3d_bbox_x'] = np.array([float(x[11]) for x in content]) + anno['3d_bbox_y'] = np.array([float(x[12]) for x in content]) + anno['3d_bbox_z'] = np.array([float(x[13]) for x in content]) + anno['3d_bbox_rot_y'] = np.array([float(x[14]) for x in content]) + + return anno + + +def main(_): + convert_kitti_to_tfrecords( + data_dir=FLAGS.data_dir, + output_path=FLAGS.output_path, + classes_to_use=FLAGS.classes_to_use.split(','), + label_map_path=FLAGS.label_map_path, + validation_set_size=FLAGS.validation_set_size) + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_kitti_tf_record_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_kitti_tf_record_test.py new file mode 100644 index 0000000000000000000000000000000000000000..37ac4b8b19d65f8533ecefec318b409df12bce5f --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_kitti_tf_record_test.py @@ -0,0 +1,130 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Test for create_kitti_tf_record.py.""" + +import os + +import numpy as np +import PIL.Image +import tensorflow as tf + +from object_detection.dataset_tools import create_kitti_tf_record + + +class CreateKittiTFRecordTest(tf.test.TestCase): + + def _assertProtoEqual(self, proto_field, expectation): + """Helper function to assert if a proto field equals some value. + + Args: + proto_field: The protobuf field to compare. + expectation: The expected value of the protobuf field. + """ + proto_list = [p for p in proto_field] + self.assertListEqual(proto_list, expectation) + + def test_dict_to_tf_example(self): + image_file_name = 'tmp_image.jpg' + image_data = np.random.rand(256, 256, 3) + save_path = os.path.join(self.get_temp_dir(), image_file_name) + image = PIL.Image.fromarray(image_data, 'RGB') + image.save(save_path) + + annotations = {} + annotations['2d_bbox_left'] = np.array([64]) + annotations['2d_bbox_top'] = np.array([64]) + annotations['2d_bbox_right'] = np.array([192]) + annotations['2d_bbox_bottom'] = np.array([192]) + annotations['type'] = ['car'] + annotations['truncated'] = np.array([1]) + annotations['alpha'] = np.array([2]) + annotations['3d_bbox_height'] = np.array([10]) + annotations['3d_bbox_width'] = np.array([11]) + annotations['3d_bbox_length'] = np.array([12]) + annotations['3d_bbox_x'] = np.array([13]) + annotations['3d_bbox_y'] = np.array([14]) + annotations['3d_bbox_z'] = np.array([15]) + annotations['3d_bbox_rot_y'] = np.array([4]) + + label_map_dict = { + 'background': 0, + 'car': 1, + } + + example = create_kitti_tf_record.prepare_example( + save_path, + annotations, + label_map_dict) + + self._assertProtoEqual( + example.features.feature['image/height'].int64_list.value, [256]) + self._assertProtoEqual( + example.features.feature['image/width'].int64_list.value, [256]) + self._assertProtoEqual( + example.features.feature['image/filename'].bytes_list.value, + [save_path]) + self._assertProtoEqual( + example.features.feature['image/source_id'].bytes_list.value, + [save_path]) + self._assertProtoEqual( + example.features.feature['image/format'].bytes_list.value, ['png']) + self._assertProtoEqual( + example.features.feature['image/object/bbox/xmin'].float_list.value, + [0.25]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/ymin'].float_list.value, + [0.25]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/xmax'].float_list.value, + [0.75]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/ymax'].float_list.value, + [0.75]) + self._assertProtoEqual( + example.features.feature['image/object/class/text'].bytes_list.value, + ['car']) + self._assertProtoEqual( + example.features.feature['image/object/class/label'].int64_list.value, + [1]) + self._assertProtoEqual( + example.features.feature['image/object/truncated'].float_list.value, + [1]) + self._assertProtoEqual( + example.features.feature['image/object/alpha'].float_list.value, + [2]) + self._assertProtoEqual(example.features.feature[ + 'image/object/3d_bbox/height'].float_list.value, [10]) + self._assertProtoEqual( + example.features.feature['image/object/3d_bbox/width'].float_list.value, + [11]) + self._assertProtoEqual(example.features.feature[ + 'image/object/3d_bbox/length'].float_list.value, [12]) + self._assertProtoEqual( + example.features.feature['image/object/3d_bbox/x'].float_list.value, + [13]) + self._assertProtoEqual( + example.features.feature['image/object/3d_bbox/y'].float_list.value, + [14]) + self._assertProtoEqual( + example.features.feature['image/object/3d_bbox/z'].float_list.value, + [15]) + self._assertProtoEqual( + example.features.feature['image/object/3d_bbox/rot_y'].float_list.value, + [4]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_oid_tf_record.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_oid_tf_record.py new file mode 100644 index 0000000000000000000000000000000000000000..26d9699c8ee4ec17ef329f91e0df31ca79d50c99 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_oid_tf_record.py @@ -0,0 +1,117 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Creates TFRecords of Open Images dataset for object detection. + +Example usage: + python object_detection/dataset_tools/create_oid_tf_record.py \ + --input_box_annotations_csv=/path/to/input/annotations-human-bbox.csv \ + --input_image_label_annotations_csv=/path/to/input/annotations-label.csv \ + --input_images_directory=/path/to/input/image_pixels_directory \ + --input_label_map=/path/to/input/labels_bbox_545.labelmap \ + --output_tf_record_path_prefix=/path/to/output/prefix.tfrecord + +CSVs with bounding box annotations and image metadata (including the image URLs) +can be downloaded from the Open Images GitHub repository: +https://github.com/openimages/dataset + +This script will include every image found in the input_images_directory in the +output TFRecord, even if the image has no corresponding bounding box annotations +in the input_annotations_csv. If input_image_label_annotations_csv is specified, +it will add image-level labels as well. Note that the information of whether a +label is positivelly or negativelly verified is NOT added to tfrecord. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +import contextlib2 +import pandas as pd +import tensorflow as tf + +from object_detection.dataset_tools import oid_tfrecord_creation +from object_detection.dataset_tools import tf_record_creation_util +from object_detection.utils import label_map_util + +tf.flags.DEFINE_string('input_box_annotations_csv', None, + 'Path to CSV containing image bounding box annotations') +tf.flags.DEFINE_string('input_images_directory', None, + 'Directory containing the image pixels ' + 'downloaded from the OpenImages GitHub repository.') +tf.flags.DEFINE_string('input_image_label_annotations_csv', None, + 'Path to CSV containing image-level labels annotations') +tf.flags.DEFINE_string('input_label_map', None, 'Path to the label map proto') +tf.flags.DEFINE_string( + 'output_tf_record_path_prefix', None, + 'Path to the output TFRecord. The shard index and the number of shards ' + 'will be appended for each output shard.') +tf.flags.DEFINE_integer('num_shards', 100, 'Number of TFRecord shards') + +FLAGS = tf.flags.FLAGS + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + required_flags = [ + 'input_box_annotations_csv', 'input_images_directory', 'input_label_map', + 'output_tf_record_path_prefix' + ] + for flag_name in required_flags: + if not getattr(FLAGS, flag_name): + raise ValueError('Flag --{} is required'.format(flag_name)) + + label_map = label_map_util.get_label_map_dict(FLAGS.input_label_map) + all_box_annotations = pd.read_csv(FLAGS.input_box_annotations_csv) + if FLAGS.input_image_label_annotations_csv: + all_label_annotations = pd.read_csv(FLAGS.input_image_label_annotations_csv) + all_label_annotations.rename( + columns={'Confidence': 'ConfidenceImageLabel'}, inplace=True) + else: + all_label_annotations = None + all_images = tf.gfile.Glob( + os.path.join(FLAGS.input_images_directory, '*.jpg')) + all_image_ids = [os.path.splitext(os.path.basename(v))[0] for v in all_images] + all_image_ids = pd.DataFrame({'ImageID': all_image_ids}) + all_annotations = pd.concat( + [all_box_annotations, all_image_ids, all_label_annotations]) + + tf.logging.log(tf.logging.INFO, 'Found %d images...', len(all_image_ids)) + + with contextlib2.ExitStack() as tf_record_close_stack: + output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( + tf_record_close_stack, FLAGS.output_tf_record_path_prefix, + FLAGS.num_shards) + + for counter, image_data in enumerate(all_annotations.groupby('ImageID')): + tf.logging.log_every_n(tf.logging.INFO, 'Processed %d images...', 1000, + counter) + + image_id, image_annotations = image_data + # In OID image file names are formed by appending ".jpg" to the image ID. + image_path = os.path.join(FLAGS.input_images_directory, image_id + '.jpg') + with tf.gfile.Open(image_path) as image_file: + encoded_image = image_file.read() + + tf_example = oid_tfrecord_creation.tf_example_from_annotations_data_frame( + image_annotations, label_map, encoded_image) + if tf_example: + shard_idx = int(image_id, 16) % FLAGS.num_shards + output_tfrecords[shard_idx].write(tf_example.SerializeToString()) + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_pascal_tf_record.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_pascal_tf_record.py new file mode 100644 index 0000000000000000000000000000000000000000..813071c924ae457453190710181be2d702b439ce --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_pascal_tf_record.py @@ -0,0 +1,185 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +r"""Convert raw PASCAL dataset to TFRecord for object_detection. + +Example usage: + python object_detection/dataset_tools/create_pascal_tf_record.py \ + --data_dir=/home/user/VOCdevkit \ + --year=VOC2012 \ + --output_path=/home/user/pascal.record +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import hashlib +import io +import logging +import os + +from lxml import etree +import PIL.Image +import tensorflow as tf + +from object_detection.utils import dataset_util +from object_detection.utils import label_map_util + + +flags = tf.app.flags +flags.DEFINE_string('data_dir', '', 'Root directory to raw PASCAL VOC dataset.') +flags.DEFINE_string('set', 'train', 'Convert training set, validation set or ' + 'merged set.') +flags.DEFINE_string('annotations_dir', 'Annotations', + '(Relative) path to annotations directory.') +flags.DEFINE_string('year', 'VOC2007', 'Desired challenge year.') +flags.DEFINE_string('output_path', '', 'Path to output TFRecord') +flags.DEFINE_string('label_map_path', 'data/pascal_label_map.pbtxt', + 'Path to label map proto') +flags.DEFINE_boolean('ignore_difficult_instances', False, 'Whether to ignore ' + 'difficult instances') +FLAGS = flags.FLAGS + +SETS = ['train', 'val', 'trainval', 'test'] +YEARS = ['VOC2007', 'VOC2012', 'merged'] + + +def dict_to_tf_example(data, + dataset_directory, + label_map_dict, + ignore_difficult_instances=False, + image_subdirectory='JPEGImages'): + """Convert XML derived dict to tf.Example proto. + + Notice that this function normalizes the bounding box coordinates provided + by the raw data. + + Args: + data: dict holding PASCAL XML fields for a single image (obtained by + running dataset_util.recursive_parse_xml_to_dict) + dataset_directory: Path to root directory holding PASCAL dataset + label_map_dict: A map from string label names to integers ids. + ignore_difficult_instances: Whether to skip difficult instances in the + dataset (default: False). + image_subdirectory: String specifying subdirectory within the + PASCAL dataset directory holding the actual image data. + + Returns: + example: The converted tf.Example. + + Raises: + ValueError: if the image pointed to by data['filename'] is not a valid JPEG + """ + img_path = os.path.join(data['folder'], image_subdirectory, data['filename']) + full_path = os.path.join(dataset_directory, img_path) + with tf.gfile.GFile(full_path, 'rb') as fid: + encoded_jpg = fid.read() + encoded_jpg_io = io.BytesIO(encoded_jpg) + image = PIL.Image.open(encoded_jpg_io) + if image.format != 'JPEG': + raise ValueError('Image format not JPEG') + key = hashlib.sha256(encoded_jpg).hexdigest() + + width = int(data['size']['width']) + height = int(data['size']['height']) + + xmin = [] + ymin = [] + xmax = [] + ymax = [] + classes = [] + classes_text = [] + truncated = [] + poses = [] + difficult_obj = [] + if 'object' in data: + for obj in data['object']: + difficult = bool(int(obj['difficult'])) + if ignore_difficult_instances and difficult: + continue + + difficult_obj.append(int(difficult)) + + xmin.append(float(obj['bndbox']['xmin']) / width) + ymin.append(float(obj['bndbox']['ymin']) / height) + xmax.append(float(obj['bndbox']['xmax']) / width) + ymax.append(float(obj['bndbox']['ymax']) / height) + classes_text.append(obj['name'].encode('utf8')) + classes.append(label_map_dict[obj['name']]) + truncated.append(int(obj['truncated'])) + poses.append(obj['pose'].encode('utf8')) + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': dataset_util.int64_feature(height), + 'image/width': dataset_util.int64_feature(width), + 'image/filename': dataset_util.bytes_feature( + data['filename'].encode('utf8')), + 'image/source_id': dataset_util.bytes_feature( + data['filename'].encode('utf8')), + 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), + 'image/encoded': dataset_util.bytes_feature(encoded_jpg), + 'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')), + 'image/object/bbox/xmin': dataset_util.float_list_feature(xmin), + 'image/object/bbox/xmax': dataset_util.float_list_feature(xmax), + 'image/object/bbox/ymin': dataset_util.float_list_feature(ymin), + 'image/object/bbox/ymax': dataset_util.float_list_feature(ymax), + 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), + 'image/object/class/label': dataset_util.int64_list_feature(classes), + 'image/object/difficult': dataset_util.int64_list_feature(difficult_obj), + 'image/object/truncated': dataset_util.int64_list_feature(truncated), + 'image/object/view': dataset_util.bytes_list_feature(poses), + })) + return example + + +def main(_): + if FLAGS.set not in SETS: + raise ValueError('set must be in : {}'.format(SETS)) + if FLAGS.year not in YEARS: + raise ValueError('year must be in : {}'.format(YEARS)) + + data_dir = FLAGS.data_dir + years = ['VOC2007', 'VOC2012'] + if FLAGS.year != 'merged': + years = [FLAGS.year] + + writer = tf.python_io.TFRecordWriter(FLAGS.output_path) + + label_map_dict = label_map_util.get_label_map_dict(FLAGS.label_map_path) + + for year in years: + logging.info('Reading from PASCAL %s dataset.', year) + examples_path = os.path.join(data_dir, year, 'ImageSets', 'Main', + 'aeroplane_' + FLAGS.set + '.txt') + annotations_dir = os.path.join(data_dir, year, FLAGS.annotations_dir) + examples_list = dataset_util.read_examples_list(examples_path) + for idx, example in enumerate(examples_list): + if idx % 100 == 0: + logging.info('On image %d of %d', idx, len(examples_list)) + path = os.path.join(annotations_dir, example + '.xml') + with tf.gfile.GFile(path, 'r') as fid: + xml_str = fid.read() + xml = etree.fromstring(xml_str) + data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation'] + + tf_example = dict_to_tf_example(data, FLAGS.data_dir, label_map_dict, + FLAGS.ignore_difficult_instances) + writer.write(tf_example.SerializeToString()) + + writer.close() + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_pascal_tf_record_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_pascal_tf_record_test.py new file mode 100644 index 0000000000000000000000000000000000000000..66929bd466a3db5acc9b79460993486c1cd10f34 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_pascal_tf_record_test.py @@ -0,0 +1,118 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Test for create_pascal_tf_record.py.""" + +import os + +import numpy as np +import PIL.Image +import tensorflow as tf + +from object_detection.dataset_tools import create_pascal_tf_record + + +class CreatePascalTFRecordTest(tf.test.TestCase): + + def _assertProtoEqual(self, proto_field, expectation): + """Helper function to assert if a proto field equals some value. + + Args: + proto_field: The protobuf field to compare. + expectation: The expected value of the protobuf field. + """ + proto_list = [p for p in proto_field] + self.assertListEqual(proto_list, expectation) + + def test_dict_to_tf_example(self): + image_file_name = 'tmp_image.jpg' + image_data = np.random.rand(256, 256, 3) + save_path = os.path.join(self.get_temp_dir(), image_file_name) + image = PIL.Image.fromarray(image_data, 'RGB') + image.save(save_path) + + data = { + 'folder': '', + 'filename': image_file_name, + 'size': { + 'height': 256, + 'width': 256, + }, + 'object': [ + { + 'difficult': 1, + 'bndbox': { + 'xmin': 64, + 'ymin': 64, + 'xmax': 192, + 'ymax': 192, + }, + 'name': 'person', + 'truncated': 0, + 'pose': '', + }, + ], + } + + label_map_dict = { + 'background': 0, + 'person': 1, + 'notperson': 2, + } + + example = create_pascal_tf_record.dict_to_tf_example( + data, self.get_temp_dir(), label_map_dict, image_subdirectory='') + self._assertProtoEqual( + example.features.feature['image/height'].int64_list.value, [256]) + self._assertProtoEqual( + example.features.feature['image/width'].int64_list.value, [256]) + self._assertProtoEqual( + example.features.feature['image/filename'].bytes_list.value, + [image_file_name]) + self._assertProtoEqual( + example.features.feature['image/source_id'].bytes_list.value, + [image_file_name]) + self._assertProtoEqual( + example.features.feature['image/format'].bytes_list.value, ['jpeg']) + self._assertProtoEqual( + example.features.feature['image/object/bbox/xmin'].float_list.value, + [0.25]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/ymin'].float_list.value, + [0.25]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/xmax'].float_list.value, + [0.75]) + self._assertProtoEqual( + example.features.feature['image/object/bbox/ymax'].float_list.value, + [0.75]) + self._assertProtoEqual( + example.features.feature['image/object/class/text'].bytes_list.value, + ['person']) + self._assertProtoEqual( + example.features.feature['image/object/class/label'].int64_list.value, + [1]) + self._assertProtoEqual( + example.features.feature['image/object/difficult'].int64_list.value, + [1]) + self._assertProtoEqual( + example.features.feature['image/object/truncated'].int64_list.value, + [0]) + self._assertProtoEqual( + example.features.feature['image/object/view'].bytes_list.value, ['']) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_pet_tf_record.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_pet_tf_record.py new file mode 100644 index 0000000000000000000000000000000000000000..9b3b55c60009fb14d7384097d8c7fad02c5d345a --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/create_pet_tf_record.py @@ -0,0 +1,318 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +r"""Convert the Oxford pet dataset to TFRecord for object_detection. + +See: O. M. Parkhi, A. Vedaldi, A. Zisserman, C. V. Jawahar + Cats and Dogs + IEEE Conference on Computer Vision and Pattern Recognition, 2012 + http://www.robots.ox.ac.uk/~vgg/data/pets/ + +Example usage: + python object_detection/dataset_tools/create_pet_tf_record.py \ + --data_dir=/home/user/pet \ + --output_dir=/home/user/pet/output +""" + +import hashlib +import io +import logging +import os +import random +import re + +import contextlib2 +from lxml import etree +import numpy as np +import PIL.Image +import tensorflow as tf + +from object_detection.dataset_tools import tf_record_creation_util +from object_detection.utils import dataset_util +from object_detection.utils import label_map_util + +flags = tf.app.flags +flags.DEFINE_string('data_dir', '', 'Root directory to raw pet dataset.') +flags.DEFINE_string('output_dir', '', 'Path to directory to output TFRecords.') +flags.DEFINE_string('label_map_path', 'data/pet_label_map.pbtxt', + 'Path to label map proto') +flags.DEFINE_boolean('faces_only', True, 'If True, generates bounding boxes ' + 'for pet faces. Otherwise generates bounding boxes (as ' + 'well as segmentations for full pet bodies). Note that ' + 'in the latter case, the resulting files are much larger.') +flags.DEFINE_string('mask_type', 'png', 'How to represent instance ' + 'segmentation masks. Options are "png" or "numerical".') +flags.DEFINE_integer('num_shards', 10, 'Number of TFRecord shards') + +FLAGS = flags.FLAGS + + +def get_class_name_from_filename(file_name): + """Gets the class name from a file. + + Args: + file_name: The file name to get the class name from. + ie. "american_pit_bull_terrier_105.jpg" + + Returns: + A string of the class name. + """ + match = re.match(r'([A-Za-z_]+)(_[0-9]+\.jpg)', file_name, re.I) + return match.groups()[0] + + +def dict_to_tf_example(data, + mask_path, + label_map_dict, + image_subdirectory, + ignore_difficult_instances=False, + faces_only=True, + mask_type='png'): + """Convert XML derived dict to tf.Example proto. + + Notice that this function normalizes the bounding box coordinates provided + by the raw data. + + Args: + data: dict holding PASCAL XML fields for a single image (obtained by + running dataset_util.recursive_parse_xml_to_dict) + mask_path: String path to PNG encoded mask. + label_map_dict: A map from string label names to integers ids. + image_subdirectory: String specifying subdirectory within the + Pascal dataset directory holding the actual image data. + ignore_difficult_instances: Whether to skip difficult instances in the + dataset (default: False). + faces_only: If True, generates bounding boxes for pet faces. Otherwise + generates bounding boxes (as well as segmentations for full pet bodies). + mask_type: 'numerical' or 'png'. 'png' is recommended because it leads to + smaller file sizes. + + Returns: + example: The converted tf.Example. + + Raises: + ValueError: if the image pointed to by data['filename'] is not a valid JPEG + """ + img_path = os.path.join(image_subdirectory, data['filename']) + with tf.gfile.GFile(img_path, 'rb') as fid: + encoded_jpg = fid.read() + encoded_jpg_io = io.BytesIO(encoded_jpg) + image = PIL.Image.open(encoded_jpg_io) + if image.format != 'JPEG': + raise ValueError('Image format not JPEG') + key = hashlib.sha256(encoded_jpg).hexdigest() + + with tf.gfile.GFile(mask_path, 'rb') as fid: + encoded_mask_png = fid.read() + encoded_png_io = io.BytesIO(encoded_mask_png) + mask = PIL.Image.open(encoded_png_io) + if mask.format != 'PNG': + raise ValueError('Mask format not PNG') + + mask_np = np.asarray(mask) + nonbackground_indices_x = np.any(mask_np != 2, axis=0) + nonbackground_indices_y = np.any(mask_np != 2, axis=1) + nonzero_x_indices = np.where(nonbackground_indices_x) + nonzero_y_indices = np.where(nonbackground_indices_y) + + width = int(data['size']['width']) + height = int(data['size']['height']) + + xmins = [] + ymins = [] + xmaxs = [] + ymaxs = [] + classes = [] + classes_text = [] + truncated = [] + poses = [] + difficult_obj = [] + masks = [] + if 'object' in data: + for obj in data['object']: + difficult = bool(int(obj['difficult'])) + if ignore_difficult_instances and difficult: + continue + difficult_obj.append(int(difficult)) + + if faces_only: + xmin = float(obj['bndbox']['xmin']) + xmax = float(obj['bndbox']['xmax']) + ymin = float(obj['bndbox']['ymin']) + ymax = float(obj['bndbox']['ymax']) + else: + xmin = float(np.min(nonzero_x_indices)) + xmax = float(np.max(nonzero_x_indices)) + ymin = float(np.min(nonzero_y_indices)) + ymax = float(np.max(nonzero_y_indices)) + + xmins.append(xmin / width) + ymins.append(ymin / height) + xmaxs.append(xmax / width) + ymaxs.append(ymax / height) + class_name = get_class_name_from_filename(data['filename']) + classes_text.append(class_name.encode('utf8')) + classes.append(label_map_dict[class_name]) + truncated.append(int(obj['truncated'])) + poses.append(obj['pose'].encode('utf8')) + if not faces_only: + mask_remapped = (mask_np != 2).astype(np.uint8) + masks.append(mask_remapped) + + feature_dict = { + 'image/height': dataset_util.int64_feature(height), + 'image/width': dataset_util.int64_feature(width), + 'image/filename': dataset_util.bytes_feature( + data['filename'].encode('utf8')), + 'image/source_id': dataset_util.bytes_feature( + data['filename'].encode('utf8')), + 'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')), + 'image/encoded': dataset_util.bytes_feature(encoded_jpg), + 'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')), + 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), + 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), + 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), + 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), + 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), + 'image/object/class/label': dataset_util.int64_list_feature(classes), + 'image/object/difficult': dataset_util.int64_list_feature(difficult_obj), + 'image/object/truncated': dataset_util.int64_list_feature(truncated), + 'image/object/view': dataset_util.bytes_list_feature(poses), + } + if not faces_only: + if mask_type == 'numerical': + mask_stack = np.stack(masks).astype(np.float32) + masks_flattened = np.reshape(mask_stack, [-1]) + feature_dict['image/object/mask'] = ( + dataset_util.float_list_feature(masks_flattened.tolist())) + elif mask_type == 'png': + encoded_mask_png_list = [] + for mask in masks: + img = PIL.Image.fromarray(mask) + output = io.BytesIO() + img.save(output, format='PNG') + encoded_mask_png_list.append(output.getvalue()) + feature_dict['image/object/mask'] = ( + dataset_util.bytes_list_feature(encoded_mask_png_list)) + + example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) + return example + + +def create_tf_record(output_filename, + num_shards, + label_map_dict, + annotations_dir, + image_dir, + examples, + faces_only=True, + mask_type='png'): + """Creates a TFRecord file from examples. + + Args: + output_filename: Path to where output file is saved. + num_shards: Number of shards for output file. + label_map_dict: The label map dictionary. + annotations_dir: Directory where annotation files are stored. + image_dir: Directory where image files are stored. + examples: Examples to parse and save to tf record. + faces_only: If True, generates bounding boxes for pet faces. Otherwise + generates bounding boxes (as well as segmentations for full pet bodies). + mask_type: 'numerical' or 'png'. 'png' is recommended because it leads to + smaller file sizes. + """ + with contextlib2.ExitStack() as tf_record_close_stack: + output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( + tf_record_close_stack, output_filename, num_shards) + for idx, example in enumerate(examples): + if idx % 100 == 0: + logging.info('On image %d of %d', idx, len(examples)) + xml_path = os.path.join(annotations_dir, 'xmls', example + '.xml') + mask_path = os.path.join(annotations_dir, 'trimaps', example + '.png') + + if not os.path.exists(xml_path): + logging.warning('Could not find %s, ignoring example.', xml_path) + continue + with tf.gfile.GFile(xml_path, 'r') as fid: + xml_str = fid.read() + xml = etree.fromstring(xml_str) + data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation'] + + try: + tf_example = dict_to_tf_example( + data, + mask_path, + label_map_dict, + image_dir, + faces_only=faces_only, + mask_type=mask_type) + if tf_example: + shard_idx = idx % num_shards + output_tfrecords[shard_idx].write(tf_example.SerializeToString()) + except ValueError: + logging.warning('Invalid example: %s, ignoring.', xml_path) + + +# TODO(derekjchow): Add test for pet/PASCAL main files. +def main(_): + data_dir = FLAGS.data_dir + label_map_dict = label_map_util.get_label_map_dict(FLAGS.label_map_path) + + logging.info('Reading from Pet dataset.') + image_dir = os.path.join(data_dir, 'images') + annotations_dir = os.path.join(data_dir, 'annotations') + examples_path = os.path.join(annotations_dir, 'trainval.txt') + examples_list = dataset_util.read_examples_list(examples_path) + + # Test images are not included in the downloaded data set, so we shall perform + # our own split. + random.seed(42) + random.shuffle(examples_list) + num_examples = len(examples_list) + num_train = int(0.7 * num_examples) + train_examples = examples_list[:num_train] + val_examples = examples_list[num_train:] + logging.info('%d training and %d validation examples.', + len(train_examples), len(val_examples)) + + train_output_path = os.path.join(FLAGS.output_dir, 'pet_faces_train.record') + val_output_path = os.path.join(FLAGS.output_dir, 'pet_faces_val.record') + if not FLAGS.faces_only: + train_output_path = os.path.join(FLAGS.output_dir, + 'pets_fullbody_with_masks_train.record') + val_output_path = os.path.join(FLAGS.output_dir, + 'pets_fullbody_with_masks_val.record') + create_tf_record( + train_output_path, + FLAGS.num_shards, + label_map_dict, + annotations_dir, + image_dir, + train_examples, + faces_only=FLAGS.faces_only, + mask_type=FLAGS.mask_type) + create_tf_record( + val_output_path, + FLAGS.num_shards, + label_map_dict, + annotations_dir, + image_dir, + val_examples, + faces_only=FLAGS.faces_only, + mask_type=FLAGS.mask_type) + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/download_and_preprocess_mscoco.sh b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/download_and_preprocess_mscoco.sh new file mode 100644 index 0000000000000000000000000000000000000000..843ba86938d35eed18dd6f7968ea87c90551fc13 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/download_and_preprocess_mscoco.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# Script to download and preprocess the MSCOCO data set for detection. +# +# The outputs of this script are TFRecord files containing serialized +# tf.Example protocol buffers. See create_coco_tf_record.py for details of how +# the tf.Example protocol buffers are constructed and see +# http://cocodataset.org/#overview for an overview of the dataset. +# +# usage: +# bash object_detection/dataset_tools/download_and_preprocess_mscoco.sh \ +# /tmp/mscoco +set -e + +if [ -z "$1" ]; then + echo "usage download_and_preprocess_mscoco.sh [data dir]" + exit +fi + +if [ "$(uname)" == "Darwin" ]; then + UNZIP="tar -xf" +else + UNZIP="unzip -nq" +fi + +# Create the output directories. +OUTPUT_DIR="${1%/}" +SCRATCH_DIR="${OUTPUT_DIR}/raw-data" +mkdir -p "${OUTPUT_DIR}" +mkdir -p "${SCRATCH_DIR}" +CURRENT_DIR=$(pwd) + +# Helper function to download and unpack a .zip file. +function download_and_unzip() { + local BASE_URL=${1} + local FILENAME=${2} + + if [ ! -f ${FILENAME} ]; then + echo "Downloading ${FILENAME} to $(pwd)" + wget -nd -c "${BASE_URL}/${FILENAME}" + else + echo "Skipping download of ${FILENAME}" + fi + echo "Unzipping ${FILENAME}" + ${UNZIP} ${FILENAME} +} + +cd ${SCRATCH_DIR} + +# Download the images. +BASE_IMAGE_URL="http://images.cocodataset.org/zips" + +TRAIN_IMAGE_FILE="train2017.zip" +download_and_unzip ${BASE_IMAGE_URL} ${TRAIN_IMAGE_FILE} +TRAIN_IMAGE_DIR="${SCRATCH_DIR}/train2017" + +VAL_IMAGE_FILE="val2017.zip" +download_and_unzip ${BASE_IMAGE_URL} ${VAL_IMAGE_FILE} +VAL_IMAGE_DIR="${SCRATCH_DIR}/val2017" + +TEST_IMAGE_FILE="test2017.zip" +download_and_unzip ${BASE_IMAGE_URL} ${TEST_IMAGE_FILE} +TEST_IMAGE_DIR="${SCRATCH_DIR}/test2017" + +# Download the annotations. +BASE_INSTANCES_URL="http://images.cocodataset.org/annotations" +INSTANCES_FILE="annotations_trainval2017.zip" +download_and_unzip ${BASE_INSTANCES_URL} ${INSTANCES_FILE} + +TRAIN_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/instances_train2017.json" +VAL_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/instances_val2017.json" + +# Download the test image info. +BASE_IMAGE_INFO_URL="http://images.cocodataset.org/annotations" +IMAGE_INFO_FILE="image_info_test2017.zip" +download_and_unzip ${BASE_IMAGE_INFO_URL} ${IMAGE_INFO_FILE} + +TESTDEV_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/image_info_test-dev2017.json" + +# Build TFRecords of the image data. +cd "${CURRENT_DIR}" +python object_detection/dataset_tools/create_coco_tf_record.py \ + --logtostderr \ + --include_masks \ + --train_image_dir="${TRAIN_IMAGE_DIR}" \ + --val_image_dir="${VAL_IMAGE_DIR}" \ + --test_image_dir="${TEST_IMAGE_DIR}" \ + --train_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \ + --val_annotations_file="${VAL_ANNOTATIONS_FILE}" \ + --testdev_annotations_file="${TESTDEV_ANNOTATIONS_FILE}" \ + --output_dir="${OUTPUT_DIR}" + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_hierarchical_labels_expansion.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_hierarchical_labels_expansion.py new file mode 100644 index 0000000000000000000000000000000000000000..6c00ac429102841ccff77de78e5bf06a0d3d6a5a --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_hierarchical_labels_expansion.py @@ -0,0 +1,172 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A class and executable to expand hierarchically image-level labels and boxes. + +Example usage: + ./hierarchical_labels_expansion + [optional]labels_file +""" + +import json +import sys + + +def _update_dict(initial_dict, update): + """Updates dictionary with update content. + + Args: + initial_dict: initial dictionary. + update: updated dictionary. + """ + + for key, value_list in update.iteritems(): + if key in initial_dict: + initial_dict[key].extend(value_list) + else: + initial_dict[key] = value_list + + +def _build_plain_hierarchy(hierarchy, skip_root=False): + """Expands tree hierarchy representation to parent-child dictionary. + + Args: + hierarchy: labels hierarchy as JSON file. + skip_root: if true skips root from the processing (done for the case when all + classes under hierarchy are collected under virtual node). + + Returns: + keyed_parent - dictionary of parent - all its children nodes. + keyed_child - dictionary of children - all its parent nodes + children - all children of the current node. + """ + all_children = [] + all_keyed_parent = {} + all_keyed_child = {} + if 'Subcategory' in hierarchy: + for node in hierarchy['Subcategory']: + keyed_parent, keyed_child, children = _build_plain_hierarchy(node) + # Update is not done through dict.update() since some children have multi- + # ple parents in the hiearchy. + _update_dict(all_keyed_parent, keyed_parent) + _update_dict(all_keyed_child, keyed_child) + all_children.extend(children) + + if not skip_root: + all_keyed_parent[hierarchy['LabelName']] = all_children + all_children = [hierarchy['LabelName']] + all_children + for child, _ in all_keyed_child.iteritems(): + all_keyed_child[child].append(hierarchy['LabelName']) + all_keyed_child[hierarchy['LabelName']] = [] + + return all_keyed_parent, all_keyed_child, all_children + + +class OIDHierarchicalLabelsExpansion(object): + """ Main class to perform labels hierachical expansion.""" + + def __init__(self, hierarchy): + """Constructor. + + Args: + hierarchy: labels hierarchy as JSON file. + """ + + self._hierarchy_keyed_parent, self._hierarchy_keyed_child, _ = ( + _build_plain_hierarchy(hierarchy, skip_root=True)) + + def expand_boxes_from_csv(self, csv_row): + """Expands a row containing bounding boxes from CSV file. + + Args: + csv_row: a single row of Open Images released groundtruth file. + + Returns: + a list of strings (including the initial row) corresponding to the ground + truth expanded to multiple annotation for evaluation with Open Images + Challenge 2018 metric. + """ + # Row header is expected to be exactly: + # ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded, + # IsTruncated,IsGroupOf,IsDepiction,IsInside + cvs_row_splited = csv_row.split(',') + assert len(cvs_row_splited) == 13 + result = [csv_row] + assert cvs_row_splited[2] in self._hierarchy_keyed_child + parent_nodes = self._hierarchy_keyed_child[cvs_row_splited[2]] + for parent_node in parent_nodes: + cvs_row_splited[2] = parent_node + result.append(','.join(cvs_row_splited)) + return result + + def expand_labels_from_csv(self, csv_row): + """Expands a row containing bounding boxes from CSV file. + + Args: + csv_row: a single row of Open Images released groundtruth file. + + Returns: + a list of strings (including the initial row) corresponding to the ground + truth expanded to multiple annotation for evaluation with Open Images + Challenge 2018 metric. + """ + # Row header is expected to be exactly: + # ImageID,Source,LabelName,Confidence + cvs_row_splited = csv_row.split(',') + assert len(cvs_row_splited) == 4 + result = [csv_row] + if int(cvs_row_splited[3]) == 1: + assert cvs_row_splited[2] in self._hierarchy_keyed_child + parent_nodes = self._hierarchy_keyed_child[cvs_row_splited[2]] + for parent_node in parent_nodes: + cvs_row_splited[2] = parent_node + result.append(','.join(cvs_row_splited)) + else: + assert cvs_row_splited[2] in self._hierarchy_keyed_parent + child_nodes = self._hierarchy_keyed_parent[cvs_row_splited[2]] + for child_node in child_nodes: + cvs_row_splited[2] = child_node + result.append(','.join(cvs_row_splited)) + return result + + +def main(argv): + + if len(argv) < 4: + print """Missing arguments. \n + Usage: ./hierarchical_labels_expansion + [optional]labels_file""" + return + with open(argv[1]) as f: + hierarchy = json.load(f) + expansion_generator = OIDHierarchicalLabelsExpansion(hierarchy) + labels_file = False + if len(argv) > 4 and argv[4] == 'labels_file': + labels_file = True + with open(argv[2], 'r') as source: + with open(argv[3], 'w') as target: + header_skipped = False + for line in source: + if not header_skipped: + header_skipped = True + continue + if labels_file: + expanded_lines = expansion_generator.expand_labels_from_csv(line) + else: + expanded_lines = expansion_generator.expand_boxes_from_csv(line) + target.writelines(expanded_lines) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_hierarchical_labels_expansion_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_hierarchical_labels_expansion_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cd62b9cff28b052dba542a354eb7c87a8f332f8b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_hierarchical_labels_expansion_test.py @@ -0,0 +1,88 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for the OpenImages label expansion (OIDHierarchicalLabelsExpansion).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from object_detection.dataset_tools import oid_hierarchical_labels_expansion + + +def create_test_data(): + hierarchy = { + 'LabelName': + 'a', + 'Subcategory': [{ + 'LabelName': 'b' + }, { + 'LabelName': 'c', + 'Subcategory': [{ + 'LabelName': 'd' + }, { + 'LabelName': 'e' + }] + }, { + 'LabelName': 'f', + 'Subcategory': [{ + 'LabelName': 'd' + },] + }] + } + bbox_rows = [ + '123,xclick,b,1,0.1,0.2,0.1,0.2,1,1,0,0,0', + '123,xclick,d,1,0.2,0.3,0.1,0.2,1,1,0,0,0' + ] + label_rows = [ + '123,verification,b,0', '123,verification,c,0', '124,verification,d,1' + ] + return hierarchy, bbox_rows, label_rows + + +class HierarchicalLabelsExpansionTest(tf.test.TestCase): + + def test_bbox_expansion(self): + hierarchy, bbox_rows, _ = create_test_data() + expansion_generator = ( + oid_hierarchical_labels_expansion.OIDHierarchicalLabelsExpansion( + hierarchy)) + all_result_rows = [] + for row in bbox_rows: + all_result_rows.extend(expansion_generator.expand_boxes_from_csv(row)) + self.assertItemsEqual([ + '123,xclick,b,1,0.1,0.2,0.1,0.2,1,1,0,0,0', + '123,xclick,d,1,0.2,0.3,0.1,0.2,1,1,0,0,0', + '123,xclick,f,1,0.2,0.3,0.1,0.2,1,1,0,0,0', + '123,xclick,c,1,0.2,0.3,0.1,0.2,1,1,0,0,0' + ], all_result_rows) + + def test_labels_expansion(self): + hierarchy, _, label_rows = create_test_data() + expansion_generator = ( + oid_hierarchical_labels_expansion.OIDHierarchicalLabelsExpansion( + hierarchy)) + all_result_rows = [] + for row in label_rows: + all_result_rows.extend(expansion_generator.expand_labels_from_csv(row)) + self.assertItemsEqual([ + '123,verification,b,0', '123,verification,c,0', '123,verification,d,0', + '123,verification,e,0', '124,verification,d,1', '124,verification,f,1', + '124,verification,c,1' + ], all_result_rows) + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_tfrecord_creation.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_tfrecord_creation.py new file mode 100644 index 0000000000000000000000000000000000000000..706280985d98bf4caaef8f9a2e30735913f33420 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_tfrecord_creation.py @@ -0,0 +1,106 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Utilities for creating TFRecords of TF examples for the Open Images dataset. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from object_detection.core import standard_fields +from object_detection.utils import dataset_util + + +def tf_example_from_annotations_data_frame(annotations_data_frame, label_map, + encoded_image): + """Populates a TF Example message with image annotations from a data frame. + + Args: + annotations_data_frame: Data frame containing the annotations for a single + image. + label_map: String to integer label map. + encoded_image: The encoded image string + + Returns: + The populated TF Example, if the label of at least one object is present in + label_map. Otherwise, returns None. + """ + + filtered_data_frame = annotations_data_frame[ + annotations_data_frame.LabelName.isin(label_map)] + filtered_data_frame_boxes = filtered_data_frame[ + ~filtered_data_frame.YMin.isnull()] + filtered_data_frame_labels = filtered_data_frame[ + filtered_data_frame.YMin.isnull()] + image_id = annotations_data_frame.ImageID.iloc[0] + + feature_map = { + standard_fields.TfExampleFields.object_bbox_ymin: + dataset_util.float_list_feature( + filtered_data_frame_boxes.YMin.as_matrix()), + standard_fields.TfExampleFields.object_bbox_xmin: + dataset_util.float_list_feature( + filtered_data_frame_boxes.XMin.as_matrix()), + standard_fields.TfExampleFields.object_bbox_ymax: + dataset_util.float_list_feature( + filtered_data_frame_boxes.YMax.as_matrix()), + standard_fields.TfExampleFields.object_bbox_xmax: + dataset_util.float_list_feature( + filtered_data_frame_boxes.XMax.as_matrix()), + standard_fields.TfExampleFields.object_class_text: + dataset_util.bytes_list_feature( + filtered_data_frame_boxes.LabelName.as_matrix()), + standard_fields.TfExampleFields.object_class_label: + dataset_util.int64_list_feature( + filtered_data_frame_boxes.LabelName.map(lambda x: label_map[x]) + .as_matrix()), + standard_fields.TfExampleFields.filename: + dataset_util.bytes_feature('{}.jpg'.format(image_id)), + standard_fields.TfExampleFields.source_id: + dataset_util.bytes_feature(image_id), + standard_fields.TfExampleFields.image_encoded: + dataset_util.bytes_feature(encoded_image), + } + + if 'IsGroupOf' in filtered_data_frame.columns: + feature_map[standard_fields.TfExampleFields. + object_group_of] = dataset_util.int64_list_feature( + filtered_data_frame_boxes.IsGroupOf.as_matrix().astype(int)) + if 'IsOccluded' in filtered_data_frame.columns: + feature_map[standard_fields.TfExampleFields. + object_occluded] = dataset_util.int64_list_feature( + filtered_data_frame_boxes.IsOccluded.as_matrix().astype( + int)) + if 'IsTruncated' in filtered_data_frame.columns: + feature_map[standard_fields.TfExampleFields. + object_truncated] = dataset_util.int64_list_feature( + filtered_data_frame_boxes.IsTruncated.as_matrix().astype( + int)) + if 'IsDepiction' in filtered_data_frame.columns: + feature_map[standard_fields.TfExampleFields. + object_depiction] = dataset_util.int64_list_feature( + filtered_data_frame_boxes.IsDepiction.as_matrix().astype( + int)) + + if 'ConfidenceImageLabel' in filtered_data_frame_labels.columns: + feature_map[standard_fields.TfExampleFields. + image_class_label] = dataset_util.int64_list_feature( + filtered_data_frame_labels.LabelName.map( + lambda x: label_map[x]).as_matrix()) + feature_map[standard_fields.TfExampleFields. + image_class_text] = dataset_util.bytes_list_feature( + filtered_data_frame_labels.LabelName.as_matrix()), + return tf.train.Example(features=tf.train.Features(feature=feature_map)) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_tfrecord_creation_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_tfrecord_creation_test.py new file mode 100644 index 0000000000000000000000000000000000000000..44ef852165ca115ba109e665f27352935386cb69 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/oid_tfrecord_creation_test.py @@ -0,0 +1,200 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for oid_tfrecord_creation.py.""" + +import pandas as pd +import tensorflow as tf + +from object_detection.dataset_tools import oid_tfrecord_creation + + +def create_test_data(): + data = { + 'ImageID': ['i1', 'i1', 'i1', 'i1', 'i1', 'i2', 'i2'], + 'LabelName': ['a', 'a', 'b', 'b', 'c', 'b', 'c'], + 'YMin': [0.3, 0.6, 0.8, 0.1, None, 0.0, 0.0], + 'XMin': [0.1, 0.3, 0.7, 0.0, None, 0.1, 0.1], + 'XMax': [0.2, 0.3, 0.8, 0.5, None, 0.9, 0.9], + 'YMax': [0.3, 0.6, 1, 0.8, None, 0.8, 0.8], + 'IsOccluded': [0, 1, 1, 0, None, 0, 0], + 'IsTruncated': [0, 0, 0, 1, None, 0, 0], + 'IsGroupOf': [0, 0, 0, 0, None, 0, 1], + 'IsDepiction': [1, 0, 0, 0, None, 0, 0], + 'ConfidenceImageLabel': [None, None, None, None, 0, None, None], + } + df = pd.DataFrame(data=data) + label_map = {'a': 0, 'b': 1, 'c': 2} + return label_map, df + + +class TfExampleFromAnnotationsDataFrameTests(tf.test.TestCase): + + def test_simple(self): + label_map, df = create_test_data() + + tf_example = oid_tfrecord_creation.tf_example_from_annotations_data_frame( + df[df.ImageID == 'i1'], label_map, 'encoded_image_test') + self.assertProtoEquals( + """ + features { + feature { + key: "image/encoded" + value { bytes_list { value: "encoded_image_test" } } } + feature { + key: "image/filename" + value { bytes_list { value: "i1.jpg" } } } + feature { + key: "image/object/bbox/ymin" + value { float_list { value: [0.3, 0.6, 0.8, 0.1] } } } + feature { + key: "image/object/bbox/xmin" + value { float_list { value: [0.1, 0.3, 0.7, 0.0] } } } + feature { + key: "image/object/bbox/ymax" + value { float_list { value: [0.3, 0.6, 1.0, 0.8] } } } + feature { + key: "image/object/bbox/xmax" + value { float_list { value: [0.2, 0.3, 0.8, 0.5] } } } + feature { + key: "image/object/class/label" + value { int64_list { value: [0, 0, 1, 1] } } } + feature { + key: "image/object/class/text" + value { bytes_list { value: ["a", "a", "b", "b"] } } } + feature { + key: "image/source_id" + value { bytes_list { value: "i1" } } } + feature { + key: "image/object/depiction" + value { int64_list { value: [1, 0, 0, 0] } } } + feature { + key: "image/object/group_of" + value { int64_list { value: [0, 0, 0, 0] } } } + feature { + key: "image/object/occluded" + value { int64_list { value: [0, 1, 1, 0] } } } + feature { + key: "image/object/truncated" + value { int64_list { value: [0, 0, 0, 1] } } } + feature { + key: "image/class/label" + value { int64_list { value: [2] } } } + feature { + key: "image/class/text" + value { bytes_list { value: ["c"] } } } } + """, tf_example) + + def test_no_attributes(self): + label_map, df = create_test_data() + + del df['IsDepiction'] + del df['IsGroupOf'] + del df['IsOccluded'] + del df['IsTruncated'] + del df['ConfidenceImageLabel'] + + tf_example = oid_tfrecord_creation.tf_example_from_annotations_data_frame( + df[df.ImageID == 'i2'], label_map, 'encoded_image_test') + self.assertProtoEquals(""" + features { + feature { + key: "image/encoded" + value { bytes_list { value: "encoded_image_test" } } } + feature { + key: "image/filename" + value { bytes_list { value: "i2.jpg" } } } + feature { + key: "image/object/bbox/ymin" + value { float_list { value: [0.0, 0.0] } } } + feature { + key: "image/object/bbox/xmin" + value { float_list { value: [0.1, 0.1] } } } + feature { + key: "image/object/bbox/ymax" + value { float_list { value: [0.8, 0.8] } } } + feature { + key: "image/object/bbox/xmax" + value { float_list { value: [0.9, 0.9] } } } + feature { + key: "image/object/class/label" + value { int64_list { value: [1, 2] } } } + feature { + key: "image/object/class/text" + value { bytes_list { value: ["b", "c"] } } } + feature { + key: "image/source_id" + value { bytes_list { value: "i2" } } } } + """, tf_example) + + def test_label_filtering(self): + label_map, df = create_test_data() + + label_map = {'a': 0} + + tf_example = oid_tfrecord_creation.tf_example_from_annotations_data_frame( + df[df.ImageID == 'i1'], label_map, 'encoded_image_test') + self.assertProtoEquals( + """ + features { + feature { + key: "image/encoded" + value { bytes_list { value: "encoded_image_test" } } } + feature { + key: "image/filename" + value { bytes_list { value: "i1.jpg" } } } + feature { + key: "image/object/bbox/ymin" + value { float_list { value: [0.3, 0.6] } } } + feature { + key: "image/object/bbox/xmin" + value { float_list { value: [0.1, 0.3] } } } + feature { + key: "image/object/bbox/ymax" + value { float_list { value: [0.3, 0.6] } } } + feature { + key: "image/object/bbox/xmax" + value { float_list { value: [0.2, 0.3] } } } + feature { + key: "image/object/class/label" + value { int64_list { value: [0, 0] } } } + feature { + key: "image/object/class/text" + value { bytes_list { value: ["a", "a"] } } } + feature { + key: "image/source_id" + value { bytes_list { value: "i1" } } } + feature { + key: "image/object/depiction" + value { int64_list { value: [1, 0] } } } + feature { + key: "image/object/group_of" + value { int64_list { value: [0, 0] } } } + feature { + key: "image/object/occluded" + value { int64_list { value: [0, 1] } } } + feature { + key: "image/object/truncated" + value { int64_list { value: [0, 0] } } } + feature { + key: "image/class/label" + value { int64_list { } } } + feature { + key: "image/class/text" + value { bytes_list { } } } } + """, tf_example) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/tf_record_creation_util.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/tf_record_creation_util.py new file mode 100644 index 0000000000000000000000000000000000000000..e8da2291d620b2218cbf724dedb6464e8050f1f7 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/tf_record_creation_util.py @@ -0,0 +1,46 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Utilities for creating TFRecords of TF examples for the Open Images dataset. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +def open_sharded_output_tfrecords(exit_stack, base_path, num_shards): + """Opens all TFRecord shards for writing and adds them to an exit stack. + + Args: + exit_stack: A context2.ExitStack used to automatically closed the TFRecords + opened in this function. + base_path: The base path for all shards + num_shards: The number of shards + + Returns: + The list of opened TFRecords. Position k in the list corresponds to shard k. + """ + tf_record_output_filenames = [ + '{}-{:05d}-of-{:05d}'.format(base_path, idx, num_shards) + for idx in range(num_shards) + ] + + tfrecords = [ + exit_stack.enter_context(tf.python_io.TFRecordWriter(file_name)) + for file_name in tf_record_output_filenames + ] + + return tfrecords diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/tf_record_creation_util_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/tf_record_creation_util_test.py new file mode 100644 index 0000000000000000000000000000000000000000..f1231f8bb7b86ec6a066a6ec373e3e6e8af63386 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/dataset_tools/tf_record_creation_util_test.py @@ -0,0 +1,42 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tf_record_creation_util.py.""" + +import os +import contextlib2 +import tensorflow as tf + +from object_detection.dataset_tools import tf_record_creation_util + + +class OpenOutputTfrecordsTests(tf.test.TestCase): + + def test_sharded_tfrecord_writes(self): + with contextlib2.ExitStack() as tf_record_close_stack: + output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords( + tf_record_close_stack, + os.path.join(tf.test.get_temp_dir(), 'test.tfrec'), 10) + for idx in range(10): + output_tfrecords[idx].write('test_{}'.format(idx)) + + for idx in range(10): + tf_record_path = '{}-{:05d}-of-00010'.format( + os.path.join(tf.test.get_temp_dir(), 'test.tfrec'), idx) + records = list(tf.python_io.tf_record_iterator(tf_record_path)) + self.assertAllEqual(records, ['test_{}'.format(idx)]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/eval.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..f546442e27d19b41a121c68022549367ba36bc90 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/eval.py @@ -0,0 +1,147 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +r"""Evaluation executable for detection models. + +This executable is used to evaluate DetectionModels. There are two ways of +configuring the eval job. + +1) A single pipeline_pb2.TrainEvalPipelineConfig file maybe specified instead. +In this mode, the --eval_training_data flag may be given to force the pipeline +to evaluate on training data instead. + +Example usage: + ./eval \ + --logtostderr \ + --checkpoint_dir=path/to/checkpoint_dir \ + --eval_dir=path/to/eval_dir \ + --pipeline_config_path=pipeline_config.pbtxt + +2) Three configuration files may be provided: a model_pb2.DetectionModel +configuration file to define what type of DetectionModel is being evaluated, an +input_reader_pb2.InputReader file to specify what data the model is evaluating +and an eval_pb2.EvalConfig file to configure evaluation parameters. + +Example usage: + ./eval \ + --logtostderr \ + --checkpoint_dir=path/to/checkpoint_dir \ + --eval_dir=path/to/eval_dir \ + --eval_config_path=eval_config.pbtxt \ + --model_config_path=model_config.pbtxt \ + --input_config_path=eval_input_config.pbtxt +""" +import functools +import os +import tensorflow as tf + +from object_detection import evaluator +from object_detection.builders import dataset_builder +from object_detection.builders import graph_rewriter_builder +from object_detection.builders import model_builder +from object_detection.utils import config_util +from object_detection.utils import dataset_util +from object_detection.utils import label_map_util + + +tf.logging.set_verbosity(tf.logging.INFO) + +flags = tf.app.flags +flags.DEFINE_boolean('eval_training_data', False, + 'If training data should be evaluated for this job.') +flags.DEFINE_string('checkpoint_dir', '', + 'Directory containing checkpoints to evaluate, typically ' + 'set to `train_dir` used in the training job.') +flags.DEFINE_string('eval_dir', '', + 'Directory to write eval summaries to.') +flags.DEFINE_string('pipeline_config_path', '', + 'Path to a pipeline_pb2.TrainEvalPipelineConfig config ' + 'file. If provided, other configs are ignored') +flags.DEFINE_string('eval_config_path', '', + 'Path to an eval_pb2.EvalConfig config file.') +flags.DEFINE_string('input_config_path', '', + 'Path to an input_reader_pb2.InputReader config file.') +flags.DEFINE_string('model_config_path', '', + 'Path to a model_pb2.DetectionModel config file.') +flags.DEFINE_boolean('run_once', False, 'Option to only run a single pass of ' + 'evaluation. Overrides the `max_evals` parameter in the ' + 'provided config.') +FLAGS = flags.FLAGS + + +def main(unused_argv): + assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.' + assert FLAGS.eval_dir, '`eval_dir` is missing.' + tf.gfile.MakeDirs(FLAGS.eval_dir) + if FLAGS.pipeline_config_path: + configs = config_util.get_configs_from_pipeline_file( + FLAGS.pipeline_config_path) + tf.gfile.Copy(FLAGS.pipeline_config_path, + os.path.join(FLAGS.eval_dir, 'pipeline.config'), + overwrite=True) + else: + configs = config_util.get_configs_from_multiple_files( + model_config_path=FLAGS.model_config_path, + eval_config_path=FLAGS.eval_config_path, + eval_input_config_path=FLAGS.input_config_path) + for name, config in [('model.config', FLAGS.model_config_path), + ('eval.config', FLAGS.eval_config_path), + ('input.config', FLAGS.input_config_path)]: + tf.gfile.Copy(config, + os.path.join(FLAGS.eval_dir, name), + overwrite=True) + + model_config = configs['model'] + eval_config = configs['eval_config'] + input_config = configs['eval_input_config'] + if FLAGS.eval_training_data: + input_config = configs['train_input_config'] + + model_fn = functools.partial( + model_builder.build, + model_config=model_config, + is_training=False) + + def get_next(config): + return dataset_util.make_initializable_iterator( + dataset_builder.build(config)).get_next() + + create_input_dict_fn = functools.partial(get_next, input_config) + + label_map = label_map_util.load_labelmap(input_config.label_map_path) + max_num_classes = max([item.id for item in label_map.item]) + categories = label_map_util.convert_label_map_to_categories( + label_map, max_num_classes) + + if FLAGS.run_once: + eval_config.max_evals = 1 + + graph_rewriter_fn = None + if 'graph_rewriter_config' in configs: + graph_rewriter_fn = graph_rewriter_builder.build( + configs['graph_rewriter_config'], is_training=False) + + evaluator.evaluate( + create_input_dict_fn, + model_fn, + eval_config, + categories, + FLAGS.checkpoint_dir, + FLAGS.eval_dir, + graph_hook_fn=graph_rewriter_fn) + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/eval_util.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/eval_util.py new file mode 100644 index 0000000000000000000000000000000000000000..67b62a4421a963dbd4c09266315252d172594ba9 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/eval_util.py @@ -0,0 +1,645 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Common utility functions for evaluation.""" +import collections +import logging +import os +import time + +import numpy as np +import tensorflow as tf + +from object_detection.core import box_list +from object_detection.core import box_list_ops +from object_detection.core import keypoint_ops +from object_detection.core import standard_fields as fields +from object_detection.metrics import coco_evaluation +from object_detection.utils import label_map_util +from object_detection.utils import ops +from object_detection.utils import visualization_utils as vis_utils + +slim = tf.contrib.slim + + +def write_metrics(metrics, global_step, summary_dir): + """Write metrics to a summary directory. + + Args: + metrics: A dictionary containing metric names and values. + global_step: Global step at which the metrics are computed. + summary_dir: Directory to write tensorflow summaries to. + """ + logging.info('Writing metrics to tf summary.') + summary_writer = tf.summary.FileWriterCache.get(summary_dir) + for key in sorted(metrics): + summary = tf.Summary(value=[ + tf.Summary.Value(tag=key, simple_value=metrics[key]), + ]) + summary_writer.add_summary(summary, global_step) + logging.info('%s: %f', key, metrics[key]) + logging.info('Metrics written to tf summary.') + + +# TODO(rathodv): Add tests. +def visualize_detection_results(result_dict, + tag, + global_step, + categories, + summary_dir='', + export_dir='', + agnostic_mode=False, + show_groundtruth=False, + groundtruth_box_visualization_color='black', + min_score_thresh=.5, + max_num_predictions=20, + skip_scores=False, + skip_labels=False, + keep_image_id_for_visualization_export=False): + """Visualizes detection results and writes visualizations to image summaries. + + This function visualizes an image with its detected bounding boxes and writes + to image summaries which can be viewed on tensorboard. It optionally also + writes images to a directory. In the case of missing entry in the label map, + unknown class name in the visualization is shown as "N/A". + + Args: + result_dict: a dictionary holding groundtruth and detection + data corresponding to each image being evaluated. The following keys + are required: + 'original_image': a numpy array representing the image with shape + [1, height, width, 3] or [1, height, width, 1] + 'detection_boxes': a numpy array of shape [N, 4] + 'detection_scores': a numpy array of shape [N] + 'detection_classes': a numpy array of shape [N] + The following keys are optional: + 'groundtruth_boxes': a numpy array of shape [N, 4] + 'groundtruth_keypoints': a numpy array of shape [N, num_keypoints, 2] + Detections are assumed to be provided in decreasing order of score and for + display, and we assume that scores are probabilities between 0 and 1. + tag: tensorboard tag (string) to associate with image. + global_step: global step at which the visualization are generated. + categories: a list of dictionaries representing all possible categories. + Each dict in this list has the following keys: + 'id': (required) an integer id uniquely identifying this category + 'name': (required) string representing category name + e.g., 'cat', 'dog', 'pizza' + 'supercategory': (optional) string representing the supercategory + e.g., 'animal', 'vehicle', 'food', etc + summary_dir: the output directory to which the image summaries are written. + export_dir: the output directory to which images are written. If this is + empty (default), then images are not exported. + agnostic_mode: boolean (default: False) controlling whether to evaluate in + class-agnostic mode or not. + show_groundtruth: boolean (default: False) controlling whether to show + groundtruth boxes in addition to detected boxes + groundtruth_box_visualization_color: box color for visualizing groundtruth + boxes + min_score_thresh: minimum score threshold for a box to be visualized + max_num_predictions: maximum number of detections to visualize + skip_scores: whether to skip score when drawing a single detection + skip_labels: whether to skip label when drawing a single detection + keep_image_id_for_visualization_export: whether to keep image identifier in + filename when exported to export_dir + Raises: + ValueError: if result_dict does not contain the expected keys (i.e., + 'original_image', 'detection_boxes', 'detection_scores', + 'detection_classes') + """ + detection_fields = fields.DetectionResultFields + input_fields = fields.InputDataFields + if not set([ + input_fields.original_image, + detection_fields.detection_boxes, + detection_fields.detection_scores, + detection_fields.detection_classes, + ]).issubset(set(result_dict.keys())): + raise ValueError('result_dict does not contain all expected keys.') + if show_groundtruth and input_fields.groundtruth_boxes not in result_dict: + raise ValueError('If show_groundtruth is enabled, result_dict must contain ' + 'groundtruth_boxes.') + logging.info('Creating detection visualizations.') + category_index = label_map_util.create_category_index(categories) + + image = np.squeeze(result_dict[input_fields.original_image], axis=0) + if image.shape[2] == 1: # If one channel image, repeat in RGB. + image = np.tile(image, [1, 1, 3]) + detection_boxes = result_dict[detection_fields.detection_boxes] + detection_scores = result_dict[detection_fields.detection_scores] + detection_classes = np.int32((result_dict[ + detection_fields.detection_classes])) + detection_keypoints = result_dict.get(detection_fields.detection_keypoints) + detection_masks = result_dict.get(detection_fields.detection_masks) + detection_boundaries = result_dict.get(detection_fields.detection_boundaries) + + # Plot groundtruth underneath detections + if show_groundtruth: + groundtruth_boxes = result_dict[input_fields.groundtruth_boxes] + groundtruth_keypoints = result_dict.get(input_fields.groundtruth_keypoints) + vis_utils.visualize_boxes_and_labels_on_image_array( + image=image, + boxes=groundtruth_boxes, + classes=None, + scores=None, + category_index=category_index, + keypoints=groundtruth_keypoints, + use_normalized_coordinates=False, + max_boxes_to_draw=None, + groundtruth_box_visualization_color=groundtruth_box_visualization_color) + vis_utils.visualize_boxes_and_labels_on_image_array( + image, + detection_boxes, + detection_classes, + detection_scores, + category_index, + instance_masks=detection_masks, + instance_boundaries=detection_boundaries, + keypoints=detection_keypoints, + use_normalized_coordinates=False, + max_boxes_to_draw=max_num_predictions, + min_score_thresh=min_score_thresh, + agnostic_mode=agnostic_mode, + skip_scores=skip_scores, + skip_labels=skip_labels) + + if export_dir: + if keep_image_id_for_visualization_export and result_dict[fields. + InputDataFields() + .key]: + export_path = os.path.join(export_dir, 'export-{}-{}.png'.format( + tag, result_dict[fields.InputDataFields().key])) + else: + export_path = os.path.join(export_dir, 'export-{}.png'.format(tag)) + vis_utils.save_image_array_as_png(image, export_path) + + summary = tf.Summary(value=[ + tf.Summary.Value( + tag=tag, + image=tf.Summary.Image( + encoded_image_string=vis_utils.encode_image_array_as_png_str( + image))) + ]) + summary_writer = tf.summary.FileWriterCache.get(summary_dir) + summary_writer.add_summary(summary, global_step) + + logging.info('Detection visualizations written to summary with tag %s.', tag) + + +def _run_checkpoint_once(tensor_dict, + evaluators=None, + batch_processor=None, + checkpoint_dirs=None, + variables_to_restore=None, + restore_fn=None, + num_batches=1, + master='', + save_graph=False, + save_graph_dir='', + losses_dict=None): + """Evaluates metrics defined in evaluators and returns summaries. + + This function loads the latest checkpoint in checkpoint_dirs and evaluates + all metrics defined in evaluators. The metrics are processed in batch by the + batch_processor. + + Args: + tensor_dict: a dictionary holding tensors representing a batch of detections + and corresponding groundtruth annotations. + evaluators: a list of object of type DetectionEvaluator to be used for + evaluation. Note that the metric names produced by different evaluators + must be unique. + batch_processor: a function taking four arguments: + 1. tensor_dict: the same tensor_dict that is passed in as the first + argument to this function. + 2. sess: a tensorflow session + 3. batch_index: an integer representing the index of the batch amongst + all batches + By default, batch_processor is None, which defaults to running: + return sess.run(tensor_dict) + To skip an image, it suffices to return an empty dictionary in place of + result_dict. + checkpoint_dirs: list of directories to load into an EnsembleModel. If it + has only one directory, EnsembleModel will not be used -- + a DetectionModel + will be instantiated directly. Not used if restore_fn is set. + variables_to_restore: None, or a dictionary mapping variable names found in + a checkpoint to model variables. The dictionary would normally be + generated by creating a tf.train.ExponentialMovingAverage object and + calling its variables_to_restore() method. Not used if restore_fn is set. + restore_fn: None, or a function that takes a tf.Session object and correctly + restores all necessary variables from the correct checkpoint file. If + None, attempts to restore from the first directory in checkpoint_dirs. + num_batches: the number of batches to use for evaluation. + master: the location of the Tensorflow session. + save_graph: whether or not the Tensorflow graph is stored as a pbtxt file. + save_graph_dir: where to store the Tensorflow graph on disk. If save_graph + is True this must be non-empty. + losses_dict: optional dictionary of scalar detection losses. + + Returns: + global_step: the count of global steps. + all_evaluator_metrics: A dictionary containing metric names and values. + + Raises: + ValueError: if restore_fn is None and checkpoint_dirs doesn't have at least + one element. + ValueError: if save_graph is True and save_graph_dir is not defined. + """ + if save_graph and not save_graph_dir: + raise ValueError('`save_graph_dir` must be defined.') + sess = tf.Session(master, graph=tf.get_default_graph()) + sess.run(tf.global_variables_initializer()) + sess.run(tf.local_variables_initializer()) + sess.run(tf.tables_initializer()) + if restore_fn: + restore_fn(sess) + else: + if not checkpoint_dirs: + raise ValueError('`checkpoint_dirs` must have at least one entry.') + checkpoint_file = tf.train.latest_checkpoint(checkpoint_dirs[0]) + saver = tf.train.Saver(variables_to_restore) + saver.restore(sess, checkpoint_file) + + if save_graph: + tf.train.write_graph(sess.graph_def, save_graph_dir, 'eval.pbtxt') + + counters = {'skipped': 0, 'success': 0} + aggregate_result_losses_dict = collections.defaultdict(list) + with tf.contrib.slim.queues.QueueRunners(sess): + try: + for batch in range(int(num_batches)): + if (batch + 1) % 100 == 0: + logging.info('Running eval ops batch %d/%d', batch + 1, num_batches) + if not batch_processor: + try: + if not losses_dict: + losses_dict = {} + result_dict, result_losses_dict = sess.run([tensor_dict, + losses_dict]) + counters['success'] += 1 + except tf.errors.InvalidArgumentError: + logging.info('Skipping image') + counters['skipped'] += 1 + result_dict = {} + else: + result_dict, result_losses_dict = batch_processor( + tensor_dict, sess, batch, counters, losses_dict=losses_dict) + if not result_dict: + continue + for key, value in iter(result_losses_dict.items()): + aggregate_result_losses_dict[key].append(value) + for evaluator in evaluators: + # TODO(b/65130867): Use image_id tensor once we fix the input data + # decoders to return correct image_id. + # TODO(akuznetsa): result_dict contains batches of images, while + # add_single_ground_truth_image_info expects a single image. Fix + evaluator.add_single_ground_truth_image_info( + image_id=batch, groundtruth_dict=result_dict) + evaluator.add_single_detected_image_info( + image_id=batch, detections_dict=result_dict) + logging.info('Running eval batches done.') + except tf.errors.OutOfRangeError: + logging.info('Done evaluating -- epoch limit reached') + finally: + # When done, ask the threads to stop. + logging.info('# success: %d', counters['success']) + logging.info('# skipped: %d', counters['skipped']) + all_evaluator_metrics = {} + for evaluator in evaluators: + metrics = evaluator.evaluate() + evaluator.clear() + if any(key in all_evaluator_metrics for key in metrics): + raise ValueError('Metric names between evaluators must not collide.') + all_evaluator_metrics.update(metrics) + global_step = tf.train.global_step(sess, tf.train.get_global_step()) + + for key, value in iter(aggregate_result_losses_dict.items()): + all_evaluator_metrics['Losses/' + key] = np.mean(value) + sess.close() + return (global_step, all_evaluator_metrics) + + +# TODO(rathodv): Add tests. +def repeated_checkpoint_run(tensor_dict, + summary_dir, + evaluators, + batch_processor=None, + checkpoint_dirs=None, + variables_to_restore=None, + restore_fn=None, + num_batches=1, + eval_interval_secs=120, + max_number_of_evaluations=None, + master='', + save_graph=False, + save_graph_dir='', + losses_dict=None): + """Periodically evaluates desired tensors using checkpoint_dirs or restore_fn. + + This function repeatedly loads a checkpoint and evaluates a desired + set of tensors (provided by tensor_dict) and hands the resulting numpy + arrays to a function result_processor which can be used to further + process/save/visualize the results. + + Args: + tensor_dict: a dictionary holding tensors representing a batch of detections + and corresponding groundtruth annotations. + summary_dir: a directory to write metrics summaries. + evaluators: a list of object of type DetectionEvaluator to be used for + evaluation. Note that the metric names produced by different evaluators + must be unique. + batch_processor: a function taking three arguments: + 1. tensor_dict: the same tensor_dict that is passed in as the first + argument to this function. + 2. sess: a tensorflow session + 3. batch_index: an integer representing the index of the batch amongst + all batches + By default, batch_processor is None, which defaults to running: + return sess.run(tensor_dict) + checkpoint_dirs: list of directories to load into a DetectionModel or an + EnsembleModel if restore_fn isn't set. Also used to determine when to run + next evaluation. Must have at least one element. + variables_to_restore: None, or a dictionary mapping variable names found in + a checkpoint to model variables. The dictionary would normally be + generated by creating a tf.train.ExponentialMovingAverage object and + calling its variables_to_restore() method. Not used if restore_fn is set. + restore_fn: a function that takes a tf.Session object and correctly restores + all necessary variables from the correct checkpoint file. + num_batches: the number of batches to use for evaluation. + eval_interval_secs: the number of seconds between each evaluation run. + max_number_of_evaluations: the max number of iterations of the evaluation. + If the value is left as None the evaluation continues indefinitely. + master: the location of the Tensorflow session. + save_graph: whether or not the Tensorflow graph is saved as a pbtxt file. + save_graph_dir: where to save on disk the Tensorflow graph. If store_graph + is True this must be non-empty. + losses_dict: optional dictionary of scalar detection losses. + + Returns: + metrics: A dictionary containing metric names and values in the latest + evaluation. + + Raises: + ValueError: if max_num_of_evaluations is not None or a positive number. + ValueError: if checkpoint_dirs doesn't have at least one element. + """ + if max_number_of_evaluations and max_number_of_evaluations <= 0: + raise ValueError( + '`number_of_steps` must be either None or a positive number.') + + if not checkpoint_dirs: + raise ValueError('`checkpoint_dirs` must have at least one entry.') + + last_evaluated_model_path = None + number_of_evaluations = 0 + while True: + start = time.time() + logging.info('Starting evaluation at ' + time.strftime( + '%Y-%m-%d-%H:%M:%S', time.gmtime())) + model_path = tf.train.latest_checkpoint(checkpoint_dirs[0]) + if not model_path: + logging.info('No model found in %s. Will try again in %d seconds', + checkpoint_dirs[0], eval_interval_secs) + elif model_path == last_evaluated_model_path: + logging.info('Found already evaluated checkpoint. Will try again in %d ' + 'seconds', eval_interval_secs) + else: + last_evaluated_model_path = model_path + global_step, metrics = _run_checkpoint_once(tensor_dict, evaluators, + batch_processor, + checkpoint_dirs, + variables_to_restore, + restore_fn, num_batches, + master, save_graph, + save_graph_dir, + losses_dict=losses_dict) + write_metrics(metrics, global_step, summary_dir) + number_of_evaluations += 1 + + if (max_number_of_evaluations and + number_of_evaluations >= max_number_of_evaluations): + logging.info('Finished evaluation!') + break + time_to_next_eval = start + eval_interval_secs - time.time() + if time_to_next_eval > 0: + time.sleep(time_to_next_eval) + + return metrics + + +def result_dict_for_single_example(image, + key, + detections, + groundtruth=None, + class_agnostic=False, + scale_to_absolute=False): + """Merges all detection and groundtruth information for a single example. + + Note that evaluation tools require classes that are 1-indexed, and so this + function performs the offset. If `class_agnostic` is True, all output classes + have label 1. + + Args: + image: A single 4D uint8 image tensor of shape [1, H, W, C]. + key: A single string tensor identifying the image. + detections: A dictionary of detections, returned from + DetectionModel.postprocess(). + groundtruth: (Optional) Dictionary of groundtruth items, with fields: + 'groundtruth_boxes': [num_boxes, 4] float32 tensor of boxes, in + normalized coordinates. + 'groundtruth_classes': [num_boxes] int64 tensor of 1-indexed classes. + 'groundtruth_area': [num_boxes] float32 tensor of bbox area. (Optional) + 'groundtruth_is_crowd': [num_boxes] int64 tensor. (Optional) + 'groundtruth_difficult': [num_boxes] int64 tensor. (Optional) + 'groundtruth_group_of': [num_boxes] int64 tensor. (Optional) + 'groundtruth_instance_masks': 3D int64 tensor of instance masks + (Optional). + class_agnostic: Boolean indicating whether the detections are class-agnostic + (i.e. binary). Default False. + scale_to_absolute: Boolean indicating whether boxes and keypoints should be + scaled to absolute coordinates. Note that for IoU based evaluations, it + does not matter whether boxes are expressed in absolute or relative + coordinates. Default False. + + Returns: + A dictionary with: + 'original_image': A [1, H, W, C] uint8 image tensor. + 'key': A string tensor with image identifier. + 'detection_boxes': [max_detections, 4] float32 tensor of boxes, in + normalized or absolute coordinates, depending on the value of + `scale_to_absolute`. + 'detection_scores': [max_detections] float32 tensor of scores. + 'detection_classes': [max_detections] int64 tensor of 1-indexed classes. + 'detection_masks': [max_detections, H, W] float32 tensor of binarized + masks, reframed to full image masks. + 'groundtruth_boxes': [num_boxes, 4] float32 tensor of boxes, in + normalized or absolute coordinates, depending on the value of + `scale_to_absolute`. (Optional) + 'groundtruth_classes': [num_boxes] int64 tensor of 1-indexed classes. + (Optional) + 'groundtruth_area': [num_boxes] float32 tensor of bbox area. (Optional) + 'groundtruth_is_crowd': [num_boxes] int64 tensor. (Optional) + 'groundtruth_difficult': [num_boxes] int64 tensor. (Optional) + 'groundtruth_group_of': [num_boxes] int64 tensor. (Optional) + 'groundtruth_instance_masks': 3D int64 tensor of instance masks + (Optional). + + """ + label_id_offset = 1 # Applying label id offset (b/63711816) + + input_data_fields = fields.InputDataFields + output_dict = { + input_data_fields.original_image: image, + input_data_fields.key: key, + } + + detection_fields = fields.DetectionResultFields + detection_boxes = detections[detection_fields.detection_boxes][0] + image_shape = tf.shape(image) + detection_scores = detections[detection_fields.detection_scores][0] + + if class_agnostic: + detection_classes = tf.ones_like(detection_scores, dtype=tf.int64) + else: + detection_classes = ( + tf.to_int64(detections[detection_fields.detection_classes][0]) + + label_id_offset) + + num_detections = tf.to_int32(detections[detection_fields.num_detections][0]) + detection_boxes = tf.slice( + detection_boxes, begin=[0, 0], size=[num_detections, -1]) + detection_classes = tf.slice( + detection_classes, begin=[0], size=[num_detections]) + detection_scores = tf.slice( + detection_scores, begin=[0], size=[num_detections]) + + if scale_to_absolute: + absolute_detection_boxlist = box_list_ops.to_absolute_coordinates( + box_list.BoxList(detection_boxes), image_shape[1], image_shape[2]) + output_dict[detection_fields.detection_boxes] = ( + absolute_detection_boxlist.get()) + else: + output_dict[detection_fields.detection_boxes] = detection_boxes + output_dict[detection_fields.detection_classes] = detection_classes + output_dict[detection_fields.detection_scores] = detection_scores + + if detection_fields.detection_masks in detections: + detection_masks = detections[detection_fields.detection_masks][0] + # TODO(rathodv): This should be done in model's postprocess + # function ideally. + detection_masks = tf.slice( + detection_masks, begin=[0, 0, 0], size=[num_detections, -1, -1]) + detection_masks_reframed = ops.reframe_box_masks_to_image_masks( + detection_masks, detection_boxes, image_shape[1], image_shape[2]) + detection_masks_reframed = tf.cast( + tf.greater(detection_masks_reframed, 0.5), tf.uint8) + output_dict[detection_fields.detection_masks] = detection_masks_reframed + if detection_fields.detection_keypoints in detections: + detection_keypoints = detections[detection_fields.detection_keypoints][0] + output_dict[detection_fields.detection_keypoints] = detection_keypoints + if scale_to_absolute: + absolute_detection_keypoints = keypoint_ops.scale( + detection_keypoints, image_shape[1], image_shape[2]) + output_dict[detection_fields.detection_keypoints] = ( + absolute_detection_keypoints) + + if groundtruth: + if input_data_fields.groundtruth_instance_masks in groundtruth: + groundtruth[input_data_fields.groundtruth_instance_masks] = tf.cast( + groundtruth[input_data_fields.groundtruth_instance_masks], tf.uint8) + output_dict.update(groundtruth) + if scale_to_absolute: + groundtruth_boxes = groundtruth[input_data_fields.groundtruth_boxes] + absolute_gt_boxlist = box_list_ops.to_absolute_coordinates( + box_list.BoxList(groundtruth_boxes), image_shape[1], image_shape[2]) + output_dict[input_data_fields.groundtruth_boxes] = ( + absolute_gt_boxlist.get()) + # For class-agnostic models, groundtruth classes all become 1. + if class_agnostic: + groundtruth_classes = groundtruth[input_data_fields.groundtruth_classes] + groundtruth_classes = tf.ones_like(groundtruth_classes, dtype=tf.int64) + output_dict[input_data_fields.groundtruth_classes] = groundtruth_classes + + return output_dict + + +def get_eval_metric_ops_for_evaluators(evaluation_metrics, + categories, + eval_dict, + include_metrics_per_category=False): + """Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`. + + Args: + evaluation_metrics: List of evaluation metric names. Current options are + 'coco_detection_metrics' and 'coco_mask_metrics'. + categories: A list of dicts, each of which has the following keys - + 'id': (required) an integer id uniquely identifying this category. + 'name': (required) string representing category name e.g., 'cat', 'dog'. + eval_dict: An evaluation dictionary, returned from + result_dict_for_single_example(). + include_metrics_per_category: If True, additionally include per-category + metrics. + + Returns: + A dictionary of metric names to tuple of value_op and update_op that can be + used as eval metric ops in tf.EstimatorSpec. + + Raises: + ValueError: If any of the metrics in `evaluation_metric` is not + 'coco_detection_metrics' or 'coco_mask_metrics'. + """ + evaluation_metrics = list(set(evaluation_metrics)) + + input_data_fields = fields.InputDataFields + detection_fields = fields.DetectionResultFields + eval_metric_ops = {} + for metric in evaluation_metrics: + if metric == 'coco_detection_metrics': + coco_evaluator = coco_evaluation.CocoDetectionEvaluator( + categories, include_metrics_per_category=include_metrics_per_category) + eval_metric_ops.update( + coco_evaluator.get_estimator_eval_metric_ops( + image_id=eval_dict[input_data_fields.key], + groundtruth_boxes=eval_dict[input_data_fields.groundtruth_boxes], + groundtruth_classes=eval_dict[ + input_data_fields.groundtruth_classes], + detection_boxes=eval_dict[detection_fields.detection_boxes], + detection_scores=eval_dict[detection_fields.detection_scores], + detection_classes=eval_dict[detection_fields.detection_classes], + groundtruth_is_crowd=eval_dict.get( + input_data_fields.groundtruth_is_crowd))) + elif metric == 'coco_mask_metrics': + coco_mask_evaluator = coco_evaluation.CocoMaskEvaluator( + categories, include_metrics_per_category=include_metrics_per_category) + eval_metric_ops.update( + coco_mask_evaluator.get_estimator_eval_metric_ops( + image_id=eval_dict[input_data_fields.key], + groundtruth_boxes=eval_dict[input_data_fields.groundtruth_boxes], + groundtruth_classes=eval_dict[ + input_data_fields.groundtruth_classes], + groundtruth_instance_masks=eval_dict[ + input_data_fields.groundtruth_instance_masks], + detection_scores=eval_dict[detection_fields.detection_scores], + detection_classes=eval_dict[detection_fields.detection_classes], + detection_masks=eval_dict[detection_fields.detection_masks], + groundtruth_is_crowd=eval_dict.get( + input_data_fields.groundtruth_is_crowd),)) + else: + raise ValueError('The only evaluation metrics supported are ' + '"coco_detection_metrics" and "coco_mask_metrics". ' + 'Found {} in the evaluation metrics'.format(metric)) + + return eval_metric_ops + + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/eval_util_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/eval_util_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e4b0ca3d866774189731658be2ee7b5fa7293113 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/eval_util_test.py @@ -0,0 +1,112 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for eval_util.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +from object_detection import eval_util +from object_detection.core import standard_fields as fields + + +class EvalUtilTest(tf.test.TestCase): + + def _get_categories_list(self): + return [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'dog'}, + {'id': 2, 'name': 'cat'}] + + def _make_evaluation_dict(self): + input_data_fields = fields.InputDataFields + detection_fields = fields.DetectionResultFields + + image = tf.zeros(shape=[1, 20, 20, 3], dtype=tf.uint8) + key = tf.constant('image1') + detection_boxes = tf.constant([[[0., 0., 1., 1.]]]) + detection_scores = tf.constant([[0.8]]) + detection_classes = tf.constant([[0]]) + detection_masks = tf.ones(shape=[1, 1, 20, 20], dtype=tf.float32) + num_detections = tf.constant([1]) + groundtruth_boxes = tf.constant([[0., 0., 1., 1.]]) + groundtruth_classes = tf.constant([1]) + groundtruth_instance_masks = tf.ones(shape=[1, 20, 20], dtype=tf.uint8) + detections = { + detection_fields.detection_boxes: detection_boxes, + detection_fields.detection_scores: detection_scores, + detection_fields.detection_classes: detection_classes, + detection_fields.detection_masks: detection_masks, + detection_fields.num_detections: num_detections + } + groundtruth = { + input_data_fields.groundtruth_boxes: groundtruth_boxes, + input_data_fields.groundtruth_classes: groundtruth_classes, + input_data_fields.groundtruth_instance_masks: groundtruth_instance_masks + } + return eval_util.result_dict_for_single_example(image, key, detections, + groundtruth) + + def test_get_eval_metric_ops_for_coco_detections(self): + evaluation_metrics = ['coco_detection_metrics'] + categories = self._get_categories_list() + eval_dict = self._make_evaluation_dict() + metric_ops = eval_util.get_eval_metric_ops_for_evaluators( + evaluation_metrics, categories, eval_dict) + _, update_op = metric_ops['DetectionBoxes_Precision/mAP'] + + with self.test_session() as sess: + metrics = {} + for key, (value_op, _) in metric_ops.iteritems(): + metrics[key] = value_op + sess.run(update_op) + metrics = sess.run(metrics) + print(metrics) + self.assertAlmostEqual(1.0, metrics['DetectionBoxes_Precision/mAP']) + self.assertNotIn('DetectionMasks_Precision/mAP', metrics) + + def test_get_eval_metric_ops_for_coco_detections_and_masks(self): + evaluation_metrics = ['coco_detection_metrics', + 'coco_mask_metrics'] + categories = self._get_categories_list() + eval_dict = self._make_evaluation_dict() + metric_ops = eval_util.get_eval_metric_ops_for_evaluators( + evaluation_metrics, categories, eval_dict) + _, update_op_boxes = metric_ops['DetectionBoxes_Precision/mAP'] + _, update_op_masks = metric_ops['DetectionMasks_Precision/mAP'] + + with self.test_session() as sess: + metrics = {} + for key, (value_op, _) in metric_ops.iteritems(): + metrics[key] = value_op + sess.run(update_op_boxes) + sess.run(update_op_masks) + metrics = sess.run(metrics) + self.assertAlmostEqual(1.0, metrics['DetectionBoxes_Precision/mAP']) + self.assertAlmostEqual(1.0, metrics['DetectionMasks_Precision/mAP']) + + def test_get_eval_metric_ops_raises_error_with_unsupported_metric(self): + evaluation_metrics = ['unsupported_metrics'] + categories = self._get_categories_list() + eval_dict = self._make_evaluation_dict() + with self.assertRaises(ValueError): + eval_util.get_eval_metric_ops_for_evaluators( + evaluation_metrics, categories, eval_dict) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/evaluator.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..7352af52208451ca70b8db99266f76cba800e384 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/evaluator.py @@ -0,0 +1,278 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Detection model evaluator. + +This file provides a generic evaluation method that can be used to evaluate a +DetectionModel. +""" + +import logging +import tensorflow as tf + +from object_detection import eval_util +from object_detection.core import prefetcher +from object_detection.core import standard_fields as fields +from object_detection.metrics import coco_evaluation +from object_detection.utils import object_detection_evaluation + +# A dictionary of metric names to classes that implement the metric. The classes +# in the dictionary must implement +# utils.object_detection_evaluation.DetectionEvaluator interface. +EVAL_METRICS_CLASS_DICT = { + 'pascal_voc_detection_metrics': + object_detection_evaluation.PascalDetectionEvaluator, + 'weighted_pascal_voc_detection_metrics': + object_detection_evaluation.WeightedPascalDetectionEvaluator, + 'pascal_voc_instance_segmentation_metrics': + object_detection_evaluation.PascalInstanceSegmentationEvaluator, + 'weighted_pascal_voc_instance_segmentation_metrics': + object_detection_evaluation.WeightedPascalInstanceSegmentationEvaluator, + 'open_images_V2_detection_metrics': + object_detection_evaluation.OpenImagesDetectionEvaluator, + 'coco_detection_metrics': + coco_evaluation.CocoDetectionEvaluator, + 'coco_mask_metrics': + coco_evaluation.CocoMaskEvaluator, + 'oid_challenge_object_detection_metrics': + object_detection_evaluation.OpenImagesDetectionChallengeEvaluator, +} + +EVAL_DEFAULT_METRIC = 'pascal_voc_detection_metrics' + + +def _extract_predictions_and_losses(model, + create_input_dict_fn, + ignore_groundtruth=False): + """Constructs tensorflow detection graph and returns output tensors. + + Args: + model: model to perform predictions with. + create_input_dict_fn: function to create input tensor dictionaries. + ignore_groundtruth: whether groundtruth should be ignored. + + Returns: + prediction_groundtruth_dict: A dictionary with postprocessed tensors (keyed + by standard_fields.DetectionResultsFields) and optional groundtruth + tensors (keyed by standard_fields.InputDataFields). + losses_dict: A dictionary containing detection losses. This is empty when + ignore_groundtruth is true. + """ + input_dict = create_input_dict_fn() + prefetch_queue = prefetcher.prefetch(input_dict, capacity=500) + input_dict = prefetch_queue.dequeue() + original_image = tf.expand_dims(input_dict[fields.InputDataFields.image], 0) + preprocessed_image, true_image_shapes = model.preprocess( + tf.to_float(original_image)) + prediction_dict = model.predict(preprocessed_image, true_image_shapes) + detections = model.postprocess(prediction_dict, true_image_shapes) + + groundtruth = None + losses_dict = {} + if not ignore_groundtruth: + groundtruth = { + fields.InputDataFields.groundtruth_boxes: + input_dict[fields.InputDataFields.groundtruth_boxes], + fields.InputDataFields.groundtruth_classes: + input_dict[fields.InputDataFields.groundtruth_classes], + fields.InputDataFields.groundtruth_area: + input_dict[fields.InputDataFields.groundtruth_area], + fields.InputDataFields.groundtruth_is_crowd: + input_dict[fields.InputDataFields.groundtruth_is_crowd], + fields.InputDataFields.groundtruth_difficult: + input_dict[fields.InputDataFields.groundtruth_difficult] + } + if fields.InputDataFields.groundtruth_group_of in input_dict: + groundtruth[fields.InputDataFields.groundtruth_group_of] = ( + input_dict[fields.InputDataFields.groundtruth_group_of]) + groundtruth_masks_list = None + if fields.DetectionResultFields.detection_masks in detections: + groundtruth[fields.InputDataFields.groundtruth_instance_masks] = ( + input_dict[fields.InputDataFields.groundtruth_instance_masks]) + groundtruth_masks_list = [ + input_dict[fields.InputDataFields.groundtruth_instance_masks]] + groundtruth_keypoints_list = None + if fields.DetectionResultFields.detection_keypoints in detections: + groundtruth[fields.InputDataFields.groundtruth_keypoints] = ( + input_dict[fields.InputDataFields.groundtruth_keypoints]) + groundtruth_keypoints_list = [ + input_dict[fields.InputDataFields.groundtruth_keypoints]] + label_id_offset = 1 + model.provide_groundtruth( + [input_dict[fields.InputDataFields.groundtruth_boxes]], + [tf.one_hot(input_dict[fields.InputDataFields.groundtruth_classes] + - label_id_offset, depth=model.num_classes)], + groundtruth_masks_list, groundtruth_keypoints_list) + losses_dict.update(model.loss(prediction_dict, true_image_shapes)) + + result_dict = eval_util.result_dict_for_single_example( + original_image, + input_dict[fields.InputDataFields.source_id], + detections, + groundtruth, + class_agnostic=( + fields.DetectionResultFields.detection_classes not in detections), + scale_to_absolute=True) + return result_dict, losses_dict + + +def get_evaluators(eval_config, categories): + """Returns the evaluator class according to eval_config, valid for categories. + + Args: + eval_config: evaluation configurations. + categories: a list of categories to evaluate. + Returns: + An list of instances of DetectionEvaluator. + + Raises: + ValueError: if metric is not in the metric class dictionary. + """ + eval_metric_fn_keys = eval_config.metrics_set + if not eval_metric_fn_keys: + eval_metric_fn_keys = [EVAL_DEFAULT_METRIC] + evaluators_list = [] + for eval_metric_fn_key in eval_metric_fn_keys: + if eval_metric_fn_key not in EVAL_METRICS_CLASS_DICT: + raise ValueError('Metric not found: {}'.format(eval_metric_fn_key)) + evaluators_list.append( + EVAL_METRICS_CLASS_DICT[eval_metric_fn_key](categories=categories)) + return evaluators_list + + +def evaluate(create_input_dict_fn, create_model_fn, eval_config, categories, + checkpoint_dir, eval_dir, graph_hook_fn=None, evaluator_list=None): + """Evaluation function for detection models. + + Args: + create_input_dict_fn: a function to create a tensor input dictionary. + create_model_fn: a function that creates a DetectionModel. + eval_config: a eval_pb2.EvalConfig protobuf. + categories: a list of category dictionaries. Each dict in the list should + have an integer 'id' field and string 'name' field. + checkpoint_dir: directory to load the checkpoints to evaluate from. + eval_dir: directory to write evaluation metrics summary to. + graph_hook_fn: Optional function that is called after the training graph is + completely built. This is helpful to perform additional changes to the + training graph such as optimizing batchnorm. The function should modify + the default graph. + evaluator_list: Optional list of instances of DetectionEvaluator. If not + given, this list of metrics is created according to the eval_config. + + Returns: + metrics: A dictionary containing metric names and values from the latest + run. + """ + with tf.Graph().as_default(): + model = create_model_fn() + + if eval_config.ignore_groundtruth and not eval_config.export_path: + logging.fatal('If ignore_groundtruth=True then an export_path is ' + 'required. Aborting!!!') + + tensor_dict, losses_dict = _extract_predictions_and_losses( + model=model, + create_input_dict_fn=create_input_dict_fn, + ignore_groundtruth=eval_config.ignore_groundtruth) + + def _process_batch(tensor_dict, sess, batch_index, counters, + losses_dict=None): + """Evaluates tensors in tensor_dict, losses_dict and visualizes examples. + + This function calls sess.run on tensor_dict, evaluating the original_image + tensor only on the first K examples and visualizing detections overlaid + on this original_image. + + Args: + tensor_dict: a dictionary of tensors + sess: tensorflow session + batch_index: the index of the batch amongst all batches in the run. + counters: a dictionary holding 'success' and 'skipped' fields which can + be updated to keep track of number of successful and failed runs, + respectively. If these fields are not updated, then the success/skipped + counter values shown at the end of evaluation will be incorrect. + losses_dict: Optional dictonary of scalar loss tensors. + + Returns: + result_dict: a dictionary of numpy arrays + result_losses_dict: a dictionary of scalar losses. This is empty if input + losses_dict is None. + """ + try: + if not losses_dict: + losses_dict = {} + result_dict, result_losses_dict = sess.run([tensor_dict, losses_dict]) + counters['success'] += 1 + except tf.errors.InvalidArgumentError: + logging.info('Skipping image') + counters['skipped'] += 1 + return {}, {} + global_step = tf.train.global_step(sess, tf.train.get_global_step()) + if batch_index < eval_config.num_visualizations: + tag = 'image-{}'.format(batch_index) + eval_util.visualize_detection_results( + result_dict, + tag, + global_step, + categories=categories, + summary_dir=eval_dir, + export_dir=eval_config.visualization_export_dir, + show_groundtruth=eval_config.visualize_groundtruth_boxes, + groundtruth_box_visualization_color=eval_config. + groundtruth_box_visualization_color, + min_score_thresh=eval_config.min_score_threshold, + max_num_predictions=eval_config.max_num_boxes_to_visualize, + skip_scores=eval_config.skip_scores, + skip_labels=eval_config.skip_labels, + keep_image_id_for_visualization_export=eval_config. + keep_image_id_for_visualization_export) + return result_dict, result_losses_dict + + variables_to_restore = tf.global_variables() + global_step = tf.train.get_or_create_global_step() + variables_to_restore.append(global_step) + + if graph_hook_fn: graph_hook_fn() + + if eval_config.use_moving_averages: + variable_averages = tf.train.ExponentialMovingAverage(0.0) + variables_to_restore = variable_averages.variables_to_restore() + saver = tf.train.Saver(variables_to_restore) + + def _restore_latest_checkpoint(sess): + latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) + saver.restore(sess, latest_checkpoint) + + if not evaluator_list: + evaluator_list = get_evaluators(eval_config, categories) + + metrics = eval_util.repeated_checkpoint_run( + tensor_dict=tensor_dict, + summary_dir=eval_dir, + evaluators=evaluator_list, + batch_processor=_process_batch, + checkpoint_dirs=[checkpoint_dir], + variables_to_restore=None, + restore_fn=_restore_latest_checkpoint, + num_batches=eval_config.num_examples, + eval_interval_secs=eval_config.eval_interval_secs, + max_number_of_evaluations=(1 if eval_config.ignore_groundtruth else + eval_config.max_evals + if eval_config.max_evals else None), + master=eval_config.eval_master, + save_graph=eval_config.save_graph, + save_graph_dir=(eval_dir if eval_config.save_graph else ''), + losses_dict=losses_dict) + + return metrics diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/export_inference_graph.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/export_inference_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..5d0699f199848255c2989f7f46f4282ee4674765 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/export_inference_graph.py @@ -0,0 +1,147 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +r"""Tool to export an object detection model for inference. + +Prepares an object detection tensorflow graph for inference using model +configuration and an optional trained checkpoint. Outputs inference +graph, associated checkpoint files, a frozen inference graph and a +SavedModel (https://tensorflow.github.io/serving/serving_basic.html). + +The inference graph contains one of three input nodes depending on the user +specified option. + * `image_tensor`: Accepts a uint8 4-D tensor of shape [None, None, None, 3] + * `encoded_image_string_tensor`: Accepts a 1-D string tensor of shape [None] + containing encoded PNG or JPEG images. Image resolutions are expected to be + the same if more than 1 image is provided. + * `tf_example`: Accepts a 1-D string tensor of shape [None] containing + serialized TFExample protos. Image resolutions are expected to be the same + if more than 1 image is provided. + +and the following output nodes returned by the model.postprocess(..): + * `num_detections`: Outputs float32 tensors of the form [batch] + that specifies the number of valid boxes per image in the batch. + * `detection_boxes`: Outputs float32 tensors of the form + [batch, num_boxes, 4] containing detected boxes. + * `detection_scores`: Outputs float32 tensors of the form + [batch, num_boxes] containing class scores for the detections. + * `detection_classes`: Outputs float32 tensors of the form + [batch, num_boxes] containing classes for the detections. + * `detection_masks`: Outputs float32 tensors of the form + [batch, num_boxes, mask_height, mask_width] containing predicted instance + masks for each box if its present in the dictionary of postprocessed + tensors returned by the model. + +Notes: + * This tool uses `use_moving_averages` from eval_config to decide which + weights to freeze. + +Example Usage: +-------------- +python export_inference_graph \ + --input_type image_tensor \ + --pipeline_config_path path/to/ssd_inception_v2.config \ + --trained_checkpoint_prefix path/to/model.ckpt \ + --output_directory path/to/exported_model_directory + +The expected output would be in the directory +path/to/exported_model_directory (which is created if it does not exist) +with contents: + - graph.pbtxt + - model.ckpt.data-00000-of-00001 + - model.ckpt.info + - model.ckpt.meta + - frozen_inference_graph.pb + + saved_model (a directory) + +Config overrides (see the `config_override` flag) are text protobufs +(also of type pipeline_pb2.TrainEvalPipelineConfig) which are used to override +certain fields in the provided pipeline_config_path. These are useful for +making small changes to the inference graph that differ from the training or +eval config. + +Example Usage (in which we change the second stage post-processing score +threshold to be 0.5): + +python export_inference_graph \ + --input_type image_tensor \ + --pipeline_config_path path/to/ssd_inception_v2.config \ + --trained_checkpoint_prefix path/to/model.ckpt \ + --output_directory path/to/exported_model_directory \ + --config_override " \ + model{ \ + faster_rcnn { \ + second_stage_post_processing { \ + batch_non_max_suppression { \ + score_threshold: 0.5 \ + } \ + } \ + } \ + }" +""" +import tensorflow as tf +from google.protobuf import text_format +from object_detection import exporter +from object_detection.protos import pipeline_pb2 + +slim = tf.contrib.slim +flags = tf.app.flags + +flags.DEFINE_string('input_type', 'image_tensor', 'Type of input node. Can be ' + 'one of [`image_tensor`, `encoded_image_string_tensor`, ' + '`tf_example`]') +flags.DEFINE_string('input_shape', None, + 'If input_type is `image_tensor`, this can explicitly set ' + 'the shape of this input tensor to a fixed size. The ' + 'dimensions are to be provided as a comma-separated list ' + 'of integers. A value of -1 can be used for unknown ' + 'dimensions. If not specified, for an `image_tensor, the ' + 'default shape will be partially specified as ' + '`[None, None, None, 3]`.') +flags.DEFINE_string('pipeline_config_path', None, + 'Path to a pipeline_pb2.TrainEvalPipelineConfig config ' + 'file.') +flags.DEFINE_string('trained_checkpoint_prefix', None, + 'Path to trained checkpoint, typically of the form ' + 'path/to/model.ckpt') +flags.DEFINE_string('output_directory', None, 'Path to write outputs.') +flags.DEFINE_string('config_override', '', + 'pipeline_pb2.TrainEvalPipelineConfig ' + 'text proto to override pipeline_config_path.') +tf.app.flags.mark_flag_as_required('pipeline_config_path') +tf.app.flags.mark_flag_as_required('trained_checkpoint_prefix') +tf.app.flags.mark_flag_as_required('output_directory') +FLAGS = flags.FLAGS + + +def main(_): + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + with tf.gfile.GFile(FLAGS.pipeline_config_path, 'r') as f: + text_format.Merge(f.read(), pipeline_config) + text_format.Merge(FLAGS.config_override, pipeline_config) + if FLAGS.input_shape: + input_shape = [ + int(dim) if dim != '-1' else None + for dim in FLAGS.input_shape.split(',') + ] + else: + input_shape = None + exporter.export_inference_graph(FLAGS.input_type, pipeline_config, + FLAGS.trained_checkpoint_prefix, + FLAGS.output_directory, input_shape) + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/exporter.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/exporter.py new file mode 100644 index 0000000000000000000000000000000000000000..05b09b1f72ae12682c77bbe4e790469ab5864b43 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/exporter.py @@ -0,0 +1,465 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Functions to export object detection inference graph.""" +import logging +import os +import tempfile +import tensorflow as tf +from google.protobuf import text_format +from tensorflow.core.protobuf import saver_pb2 +from tensorflow.python import pywrap_tensorflow +from tensorflow.python.client import session +from tensorflow.python.framework import graph_util +from tensorflow.python.platform import gfile +from tensorflow.python.saved_model import signature_constants +from tensorflow.python.training import saver as saver_lib +from object_detection.builders import model_builder +from object_detection.core import standard_fields as fields +from object_detection.data_decoders import tf_example_decoder + +slim = tf.contrib.slim + + +# TODO(derekjchow): Replace with freeze_graph.freeze_graph_with_def_protos when +# newer version of Tensorflow becomes more common. +def freeze_graph_with_def_protos( + input_graph_def, + input_saver_def, + input_checkpoint, + output_node_names, + restore_op_name, + filename_tensor_name, + clear_devices, + initializer_nodes, + variable_names_blacklist=''): + """Converts all variables in a graph and checkpoint into constants.""" + del restore_op_name, filename_tensor_name # Unused by updated loading code. + + # 'input_checkpoint' may be a prefix if we're using Saver V2 format + if not saver_lib.checkpoint_exists(input_checkpoint): + raise ValueError( + 'Input checkpoint "' + input_checkpoint + '" does not exist!') + + if not output_node_names: + raise ValueError( + 'You must supply the name of a node to --output_node_names.') + + # Remove all the explicit device specifications for this node. This helps to + # make the graph more portable. + if clear_devices: + for node in input_graph_def.node: + node.device = '' + + with tf.Graph().as_default(): + tf.import_graph_def(input_graph_def, name='') + config = tf.ConfigProto(graph_options=tf.GraphOptions()) + with session.Session(config=config) as sess: + if input_saver_def: + saver = saver_lib.Saver(saver_def=input_saver_def) + saver.restore(sess, input_checkpoint) + else: + var_list = {} + reader = pywrap_tensorflow.NewCheckpointReader(input_checkpoint) + var_to_shape_map = reader.get_variable_to_shape_map() + for key in var_to_shape_map: + try: + tensor = sess.graph.get_tensor_by_name(key + ':0') + except KeyError: + # This tensor doesn't exist in the graph (for example it's + # 'global_step' or a similar housekeeping element) so skip it. + continue + var_list[key] = tensor + saver = saver_lib.Saver(var_list=var_list) + saver.restore(sess, input_checkpoint) + if initializer_nodes: + sess.run(initializer_nodes) + + variable_names_blacklist = (variable_names_blacklist.split(',') if + variable_names_blacklist else None) + output_graph_def = graph_util.convert_variables_to_constants( + sess, + input_graph_def, + output_node_names.split(','), + variable_names_blacklist=variable_names_blacklist) + + return output_graph_def + + +def replace_variable_values_with_moving_averages(graph, + current_checkpoint_file, + new_checkpoint_file): + """Replaces variable values in the checkpoint with their moving averages. + + If the current checkpoint has shadow variables maintaining moving averages of + the variables defined in the graph, this function generates a new checkpoint + where the variables contain the values of their moving averages. + + Args: + graph: a tf.Graph object. + current_checkpoint_file: a checkpoint containing both original variables and + their moving averages. + new_checkpoint_file: file path to write a new checkpoint. + """ + with graph.as_default(): + variable_averages = tf.train.ExponentialMovingAverage(0.0) + ema_variables_to_restore = variable_averages.variables_to_restore() + with tf.Session() as sess: + read_saver = tf.train.Saver(ema_variables_to_restore) + read_saver.restore(sess, current_checkpoint_file) + write_saver = tf.train.Saver() + write_saver.save(sess, new_checkpoint_file) + + +def _image_tensor_input_placeholder(input_shape=None): + """Returns input placeholder and a 4-D uint8 image tensor.""" + if input_shape is None: + input_shape = (None, None, None, 3) + input_tensor = tf.placeholder( + dtype=tf.uint8, shape=input_shape, name='image_tensor') + return input_tensor, input_tensor + + +def _tf_example_input_placeholder(): + """Returns input that accepts a batch of strings with tf examples. + + Returns: + a tuple of input placeholder and the output decoded images. + """ + batch_tf_example_placeholder = tf.placeholder( + tf.string, shape=[None], name='tf_example') + def decode(tf_example_string_tensor): + tensor_dict = tf_example_decoder.TfExampleDecoder().decode( + tf_example_string_tensor) + image_tensor = tensor_dict[fields.InputDataFields.image] + return image_tensor + return (batch_tf_example_placeholder, + tf.map_fn(decode, + elems=batch_tf_example_placeholder, + dtype=tf.uint8, + parallel_iterations=32, + back_prop=False)) + + +def _encoded_image_string_tensor_input_placeholder(): + """Returns input that accepts a batch of PNG or JPEG strings. + + Returns: + a tuple of input placeholder and the output decoded images. + """ + batch_image_str_placeholder = tf.placeholder( + dtype=tf.string, + shape=[None], + name='encoded_image_string_tensor') + def decode(encoded_image_string_tensor): + image_tensor = tf.image.decode_image(encoded_image_string_tensor, + channels=3) + image_tensor.set_shape((None, None, 3)) + return image_tensor + return (batch_image_str_placeholder, + tf.map_fn( + decode, + elems=batch_image_str_placeholder, + dtype=tf.uint8, + parallel_iterations=32, + back_prop=False)) + + +input_placeholder_fn_map = { + 'image_tensor': _image_tensor_input_placeholder, + 'encoded_image_string_tensor': + _encoded_image_string_tensor_input_placeholder, + 'tf_example': _tf_example_input_placeholder, +} + + +def _add_output_tensor_nodes(postprocessed_tensors, + output_collection_name='inference_op'): + """Adds output nodes for detection boxes and scores. + + Adds the following nodes for output tensors - + * num_detections: float32 tensor of shape [batch_size]. + * detection_boxes: float32 tensor of shape [batch_size, num_boxes, 4] + containing detected boxes. + * detection_scores: float32 tensor of shape [batch_size, num_boxes] + containing scores for the detected boxes. + * detection_classes: float32 tensor of shape [batch_size, num_boxes] + containing class predictions for the detected boxes. + * detection_keypoints: (Optional) float32 tensor of shape + [batch_size, num_boxes, num_keypoints, 2] containing keypoints for each + detection box. + * detection_masks: (Optional) float32 tensor of shape + [batch_size, num_boxes, mask_height, mask_width] containing masks for each + detection box. + + Args: + postprocessed_tensors: a dictionary containing the following fields + 'detection_boxes': [batch, max_detections, 4] + 'detection_scores': [batch, max_detections] + 'detection_classes': [batch, max_detections] + 'detection_masks': [batch, max_detections, mask_height, mask_width] + (optional). + 'num_detections': [batch] + output_collection_name: Name of collection to add output tensors to. + + Returns: + A tensor dict containing the added output tensor nodes. + """ + detection_fields = fields.DetectionResultFields + label_id_offset = 1 + boxes = postprocessed_tensors.get(detection_fields.detection_boxes) + scores = postprocessed_tensors.get(detection_fields.detection_scores) + classes = postprocessed_tensors.get( + detection_fields.detection_classes) + label_id_offset + keypoints = postprocessed_tensors.get(detection_fields.detection_keypoints) + masks = postprocessed_tensors.get(detection_fields.detection_masks) + num_detections = postprocessed_tensors.get(detection_fields.num_detections) + outputs = {} + outputs[detection_fields.detection_boxes] = tf.identity( + boxes, name=detection_fields.detection_boxes) + outputs[detection_fields.detection_scores] = tf.identity( + scores, name=detection_fields.detection_scores) + outputs[detection_fields.detection_classes] = tf.identity( + classes, name=detection_fields.detection_classes) + outputs[detection_fields.num_detections] = tf.identity( + num_detections, name=detection_fields.num_detections) + if keypoints is not None: + outputs[detection_fields.detection_keypoints] = tf.identity( + keypoints, name=detection_fields.detection_keypoints) + if masks is not None: + outputs[detection_fields.detection_masks] = tf.identity( + masks, name=detection_fields.detection_masks) + for output_key in outputs: + tf.add_to_collection(output_collection_name, outputs[output_key]) + if masks is not None: + tf.add_to_collection(output_collection_name, + outputs[detection_fields.detection_masks]) + return outputs + + +def write_frozen_graph(frozen_graph_path, frozen_graph_def): + """Writes frozen graph to disk. + + Args: + frozen_graph_path: Path to write inference graph. + frozen_graph_def: tf.GraphDef holding frozen graph. + """ + with gfile.GFile(frozen_graph_path, 'wb') as f: + f.write(frozen_graph_def.SerializeToString()) + logging.info('%d ops in the final graph.', len(frozen_graph_def.node)) + + +def write_saved_model(saved_model_path, + frozen_graph_def, + inputs, + outputs): + """Writes SavedModel to disk. + + If checkpoint_path is not None bakes the weights into the graph thereby + eliminating the need of checkpoint files during inference. If the model + was trained with moving averages, setting use_moving_averages to true + restores the moving averages, otherwise the original set of variables + is restored. + + Args: + saved_model_path: Path to write SavedModel. + frozen_graph_def: tf.GraphDef holding frozen graph. + inputs: The input image tensor to use for detection. + outputs: A tensor dictionary containing the outputs of a DetectionModel. + """ + with tf.Graph().as_default(): + with session.Session() as sess: + + tf.import_graph_def(frozen_graph_def, name='') + + builder = tf.saved_model.builder.SavedModelBuilder(saved_model_path) + + tensor_info_inputs = { + 'inputs': tf.saved_model.utils.build_tensor_info(inputs)} + tensor_info_outputs = {} + for k, v in outputs.items(): + tensor_info_outputs[k] = tf.saved_model.utils.build_tensor_info(v) + + detection_signature = ( + tf.saved_model.signature_def_utils.build_signature_def( + inputs=tensor_info_inputs, + outputs=tensor_info_outputs, + method_name=signature_constants.PREDICT_METHOD_NAME)) + + builder.add_meta_graph_and_variables( + sess, [tf.saved_model.tag_constants.SERVING], + signature_def_map={ + signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: + detection_signature, + }, + ) + builder.save() + + +def write_graph_and_checkpoint(inference_graph_def, + model_path, + input_saver_def, + trained_checkpoint_prefix): + """Writes the graph and the checkpoint into disk.""" + for node in inference_graph_def.node: + node.device = '' + with tf.Graph().as_default(): + tf.import_graph_def(inference_graph_def, name='') + with session.Session() as sess: + saver = saver_lib.Saver(saver_def=input_saver_def, + save_relative_paths=True) + saver.restore(sess, trained_checkpoint_prefix) + saver.save(sess, model_path) + + +def _get_outputs_from_inputs(input_tensors, detection_model, + output_collection_name): + inputs = tf.to_float(input_tensors) + preprocessed_inputs, true_image_shapes = detection_model.preprocess(inputs) + output_tensors = detection_model.predict( + preprocessed_inputs, true_image_shapes) + postprocessed_tensors = detection_model.postprocess( + output_tensors, true_image_shapes) + return _add_output_tensor_nodes(postprocessed_tensors, + output_collection_name) + + +def _build_detection_graph(input_type, detection_model, input_shape, + output_collection_name, graph_hook_fn): + """Build the detection graph.""" + if input_type not in input_placeholder_fn_map: + raise ValueError('Unknown input type: {}'.format(input_type)) + placeholder_args = {} + if input_shape is not None: + if input_type != 'image_tensor': + raise ValueError('Can only specify input shape for `image_tensor` ' + 'inputs.') + placeholder_args['input_shape'] = input_shape + placeholder_tensor, input_tensors = input_placeholder_fn_map[input_type]( + **placeholder_args) + outputs = _get_outputs_from_inputs( + input_tensors=input_tensors, + detection_model=detection_model, + output_collection_name=output_collection_name) + + # Add global step to the graph. + slim.get_or_create_global_step() + + if graph_hook_fn: graph_hook_fn() + + return outputs, placeholder_tensor + + +def _export_inference_graph(input_type, + detection_model, + use_moving_averages, + trained_checkpoint_prefix, + output_directory, + additional_output_tensor_names=None, + input_shape=None, + output_collection_name='inference_op', + graph_hook_fn=None): + """Export helper.""" + tf.gfile.MakeDirs(output_directory) + frozen_graph_path = os.path.join(output_directory, + 'frozen_inference_graph.pb') + saved_model_path = os.path.join(output_directory, 'saved_model') + model_path = os.path.join(output_directory, 'model.ckpt') + + outputs, placeholder_tensor = _build_detection_graph( + input_type=input_type, + detection_model=detection_model, + input_shape=input_shape, + output_collection_name=output_collection_name, + graph_hook_fn=graph_hook_fn) + + saver_kwargs = {} + if use_moving_averages: + # This check is to be compatible with both version of SaverDef. + if os.path.isfile(trained_checkpoint_prefix): + saver_kwargs['write_version'] = saver_pb2.SaverDef.V1 + temp_checkpoint_prefix = tempfile.NamedTemporaryFile().name + else: + temp_checkpoint_prefix = tempfile.mkdtemp() + replace_variable_values_with_moving_averages( + tf.get_default_graph(), trained_checkpoint_prefix, + temp_checkpoint_prefix) + checkpoint_to_use = temp_checkpoint_prefix + else: + checkpoint_to_use = trained_checkpoint_prefix + + saver = tf.train.Saver(**saver_kwargs) + input_saver_def = saver.as_saver_def() + + write_graph_and_checkpoint( + inference_graph_def=tf.get_default_graph().as_graph_def(), + model_path=model_path, + input_saver_def=input_saver_def, + trained_checkpoint_prefix=checkpoint_to_use) + + if additional_output_tensor_names is not None: + output_node_names = ','.join(outputs.keys()+additional_output_tensor_names) + else: + output_node_names = ','.join(outputs.keys()) + + frozen_graph_def = freeze_graph_with_def_protos( + input_graph_def=tf.get_default_graph().as_graph_def(), + input_saver_def=input_saver_def, + input_checkpoint=checkpoint_to_use, + output_node_names=output_node_names, + restore_op_name='save/restore_all', + filename_tensor_name='save/Const:0', + clear_devices=True, + initializer_nodes='') + write_frozen_graph(frozen_graph_path, frozen_graph_def) + write_saved_model(saved_model_path, frozen_graph_def, + placeholder_tensor, outputs) + + +def export_inference_graph(input_type, + pipeline_config, + trained_checkpoint_prefix, + output_directory, + input_shape=None, + output_collection_name='inference_op', + additional_output_tensor_names=None): + """Exports inference graph for the model specified in the pipeline config. + + Args: + input_type: Type of input for the graph. Can be one of [`image_tensor`, + `tf_example`]. + pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto. + trained_checkpoint_prefix: Path to the trained checkpoint file. + output_directory: Path to write outputs. + input_shape: Sets a fixed shape for an `image_tensor` input. If not + specified, will default to [None, None, None, 3]. + output_collection_name: Name of collection to add output tensors to. + If None, does not add output tensors to a collection. + additional_output_tensor_names: list of additional output + tensors to include in the frozen graph. + """ + detection_model = model_builder.build(pipeline_config.model, + is_training=False) + _export_inference_graph(input_type, detection_model, + pipeline_config.eval_config.use_moving_averages, + trained_checkpoint_prefix, + output_directory, additional_output_tensor_names, + input_shape, output_collection_name, + graph_hook_fn=None) + pipeline_config.eval_config.use_moving_averages = False + config_text = text_format.MessageToString(pipeline_config) + with tf.gfile.Open( + os.path.join(output_directory, 'pipeline.config'), 'wb') as f: + f.write(config_text) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/exporter_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/exporter_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cf6e85b200f881ec5f1db6b9d1ce5e5e074904b0 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/exporter_test.py @@ -0,0 +1,864 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.export_inference_graph.""" +import os +import numpy as np +import six +import tensorflow as tf +from google.protobuf import text_format +from object_detection import exporter +from object_detection.builders import model_builder +from object_detection.core import model +from object_detection.protos import pipeline_pb2 + +if six.PY2: + import mock # pylint: disable=g-import-not-at-top +else: + from unittest import mock # pylint: disable=g-import-not-at-top + +slim = tf.contrib.slim + + +class FakeModel(model.DetectionModel): + + def __init__(self, add_detection_keypoints=False, add_detection_masks=False): + self._add_detection_keypoints = add_detection_keypoints + self._add_detection_masks = add_detection_masks + + def preprocess(self, inputs): + true_image_shapes = [] # Doesn't matter for the fake model. + return tf.identity(inputs), true_image_shapes + + def predict(self, preprocessed_inputs, true_image_shapes): + return {'image': tf.layers.conv2d(preprocessed_inputs, 3, 1)} + + def postprocess(self, prediction_dict, true_image_shapes): + with tf.control_dependencies(prediction_dict.values()): + postprocessed_tensors = { + 'detection_boxes': tf.constant([[[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.8, 0.8]], + [[0.5, 0.5, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]], tf.float32), + 'detection_scores': tf.constant([[0.7, 0.6], + [0.9, 0.0]], tf.float32), + 'detection_classes': tf.constant([[0, 1], + [1, 0]], tf.float32), + 'num_detections': tf.constant([2, 1], tf.float32) + } + if self._add_detection_keypoints: + postprocessed_tensors['detection_keypoints'] = tf.constant( + np.arange(48).reshape([2, 2, 6, 2]), tf.float32) + if self._add_detection_masks: + postprocessed_tensors['detection_masks'] = tf.constant( + np.arange(64).reshape([2, 2, 4, 4]), tf.float32) + return postprocessed_tensors + + def restore_map(self, checkpoint_path, fine_tune_checkpoint_type): + pass + + def loss(self, prediction_dict, true_image_shapes): + pass + + +class ExportInferenceGraphTest(tf.test.TestCase): + + def _save_checkpoint_from_mock_model(self, checkpoint_path, + use_moving_averages): + g = tf.Graph() + with g.as_default(): + mock_model = FakeModel() + preprocessed_inputs, true_image_shapes = mock_model.preprocess( + tf.placeholder(tf.float32, shape=[None, None, None, 3])) + predictions = mock_model.predict(preprocessed_inputs, true_image_shapes) + mock_model.postprocess(predictions, true_image_shapes) + if use_moving_averages: + tf.train.ExponentialMovingAverage(0.0).apply() + slim.get_or_create_global_step() + saver = tf.train.Saver() + init = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init) + saver.save(sess, checkpoint_path) + + def _load_inference_graph(self, inference_graph_path): + od_graph = tf.Graph() + with od_graph.as_default(): + od_graph_def = tf.GraphDef() + with tf.gfile.GFile(inference_graph_path) as fid: + serialized_graph = fid.read() + od_graph_def.ParseFromString(serialized_graph) + tf.import_graph_def(od_graph_def, name='') + return od_graph + + def _create_tf_example(self, image_array): + with self.test_session(): + encoded_image = tf.image.encode_jpeg(tf.constant(image_array)).eval() + def _bytes_feature(value): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': _bytes_feature(encoded_image), + 'image/format': _bytes_feature('jpg'), + 'image/source_id': _bytes_feature('image_id') + })).SerializeToString() + return example + + def test_export_graph_with_image_tensor_input(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=False) + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel() + output_directory = os.path.join(tmp_dir, 'output') + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='image_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + self.assertTrue(os.path.exists(os.path.join( + output_directory, 'saved_model', 'saved_model.pb'))) + + def test_export_graph_with_fixed_size_image_tensor_input(self): + input_shape = [1, 320, 320, 3] + + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model( + trained_checkpoint_prefix, use_moving_averages=False) + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel() + output_directory = os.path.join(tmp_dir, 'output') + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='image_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory, + input_shape=input_shape) + saved_model_path = os.path.join(output_directory, 'saved_model') + self.assertTrue( + os.path.exists(os.path.join(saved_model_path, 'saved_model.pb'))) + + with tf.Graph().as_default() as od_graph: + with self.test_session(graph=od_graph) as sess: + meta_graph = tf.saved_model.loader.load( + sess, [tf.saved_model.tag_constants.SERVING], saved_model_path) + signature = meta_graph.signature_def['serving_default'] + input_tensor_name = signature.inputs['inputs'].name + image_tensor = od_graph.get_tensor_by_name(input_tensor_name) + self.assertSequenceEqual(image_tensor.get_shape().as_list(), + input_shape) + + def test_export_graph_with_tf_example_input(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=False) + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel() + output_directory = os.path.join(tmp_dir, 'output') + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='tf_example', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + self.assertTrue(os.path.exists(os.path.join( + output_directory, 'saved_model', 'saved_model.pb'))) + + def test_export_graph_with_encoded_image_string_input(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=False) + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel() + output_directory = os.path.join(tmp_dir, 'output') + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='encoded_image_string_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + self.assertTrue(os.path.exists(os.path.join( + output_directory, 'saved_model', 'saved_model.pb'))) + + def _get_variables_in_checkpoint(self, checkpoint_file): + return set([ + var_name + for var_name, _ in tf.train.list_variables(checkpoint_file)]) + + def test_replace_variable_values_with_moving_averages(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + new_checkpoint_prefix = os.path.join(tmp_dir, 'new.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + graph = tf.Graph() + with graph.as_default(): + fake_model = FakeModel() + preprocessed_inputs, true_image_shapes = fake_model.preprocess( + tf.placeholder(dtype=tf.float32, shape=[None, None, None, 3])) + predictions = fake_model.predict(preprocessed_inputs, true_image_shapes) + fake_model.postprocess(predictions, true_image_shapes) + exporter.replace_variable_values_with_moving_averages( + graph, trained_checkpoint_prefix, new_checkpoint_prefix) + + expected_variables = set(['conv2d/bias', 'conv2d/kernel']) + variables_in_old_ckpt = self._get_variables_in_checkpoint( + trained_checkpoint_prefix) + self.assertIn('conv2d/bias/ExponentialMovingAverage', + variables_in_old_ckpt) + self.assertIn('conv2d/kernel/ExponentialMovingAverage', + variables_in_old_ckpt) + variables_in_new_ckpt = self._get_variables_in_checkpoint( + new_checkpoint_prefix) + self.assertTrue(expected_variables.issubset(variables_in_new_ckpt)) + self.assertNotIn('conv2d/bias/ExponentialMovingAverage', + variables_in_new_ckpt) + self.assertNotIn('conv2d/kernel/ExponentialMovingAverage', + variables_in_new_ckpt) + + def test_export_graph_with_moving_averages(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + output_directory = os.path.join(tmp_dir, 'output') + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel() + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = True + exporter.export_inference_graph( + input_type='image_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + self.assertTrue(os.path.exists(os.path.join( + output_directory, 'saved_model', 'saved_model.pb'))) + expected_variables = set(['conv2d/bias', 'conv2d/kernel', 'global_step']) + actual_variables = set( + [var_name for var_name, _ in tf.train.list_variables(output_directory)]) + self.assertTrue(expected_variables.issubset(actual_variables)) + + def test_export_model_with_all_output_nodes(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + output_directory = os.path.join(tmp_dir, 'output') + inference_graph_path = os.path.join(output_directory, + 'frozen_inference_graph.pb') + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + exporter.export_inference_graph( + input_type='image_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + inference_graph = self._load_inference_graph(inference_graph_path) + with self.test_session(graph=inference_graph): + inference_graph.get_tensor_by_name('image_tensor:0') + inference_graph.get_tensor_by_name('detection_boxes:0') + inference_graph.get_tensor_by_name('detection_scores:0') + inference_graph.get_tensor_by_name('detection_classes:0') + inference_graph.get_tensor_by_name('detection_keypoints:0') + inference_graph.get_tensor_by_name('detection_masks:0') + inference_graph.get_tensor_by_name('num_detections:0') + + def test_export_model_with_detection_only_nodes(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + output_directory = os.path.join(tmp_dir, 'output') + inference_graph_path = os.path.join(output_directory, + 'frozen_inference_graph.pb') + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel(add_detection_masks=False) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + exporter.export_inference_graph( + input_type='image_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + inference_graph = self._load_inference_graph(inference_graph_path) + with self.test_session(graph=inference_graph): + inference_graph.get_tensor_by_name('image_tensor:0') + inference_graph.get_tensor_by_name('detection_boxes:0') + inference_graph.get_tensor_by_name('detection_scores:0') + inference_graph.get_tensor_by_name('detection_classes:0') + inference_graph.get_tensor_by_name('num_detections:0') + with self.assertRaises(KeyError): + inference_graph.get_tensor_by_name('detection_keypoints:0') + inference_graph.get_tensor_by_name('detection_masks:0') + + def test_export_and_run_inference_with_image_tensor(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + output_directory = os.path.join(tmp_dir, 'output') + inference_graph_path = os.path.join(output_directory, + 'frozen_inference_graph.pb') + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='image_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + + inference_graph = self._load_inference_graph(inference_graph_path) + with self.test_session(graph=inference_graph) as sess: + image_tensor = inference_graph.get_tensor_by_name('image_tensor:0') + boxes = inference_graph.get_tensor_by_name('detection_boxes:0') + scores = inference_graph.get_tensor_by_name('detection_scores:0') + classes = inference_graph.get_tensor_by_name('detection_classes:0') + keypoints = inference_graph.get_tensor_by_name('detection_keypoints:0') + masks = inference_graph.get_tensor_by_name('detection_masks:0') + num_detections = inference_graph.get_tensor_by_name('num_detections:0') + (boxes_np, scores_np, classes_np, keypoints_np, masks_np, + num_detections_np) = sess.run( + [boxes, scores, classes, keypoints, masks, num_detections], + feed_dict={image_tensor: np.ones((2, 4, 4, 3)).astype(np.uint8)}) + self.assertAllClose(boxes_np, [[[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.8, 0.8]], + [[0.5, 0.5, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]]) + self.assertAllClose(scores_np, [[0.7, 0.6], + [0.9, 0.0]]) + self.assertAllClose(classes_np, [[1, 2], + [2, 1]]) + self.assertAllClose(keypoints_np, np.arange(48).reshape([2, 2, 6, 2])) + self.assertAllClose(masks_np, np.arange(64).reshape([2, 2, 4, 4])) + self.assertAllClose(num_detections_np, [2, 1]) + + def _create_encoded_image_string(self, image_array_np, encoding_format): + od_graph = tf.Graph() + with od_graph.as_default(): + if encoding_format == 'jpg': + encoded_string = tf.image.encode_jpeg(image_array_np) + elif encoding_format == 'png': + encoded_string = tf.image.encode_png(image_array_np) + else: + raise ValueError('Supports only the following formats: `jpg`, `png`') + with self.test_session(graph=od_graph): + return encoded_string.eval() + + def test_export_and_run_inference_with_encoded_image_string_tensor(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + output_directory = os.path.join(tmp_dir, 'output') + inference_graph_path = os.path.join(output_directory, + 'frozen_inference_graph.pb') + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='encoded_image_string_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + + inference_graph = self._load_inference_graph(inference_graph_path) + jpg_image_str = self._create_encoded_image_string( + np.ones((4, 4, 3)).astype(np.uint8), 'jpg') + png_image_str = self._create_encoded_image_string( + np.ones((4, 4, 3)).astype(np.uint8), 'png') + with self.test_session(graph=inference_graph) as sess: + image_str_tensor = inference_graph.get_tensor_by_name( + 'encoded_image_string_tensor:0') + boxes = inference_graph.get_tensor_by_name('detection_boxes:0') + scores = inference_graph.get_tensor_by_name('detection_scores:0') + classes = inference_graph.get_tensor_by_name('detection_classes:0') + keypoints = inference_graph.get_tensor_by_name('detection_keypoints:0') + masks = inference_graph.get_tensor_by_name('detection_masks:0') + num_detections = inference_graph.get_tensor_by_name('num_detections:0') + for image_str in [jpg_image_str, png_image_str]: + image_str_batch_np = np.hstack([image_str]* 2) + (boxes_np, scores_np, classes_np, keypoints_np, masks_np, + num_detections_np) = sess.run( + [boxes, scores, classes, keypoints, masks, num_detections], + feed_dict={image_str_tensor: image_str_batch_np}) + self.assertAllClose(boxes_np, [[[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.8, 0.8]], + [[0.5, 0.5, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]]) + self.assertAllClose(scores_np, [[0.7, 0.6], + [0.9, 0.0]]) + self.assertAllClose(classes_np, [[1, 2], + [2, 1]]) + self.assertAllClose(keypoints_np, np.arange(48).reshape([2, 2, 6, 2])) + self.assertAllClose(masks_np, np.arange(64).reshape([2, 2, 4, 4])) + self.assertAllClose(num_detections_np, [2, 1]) + + def test_raise_runtime_error_on_images_with_different_sizes(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + output_directory = os.path.join(tmp_dir, 'output') + inference_graph_path = os.path.join(output_directory, + 'frozen_inference_graph.pb') + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='encoded_image_string_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + + inference_graph = self._load_inference_graph(inference_graph_path) + large_image = self._create_encoded_image_string( + np.ones((4, 4, 3)).astype(np.uint8), 'jpg') + small_image = self._create_encoded_image_string( + np.ones((2, 2, 3)).astype(np.uint8), 'jpg') + + image_str_batch_np = np.hstack([large_image, small_image]) + with self.test_session(graph=inference_graph) as sess: + image_str_tensor = inference_graph.get_tensor_by_name( + 'encoded_image_string_tensor:0') + boxes = inference_graph.get_tensor_by_name('detection_boxes:0') + scores = inference_graph.get_tensor_by_name('detection_scores:0') + classes = inference_graph.get_tensor_by_name('detection_classes:0') + keypoints = inference_graph.get_tensor_by_name('detection_keypoints:0') + masks = inference_graph.get_tensor_by_name('detection_masks:0') + num_detections = inference_graph.get_tensor_by_name('num_detections:0') + with self.assertRaisesRegexp(tf.errors.InvalidArgumentError, + 'TensorArray.*shape'): + sess.run( + [boxes, scores, classes, keypoints, masks, num_detections], + feed_dict={image_str_tensor: image_str_batch_np}) + + def test_export_and_run_inference_with_tf_example(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + output_directory = os.path.join(tmp_dir, 'output') + inference_graph_path = os.path.join(output_directory, + 'frozen_inference_graph.pb') + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='tf_example', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + + inference_graph = self._load_inference_graph(inference_graph_path) + tf_example_np = np.expand_dims(self._create_tf_example( + np.ones((4, 4, 3)).astype(np.uint8)), axis=0) + with self.test_session(graph=inference_graph) as sess: + tf_example = inference_graph.get_tensor_by_name('tf_example:0') + boxes = inference_graph.get_tensor_by_name('detection_boxes:0') + scores = inference_graph.get_tensor_by_name('detection_scores:0') + classes = inference_graph.get_tensor_by_name('detection_classes:0') + keypoints = inference_graph.get_tensor_by_name('detection_keypoints:0') + masks = inference_graph.get_tensor_by_name('detection_masks:0') + num_detections = inference_graph.get_tensor_by_name('num_detections:0') + (boxes_np, scores_np, classes_np, keypoints_np, masks_np, + num_detections_np) = sess.run( + [boxes, scores, classes, keypoints, masks, num_detections], + feed_dict={tf_example: tf_example_np}) + self.assertAllClose(boxes_np, [[[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.8, 0.8]], + [[0.5, 0.5, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]]) + self.assertAllClose(scores_np, [[0.7, 0.6], + [0.9, 0.0]]) + self.assertAllClose(classes_np, [[1, 2], + [2, 1]]) + self.assertAllClose(keypoints_np, np.arange(48).reshape([2, 2, 6, 2])) + self.assertAllClose(masks_np, np.arange(64).reshape([2, 2, 4, 4])) + self.assertAllClose(num_detections_np, [2, 1]) + + def test_write_frozen_graph(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + output_directory = os.path.join(tmp_dir, 'output') + inference_graph_path = os.path.join(output_directory, + 'frozen_inference_graph.pb') + tf.gfile.MakeDirs(output_directory) + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + detection_model = model_builder.build(pipeline_config.model, + is_training=False) + outputs, _ = exporter._build_detection_graph( + input_type='tf_example', + detection_model=detection_model, + input_shape=None, + output_collection_name='inference_op', + graph_hook_fn=None) + output_node_names = ','.join(outputs.keys()) + saver = tf.train.Saver() + input_saver_def = saver.as_saver_def() + frozen_graph_def = exporter.freeze_graph_with_def_protos( + input_graph_def=tf.get_default_graph().as_graph_def(), + input_saver_def=input_saver_def, + input_checkpoint=trained_checkpoint_prefix, + output_node_names=output_node_names, + restore_op_name='save/restore_all', + filename_tensor_name='save/Const:0', + clear_devices=True, + initializer_nodes='') + exporter.write_frozen_graph(inference_graph_path, frozen_graph_def) + + inference_graph = self._load_inference_graph(inference_graph_path) + tf_example_np = np.expand_dims(self._create_tf_example( + np.ones((4, 4, 3)).astype(np.uint8)), axis=0) + with self.test_session(graph=inference_graph) as sess: + tf_example = inference_graph.get_tensor_by_name('tf_example:0') + boxes = inference_graph.get_tensor_by_name('detection_boxes:0') + scores = inference_graph.get_tensor_by_name('detection_scores:0') + classes = inference_graph.get_tensor_by_name('detection_classes:0') + keypoints = inference_graph.get_tensor_by_name('detection_keypoints:0') + masks = inference_graph.get_tensor_by_name('detection_masks:0') + num_detections = inference_graph.get_tensor_by_name('num_detections:0') + (boxes_np, scores_np, classes_np, keypoints_np, masks_np, + num_detections_np) = sess.run( + [boxes, scores, classes, keypoints, masks, num_detections], + feed_dict={tf_example: tf_example_np}) + self.assertAllClose(boxes_np, [[[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.8, 0.8]], + [[0.5, 0.5, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]]) + self.assertAllClose(scores_np, [[0.7, 0.6], + [0.9, 0.0]]) + self.assertAllClose(classes_np, [[1, 2], + [2, 1]]) + self.assertAllClose(keypoints_np, np.arange(48).reshape([2, 2, 6, 2])) + self.assertAllClose(masks_np, np.arange(64).reshape([2, 2, 4, 4])) + self.assertAllClose(num_detections_np, [2, 1]) + + def test_export_graph_saves_pipeline_file(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=True) + output_directory = os.path.join(tmp_dir, 'output') + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel() + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + exporter.export_inference_graph( + input_type='image_tensor', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + expected_pipeline_path = os.path.join( + output_directory, 'pipeline.config') + self.assertTrue(os.path.exists(expected_pipeline_path)) + + written_pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + with tf.gfile.GFile(expected_pipeline_path, 'r') as f: + proto_str = f.read() + text_format.Merge(proto_str, written_pipeline_config) + self.assertProtoEquals(pipeline_config, written_pipeline_config) + + def test_export_saved_model_and_run_inference(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=False) + output_directory = os.path.join(tmp_dir, 'output') + saved_model_path = os.path.join(output_directory, 'saved_model') + + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='tf_example', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + + tf_example_np = np.hstack([self._create_tf_example( + np.ones((4, 4, 3)).astype(np.uint8))] * 2) + with tf.Graph().as_default() as od_graph: + with self.test_session(graph=od_graph) as sess: + meta_graph = tf.saved_model.loader.load( + sess, [tf.saved_model.tag_constants.SERVING], saved_model_path) + + signature = meta_graph.signature_def['serving_default'] + input_tensor_name = signature.inputs['inputs'].name + tf_example = od_graph.get_tensor_by_name(input_tensor_name) + + boxes = od_graph.get_tensor_by_name( + signature.outputs['detection_boxes'].name) + scores = od_graph.get_tensor_by_name( + signature.outputs['detection_scores'].name) + classes = od_graph.get_tensor_by_name( + signature.outputs['detection_classes'].name) + keypoints = od_graph.get_tensor_by_name( + signature.outputs['detection_keypoints'].name) + masks = od_graph.get_tensor_by_name( + signature.outputs['detection_masks'].name) + num_detections = od_graph.get_tensor_by_name( + signature.outputs['num_detections'].name) + + (boxes_np, scores_np, classes_np, keypoints_np, masks_np, + num_detections_np) = sess.run( + [boxes, scores, classes, keypoints, masks, num_detections], + feed_dict={tf_example: tf_example_np}) + self.assertAllClose(boxes_np, [[[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.8, 0.8]], + [[0.5, 0.5, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]]) + self.assertAllClose(scores_np, [[0.7, 0.6], + [0.9, 0.0]]) + self.assertAllClose(classes_np, [[1, 2], + [2, 1]]) + self.assertAllClose(keypoints_np, np.arange(48).reshape([2, 2, 6, 2])) + self.assertAllClose(masks_np, np.arange(64).reshape([2, 2, 4, 4])) + self.assertAllClose(num_detections_np, [2, 1]) + + def test_write_saved_model(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=False) + output_directory = os.path.join(tmp_dir, 'output') + saved_model_path = os.path.join(output_directory, 'saved_model') + tf.gfile.MakeDirs(output_directory) + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + detection_model = model_builder.build(pipeline_config.model, + is_training=False) + outputs, placeholder_tensor = exporter._build_detection_graph( + input_type='tf_example', + detection_model=detection_model, + input_shape=None, + output_collection_name='inference_op', + graph_hook_fn=None) + output_node_names = ','.join(outputs.keys()) + saver = tf.train.Saver() + input_saver_def = saver.as_saver_def() + frozen_graph_def = exporter.freeze_graph_with_def_protos( + input_graph_def=tf.get_default_graph().as_graph_def(), + input_saver_def=input_saver_def, + input_checkpoint=trained_checkpoint_prefix, + output_node_names=output_node_names, + restore_op_name='save/restore_all', + filename_tensor_name='save/Const:0', + clear_devices=True, + initializer_nodes='') + exporter.write_saved_model( + saved_model_path=saved_model_path, + frozen_graph_def=frozen_graph_def, + inputs=placeholder_tensor, + outputs=outputs) + + tf_example_np = np.hstack([self._create_tf_example( + np.ones((4, 4, 3)).astype(np.uint8))] * 2) + with tf.Graph().as_default() as od_graph: + with self.test_session(graph=od_graph) as sess: + meta_graph = tf.saved_model.loader.load( + sess, [tf.saved_model.tag_constants.SERVING], saved_model_path) + + signature = meta_graph.signature_def['serving_default'] + input_tensor_name = signature.inputs['inputs'].name + tf_example = od_graph.get_tensor_by_name(input_tensor_name) + + boxes = od_graph.get_tensor_by_name( + signature.outputs['detection_boxes'].name) + scores = od_graph.get_tensor_by_name( + signature.outputs['detection_scores'].name) + classes = od_graph.get_tensor_by_name( + signature.outputs['detection_classes'].name) + keypoints = od_graph.get_tensor_by_name( + signature.outputs['detection_keypoints'].name) + masks = od_graph.get_tensor_by_name( + signature.outputs['detection_masks'].name) + num_detections = od_graph.get_tensor_by_name( + signature.outputs['num_detections'].name) + + (boxes_np, scores_np, classes_np, keypoints_np, masks_np, + num_detections_np) = sess.run( + [boxes, scores, classes, keypoints, masks, num_detections], + feed_dict={tf_example: tf_example_np}) + self.assertAllClose(boxes_np, [[[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.8, 0.8]], + [[0.5, 0.5, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]]) + self.assertAllClose(scores_np, [[0.7, 0.6], + [0.9, 0.0]]) + self.assertAllClose(classes_np, [[1, 2], + [2, 1]]) + self.assertAllClose(keypoints_np, np.arange(48).reshape([2, 2, 6, 2])) + self.assertAllClose(masks_np, np.arange(64).reshape([2, 2, 4, 4])) + self.assertAllClose(num_detections_np, [2, 1]) + + def test_export_checkpoint_and_run_inference(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=False) + output_directory = os.path.join(tmp_dir, 'output') + model_path = os.path.join(output_directory, 'model.ckpt') + meta_graph_path = model_path + '.meta' + + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + exporter.export_inference_graph( + input_type='tf_example', + pipeline_config=pipeline_config, + trained_checkpoint_prefix=trained_checkpoint_prefix, + output_directory=output_directory) + + tf_example_np = np.hstack([self._create_tf_example( + np.ones((4, 4, 3)).astype(np.uint8))] * 2) + with tf.Graph().as_default() as od_graph: + with self.test_session(graph=od_graph) as sess: + new_saver = tf.train.import_meta_graph(meta_graph_path) + new_saver.restore(sess, model_path) + + tf_example = od_graph.get_tensor_by_name('tf_example:0') + boxes = od_graph.get_tensor_by_name('detection_boxes:0') + scores = od_graph.get_tensor_by_name('detection_scores:0') + classes = od_graph.get_tensor_by_name('detection_classes:0') + keypoints = od_graph.get_tensor_by_name('detection_keypoints:0') + masks = od_graph.get_tensor_by_name('detection_masks:0') + num_detections = od_graph.get_tensor_by_name('num_detections:0') + (boxes_np, scores_np, classes_np, keypoints_np, masks_np, + num_detections_np) = sess.run( + [boxes, scores, classes, keypoints, masks, num_detections], + feed_dict={tf_example: tf_example_np}) + self.assertAllClose(boxes_np, [[[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.8, 0.8]], + [[0.5, 0.5, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]]) + self.assertAllClose(scores_np, [[0.7, 0.6], + [0.9, 0.0]]) + self.assertAllClose(classes_np, [[1, 2], + [2, 1]]) + self.assertAllClose(keypoints_np, np.arange(48).reshape([2, 2, 6, 2])) + self.assertAllClose(masks_np, np.arange(64).reshape([2, 2, 4, 4])) + self.assertAllClose(num_detections_np, [2, 1]) + + def test_write_graph_and_checkpoint(self): + tmp_dir = self.get_temp_dir() + trained_checkpoint_prefix = os.path.join(tmp_dir, 'model.ckpt') + self._save_checkpoint_from_mock_model(trained_checkpoint_prefix, + use_moving_averages=False) + output_directory = os.path.join(tmp_dir, 'output') + model_path = os.path.join(output_directory, 'model.ckpt') + meta_graph_path = model_path + '.meta' + tf.gfile.MakeDirs(output_directory) + with mock.patch.object( + model_builder, 'build', autospec=True) as mock_builder: + mock_builder.return_value = FakeModel( + add_detection_keypoints=True, add_detection_masks=True) + pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() + pipeline_config.eval_config.use_moving_averages = False + detection_model = model_builder.build(pipeline_config.model, + is_training=False) + exporter._build_detection_graph( + input_type='tf_example', + detection_model=detection_model, + input_shape=None, + output_collection_name='inference_op', + graph_hook_fn=None) + saver = tf.train.Saver() + input_saver_def = saver.as_saver_def() + exporter.write_graph_and_checkpoint( + inference_graph_def=tf.get_default_graph().as_graph_def(), + model_path=model_path, + input_saver_def=input_saver_def, + trained_checkpoint_prefix=trained_checkpoint_prefix) + + tf_example_np = np.hstack([self._create_tf_example( + np.ones((4, 4, 3)).astype(np.uint8))] * 2) + with tf.Graph().as_default() as od_graph: + with self.test_session(graph=od_graph) as sess: + new_saver = tf.train.import_meta_graph(meta_graph_path) + new_saver.restore(sess, model_path) + + tf_example = od_graph.get_tensor_by_name('tf_example:0') + boxes = od_graph.get_tensor_by_name('detection_boxes:0') + scores = od_graph.get_tensor_by_name('detection_scores:0') + classes = od_graph.get_tensor_by_name('detection_classes:0') + keypoints = od_graph.get_tensor_by_name('detection_keypoints:0') + masks = od_graph.get_tensor_by_name('detection_masks:0') + num_detections = od_graph.get_tensor_by_name('num_detections:0') + (boxes_np, scores_np, classes_np, keypoints_np, masks_np, + num_detections_np) = sess.run( + [boxes, scores, classes, keypoints, masks, num_detections], + feed_dict={tf_example: tf_example_np}) + self.assertAllClose(boxes_np, [[[0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 0.8, 0.8]], + [[0.5, 0.5, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0]]]) + self.assertAllClose(scores_np, [[0.7, 0.6], + [0.9, 0.0]]) + self.assertAllClose(classes_np, [[1, 2], + [2, 1]]) + self.assertAllClose(keypoints_np, np.arange(48).reshape([2, 2, 6, 2])) + self.assertAllClose(masks_np, np.arange(64).reshape([2, 2, 4, 4])) + self.assertAllClose(num_detections_np, [2, 1]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/configuring_jobs.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/configuring_jobs.md new file mode 100644 index 0000000000000000000000000000000000000000..9b042625b21591c79f0de98dea275491218f1877 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/configuring_jobs.md @@ -0,0 +1,162 @@ +# Configuring the Object Detection Training Pipeline + +## Overview + +The Tensorflow Object Detection API uses protobuf files to configure the +training and evaluation process. The schema for the training pipeline can be +found in object_detection/protos/pipeline.proto. At a high level, the config +file is split into 5 parts: + +1. The `model` configuration. This defines what type of model will be trained +(ie. meta-architecture, feature extractor). +2. The `train_config`, which decides what parameters should be used to train +model parameters (ie. SGD parameters, input preprocessing and feature extractor +initialization values). +3. The `eval_config`, which determines what set of metrics will be reported for +evaluation (currently we only support the PASCAL VOC metrics). +4. The `train_input_config`, which defines what dataset the model should be +trained on. +5. The `eval_input_config`, which defines what dataset the model will be +evaluated on. Typically this should be different than the training input +dataset. + +A skeleton configuration file is shown below: + +``` +model { +(... Add model config here...) +} + +train_config : { +(... Add train_config here...) +} + +train_input_reader: { +(... Add train_input configuration here...) +} + +eval_config: { +} + +eval_input_reader: { +(... Add eval_input configuration here...) +} +``` + +## Picking Model Parameters + +There are a large number of model parameters to configure. The best settings +will depend on your given application. Faster R-CNN models are better suited to +cases where high accuracy is desired and latency is of lower priority. +Conversely, if processing time is the most important factor, SSD models are +recommended. Read [our paper](https://arxiv.org/abs/1611.10012) for a more +detailed discussion on the speed vs accuracy tradeoff. + +To help new users get started, sample model configurations have been provided +in the object_detection/samples/configs folder. The contents of these +configuration files can be pasted into `model` field of the skeleton +configuration. Users should note that the `num_classes` field should be changed +to a value suited for the dataset the user is training on. + +## Defining Inputs + +The Tensorflow Object Detection API accepts inputs in the TFRecord file format. +Users must specify the locations of both the training and evaluation files. +Additionally, users should also specify a label map, which define the mapping +between a class id and class name. The label map should be identical between +training and evaluation datasets. + +An example input configuration looks as follows: + +``` +tf_record_input_reader { + input_path: "/usr/home/username/data/train.record" +} +label_map_path: "/usr/home/username/data/label_map.pbtxt" +``` + +Users should substitute the `input_path` and `label_map_path` arguments and +insert the input configuration into the `train_input_reader` and +`eval_input_reader` fields in the skeleton configuration. Note that the paths +can also point to Google Cloud Storage buckets (ie. +"gs://project_bucket/train.record") for use on Google Cloud. + +## Configuring the Trainer + +The `train_config` defines parts of the training process: + +1. Model parameter initialization. +2. Input preprocessing. +3. SGD parameters. + +A sample `train_config` is below: + +``` +batch_size: 1 +optimizer { + momentum_optimizer: { + learning_rate: { + manual_step_learning_rate { + initial_learning_rate: 0.0002 + schedule { + step: 0 + learning_rate: .0002 + } + schedule { + step: 900000 + learning_rate: .00002 + } + schedule { + step: 1200000 + learning_rate: .000002 + } + } + } + momentum_optimizer_value: 0.9 + } + use_moving_average: false +} +fine_tune_checkpoint: "/usr/home/username/tmp/model.ckpt-#####" +from_detection_checkpoint: true +gradient_clipping_by_norm: 10.0 +data_augmentation_options { + random_horizontal_flip { + } +} +``` + +### Model Parameter Initialization + +While optional, it is highly recommended that users utilize other object +detection checkpoints. Training an object detector from scratch can take days. +To speed up the training process, it is recommended that users re-use the +feature extractor parameters from a pre-existing object classification or +detection checkpoint. `train_config` provides two fields to specify +pre-existing checkpoints: `fine_tune_checkpoint` and +`from_detection_checkpoint`. `fine_tune_checkpoint` should provide a path to +the pre-existing checkpoint +(ie:"/usr/home/username/checkpoint/model.ckpt-#####"). +`from_detection_checkpoint` is a boolean value. If false, it assumes the +checkpoint was from an object classification checkpoint. Note that starting +from a detection checkpoint will usually result in a faster training job than +a classification checkpoint. + +The list of provided checkpoints can be found [here](detection_model_zoo.md). + +### Input Preprocessing + +The `data_augmentation_options` in `train_config` can be used to specify +how training data can be modified. This field is optional. + +### SGD Parameters + +The remainings parameters in `train_config` are hyperparameters for gradient +descent. Please note that the optimal learning rates provided in these +configuration files may depend on the specifics of the training setup (e.g. +number of workers, gpu type). + +## Configuring the Evaluator + +Currently evaluation is fixed to generating metrics as defined by the PASCAL VOC +challenge. The parameters for `eval_config` are set to reasonable defaults and +typically do not need to be configured. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/defining_your_own_model.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/defining_your_own_model.md new file mode 100644 index 0000000000000000000000000000000000000000..865f6af169bfe35a41765d91d36bbcfbac0a937a --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/defining_your_own_model.md @@ -0,0 +1,137 @@ +# So you want to create a new model! + +In this section, we discuss some of the abstractions that we use +for defining detection models. If you would like to define a new model +architecture for detection and use it in the Tensorflow Detection API, +then this section should also serve as a high level guide to the files that you +will need to edit to get your new model working. + +## DetectionModels (`object_detection/core/model.py`) + +In order to be trained, evaluated, and exported for serving using our +provided binaries, all models under the Tensorflow Object Detection API must +implement the `DetectionModel` interface (see the full definition in `object_detection/core/model.py`). In particular, +each of these models are responsible for implementing 5 functions: + +* `preprocess`: Run any preprocessing (e.g., scaling/shifting/reshaping) of + input values that is necessary prior to running the detector on an input + image. +* `predict`: Produce “raw” prediction tensors that can be passed to loss or + postprocess functions. +* `postprocess`: Convert predicted output tensors to final detections. +* `loss`: Compute scalar loss tensors with respect to provided groundtruth. +* `restore`: Load a checkpoint into the Tensorflow graph. + +Given a `DetectionModel` at training time, we pass each image batch through +the following sequence of functions to compute a loss which can be optimized via +SGD: + +``` +inputs (images tensor) -> preprocess -> predict -> loss -> outputs (loss tensor) +``` + +And at eval time, we pass each image batch through the following sequence of +functions to produce a set of detections: + +``` +inputs (images tensor) -> preprocess -> predict -> postprocess -> + outputs (boxes tensor, scores tensor, classes tensor, num_detections tensor) +``` + +Some conventions to be aware of: + +* `DetectionModel`s should make no assumptions about the input size or aspect + ratio --- they are responsible for doing any resize/reshaping necessary + (see docstring for the `preprocess` function). +* Output classes are always integers in the range `[0, num_classes)`. + Any mapping of these integers to semantic labels is to be handled outside + of this class. We never explicitly emit a “background class” --- thus 0 is + the first non-background class and any logic of predicting and removing + implicit background classes must be handled internally by the implementation. +* Detected boxes are to be interpreted as being in + `[y_min, x_min, y_max, x_max]` format and normalized relative to the + image window. +* We do not specifically assume any kind of probabilistic interpretation of the + scores --- the only important thing is their relative ordering. Thus + implementations of the postprocess function are free to output logits, + probabilities, calibrated probabilities, or anything else. + +## Defining a new Faster R-CNN or SSD Feature Extractor + +In most cases, you probably will not implement a `DetectionModel` from scratch +--- instead you might create a new feature extractor to be used by one of the +SSD or Faster R-CNN meta-architectures. (We think of meta-architectures as +classes that define entire families of models using the `DetectionModel` +abstraction). + +Note: For the following discussion to make sense, we recommend first becoming +familiar with the [Faster R-CNN](https://arxiv.org/abs/1506.01497) paper. + +Let’s now imagine that you have invented a brand new network architecture +(say, “InceptionV100”) for classification and want to see how InceptionV100 +would behave as a feature extractor for detection (say, with Faster R-CNN). +A similar procedure would hold for SSD models, but we’ll discuss Faster R-CNN. + +To use InceptionV100, we will have to define a new +`FasterRCNNFeatureExtractor` and pass it to our `FasterRCNNMetaArch` +constructor as input. See +`object_detection/meta_architectures/faster_rcnn_meta_arch.py` for definitions +of `FasterRCNNFeatureExtractor` and `FasterRCNNMetaArch`, respectively. +A `FasterRCNNFeatureExtractor` must define a few +functions: + +* `preprocess`: Run any preprocessing of input values that is necessary prior + to running the detector on an input image. +* `_extract_proposal_features`: Extract first stage Region Proposal Network + (RPN) features. +* `_extract_box_classifier_features`: Extract second stage Box Classifier + features. +* `restore_from_classification_checkpoint_fn`: Load a checkpoint into the + Tensorflow graph. + +See the `object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py` +definition as one example. Some remarks: + +* We typically initialize the weights of this feature extractor + using those from the + [Slim Resnet-101 classification checkpoint](https://github.com/tensorflow/models/tree/master/research/slim#pre-trained-models), + and we know + that images were preprocessed when training this checkpoint + by subtracting a channel mean from each input + image. Thus, we implement the preprocess function to replicate the same + channel mean subtraction behavior. +* The “full” resnet classification network defined in slim is cut into two + parts --- all but the last “resnet block” is put into the + `_extract_proposal_features` function and the final block is separately + defined in the `_extract_box_classifier_features function`. In general, + some experimentation may be required to decide on an optimal layer at + which to “cut” your feature extractor into these two pieces for Faster R-CNN. + +## Register your model for configuration + +Assuming that your new feature extractor does not require nonstandard +configuration, you will want to ideally be able to simply change the +“feature_extractor.type” fields in your configuration protos to point to a +new feature extractor. In order for our API to know how to understand this +new type though, you will first have to register your new feature +extractor with the model builder (`object_detection/builders/model_builder.py`), +whose job is to create models from config protos.. + +Registration is simple --- just add a pointer to the new Feature Extractor +class that you have defined in one of the SSD or Faster R-CNN Feature +Extractor Class maps at the top of the +`object_detection/builders/model_builder.py` file. +We recommend adding a test in `object_detection/builders/model_builder_test.py` +to make sure that parsing your proto will work as expected. + +## Taking your new model for a spin + +After registration you are ready to go with your model! Some final tips: + +* To save time debugging, try running your configuration file locally first + (both training and evaluation). +* Do a sweep of learning rates to figure out which learning rate is best + for your model. +* A small but often important detail: you may find it necessary to disable + batchnorm training (that is, load the batch norm parameters from the + classification checkpoint, but do not update them during gradient descent). diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/detection_model_zoo.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/detection_model_zoo.md new file mode 100644 index 0000000000000000000000000000000000000000..37cea8ad8dd8ede25f4fef09d0fbc1f09d0354e9 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/detection_model_zoo.md @@ -0,0 +1,116 @@ +# Tensorflow detection model zoo + +We provide a collection of detection models pre-trained on the [COCO +dataset](http://mscoco.org), the [Kitti dataset](http://www.cvlibs.net/datasets/kitti/), and the +[Open Images dataset](https://github.com/openimages/dataset). These models can +be useful for +out-of-the-box inference if you are interested in categories already in COCO +(e.g., humans, cars, etc) or in Open Images (e.g., +surfboard, jacuzzi, etc). They are also useful for initializing your models when +training on novel datasets. + +In the table below, we list each such pre-trained model including: + +* a model name that corresponds to a config file that was used to train this + model in the `samples/configs` directory, +* a download link to a tar.gz file containing the pre-trained model, +* model speed --- we report running time in ms per 600x600 image (including all + pre and post-processing), but please be + aware that these timings depend highly on one's specific hardware + configuration (these timings were performed using an Nvidia + GeForce GTX TITAN X card) and should be treated more as relative timings in + many cases. Also note that desktop GPU timing does not always reflect mobile + run time. For example Mobilenet V2 is faster on mobile devices than Mobilenet + V1, but is slightly slower on desktop GPU. +* detector performance on subset of the COCO validation set or Open Images test split as measured by the dataset-specific mAP measure. + Here, higher is better, and we only report bounding box mAP rounded to the + nearest integer. +* Output types (`Boxes`, and `Masks` if applicable ) + +You can un-tar each tar.gz file via, e.g.,: + +``` +tar -xzvf ssd_mobilenet_v1_coco.tar.gz +``` + +Inside the un-tar'ed directory, you will find: + +* a graph proto (`graph.pbtxt`) +* a checkpoint + (`model.ckpt.data-00000-of-00001`, `model.ckpt.index`, `model.ckpt.meta`) +* a frozen graph proto with weights baked into the graph as constants + (`frozen_inference_graph.pb`) to be used for out of the box inference + (try this out in the Jupyter notebook!) +* a config file (`pipeline.config`) which was used to generate the graph. These + directly correspond to a config file in the + [samples/configs](https://github.com/tensorflow/models/tree/master/research/object_detection/samples/configs)) directory but often with a modified score threshold. In the case + of the heavier Faster R-CNN models, we also provide a version of the model + that uses a highly reduced number of proposals for speed. + +Some remarks on frozen inference graphs: + +* If you try to evaluate the frozen graph, you may find performance numbers for + some of the models to be slightly lower than what we report in the below + tables. This is because we discard detections with scores below a + threshold (typically 0.3) when creating the frozen graph. This corresponds + effectively to picking a point on the precision recall curve of + a detector (and discarding the part past that point), which negatively impacts + standard mAP metrics. +* Our frozen inference graphs are generated using the + [v1.5.0](https://github.com/tensorflow/tensorflow/tree/v1.5.0) + release version of Tensorflow and we do not guarantee that these will work + with other versions; this being said, each frozen inference graph can be + regenerated using your current version of Tensorflow by re-running the + [exporter](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/exporting_models.md), + pointing it at the model directory as well as the config file inside of it. + + +## COCO-trained models {#coco-models} + +| Model name | Speed (ms) | COCO mAP[^1] | Outputs | +| ------------ | :--------------: | :--------------: | :-------------: | +| [ssd_mobilenet_v1_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2017_11_17.tar.gz) | 30 | 21 | Boxes | +| [ssd_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz) | 31 | 22 | Boxes | +| [ssdlite_mobilenet_v2_coco](http://download.tensorflow.org/models/object_detection/ssdlite_mobilenet_v2_coco_2018_05_09.tar.gz) | 27 | 22 | Boxes | +| [ssd_inception_v2_coco](http://download.tensorflow.org/models/object_detection/ssd_inception_v2_coco_2017_11_17.tar.gz) | 42 | 24 | Boxes | +| [faster_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_v2_coco_2018_01_28.tar.gz) | 58 | 28 | Boxes | +| [faster_rcnn_resnet50_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_coco_2018_01_28.tar.gz) | 89 | 30 | Boxes | +| [faster_rcnn_resnet50_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet50_lowproposals_coco_2018_01_28.tar.gz) | 64 | | Boxes | +| [rfcn_resnet101_coco](http://download.tensorflow.org/models/object_detection/rfcn_resnet101_coco_2018_01_28.tar.gz) | 92 | 30 | Boxes | +| [faster_rcnn_resnet101_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_2018_01_28.tar.gz) | 106 | 32 | Boxes | +| [faster_rcnn_resnet101_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_lowproposals_coco_2018_01_28.tar.gz) | 82 | | Boxes | +| [faster_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz) | 620 | 37 | Boxes | +| [faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_lowproposals_coco_2018_01_28.tar.gz) | 241 | | Boxes | +| [faster_rcnn_nas](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_coco_2018_01_28.tar.gz) | 1833 | 43 | Boxes | +| [faster_rcnn_nas_lowproposals_coco](http://download.tensorflow.org/models/object_detection/faster_rcnn_nas_lowproposals_coco_2018_01_28.tar.gz) | 540 | | Boxes | +| [mask_rcnn_inception_resnet_v2_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_resnet_v2_atrous_coco_2018_01_28.tar.gz) | 771 | 36 | Masks | +| [mask_rcnn_inception_v2_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_inception_v2_coco_2018_01_28.tar.gz) | 79 | 25 | Masks | +| [mask_rcnn_resnet101_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet101_atrous_coco_2018_01_28.tar.gz) | 470 | 33 | Masks | +| [mask_rcnn_resnet50_atrous_coco](http://download.tensorflow.org/models/object_detection/mask_rcnn_resnet50_atrous_coco_2018_01_28.tar.gz) | 343 | 29 | Masks | + + + +## Kitti-trained models {#kitti-models} + +Model name | Speed (ms) | Pascal mAP@0.5 | Outputs +----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----: +[faster_rcnn_resnet101_kitti](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_kitti_2018_01_28.tar.gz) | 79 | 87 | Boxes + +## Open Images-trained models {#open-images-models} + +Model name | Speed (ms) | Open Images mAP@0.5[^2] | Outputs +----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----: +[faster_rcnn_inception_resnet_v2_atrous_oid](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_oid_2018_01_28.tar.gz) | 727 | 37 | Boxes +[faster_rcnn_inception_resnet_v2_atrous_lowproposals_oid](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_lowproposals_oid_2018_01_28.tar.gz) | 347 | | Boxes + + +## AVA v2.1 trained models {#ava-models} + +Model name | Speed (ms) | Pascal mAP@0.5 | Outputs +----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----: +[faster_rcnn_resnet101_ava_v2.1](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_ava_v2.1_2018_04_30.tar.gz) | 93 | 11 | Boxes + + +[^1]: See [MSCOCO evaluation protocol](http://cocodataset.org/#detections-eval). +[^2]: This is PASCAL mAP with a slightly different way of true positives computation: see [Open Images evaluation protocol](evaluation_protocols.md#open-images). + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/evaluation_protocols.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/evaluation_protocols.md new file mode 100644 index 0000000000000000000000000000000000000000..ec960058bcebd3313bd60a992b818ff256a951f7 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/evaluation_protocols.md @@ -0,0 +1,129 @@ +# Supported object detection evaluation protocols + +The Tensorflow Object Detection API currently supports three evaluation protocols, +that can be configured in `EvalConfig` by setting `metrics_set` to the +corresponding value. + +## PASCAL VOC 2010 detection metric + +`EvalConfig.metrics_set='pascal_voc_detection_metrics'` + +The commonly used mAP metric for evaluating the quality of object detectors, +computed according to the protocol of the PASCAL VOC Challenge 2010-2012. The +protocol is available +[here](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/devkit_doc_08-May-2010.pdf). + +## Weighted PASCAL VOC detection metric + +`EvalConfig.metrics_set='weighted_pascal_voc_detection_metrics'` + +The weighted PASCAL metric computes the mean average precision as the average +precision when treating all classes as a single class. In comparison, +PASCAL metrics computes the mean average precision as the mean of the +per-class average precisions. + +For example, the test set consists of two classes, "cat" and "dog", and there +are ten times more boxes of "cat" than those of "dog". According to PASCAL VOC +2010 metric, performance on each of the two classes would contribute equally +towards the final mAP value, while for the Weighted PASCAL VOC metric the final +mAP value will be influenced by frequency of each class. + +## PASCAL VOC 2010 instance segmentation metric + +`EvalConfig.metrics_set='pascal_voc_instance_segmentation_metrics'` + +Similar to Pascal VOC 2010 detection metric, but computes the intersection over +union based on the object masks instead of object boxes. + +## Weighted PASCAL VOC instance segmentation metric + +`EvalConfig.metrics_set='weighted_pascal_voc_instance_segmentation_metrics'` + +Similar to the weighted pascal voc 2010 detection metric, but computes the +intersection over union based on the object masks instead of object boxes. + +## Open Images V2 detection metric + +`EvalConfig.metrics_set='open_images_V2_detection_metrics'` + +This metric is defined originally for evaluating detector performance on [Open +Images V2 dataset](https://github.com/openimages/dataset) and is fairly similar +to the PASCAL VOC 2010 metric mentioned above. It computes interpolated average +precision (AP) for each class and averages it among all classes (mAP). + +The difference to the PASCAL VOC 2010 metric is the following: Open Images +annotations contain `group-of` ground-truth boxes (see [Open Images data +description](https://github.com/openimages/dataset#annotations-human-bboxcsv)), +that are treated differently for the purpose of deciding whether detections are +"true positives", "ignored", "false positives". Here we define these three +cases: + +A detection is a "true positive" if there is a non-group-of ground-truth box, +such that: + +* The detection box and the ground-truth box are of the same class, and + intersection-over-union (IoU) between the detection box and the ground-truth + box is greater than the IoU threshold (default value 0.5). \ + Illustration of handling non-group-of boxes: \ + ![alt + groupof_case_eval](img/nongroupof_case_eval.png "illustration of handling non-group-of boxes: yellow box - ground truth bounding box; green box - true positive; red box - false positives.") + + * yellow box - ground-truth box; + * green box - true positive; + * red boxes - false positives. + +* This is the highest scoring detection for this ground truth box that + satisfies the criteria above. + +A detection is "ignored" if it is not a true positive, and there is a `group-of` +ground-truth box such that: + +* The detection box and the ground-truth box are of the same class, and the + area of intersection between the detection box and the ground-truth box + divided by the area of the detection is greater than 0.5. This is intended + to measure whether the detection box is approximately inside the group-of + ground-truth box. \ + Illustration of handling `group-of` boxes: \ + ![alt + groupof_case_eval](img/groupof_case_eval.png "illustration of handling group-of boxes: yellow box - ground truth bounding box; grey boxes - two detections of cars, that are ignored; red box - false positive.") + + * yellow box - ground-truth box; + * grey boxes - two detections on cars, that are ignored; + * red box - false positive. + +A detection is a "false positive" if it is neither a "true positive" nor +"ignored". + +Precision and recall are defined as: + +* Precision = number-of-true-positives/(number-of-true-positives + number-of-false-positives) +* Recall = number-of-true-positives/number-of-non-group-of-boxes + +Note that detections ignored as firing on a `group-of` ground-truth box do not +contribute to the number of true positives. + +The labels in Open Images are organized in a +[hierarchy](https://storage.googleapis.com/openimages/2017_07/bbox_labels_vis/bbox_labels_vis.html). +Ground-truth bounding-boxes are annotated with the most specific class available +in the hierarchy. For example, "car" has two children "limousine" and "van". Any +other kind of car is annotated as "car" (for example, a sedan). Given this +convention, the evaluation software treats all classes independently, ignoring +the hierarchy. To achieve high performance values, object detectors should +output bounding-boxes labelled in the same manner. + +## OID Challenge Object Detection Metric 2018 + +`EvalConfig.metrics_set='oid_challenge_object_detection_metrics'` + +The metric for the OID Challenge Object Detection Metric 2018, Object Detection +track. The description is provided on the [Open Images Challenge +website](https://storage.googleapis.com/openimages/web/challenge.html). + +## OID Challenge Visual Relationship Detection Metric 2018 + +The metric for the OID Challenge Visual Relationship Detection Metric 2018, Visual +Relationship Detection track. The description is provided on the [Open Images +Challenge +website](https://storage.googleapis.com/openimages/web/challenge.html). Note: +this is currently a stand-alone metric, that can be used only through the +`metrics/oid_vrd_challenge_evaluation.py` util. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/exporting_models.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/exporting_models.md new file mode 100644 index 0000000000000000000000000000000000000000..d4735b978d82c0d239a42f80121218b6b68f0759 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/exporting_models.md @@ -0,0 +1,27 @@ +# Exporting a trained model for inference + +After your model has been trained, you should export it to a Tensorflow +graph proto. A checkpoint will typically consist of three files: + +* model.ckpt-${CHECKPOINT_NUMBER}.data-00000-of-00001 +* model.ckpt-${CHECKPOINT_NUMBER}.index +* model.ckpt-${CHECKPOINT_NUMBER}.meta + +After you've identified a candidate checkpoint to export, run the following +command from tensorflow/models/research: + +``` bash +# From tensorflow/models/research/ +python object_detection/export_inference_graph.py \ + --input_type image_tensor \ + --pipeline_config_path ${PIPELINE_CONFIG_PATH} \ + --trained_checkpoint_prefix ${TRAIN_PATH} \ + --output_directory ${EXPORT_DIR} +``` + +Afterwards, you should see the directory ${EXPORT_DIR} containing the following: + +* output_inference_graph.pb, the frozen graph format of the exported model +* saved_model/, a directory containing the saved model format of the exported model +* model.ckpt.*, the model checkpoints used for exporting +* checkpoint, a file specifying to restore included checkpoint files diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/faq.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..2f4097f0d06eedc9ab87c8a3958559450b1c1f00 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/faq.md @@ -0,0 +1,19 @@ +# Frequently Asked Questions + +## Q: AttributeError: 'module' object has no attribute 'BackupHandler' +A: This BackupHandler (tf.contrib.slim.tfexample_decoder.BackupHandler) was +introduced in tensorflow 1.5.0 so runing with earlier versions may cause this +issue. It now has been replaced by +object_detection.data_decoders.tf_example_decoder.BackupHandler. Whoever sees +this issue should be able to resolve it by syncing your fork to HEAD. +Same for LookupTensor. + +## Q: AttributeError: 'module' object has no attribute 'LookupTensor' +A: Similar to BackupHandler, syncing your fork to HEAD should make it work. + +## Q: Why can't I get the inference time as reported in model zoo? +A: The inference time reported in model zoo is mean time of testing hundreds of +images with an internal machine. As mentioned in +[Tensorflow detection model zoo](detection_model_zoo.md), this speed depends +highly on one's specific hardware configuration and should be treated more as +relative timing. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/dogs_detections_output.jpg b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/dogs_detections_output.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9e88a7010fa90f5c4a74f6caee78f5c975f77e40 Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/dogs_detections_output.jpg differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/example_cat.jpg b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/example_cat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..74c7ef4b0849ce1b1f3b8061f172cb98ce06ef5e Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/example_cat.jpg differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/groupof_case_eval.png b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/groupof_case_eval.png new file mode 100644 index 0000000000000000000000000000000000000000..5abc9b6984fb5816ca4f2e6f40e38ec6e6ea9cfc Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/groupof_case_eval.png differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/kites_detections_output.jpg b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/kites_detections_output.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7c0f3364deda6614b5bf6fdddad7e7a578f0f6eb Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/kites_detections_output.jpg differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/kites_with_segment_overlay.png b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/kites_with_segment_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..a52e57de193e53edbb1a49643e8c77609abdc79d Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/kites_with_segment_overlay.png differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/nongroupof_case_eval.png b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/nongroupof_case_eval.png new file mode 100644 index 0000000000000000000000000000000000000000..cbb76f493adfa725cd0b2ab323f89fdfc57a57ec Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/nongroupof_case_eval.png differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/oid_bus_72e19c28aac34ed8.jpg b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/oid_bus_72e19c28aac34ed8.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1e9412ad545c0a1e1e7dcfa35a168c2a61cf2012 Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/oid_bus_72e19c28aac34ed8.jpg differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/oid_monkey_3b4168c89cecbc5b.jpg b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/oid_monkey_3b4168c89cecbc5b.jpg new file mode 100644 index 0000000000000000000000000000000000000000..46b1fb282a428fe1169a7ff1d30e963bc085e733 Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/oid_monkey_3b4168c89cecbc5b.jpg differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/oxford_pet.png b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/oxford_pet.png new file mode 100644 index 0000000000000000000000000000000000000000..ddac415f5ef079f8d6fde8dd4c9838735fd96325 Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/oxford_pet.png differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/tensorboard.png b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/tensorboard.png new file mode 100644 index 0000000000000000000000000000000000000000..fbcdbeb38cf5594681c0e206a08b6d06bd1e86a9 Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/tensorboard.png differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/tensorboard2.png b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/tensorboard2.png new file mode 100644 index 0000000000000000000000000000000000000000..97ad22daa11870ecebbbe7cadfb2d8bb30d738f6 Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/tensorboard2.png differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/tf-od-api-logo.png b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/tf-od-api-logo.png new file mode 100644 index 0000000000000000000000000000000000000000..9fa9cc9dba228c1effabfa5c1474052ed8bad3fd Binary files /dev/null and b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/img/tf-od-api-logo.png differ diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/installation.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..27786c7dfbe86e1f1bb9d06db316011e8a8a3ae3 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/installation.md @@ -0,0 +1,104 @@ +# Installation + +## Dependencies + +Tensorflow Object Detection API depends on the following libraries: + +* Protobuf 3+ +* Python-tk +* Pillow 1.0 +* lxml +* tf Slim (which is included in the "tensorflow/models/research/" checkout) +* Jupyter notebook +* Matplotlib +* Tensorflow +* Cython +* cocoapi + +For detailed steps to install Tensorflow, follow the [Tensorflow installation +instructions](https://www.tensorflow.org/install/). A typical user can install +Tensorflow using one of the following commands: + +``` bash +# For CPU +pip install tensorflow +# For GPU +pip install tensorflow-gpu +``` + +The remaining libraries can be installed on Ubuntu 16.04 using via apt-get: + +``` bash +sudo apt-get install protobuf-compiler python-pil python-lxml python-tk +sudo pip install Cython +sudo pip install jupyter +sudo pip install matplotlib +``` + +Alternatively, users can install dependencies using pip: + +``` bash +sudo pip install Cython +sudo pip install pillow +sudo pip install lxml +sudo pip install jupyter +sudo pip install matplotlib +``` + +## COCO API installation + +Download the +cocoapi and +copy the pycocotools subfolder to the tensorflow/models/research directory if +you are interested in using COCO evaluation metrics. The default metrics are +based on those used in Pascal VOC evaluation. To use the COCO object detection +metrics add `metrics_set: "coco_detection_metrics"` to the `eval_config` message +in the config file. To use the COCO instance segmentation metrics add +`metrics_set: "coco_mask_metrics"` to the `eval_config` message in the config +file. + +```bash +git clone https://github.com/cocodataset/cocoapi.git +cd cocoapi/PythonAPI +make +cp -r pycocotools /models/research/ +``` + +## Protobuf Compilation + +The Tensorflow Object Detection API uses Protobufs to configure model and +training parameters. Before the framework can be used, the Protobuf libraries +must be compiled. This should be done by running the following command from +the tensorflow/models/research/ directory: + + +``` bash +# From tensorflow/models/research/ +protoc object_detection/protos/*.proto --python_out=. +``` + +## Add Libraries to PYTHONPATH + +When running locally, the tensorflow/models/research/ and slim directories +should be appended to PYTHONPATH. This can be done by running the following from +tensorflow/models/research/: + + +``` bash +# From tensorflow/models/research/ +export PYTHONPATH=$PYTHONPATH:`pwd`:`pwd`/slim +``` + +Note: This command needs to run from every new terminal you start. If you wish +to avoid running this manually, you can add it as a new line to the end of your +~/.bashrc file, replacing \`pwd\` with the absolute path of +tensorflow/models/research on your system. + +# Testing the Installation + +You can test that you have correctly installed the Tensorflow Object Detection\ +API by running the following command: + +```bash +python object_detection/builders/model_builder_test.py +``` diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/instance_segmentation.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/instance_segmentation.md new file mode 100644 index 0000000000000000000000000000000000000000..8ebf7d8c3d7329b95b81fbb14d272f6b0134e138 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/instance_segmentation.md @@ -0,0 +1,105 @@ +## Run an Instance Segmentation Model + +For some applications it isn't adequate enough to localize an object with a +simple bounding box. For instance, you might want to segment an object region +once it is detected. This class of problems is called **instance segmentation**. + +

+ +

+ +### Materializing data for instance segmentation {#materializing-instance-seg} + +Instance segmentation is an extension of object detection, where a binary mask +(i.e. object vs. background) is associated with every bounding box. This allows +for more fine-grained information about the extent of the object within the box. +To train an instance segmentation model, a groundtruth mask must be supplied for +every groundtruth bounding box. In additional to the proto fields listed in the +section titled [Using your own dataset](using_your_own_dataset.md), one must +also supply `image/object/mask`, which can either be a repeated list of +single-channel encoded PNG strings, or a single dense 3D binary tensor where +masks corresponding to each object are stacked along the first dimension. Each +is described in more detail below. + +#### PNG Instance Segmentation Masks + +Instance segmentation masks can be supplied as serialized PNG images. + +```shell +image/object/mask = ["\x89PNG\r\n\x1A\n\x00\x00\x00\rIHDR\...", ...] +``` + +These masks are whole-image masks, one for each object instance. The spatial +dimensions of each mask must agree with the image. Each mask has only a single +channel, and the pixel values are either 0 (background) or 1 (object mask). +**PNG masks are the preferred parameterization since they offer considerable +space savings compared to dense numerical masks.** + +#### Dense Numerical Instance Segmentation Masks + +Masks can also be specified via a dense numerical tensor. + +```shell +image/object/mask = [0.0, 0.0, 1.0, 1.0, 0.0, ...] +``` + +For an image with dimensions `H` x `W` and `num_boxes` groundtruth boxes, the +mask corresponds to a [`num_boxes`, `H`, `W`] float32 tensor, flattened into a +single vector of shape `num_boxes` * `H` * `W`. In TensorFlow, examples are read +in row-major format, so the elements are organized as: + +```shell +... mask 0 row 0 ... mask 0 row 1 ... // ... mask 0 row H-1 ... mask 1 row 0 ... +``` + +where each row has W contiguous binary values. + +To see an example tf-records with mask labels, see the examples under the +[Preparing Inputs](preparing_inputs.md) section. + +### Pre-existing config files + +We provide four instance segmentation config files that you can use to train +your own models: + +1. mask_rcnn_inception_resnet_v2_atrous_coco +1. mask_rcnn_resnet101_atrous_coco +1. mask_rcnn_resnet50_atrous_coco +1. mask_rcnn_inception_v2_coco + +For more details see the [detection model zoo](detection_model_zoo.md). + +### Updating a Faster R-CNN config file + +Currently, the only supported instance segmentation model is [Mask +R-CNN](https://arxiv.org/abs/1703.06870), which requires Faster R-CNN as the +backbone object detector. + +Once you have a baseline Faster R-CNN pipeline configuration, you can make the +following modifications in order to convert it into a Mask R-CNN model. + +1. Within `train_input_reader` and `eval_input_reader`, set + `load_instance_masks` to `True`. If using PNG masks, set `mask_type` to + `PNG_MASKS`, otherwise you can leave it as the default 'NUMERICAL_MASKS'. +1. Within the `faster_rcnn` config, use a `MaskRCNNBoxPredictor` as the + `second_stage_box_predictor`. +1. Within the `MaskRCNNBoxPredictor` message, set `predict_instance_masks` to + `True`. You must also define `conv_hyperparams`. +1. Within the `faster_rcnn` message, set `number_of_stages` to `3`. +1. Add instance segmentation metrics to the set of metrics: + `'coco_mask_metrics'`. +1. Update the `input_path`s to point at your data. + +Please refer to the section on [Running the pets dataset](running_pets.md) for +additional details. + +> Note: The mask prediction branch consists of a sequence of convolution layers. +> You can set the number of convolution layers and their depth as follows: +> +> 1. Within the `MaskRCNNBoxPredictor` message, set the +> `mask_prediction_conv_depth` to your value of interest. The default value +> is 256. If you set it to `0` (recommended), the depth is computed +> automatically based on the number of classes in the dataset. +> 1. Within the `MaskRCNNBoxPredictor` message, set the +> `mask_prediction_num_conv_layers` to your value of interest. The default +> value is 2. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/oid_inference_and_evaluation.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/oid_inference_and_evaluation.md new file mode 100644 index 0000000000000000000000000000000000000000..93aedf2918f26101bf341821bb939d3eb9ec31db --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/oid_inference_and_evaluation.md @@ -0,0 +1,255 @@ +# Inference and evaluation on the Open Images dataset + +This page presents a tutorial for running object detector inference and +evaluation measure computations on the [Open Images +dataset](https://github.com/openimages/dataset), using tools from the +[TensorFlow Object Detection +API](https://github.com/tensorflow/models/tree/master/research/object_detection). +It shows how to download the images and annotations for the validation and test +sets of Open Images; how to package the downloaded data in a format understood +by the Object Detection API; where to find a trained object detector model for +Open Images; how to run inference; and how to compute evaluation measures on the +inferred detections. + +Inferred detections will look like the following: + +![](img/oid_bus_72e19c28aac34ed8.jpg){height="300"} +![](img/oid_monkey_3b4168c89cecbc5b.jpg){height="300"} + +On the validation set of Open Images, this tutorial requires 27GB of free disk +space and the inference step takes approximately 9 hours on a single NVIDIA +Tesla P100 GPU. On the test set -- 75GB and 27 hours respectively. All other +steps require less than two hours in total on both sets. + +## Installing TensorFlow, the Object Detection API, and Google Cloud SDK + +Please run through the [installation instructions](installation.md) to install +TensorFlow and all its dependencies. Ensure the Protobuf libraries are compiled +and the library directories are added to `PYTHONPATH`. You will also need to +`pip` install `pandas` and `contextlib2`. + +Some of the data used in this tutorial lives in Google Cloud buckets. To access +it, you will have to [install the Google Cloud +SDK](https://cloud.google.com/sdk/downloads) on your workstation or laptop. + +## Preparing the Open Images validation and test sets + +In order to run inference and subsequent evaluation measure computations, we +require a dataset of images and ground truth boxes, packaged as TFRecords of +TFExamples. To create such a dataset for Open Images, you will need to first +download ground truth boxes from the [Open Images +website](https://github.com/openimages/dataset): + +```bash +# From tensorflow/models/research +mkdir oid +cd oid +wget https://storage.googleapis.com/openimages/2017_07/annotations_human_bbox_2017_07.tar.gz +tar -xvf annotations_human_bbox_2017_07.tar.gz +``` + +Next, download the images. In this tutorial, we will use lower resolution images +provided by [CVDF](http://www.cvdfoundation.org). Please follow the instructions +on [CVDF's Open Images repository +page](https://github.com/cvdfoundation/open-images-dataset) in order to gain +access to the cloud bucket with the images. Then run: + +```bash +# From tensorflow/models/research/oid +SPLIT=validation # Set SPLIT to "test" to download the images in the test set +mkdir raw_images_${SPLIT} +gsutil -m rsync -r gs://open-images-dataset/$SPLIT raw_images_${SPLIT} +``` + +Another option for downloading the images is to follow the URLs contained in the +[image URLs and metadata CSV +files](https://storage.googleapis.com/openimages/2017_07/images_2017_07.tar.gz) +on the Open Images website. + +At this point, your `tensorflow/models/research/oid` directory should appear as +follows: + +```lang-none +|-- 2017_07 +| |-- test +| | `-- annotations-human-bbox.csv +| |-- train +| | `-- annotations-human-bbox.csv +| `-- validation +| `-- annotations-human-bbox.csv +|-- raw_images_validation (if you downloaded the validation split) +| `-- ... (41,620 files matching regex "[0-9a-f]{16}.jpg") +|-- raw_images_test (if you downloaded the test split) +| `-- ... (125,436 files matching regex "[0-9a-f]{16}.jpg") +`-- annotations_human_bbox_2017_07.tar.gz +``` + +Next, package the data into TFRecords of TFExamples by running: + +```bash +# From tensorflow/models/research/oid +SPLIT=validation # Set SPLIT to "test" to create TFRecords for the test split +mkdir ${SPLIT}_tfrecords + +PYTHONPATH=$PYTHONPATH:$(readlink -f ..) \ +python -m object_detection/dataset_tools/create_oid_tf_record \ + --input_box_annotations_csv 2017_07/$SPLIT/annotations-human-bbox.csv \ + --input_images_directory raw_images_${SPLIT} \ + --input_label_map ../object_detection/data/oid_bbox_trainable_label_map.pbtxt \ + --output_tf_record_path_prefix ${SPLIT}_tfrecords/$SPLIT.tfrecord \ + --num_shards=100 +``` + +This results in 100 TFRecord files (shards), written to +`oid/${SPLIT}_tfrecords`, with filenames matching +`${SPLIT}.tfrecord-000[0-9][0-9]-of-00100`. Each shard contains approximately +the same number of images and is defacto a representative random sample of the +input data. [This enables](#accelerating_inference) a straightforward work +division scheme for distributing inference and also approximate measure +computations on subsets of the validation and test sets. + +## Inferring detections + +Inference requires a trained object detection model. In this tutorial we will +use a model from the [detections model zoo](detection_model_zoo.md), which can +be downloaded and unpacked by running the commands below. More information about +the model, such as its architecture and how it was trained, is available in the +[model zoo page](detection_model_zoo.md). + +```bash +# From tensorflow/models/research/oid +wget http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_oid_14_10_2017.tar.gz +tar -zxvf faster_rcnn_inception_resnet_v2_atrous_oid_14_10_2017.tar.gz +``` + +At this point, data is packed into TFRecords and we have an object detector +model. We can run inference using: + +```bash +# From tensorflow/models/research/oid +SPLIT=validation # or test +TF_RECORD_FILES=$(ls -1 ${SPLIT}_tfrecords/* | tr '\n' ',') + +PYTHONPATH=$PYTHONPATH:$(readlink -f ..) \ +python -m object_detection/inference/infer_detections \ + --input_tfrecord_paths=$TF_RECORD_FILES \ + --output_tfrecord_path=${SPLIT}_detections.tfrecord-00000-of-00001 \ + --inference_graph=faster_rcnn_inception_resnet_v2_atrous_oid/frozen_inference_graph.pb \ + --discard_image_pixels +``` + +Inference preserves all fields of the input TFExamples, and adds new fields to +store the inferred detections. This allows [computing evaluation +measures](#compute_evaluation_measures) on the output TFRecord alone, as ground +truth boxes are preserved as well. Since measure computations don't require +access to the images, `infer_detections` can optionally discard them with the +`--discard_image_pixels` flag. Discarding the images drastically reduces the +size of the output TFRecord. + +### Accelerating inference {#accelerating_inference} + +Running inference on the whole validation or test set can take a long time to +complete due to the large number of images present in these sets (41,620 and +125,436 respectively). For quick but approximate evaluation, inference and the +subsequent measure computations can be run on a small number of shards. To run +for example on 2% of all the data, it is enough to set `TF_RECORD_FILES` as +shown below before running `infer_detections`: + +```bash +TF_RECORD_FILES=$(ls ${SPLIT}_tfrecords/${SPLIT}.tfrecord-0000[0-1]-of-00100 | tr '\n' ',') +``` + +Please note that computing evaluation measures on a small subset of the data +introduces variance and bias, since some classes of objects won't be seen during +evaluation. In the example above, this leads to 13.2% higher mAP on the first +two shards of the validation set compared to the mAP for the full set ([see mAP +results](#expected-maps)). + +Another way to accelerate inference is to run it in parallel on multiple +TensorFlow devices on possibly multiple machines. The script below uses +[tmux](https://github.com/tmux/tmux/wiki) to run a separate `infer_detections` +process for each GPU on different partition of the input data. + +```bash +# From tensorflow/models/research/oid +SPLIT=validation # or test +NUM_GPUS=4 +NUM_SHARDS=100 + +tmux new-session -d -s "inference" +function tmux_start { tmux new-window -d -n "inference:GPU$1" "${*:2}; exec bash"; } +for gpu_index in $(seq 0 $(($NUM_GPUS-1))); do + start_shard=$(( $gpu_index * $NUM_SHARDS / $NUM_GPUS )) + end_shard=$(( ($gpu_index + 1) * $NUM_SHARDS / $NUM_GPUS - 1)) + TF_RECORD_FILES=$(seq -s, -f "${SPLIT}_tfrecords/${SPLIT}.tfrecord-%05.0f-of-$(printf '%05d' $NUM_SHARDS)" $start_shard $end_shard) + tmux_start ${gpu_index} \ + PYTHONPATH=$PYTHONPATH:$(readlink -f ..) CUDA_VISIBLE_DEVICES=$gpu_index \ + python -m object_detection/inference/infer_detections \ + --input_tfrecord_paths=$TF_RECORD_FILES \ + --output_tfrecord_path=${SPLIT}_detections.tfrecord-$(printf "%05d" $gpu_index)-of-$(printf "%05d" $NUM_GPUS) \ + --inference_graph=faster_rcnn_inception_resnet_v2_atrous_oid/frozen_inference_graph.pb \ + --discard_image_pixels +done +``` + +After all `infer_detections` processes finish, `tensorflow/models/research/oid` +will contain one output TFRecord from each process, with name matching +`validation_detections.tfrecord-0000[0-3]-of-00004`. + +## Computing evaluation measures {#compute_evaluation_measures} + +To compute evaluation measures on the inferred detections you first need to +create the appropriate configuration files: + +```bash +# From tensorflow/models/research/oid +SPLIT=validation # or test +NUM_SHARDS=1 # Set to NUM_GPUS if using the parallel evaluation script above + +mkdir -p ${SPLIT}_eval_metrics + +echo " +label_map_path: '../object_detection/data/oid_bbox_trainable_label_map.pbtxt' +tf_record_input_reader: { input_path: '${SPLIT}_detections.tfrecord@${NUM_SHARDS}' } +" > ${SPLIT}_eval_metrics/${SPLIT}_input_config.pbtxt + +echo " +metrics_set: 'open_images_V2_detection_metrics' +" > ${SPLIT}_eval_metrics/${SPLIT}_eval_config.pbtxt +``` + +And then run: + +```bash +# From tensorflow/models/research/oid +SPLIT=validation # or test + +PYTHONPATH=$PYTHONPATH:$(readlink -f ..) \ +python -m object_detection/metrics/offline_eval_map_corloc \ + --eval_dir=${SPLIT}_eval_metrics \ + --eval_config_path=${SPLIT}_eval_metrics/${SPLIT}_eval_config.pbtxt \ + --input_config_path=${SPLIT}_eval_metrics/${SPLIT}_input_config.pbtxt +``` + +The first configuration file contains an `object_detection.protos.InputReader` +message that describes the location of the necessary input files. The second +file contains an `object_detection.protos.EvalConfig` message that describes the +evaluation metric. For more information about these protos see the corresponding +source files. + +### Expected mAPs {#expected-maps} + +The result of running `offline_eval_map_corloc` is a CSV file located at +`${SPLIT}_eval_metrics/metrics.csv`. With the above configuration, the file will +contain average precision at IoU≥0.5 for each of the classes present in the +dataset. It will also contain the mAP@IoU≥0.5. Both the per-class average +precisions and the mAP are computed according to the [Open Images evaluation +protocol](evaluation_protocols.md). The expected mAPs for the validation and +test sets of Open Images in this case are: + +Set | Fraction of data | Images | mAP@IoU≥0.5 +---------: | :--------------: | :-----: | ----------- +validation | everything | 41,620 | 39.2% +validation | first 2 shards | 884 | 52.4% +test | everything | 125,436 | 37.7% +test | first 2 shards | 2,476 | 50.8% diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/preparing_inputs.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/preparing_inputs.md new file mode 100644 index 0000000000000000000000000000000000000000..d9d290d2928c1a163e250e4bfc3d05e01cb12d99 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/preparing_inputs.md @@ -0,0 +1,57 @@ +# Preparing Inputs + +Tensorflow Object Detection API reads data using the TFRecord file format. Two +sample scripts (`create_pascal_tf_record.py` and `create_pet_tf_record.py`) are +provided to convert from the PASCAL VOC dataset and Oxford-IIIT Pet dataset to +TFRecords. + +## Generating the PASCAL VOC TFRecord files. + +The raw 2012 PASCAL VOC data set is located +[here](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar). +To download, extract and convert it to TFRecords, run the following commands +below: + +```bash +# From tensorflow/models/research/ +wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar +tar -xvf VOCtrainval_11-May-2012.tar +python object_detection/dataset_tools/create_pascal_tf_record.py \ + --label_map_path=object_detection/data/pascal_label_map.pbtxt \ + --data_dir=VOCdevkit --year=VOC2012 --set=train \ + --output_path=pascal_train.record +python object_detection/dataset_tools/create_pascal_tf_record.py \ + --label_map_path=object_detection/data/pascal_label_map.pbtxt \ + --data_dir=VOCdevkit --year=VOC2012 --set=val \ + --output_path=pascal_val.record +``` + +You should end up with two TFRecord files named `pascal_train.record` and +`pascal_val.record` in the `tensorflow/models/research/` directory. + +The label map for the PASCAL VOC data set can be found at +`object_detection/data/pascal_label_map.pbtxt`. + +## Generating the Oxford-IIIT Pet TFRecord files. + +The Oxford-IIIT Pet data set is located +[here](http://www.robots.ox.ac.uk/~vgg/data/pets/). To download, extract and +convert it to TFRecrods, run the following commands below: + +```bash +# From tensorflow/models/research/ +wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz +wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz +tar -xvf annotations.tar.gz +tar -xvf images.tar.gz +python object_detection/dataset_tools/create_pet_tf_record.py \ + --label_map_path=object_detection/data/pet_label_map.pbtxt \ + --data_dir=`pwd` \ + --output_dir=`pwd` +``` + +You should end up with two TFRecord files named `pet_train.record` and +`pet_val.record` in the `tensorflow/models/research/` directory. + +The label map for the Pet dataset can be found at +`object_detection/data/pet_label_map.pbtxt`. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_locally.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_locally.md new file mode 100644 index 0000000000000000000000000000000000000000..b143a9b7b5bebe7a70363ac3ea118504d7eb75d8 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_locally.md @@ -0,0 +1,81 @@ +# Running Locally + +This page walks through the steps required to train an object detection model +on a local machine. It assumes the reader has completed the +following prerequisites: + +1. The Tensorflow Object Detection API has been installed as documented in the +[installation instructions](installation.md). This includes installing library +dependencies, compiling the configuration protobufs and setting up the Python +environment. +2. A valid data set has been created. See [this page](preparing_inputs.md) for +instructions on how to generate a dataset for the PASCAL VOC challenge or the +Oxford-IIIT Pet dataset. +3. A Object Detection pipeline configuration has been written. See +[this page](configuring_jobs.md) for details on how to write a pipeline configuration. + +## Recommended Directory Structure for Training and Evaluation + +``` ++data + -label_map file + -train TFRecord file + -eval TFRecord file ++models + + model + -pipeline config file + +train + +eval +``` + +## Running the Training Job + +A local training job can be run with the following command: + +```bash +# From the tensorflow/models/research/ directory +python object_detection/train.py \ + --logtostderr \ + --pipeline_config_path=${PATH_TO_YOUR_PIPELINE_CONFIG} \ + --train_dir=${PATH_TO_TRAIN_DIR} +``` + +where `${PATH_TO_YOUR_PIPELINE_CONFIG}` points to the pipeline config and +`${PATH_TO_TRAIN_DIR}` points to the directory in which training checkpoints +and events will be written to. By default, the training job will +run indefinitely until the user kills it. + +## Running the Evaluation Job + +Evaluation is run as a separate job. The eval job will periodically poll the +train directory for new checkpoints and evaluate them on a test dataset. The +job can be run using the following command: + +```bash +# From the tensorflow/models/research/ directory +python object_detection/eval.py \ + --logtostderr \ + --pipeline_config_path=${PATH_TO_YOUR_PIPELINE_CONFIG} \ + --checkpoint_dir=${PATH_TO_TRAIN_DIR} \ + --eval_dir=${PATH_TO_EVAL_DIR} +``` + +where `${PATH_TO_YOUR_PIPELINE_CONFIG}` points to the pipeline config, +`${PATH_TO_TRAIN_DIR}` points to the directory in which training checkpoints +were saved (same as the training job) and `${PATH_TO_EVAL_DIR}` points to the +directory in which evaluation events will be saved. As with the training job, +the eval job run until terminated by default. + +## Running Tensorboard + +Progress for training and eval jobs can be inspected using Tensorboard. If +using the recommended directory structure, Tensorboard can be run using the +following command: + +```bash +tensorboard --logdir=${PATH_TO_MODEL_DIRECTORY} +``` + +where `${PATH_TO_MODEL_DIRECTORY}` points to the directory that contains the +train and eval directories. Please note it may take Tensorboard a couple minutes +to populate with data. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_notebook.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_notebook.md new file mode 100644 index 0000000000000000000000000000000000000000..c2b8ad1876258d023a997d1166b0d269c2f10f48 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_notebook.md @@ -0,0 +1,15 @@ +# Quick Start: Jupyter notebook for off-the-shelf inference + +If you'd like to hit the ground running and run detection on a few example +images right out of the box, we recommend trying out the Jupyter notebook demo. +To run the Jupyter notebook, run the following command from +`tensorflow/models/research/object_detection`: + +``` +# From tensorflow/models/research/object_detection +jupyter notebook +``` + +The notebook should open in your favorite web browser. Click the +[`object_detection_tutorial.ipynb`](../object_detection_tutorial.ipynb) link to +open the demo. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_on_cloud.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_on_cloud.md new file mode 100644 index 0000000000000000000000000000000000000000..b96e2419f1e34bf4e3765334160d100a2fff2bfc --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_on_cloud.md @@ -0,0 +1,132 @@ +# Running on Google Cloud Platform + +The Tensorflow Object Detection API supports distributed training on Google +Cloud ML Engine. This section documents instructions on how to train and +evaluate your model using Cloud ML. The reader should complete the following +prerequistes: + +1. The reader has created and configured a project on Google Cloud Platform. +See [the Cloud ML quick start guide](https://cloud.google.com/ml-engine/docs/quickstarts/command-line). +2. The reader has installed the Tensorflow Object Detection API as documented +in the [installation instructions](installation.md). +3. The reader has a valid data set and stored it in a Google Cloud Storage +bucket. See [this page](preparing_inputs.md) for instructions on how to generate +a dataset for the PASCAL VOC challenge or the Oxford-IIIT Pet dataset. +4. The reader has configured a valid Object Detection pipeline, and stored it +in a Google Cloud Storage bucket. See [this page](configuring_jobs.md) for +details on how to write a pipeline configuration. + +Additionally, it is recommended users test their job by running training and +evaluation jobs for a few iterations +[locally on their own machines](running_locally.md). + +## Packaging + +In order to run the Tensorflow Object Detection API on Cloud ML, it must be +packaged (along with it's TF-Slim dependency). The required packages can be +created with the following command + +``` bash +# From tensorflow/models/research/ +python setup.py sdist +(cd slim && python setup.py sdist) +``` + +This will create python packages in dist/object_detection-0.1.tar.gz and +slim/dist/slim-0.1.tar.gz. + +## Running a Multiworker Training Job + +Google Cloud ML requires a YAML configuration file for a multiworker training +job using GPUs. A sample YAML file is given below: + +``` +trainingInput: + runtimeVersion: "1.2" + scaleTier: CUSTOM + masterType: standard_gpu + workerCount: 9 + workerType: standard_gpu + parameterServerCount: 3 + parameterServerType: standard + + +``` + +Please keep the following guidelines in mind when writing the YAML +configuration: + +* A job with n workers will have n + 1 training machines (n workers + 1 master). +* The number of parameters servers used should be an odd number to prevent + a parameter server from storing only weight variables or only bias variables + (due to round robin parameter scheduling). +* The learning rate in the training config should be decreased when using a + larger number of workers. Some experimentation is required to find the + optimal learning rate. + +The YAML file should be saved on the local machine (not on GCP). Once it has +been written, a user can start a training job on Cloud ML Engine using the +following command: + +``` bash +# From tensorflow/models/research/ +gcloud ml-engine jobs submit training object_detection_`date +%s` \ + --runtime-version 1.2 \ + --job-dir=gs://${TRAIN_DIR} \ + --packages dist/object_detection-0.1.tar.gz,slim/dist/slim-0.1.tar.gz \ + --module-name object_detection.train \ + --region us-central1 \ + --config ${PATH_TO_LOCAL_YAML_FILE} \ + -- \ + --train_dir=gs://${TRAIN_DIR} \ + --pipeline_config_path=gs://${PIPELINE_CONFIG_PATH} +``` + +Where `${PATH_TO_LOCAL_YAML_FILE}` is the local path to the YAML configuration, +`gs://${TRAIN_DIR}` specifies the directory on Google Cloud Storage where the +training checkpoints and events will be written to and +`gs://${PIPELINE_CONFIG_PATH}` points to the pipeline configuration stored on +Google Cloud Storage. + +Users can monitor the progress of their training job on the [ML Engine +Dashboard](https://console.cloud.google.com/mlengine/jobs). + +Note: This sample is supported for use with 1.2 runtime version. + +## Running an Evaluation Job on Cloud + +Evaluation jobs run on a single machine, so it is not necessary to write a YAML +configuration for evaluation. Run the following command to start the evaluation +job: + +``` bash +gcloud ml-engine jobs submit training object_detection_eval_`date +%s` \ + --runtime-version 1.2 \ + --job-dir=gs://${TRAIN_DIR} \ + --packages dist/object_detection-0.1.tar.gz,slim/dist/slim-0.1.tar.gz \ + --module-name object_detection.eval \ + --region us-central1 \ + --scale-tier BASIC_GPU \ + -- \ + --checkpoint_dir=gs://${TRAIN_DIR} \ + --eval_dir=gs://${EVAL_DIR} \ + --pipeline_config_path=gs://${PIPELINE_CONFIG_PATH} +``` + +Where `gs://${TRAIN_DIR}` points to the directory on Google Cloud Storage where +training checkpoints are saved (same as the training job), `gs://${EVAL_DIR}` +points to where evaluation events will be saved on Google Cloud Storage and +`gs://${PIPELINE_CONFIG_PATH}` points to where the pipeline configuration is +stored on Google Cloud Storage. + +## Running Tensorboard + +You can run Tensorboard locally on your own machine to view progress of your +training and eval jobs on Google Cloud ML. Run the following command to start +Tensorboard: + +``` bash +tensorboard --logdir=gs://${YOUR_CLOUD_BUCKET} +``` + +Note it may Tensorboard a few minutes to populate with results. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_pets.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_pets.md new file mode 100644 index 0000000000000000000000000000000000000000..d8ccac4132e24af978b223860ff79d34f2e8dd26 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/running_pets.md @@ -0,0 +1,336 @@ +# Quick Start: Distributed Training on the Oxford-IIIT Pets Dataset on Google Cloud + +This page is a walkthrough for training an object detector using the Tensorflow +Object Detection API. In this tutorial, we'll be training on the Oxford-IIIT Pets +dataset to build a system to detect various breeds of cats and dogs. The output +of the detector will look like the following: + +![](img/oxford_pet.png) + +## Setting up a Project on Google Cloud + +To accelerate the process, we'll run training and evaluation on [Google Cloud +ML Engine](https://cloud.google.com/ml-engine/) to leverage multiple GPUs. To +begin, you will have to set up Google Cloud via the following steps (if you have +already done this, feel free to skip to the next section): + +1. [Create a GCP project](https://cloud.google.com/resource-manager/docs/creating-managing-projects). +2. [Install the Google Cloud SDK](https://cloud.google.com/sdk/downloads) on +your workstation or laptop. +This will provide the tools you need to upload files to Google Cloud Storage and +start ML training jobs. +3. [Enable the ML Engine +APIs](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component&_ga=1.73374291.1570145678.1496689256). +By default, a new GCP project does not enable APIs to start ML Engine training +jobs. Use the above link to explicitly enable them. +4. [Set up a Google Cloud Storage (GCS) +bucket](https://cloud.google.com/storage/docs/creating-buckets). ML Engine +training jobs can only access files on a Google Cloud Storage bucket. In this +tutorial, we'll be required to upload our dataset and configuration to GCS. + +Please remember the name of your GCS bucket, as we will reference it multiple +times in this document. Substitute `${YOUR_GCS_BUCKET}` with the name of +your bucket in this document. For your convenience, you should define the +environment variable below: + +``` bash +export YOUR_GCS_BUCKET=${YOUR_GCS_BUCKET} +``` + +It is also possible to run locally by following +[the running locally instructions](running_locally.md). + +## Installing Tensorflow and the Tensorflow Object Detection API + +Please run through the [installation instructions](installation.md) to install +Tensorflow and all it dependencies. Ensure the Protobuf libraries are +compiled and the library directories are added to `PYTHONPATH`. + +## Getting the Oxford-IIIT Pets Dataset and Uploading it to Google Cloud Storage + +In order to train a detector, we require a dataset of images, bounding boxes and +classifications. For this demo, we'll use the Oxford-IIIT Pets dataset. The raw +dataset for Oxford-IIIT Pets lives +[here](http://www.robots.ox.ac.uk/~vgg/data/pets/). You will need to download +both the image dataset [`images.tar.gz`](http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz) +and the groundtruth data [`annotations.tar.gz`](http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz) +to the `tensorflow/models/research/` directory and unzip them. This may take +some time. + +``` bash +# From tensorflow/models/research/ +wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz +wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz +tar -xvf images.tar.gz +tar -xvf annotations.tar.gz +``` + +After downloading the tarballs, your `tensorflow/models/research/` directory +should appear as follows: + +```lang-none +- images.tar.gz +- annotations.tar.gz ++ images/ ++ annotations/ ++ object_detection/ +... other files and directories +``` + +The Tensorflow Object Detection API expects data to be in the TFRecord format, +so we'll now run the `create_pet_tf_record` script to convert from the raw +Oxford-IIIT Pet dataset into TFRecords. Run the following commands from the +`tensorflow/models/research/` directory: + +``` bash +# From tensorflow/models/research/ +python object_detection/dataset_tools/create_pet_tf_record.py \ + --label_map_path=object_detection/data/pet_label_map.pbtxt \ + --data_dir=`pwd` \ + --output_dir=`pwd` +``` + +Note: It is normal to see some warnings when running this script. You may ignore +them. + +Two TFRecord files named `pet_train.record` and `pet_val.record` should be +generated in the `tensorflow/models/research/` directory. + +Now that the data has been generated, we'll need to upload it to Google Cloud +Storage so the data can be accessed by ML Engine. Run the following command to +copy the files into your GCS bucket (substituting `${YOUR_GCS_BUCKET}`): + +``` bash +# From tensorflow/models/research/ +gsutil cp pet_train.record gs://${YOUR_GCS_BUCKET}/data/pet_train.record +gsutil cp pet_val.record gs://${YOUR_GCS_BUCKET}/data/pet_val.record +gsutil cp object_detection/data/pet_label_map.pbtxt gs://${YOUR_GCS_BUCKET}/data/pet_label_map.pbtxt +``` + +Please remember the path where you upload the data to, as we will need this +information when configuring the pipeline in a following step. + +## Downloading a COCO-pretrained Model for Transfer Learning + +Training a state of the art object detector from scratch can take days, even +when using multiple GPUs! In order to speed up training, we'll take an object +detector trained on a different dataset (COCO), and reuse some of it's +parameters to initialize our new model. + +Download our [COCO-pretrained Faster R-CNN with Resnet-101 +model](http://storage.googleapis.com/download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_11_06_2017.tar.gz). +Unzip the contents of the folder and copy the `model.ckpt*` files into your GCS +Bucket. + +``` bash +wget http://storage.googleapis.com/download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_coco_11_06_2017.tar.gz +tar -xvf faster_rcnn_resnet101_coco_11_06_2017.tar.gz +gsutil cp faster_rcnn_resnet101_coco_11_06_2017/model.ckpt.* gs://${YOUR_GCS_BUCKET}/data/ +``` + +Remember the path where you uploaded the model checkpoint to, as we will need it +in the following step. + +## Configuring the Object Detection Pipeline + +In the Tensorflow Object Detection API, the model parameters, training +parameters and eval parameters are all defined by a config file. More details +can be found [here](configuring_jobs.md). For this tutorial, we will use some +predefined templates provided with the source code. In the +`object_detection/samples/configs` folder, there are skeleton object_detection +configuration files. We will use `faster_rcnn_resnet101_pets.config` as a +starting point for configuring the pipeline. Open the file with your favourite +text editor. + +We'll need to configure some paths in order for the template to work. Search the +file for instances of `PATH_TO_BE_CONFIGURED` and replace them with the +appropriate value (typically `gs://${YOUR_GCS_BUCKET}/data/`). Afterwards +upload your edited file onto GCS, making note of the path it was uploaded to +(we'll need it when starting the training/eval jobs). + +``` bash +# From tensorflow/models/research/ + +# Edit the faster_rcnn_resnet101_pets.config template. Please note that there +# are multiple places where PATH_TO_BE_CONFIGURED needs to be set. +sed -i "s|PATH_TO_BE_CONFIGURED|"gs://${YOUR_GCS_BUCKET}"/data|g" \ + object_detection/samples/configs/faster_rcnn_resnet101_pets.config + +# Copy edited template to cloud. +gsutil cp object_detection/samples/configs/faster_rcnn_resnet101_pets.config \ + gs://${YOUR_GCS_BUCKET}/data/faster_rcnn_resnet101_pets.config +``` + +## Checking Your Google Cloud Storage Bucket + +At this point in the tutorial, you should have uploaded the training/validation +datasets (including label map), our COCO trained FasterRCNN finetune checkpoint and your job +configuration to your Google Cloud Storage Bucket. Your bucket should look like +the following: + +```lang-none ++ ${YOUR_GCS_BUCKET}/ + + data/ + - faster_rcnn_resnet101_pets.config + - model.ckpt.index + - model.ckpt.meta + - model.ckpt.data-00000-of-00001 + - pet_label_map.pbtxt + - pet_train.record + - pet_val.record +``` + +You can inspect your bucket using the [Google Cloud Storage +browser](https://console.cloud.google.com/storage/browser). + +## Starting Training and Evaluation Jobs on Google Cloud ML Engine + +Before we can start a job on Google Cloud ML Engine, we must: + +1. Package the Tensorflow Object Detection code. +2. Write a cluster configuration for our Google Cloud ML job. + +To package the Tensorflow Object Detection code, run the following commands from +the `tensorflow/models/research/` directory: + +``` bash +# From tensorflow/models/research/ +python setup.py sdist +(cd slim && python setup.py sdist) +``` + +You should see two tar.gz files created at `dist/object_detection-0.1.tar.gz` +and `slim/dist/slim-0.1.tar.gz`. + +For running the training Cloud ML job, we'll configure the cluster to use 10 +training jobs (1 master + 9 workers) and three parameters servers. The +configuration file can be found at `object_detection/samples/cloud/cloud.yml`. + +Note: This sample is supported for use with 1.2 runtime version. + +To start training, execute the following command from the +`tensorflow/models/research/` directory: + +``` bash +# From tensorflow/models/research/ +gcloud ml-engine jobs submit training `whoami`_object_detection_`date +%s` \ + --runtime-version 1.2 \ + --job-dir=gs://${YOUR_GCS_BUCKET}/train \ + --packages dist/object_detection-0.1.tar.gz,slim/dist/slim-0.1.tar.gz \ + --module-name object_detection.train \ + --region us-central1 \ + --config object_detection/samples/cloud/cloud.yml \ + -- \ + --train_dir=gs://${YOUR_GCS_BUCKET}/train \ + --pipeline_config_path=gs://${YOUR_GCS_BUCKET}/data/faster_rcnn_resnet101_pets.config +``` + +Once training has started, we can run an evaluation concurrently: + +``` bash +# From tensorflow/models/research/ +gcloud ml-engine jobs submit training `whoami`_object_detection_eval_`date +%s` \ + --runtime-version 1.2 \ + --job-dir=gs://${YOUR_GCS_BUCKET}/train \ + --packages dist/object_detection-0.1.tar.gz,slim/dist/slim-0.1.tar.gz \ + --module-name object_detection.eval \ + --region us-central1 \ + --scale-tier BASIC_GPU \ + -- \ + --checkpoint_dir=gs://${YOUR_GCS_BUCKET}/train \ + --eval_dir=gs://${YOUR_GCS_BUCKET}/eval \ + --pipeline_config_path=gs://${YOUR_GCS_BUCKET}/data/faster_rcnn_resnet101_pets.config +``` + +Note: Even though we're running an evaluation job, the `gcloud ml-engine jobs +submit training` command is correct. ML Engine does not distinguish between +training and evaluation jobs. + +Users can monitor and stop training and evaluation jobs on the [ML Engine +Dashboard](https://console.cloud.google.com/mlengine/jobs). + +## Monitoring Progress with Tensorboard + +You can monitor progress of the training and eval jobs by running Tensorboard on +your local machine: + +``` bash +# This command needs to be run once to allow your local machine to access your +# GCS bucket. +gcloud auth application-default login + +tensorboard --logdir=gs://${YOUR_GCS_BUCKET} +``` + +Once Tensorboard is running, navigate to `localhost:6006` from your favourite +web browser. You should see something similar to the following: + +![](img/tensorboard.png) + +You will also want to click on the images tab to see example detections made by +the model while it trains. After about an hour and a half of training, you can +expect to see something like this: + +![](img/tensorboard2.png) + +Note: It takes roughly 10 minutes for a job to get started on ML Engine, and +roughly an hour for the system to evaluate the validation dataset. It may take +some time to populate the dashboards. If you do not see any entries after half +an hour, check the logs from the [ML Engine +Dashboard](https://console.cloud.google.com/mlengine/jobs). Note that by default +the training jobs are configured to go for much longer than is necessary for +convergence. To save money, we recommend killing your jobs once you've seen +that they've converged. + +## Exporting the Tensorflow Graph + +After your model has been trained, you should export it to a Tensorflow +graph proto. First, you need to identify a candidate checkpoint to export. You +can search your bucket using the [Google Cloud Storage +Browser](https://console.cloud.google.com/storage/browser). The file should be +stored under `${YOUR_GCS_BUCKET}/train`. The checkpoint will typically consist of +three files: + +* `model.ckpt-${CHECKPOINT_NUMBER}.data-00000-of-00001` +* `model.ckpt-${CHECKPOINT_NUMBER}.index` +* `model.ckpt-${CHECKPOINT_NUMBER}.meta` + +After you've identified a candidate checkpoint to export, run the following +command from `tensorflow/models/research/`: + +``` bash +# From tensorflow/models/research/ +gsutil cp gs://${YOUR_GCS_BUCKET}/train/model.ckpt-${CHECKPOINT_NUMBER}.* . +python object_detection/export_inference_graph.py \ + --input_type image_tensor \ + --pipeline_config_path object_detection/samples/configs/faster_rcnn_resnet101_pets.config \ + --trained_checkpoint_prefix model.ckpt-${CHECKPOINT_NUMBER} \ + --output_directory exported_graphs +``` + +Afterwards, you should see a directory named `exported_graphs` containing the +SavedModel and frozen graph. + +## Configuring the Instance Segmentation Pipeline + +Mask prediction can be turned on for an object detection config by adding +`predict_instance_masks: true` within the `MaskRCNNBoxPredictor`. Other +parameters such as mask size, number of convolutions in the mask layer, and the +convolution hyper parameters can be defined. We will use +`mask_rcnn_resnet101_pets.config` as a starting point for configuring the +instance segmentation pipeline. Everything above that was mentioned about object +detection holds true for instance segmentation. Instance segmentation consists +of an object detection model with an additional head that predicts the object +mask inside each predicted box once we remove the training and other details. +Please refer to the section on [Running an Instance Segmentation +Model](instance_segmentation.md) for instructions on how to configure a model +that predicts masks in addition to object bounding boxes. + +## What's Next + +Congratulations, you have now trained an object detector for various cats and +dogs! There different things you can do now: + +1. [Test your exported model using the provided Jupyter notebook.](running_notebook.md) +2. [Experiment with different model configurations.](configuring_jobs.md) +3. Train an object detector using your own data. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/using_your_own_dataset.md b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/using_your_own_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..397e394ca41ef55695bdfbb8b0e52a138978e798 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/g3doc/using_your_own_dataset.md @@ -0,0 +1,161 @@ +# Preparing Inputs + +To use your own dataset in Tensorflow Object Detection API, you must convert it +into the [TFRecord file format](https://www.tensorflow.org/api_guides/python/python_io#tfrecords_format_details). +This document outlines how to write a script to generate the TFRecord file. + +## Label Maps + +Each dataset is required to have a label map associated with it. This label map +defines a mapping from string class names to integer class Ids. The label map +should be a `StringIntLabelMap` text protobuf. Sample label maps can be found in +object_detection/data. Label maps should always start from id 1. + +## Dataset Requirements + +For every example in your dataset, you should have the following information: + +1. An RGB image for the dataset encoded as jpeg or png. +2. A list of bounding boxes for the image. Each bounding box should contain: + 1. A bounding box coordinates (with origin in top left corner) defined by 4 + floating point numbers [ymin, xmin, ymax, xmax]. Note that we store the + _normalized_ coordinates (x / width, y / height) in the TFRecord dataset. + 2. The class of the object in the bounding box. + +# Example Image + +Consider the following image: + +![Example Image](img/example_cat.jpg "Example Image") + +with the following label map: + +``` +item { + id: 1 + name: 'Cat' +} + + +item { + id: 2 + name: 'Dog' +} +``` + +We can generate a tf.Example proto for this image using the following code: + +```python + +def create_cat_tf_example(encoded_cat_image_data): + """Creates a tf.Example proto from sample cat image. + + Args: + encoded_cat_image_data: The jpg encoded data of the cat image. + + Returns: + example: The created tf.Example. + """ + + height = 1032.0 + width = 1200.0 + filename = 'example_cat.jpg' + image_format = b'jpg' + + xmins = [322.0 / 1200.0] + xmaxs = [1062.0 / 1200.0] + ymins = [174.0 / 1032.0] + ymaxs = [761.0 / 1032.0] + classes_text = ['Cat'] + classes = [1] + + tf_example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': dataset_util.int64_feature(height), + 'image/width': dataset_util.int64_feature(width), + 'image/filename': dataset_util.bytes_feature(filename), + 'image/source_id': dataset_util.bytes_feature(filename), + 'image/encoded': dataset_util.bytes_feature(encoded_image_data), + 'image/format': dataset_util.bytes_feature(image_format), + 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), + 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), + 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), + 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), + 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), + 'image/object/class/label': dataset_util.int64_list_feature(classes), + })) + return tf_example +``` + +## Conversion Script Outline + +A typical conversion script will look like the following: + +```python + +import tensorflow as tf + +from object_detection.utils import dataset_util + + +flags = tf.app.flags +flags.DEFINE_string('output_path', '', 'Path to output TFRecord') +FLAGS = flags.FLAGS + + +def create_tf_example(example): + # TODO(user): Populate the following variables from your example. + height = None # Image height + width = None # Image width + filename = None # Filename of the image. Empty if image is not from file + encoded_image_data = None # Encoded image bytes + image_format = None # b'jpeg' or b'png' + + xmins = [] # List of normalized left x coordinates in bounding box (1 per box) + xmaxs = [] # List of normalized right x coordinates in bounding box + # (1 per box) + ymins = [] # List of normalized top y coordinates in bounding box (1 per box) + ymaxs = [] # List of normalized bottom y coordinates in bounding box + # (1 per box) + classes_text = [] # List of string class name of bounding box (1 per box) + classes = [] # List of integer class id of bounding box (1 per box) + + tf_example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': dataset_util.int64_feature(height), + 'image/width': dataset_util.int64_feature(width), + 'image/filename': dataset_util.bytes_feature(filename), + 'image/source_id': dataset_util.bytes_feature(filename), + 'image/encoded': dataset_util.bytes_feature(encoded_image_data), + 'image/format': dataset_util.bytes_feature(image_format), + 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), + 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), + 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), + 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), + 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), + 'image/object/class/label': dataset_util.int64_list_feature(classes), + })) + return tf_example + + +def main(_): + writer = tf.python_io.TFRecordWriter(FLAGS.output_path) + + # TODO(user): Write code to read in your dataset to examples variable + + for example in examples: + tf_example = create_tf_example(example) + writer.write(tf_example.SerializeToString()) + + writer.close() + + +if __name__ == '__main__': + tf.app.run() + +``` + +Note: You may notice additional fields in some other datasets. They are +currently unused by the API and are optional. + +Note: Please refer to the section on [Running an Instance Segmentation +Model](instance_segmentation.md) for instructions on how to configure a model +that predicts masks in addition to object bounding boxes. diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/detection_inference.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/detection_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..dc66686ff1f496935ad2cd05ee5f969fa1306a8d --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/detection_inference.py @@ -0,0 +1,141 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utility functions for detection inference.""" +from __future__ import division + +import tensorflow as tf + +from object_detection.core import standard_fields + + +def build_input(tfrecord_paths): + """Builds the graph's input. + + Args: + tfrecord_paths: List of paths to the input TFRecords + + Returns: + serialized_example_tensor: The next serialized example. String scalar Tensor + image_tensor: The decoded image of the example. Uint8 tensor, + shape=[1, None, None,3] + """ + filename_queue = tf.train.string_input_producer( + tfrecord_paths, shuffle=False, num_epochs=1) + + tf_record_reader = tf.TFRecordReader() + _, serialized_example_tensor = tf_record_reader.read(filename_queue) + features = tf.parse_single_example( + serialized_example_tensor, + features={ + standard_fields.TfExampleFields.image_encoded: + tf.FixedLenFeature([], tf.string), + }) + encoded_image = features[standard_fields.TfExampleFields.image_encoded] + image_tensor = tf.image.decode_image(encoded_image, channels=3) + image_tensor.set_shape([None, None, 3]) + image_tensor = tf.expand_dims(image_tensor, 0) + + return serialized_example_tensor, image_tensor + + +def build_inference_graph(image_tensor, inference_graph_path): + """Loads the inference graph and connects it to the input image. + + Args: + image_tensor: The input image. uint8 tensor, shape=[1, None, None, 3] + inference_graph_path: Path to the inference graph with embedded weights + + Returns: + detected_boxes_tensor: Detected boxes. Float tensor, + shape=[num_detections, 4] + detected_scores_tensor: Detected scores. Float tensor, + shape=[num_detections] + detected_labels_tensor: Detected labels. Int64 tensor, + shape=[num_detections] + """ + with tf.gfile.Open(inference_graph_path, 'r') as graph_def_file: + graph_content = graph_def_file.read() + graph_def = tf.GraphDef() + graph_def.MergeFromString(graph_content) + + tf.import_graph_def( + graph_def, name='', input_map={'image_tensor': image_tensor}) + + g = tf.get_default_graph() + + num_detections_tensor = tf.squeeze( + g.get_tensor_by_name('num_detections:0'), 0) + num_detections_tensor = tf.cast(num_detections_tensor, tf.int32) + + detected_boxes_tensor = tf.squeeze( + g.get_tensor_by_name('detection_boxes:0'), 0) + detected_boxes_tensor = detected_boxes_tensor[:num_detections_tensor] + + detected_scores_tensor = tf.squeeze( + g.get_tensor_by_name('detection_scores:0'), 0) + detected_scores_tensor = detected_scores_tensor[:num_detections_tensor] + + detected_labels_tensor = tf.squeeze( + g.get_tensor_by_name('detection_classes:0'), 0) + detected_labels_tensor = tf.cast(detected_labels_tensor, tf.int64) + detected_labels_tensor = detected_labels_tensor[:num_detections_tensor] + + return detected_boxes_tensor, detected_scores_tensor, detected_labels_tensor + + +def infer_detections_and_add_to_example( + serialized_example_tensor, detected_boxes_tensor, detected_scores_tensor, + detected_labels_tensor, discard_image_pixels): + """Runs the supplied tensors and adds the inferred detections to the example. + + Args: + serialized_example_tensor: Serialized TF example. Scalar string tensor + detected_boxes_tensor: Detected boxes. Float tensor, + shape=[num_detections, 4] + detected_scores_tensor: Detected scores. Float tensor, + shape=[num_detections] + detected_labels_tensor: Detected labels. Int64 tensor, + shape=[num_detections] + discard_image_pixels: If true, discards the image from the result + Returns: + The de-serialized TF example augmented with the inferred detections. + """ + tf_example = tf.train.Example() + (serialized_example, detected_boxes, detected_scores, + detected_classes) = tf.get_default_session().run([ + serialized_example_tensor, detected_boxes_tensor, detected_scores_tensor, + detected_labels_tensor + ]) + detected_boxes = detected_boxes.T + + tf_example.ParseFromString(serialized_example) + feature = tf_example.features.feature + feature[standard_fields.TfExampleFields. + detection_score].float_list.value[:] = detected_scores + feature[standard_fields.TfExampleFields. + detection_bbox_ymin].float_list.value[:] = detected_boxes[0] + feature[standard_fields.TfExampleFields. + detection_bbox_xmin].float_list.value[:] = detected_boxes[1] + feature[standard_fields.TfExampleFields. + detection_bbox_ymax].float_list.value[:] = detected_boxes[2] + feature[standard_fields.TfExampleFields. + detection_bbox_xmax].float_list.value[:] = detected_boxes[3] + feature[standard_fields.TfExampleFields. + detection_class_label].int64_list.value[:] = detected_classes + + if discard_image_pixels: + del feature[standard_fields.TfExampleFields.image_encoded] + + return tf_example diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/detection_inference_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/detection_inference_test.py new file mode 100644 index 0000000000000000000000000000000000000000..eabb6b474d672a48139cb4cdeebd388a4d5c4fca --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/detection_inference_test.py @@ -0,0 +1,176 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Tests for detection_inference.py.""" + +import os +import StringIO + +import numpy as np +from PIL import Image +import tensorflow as tf + +from object_detection.core import standard_fields +from object_detection.inference import detection_inference +from object_detection.utils import dataset_util + + +def get_mock_tfrecord_path(): + return os.path.join(tf.test.get_temp_dir(), 'mock.tfrec') + + +def create_mock_tfrecord(): + pil_image = Image.fromarray(np.array([[[123, 0, 0]]], dtype=np.uint8), 'RGB') + image_output_stream = StringIO.StringIO() + pil_image.save(image_output_stream, format='png') + encoded_image = image_output_stream.getvalue() + + feature_map = { + 'test_field': + dataset_util.float_list_feature([1, 2, 3, 4]), + standard_fields.TfExampleFields.image_encoded: + dataset_util.bytes_feature(encoded_image), + } + + tf_example = tf.train.Example(features=tf.train.Features(feature=feature_map)) + with tf.python_io.TFRecordWriter(get_mock_tfrecord_path()) as writer: + writer.write(tf_example.SerializeToString()) + + +def get_mock_graph_path(): + return os.path.join(tf.test.get_temp_dir(), 'mock_graph.pb') + + +def create_mock_graph(): + g = tf.Graph() + with g.as_default(): + in_image_tensor = tf.placeholder( + tf.uint8, shape=[1, None, None, 3], name='image_tensor') + tf.constant([2.0], name='num_detections') + tf.constant( + [[[0, 0.8, 0.7, 1], [0.1, 0.2, 0.8, 0.9], [0.2, 0.3, 0.4, 0.5]]], + name='detection_boxes') + tf.constant([[0.1, 0.2, 0.3]], name='detection_scores') + tf.identity( + tf.constant([[1.0, 2.0, 3.0]]) * + tf.reduce_sum(tf.cast(in_image_tensor, dtype=tf.float32)), + name='detection_classes') + graph_def = g.as_graph_def() + + with tf.gfile.Open(get_mock_graph_path(), 'w') as fl: + fl.write(graph_def.SerializeToString()) + + +class InferDetectionsTests(tf.test.TestCase): + + def test_simple(self): + create_mock_graph() + create_mock_tfrecord() + + serialized_example_tensor, image_tensor = detection_inference.build_input( + [get_mock_tfrecord_path()]) + self.assertAllEqual(image_tensor.get_shape().as_list(), [1, None, None, 3]) + + (detected_boxes_tensor, detected_scores_tensor, + detected_labels_tensor) = detection_inference.build_inference_graph( + image_tensor, get_mock_graph_path()) + + with self.test_session(use_gpu=False) as sess: + sess.run(tf.global_variables_initializer()) + sess.run(tf.local_variables_initializer()) + tf.train.start_queue_runners() + + tf_example = detection_inference.infer_detections_and_add_to_example( + serialized_example_tensor, detected_boxes_tensor, + detected_scores_tensor, detected_labels_tensor, False) + + self.assertProtoEquals(r""" + features { + feature { + key: "image/detection/bbox/ymin" + value { float_list { value: [0.0, 0.1] } } } + feature { + key: "image/detection/bbox/xmin" + value { float_list { value: [0.8, 0.2] } } } + feature { + key: "image/detection/bbox/ymax" + value { float_list { value: [0.7, 0.8] } } } + feature { + key: "image/detection/bbox/xmax" + value { float_list { value: [1.0, 0.9] } } } + feature { + key: "image/detection/label" + value { int64_list { value: [123, 246] } } } + feature { + key: "image/detection/score" + value { float_list { value: [0.1, 0.2] } } } + feature { + key: "image/encoded" + value { bytes_list { value: + "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\001\000\000" + "\000\001\010\002\000\000\000\220wS\336\000\000\000\022IDATx" + "\234b\250f`\000\000\000\000\377\377\003\000\001u\000|gO\242" + "\213\000\000\000\000IEND\256B`\202" } } } + feature { + key: "test_field" + value { float_list { value: [1.0, 2.0, 3.0, 4.0] } } } } + """, tf_example) + + def test_discard_image(self): + create_mock_graph() + create_mock_tfrecord() + + serialized_example_tensor, image_tensor = detection_inference.build_input( + [get_mock_tfrecord_path()]) + (detected_boxes_tensor, detected_scores_tensor, + detected_labels_tensor) = detection_inference.build_inference_graph( + image_tensor, get_mock_graph_path()) + + with self.test_session(use_gpu=False) as sess: + sess.run(tf.global_variables_initializer()) + sess.run(tf.local_variables_initializer()) + tf.train.start_queue_runners() + + tf_example = detection_inference.infer_detections_and_add_to_example( + serialized_example_tensor, detected_boxes_tensor, + detected_scores_tensor, detected_labels_tensor, True) + + self.assertProtoEquals(r""" + features { + feature { + key: "image/detection/bbox/ymin" + value { float_list { value: [0.0, 0.1] } } } + feature { + key: "image/detection/bbox/xmin" + value { float_list { value: [0.8, 0.2] } } } + feature { + key: "image/detection/bbox/ymax" + value { float_list { value: [0.7, 0.8] } } } + feature { + key: "image/detection/bbox/xmax" + value { float_list { value: [1.0, 0.9] } } } + feature { + key: "image/detection/label" + value { int64_list { value: [123, 246] } } } + feature { + key: "image/detection/score" + value { float_list { value: [0.1, 0.2] } } } + feature { + key: "test_field" + value { float_list { value: [1.0, 2.0, 3.0, 4.0] } } } } + """, tf_example) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/infer_detections.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/infer_detections.py new file mode 100644 index 0000000000000000000000000000000000000000..a251009ef0d415e08395be038dbc4ed42d804ff7 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/inference/infer_detections.py @@ -0,0 +1,96 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Infers detections on a TFRecord of TFExamples given an inference graph. + +Example usage: + ./infer_detections \ + --input_tfrecord_paths=/path/to/input/tfrecord1,/path/to/input/tfrecord2 \ + --output_tfrecord_path_prefix=/path/to/output/detections.tfrecord \ + --inference_graph=/path/to/frozen_weights_inference_graph.pb + +The output is a TFRecord of TFExamples. Each TFExample from the input is first +augmented with detections from the inference graph and then copied to the +output. + +The input and output nodes of the inference graph are expected to have the same +types, shapes, and semantics, as the input and output nodes of graphs produced +by export_inference_graph.py, when run with --input_type=image_tensor. + +The script can also discard the image pixels in the output. This greatly +reduces the output size and can potentially accelerate reading data in +subsequent processing steps that don't require the images (e.g. computing +metrics). +""" + +import itertools +import tensorflow as tf +from object_detection.inference import detection_inference + +tf.flags.DEFINE_string('input_tfrecord_paths', None, + 'A comma separated list of paths to input TFRecords.') +tf.flags.DEFINE_string('output_tfrecord_path', None, + 'Path to the output TFRecord.') +tf.flags.DEFINE_string('inference_graph', None, + 'Path to the inference graph with embedded weights.') +tf.flags.DEFINE_boolean('discard_image_pixels', False, + 'Discards the images in the output TFExamples. This' + ' significantly reduces the output size and is useful' + ' if the subsequent tools don\'t need access to the' + ' images (e.g. when computing evaluation measures).') + +FLAGS = tf.flags.FLAGS + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + required_flags = ['input_tfrecord_paths', 'output_tfrecord_path', + 'inference_graph'] + for flag_name in required_flags: + if not getattr(FLAGS, flag_name): + raise ValueError('Flag --{} is required'.format(flag_name)) + + with tf.Session() as sess: + input_tfrecord_paths = [ + v for v in FLAGS.input_tfrecord_paths.split(',') if v] + tf.logging.info('Reading input from %d files', len(input_tfrecord_paths)) + serialized_example_tensor, image_tensor = detection_inference.build_input( + input_tfrecord_paths) + tf.logging.info('Reading graph and building model...') + (detected_boxes_tensor, detected_scores_tensor, + detected_labels_tensor) = detection_inference.build_inference_graph( + image_tensor, FLAGS.inference_graph) + + tf.logging.info('Running inference and writing output to {}'.format( + FLAGS.output_tfrecord_path)) + sess.run(tf.local_variables_initializer()) + tf.train.start_queue_runners() + with tf.python_io.TFRecordWriter( + FLAGS.output_tfrecord_path) as tf_record_writer: + try: + for counter in itertools.count(): + tf.logging.log_every_n(tf.logging.INFO, 'Processed %d images...', 10, + counter) + tf_example = detection_inference.infer_detections_and_add_to_example( + serialized_example_tensor, detected_boxes_tensor, + detected_scores_tensor, detected_labels_tensor, + FLAGS.discard_image_pixels) + tf_record_writer.write(tf_example.SerializeToString()) + except tf.errors.OutOfRangeError: + tf.logging.info('Finished processing records') + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/inputs.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/inputs.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b3ae259d7deb1bea92b15f2bb3bdd5aa58ffb0 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/inputs.py @@ -0,0 +1,440 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Model input function for tf-learn object detection model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools + +import tensorflow as tf +from object_detection.builders import dataset_builder +from object_detection.builders import image_resizer_builder +from object_detection.builders import model_builder +from object_detection.builders import preprocessor_builder +from object_detection.core import preprocessor +from object_detection.core import standard_fields as fields +from object_detection.data_decoders import tf_example_decoder +from object_detection.protos import eval_pb2 +from object_detection.protos import input_reader_pb2 +from object_detection.protos import model_pb2 +from object_detection.protos import train_pb2 +from object_detection.utils import config_util +from object_detection.utils import dataset_util +from object_detection.utils import ops as util_ops + +HASH_KEY = 'hash' +HASH_BINS = 1 << 31 +SERVING_FED_EXAMPLE_KEY = 'serialized_example' + +# A map of names to methods that help build the input pipeline. +INPUT_BUILDER_UTIL_MAP = { + 'dataset_build': dataset_builder.build, +} + + +def transform_input_data(tensor_dict, + model_preprocess_fn, + image_resizer_fn, + num_classes, + data_augmentation_fn=None, + merge_multiple_boxes=False, + retain_original_image=False): + """A single function that is responsible for all input data transformations. + + Data transformation functions are applied in the following order. + 1. If key fields.InputDataFields.image_additional_channels is present in + tensor_dict, the additional channels will be merged into + fields.InputDataFields.image. + 2. data_augmentation_fn (optional): applied on tensor_dict. + 3. model_preprocess_fn: applied only on image tensor in tensor_dict. + 4. image_resizer_fn: applied on original image and instance mask tensor in + tensor_dict. + 5. one_hot_encoding: applied to classes tensor in tensor_dict. + 6. merge_multiple_boxes (optional): when groundtruth boxes are exactly the + same they can be merged into a single box with an associated k-hot class + label. + + Args: + tensor_dict: dictionary containing input tensors keyed by + fields.InputDataFields. + model_preprocess_fn: model's preprocess function to apply on image tensor. + This function must take in a 4-D float tensor and return a 4-D preprocess + float tensor and a tensor containing the true image shape. + image_resizer_fn: image resizer function to apply on groundtruth instance + `masks. This function must take a 3-D float tensor of an image and a 3-D + tensor of instance masks and return a resized version of these along with + the true shapes. + num_classes: number of max classes to one-hot (or k-hot) encode the class + labels. + data_augmentation_fn: (optional) data augmentation function to apply on + input `tensor_dict`. + merge_multiple_boxes: (optional) whether to merge multiple groundtruth boxes + and classes for a given image if the boxes are exactly the same. + retain_original_image: (optional) whether to retain original image in the + output dictionary. + + Returns: + A dictionary keyed by fields.InputDataFields containing the tensors obtained + after applying all the transformations. + """ + if fields.InputDataFields.image_additional_channels in tensor_dict: + channels = tensor_dict[fields.InputDataFields.image_additional_channels] + tensor_dict[fields.InputDataFields.image] = tf.concat( + [tensor_dict[fields.InputDataFields.image], channels], axis=2) + + if retain_original_image: + tensor_dict[fields.InputDataFields.original_image] = tf.cast( + tensor_dict[fields.InputDataFields.image], tf.uint8) + + # Apply data augmentation ops. + if data_augmentation_fn is not None: + tensor_dict = data_augmentation_fn(tensor_dict) + + # Apply model preprocessing ops and resize instance masks. + image = tensor_dict[fields.InputDataFields.image] + preprocessed_resized_image, true_image_shape = model_preprocess_fn( + tf.expand_dims(tf.to_float(image), axis=0)) + tensor_dict[fields.InputDataFields.image] = tf.squeeze( + preprocessed_resized_image, axis=0) + tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze( + true_image_shape, axis=0) + if fields.InputDataFields.groundtruth_instance_masks in tensor_dict: + masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks] + _, resized_masks, _ = image_resizer_fn(image, masks) + tensor_dict[fields.InputDataFields. + groundtruth_instance_masks] = resized_masks + + # Transform groundtruth classes to one hot encodings. + label_offset = 1 + zero_indexed_groundtruth_classes = tensor_dict[ + fields.InputDataFields.groundtruth_classes] - label_offset + tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.one_hot( + zero_indexed_groundtruth_classes, num_classes) + + if merge_multiple_boxes: + merged_boxes, merged_classes, _ = util_ops.merge_boxes_with_multiple_labels( + tensor_dict[fields.InputDataFields.groundtruth_boxes], + zero_indexed_groundtruth_classes, num_classes) + tensor_dict[fields.InputDataFields.groundtruth_boxes] = merged_boxes + tensor_dict[fields.InputDataFields.groundtruth_classes] = merged_classes + + return tensor_dict + + +def augment_input_data(tensor_dict, data_augmentation_options): + """Applies data augmentation ops to input tensors. + + Args: + tensor_dict: A dictionary of input tensors keyed by fields.InputDataFields. + data_augmentation_options: A list of tuples, where each tuple contains a + function and a dictionary that contains arguments and their values. + Usually, this is the output of core/preprocessor.build. + + Returns: + A dictionary of tensors obtained by applying data augmentation ops to the + input tensor dictionary. + """ + tensor_dict[fields.InputDataFields.image] = tf.expand_dims( + tf.to_float(tensor_dict[fields.InputDataFields.image]), 0) + + include_instance_masks = (fields.InputDataFields.groundtruth_instance_masks + in tensor_dict) + include_keypoints = (fields.InputDataFields.groundtruth_keypoints + in tensor_dict) + tensor_dict = preprocessor.preprocess( + tensor_dict, data_augmentation_options, + func_arg_map=preprocessor.get_default_func_arg_map( + include_instance_masks=include_instance_masks, + include_keypoints=include_keypoints)) + tensor_dict[fields.InputDataFields.image] = tf.squeeze( + tensor_dict[fields.InputDataFields.image], axis=0) + return tensor_dict + + +def _get_labels_dict(input_dict): + """Extracts labels dict from input dict.""" + required_label_keys = [ + fields.InputDataFields.num_groundtruth_boxes, + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_classes, + fields.InputDataFields.groundtruth_weights + ] + labels_dict = {} + for key in required_label_keys: + labels_dict[key] = input_dict[key] + + optional_label_keys = [ + fields.InputDataFields.groundtruth_keypoints, + fields.InputDataFields.groundtruth_instance_masks, + fields.InputDataFields.groundtruth_area, + fields.InputDataFields.groundtruth_is_crowd, + fields.InputDataFields.groundtruth_difficult + ] + + for key in optional_label_keys: + if key in input_dict: + labels_dict[key] = input_dict[key] + if fields.InputDataFields.groundtruth_difficult in labels_dict: + labels_dict[fields.InputDataFields.groundtruth_difficult] = tf.cast( + labels_dict[fields.InputDataFields.groundtruth_difficult], tf.int32) + return labels_dict + + +def _get_features_dict(input_dict): + """Extracts features dict from input dict.""" + hash_from_source_id = tf.string_to_hash_bucket_fast( + input_dict[fields.InputDataFields.source_id], HASH_BINS) + features = { + fields.InputDataFields.image: + input_dict[fields.InputDataFields.image], + HASH_KEY: tf.cast(hash_from_source_id, tf.int32), + fields.InputDataFields.true_image_shape: + input_dict[fields.InputDataFields.true_image_shape] + } + if fields.InputDataFields.original_image in input_dict: + features[fields.InputDataFields.original_image] = input_dict[ + fields.InputDataFields.original_image] + return features + + +def create_train_input_fn(train_config, train_input_config, + model_config): + """Creates a train `input` function for `Estimator`. + + Args: + train_config: A train_pb2.TrainConfig. + train_input_config: An input_reader_pb2.InputReader. + model_config: A model_pb2.DetectionModel. + + Returns: + `input_fn` for `Estimator` in TRAIN mode. + """ + + def _train_input_fn(params=None): + """Returns `features` and `labels` tensor dictionaries for training. + + Args: + params: Parameter dictionary passed from the estimator. + + Returns: + features: Dictionary of feature tensors. + features[fields.InputDataFields.image] is a [batch_size, H, W, C] + float32 tensor with preprocessed images. + features[HASH_KEY] is a [batch_size] int32 tensor representing unique + identifiers for the images. + features[fields.InputDataFields.true_image_shape] is a [batch_size, 3] + int32 tensor representing the true image shapes, as preprocessed + images could be padded. + features[fields.InputDataFields.original_image] (optional) is a + [batch_size, H, W, C] float32 tensor with original images. + labels: Dictionary of groundtruth tensors. + labels[fields.InputDataFields.num_groundtruth_boxes] is a [batch_size] + int32 tensor indicating the number of groundtruth boxes. + labels[fields.InputDataFields.groundtruth_boxes] is a + [batch_size, num_boxes, 4] float32 tensor containing the corners of + the groundtruth boxes. + labels[fields.InputDataFields.groundtruth_classes] is a + [batch_size, num_boxes, num_classes] float32 one-hot tensor of + classes. + labels[fields.InputDataFields.groundtruth_weights] is a + [batch_size, num_boxes] float32 tensor containing groundtruth weights + for the boxes. + -- Optional -- + labels[fields.InputDataFields.groundtruth_instance_masks] is a + [batch_size, num_boxes, H, W] float32 tensor containing only binary + values, which represent instance masks for objects. + labels[fields.InputDataFields.groundtruth_keypoints] is a + [batch_size, num_boxes, num_keypoints, 2] float32 tensor containing + keypoints for each box. + + Raises: + TypeError: if the `train_config`, `train_input_config` or `model_config` + are not of the correct type. + """ + if not isinstance(train_config, train_pb2.TrainConfig): + raise TypeError('For training mode, the `train_config` must be a ' + 'train_pb2.TrainConfig.') + if not isinstance(train_input_config, input_reader_pb2.InputReader): + raise TypeError('The `train_input_config` must be a ' + 'input_reader_pb2.InputReader.') + if not isinstance(model_config, model_pb2.DetectionModel): + raise TypeError('The `model_config` must be a ' + 'model_pb2.DetectionModel.') + + data_augmentation_options = [ + preprocessor_builder.build(step) + for step in train_config.data_augmentation_options + ] + data_augmentation_fn = functools.partial( + augment_input_data, data_augmentation_options=data_augmentation_options) + + model = model_builder.build(model_config, is_training=True) + image_resizer_config = config_util.get_image_resizer_config(model_config) + image_resizer_fn = image_resizer_builder.build(image_resizer_config) + + transform_data_fn = functools.partial( + transform_input_data, model_preprocess_fn=model.preprocess, + image_resizer_fn=image_resizer_fn, + num_classes=config_util.get_number_of_classes(model_config), + data_augmentation_fn=data_augmentation_fn, + retain_original_image=train_config.retain_original_images) + dataset = INPUT_BUILDER_UTIL_MAP['dataset_build']( + train_input_config, + transform_input_data_fn=transform_data_fn, + batch_size=params['batch_size'] if params else train_config.batch_size, + max_num_boxes=train_config.max_number_of_boxes, + num_classes=config_util.get_number_of_classes(model_config), + spatial_image_shape=config_util.get_spatial_image_size( + image_resizer_config)) + input_dict = dataset_util.make_initializable_iterator(dataset).get_next() + return (_get_features_dict(input_dict), _get_labels_dict(input_dict)) + + return _train_input_fn + + +def create_eval_input_fn(eval_config, eval_input_config, model_config): + """Creates an eval `input` function for `Estimator`. + + Args: + eval_config: An eval_pb2.EvalConfig. + eval_input_config: An input_reader_pb2.InputReader. + model_config: A model_pb2.DetectionModel. + + Returns: + `input_fn` for `Estimator` in EVAL mode. + """ + + def _eval_input_fn(params=None): + """Returns `features` and `labels` tensor dictionaries for evaluation. + + Args: + params: Parameter dictionary passed from the estimator. + + Returns: + features: Dictionary of feature tensors. + features[fields.InputDataFields.image] is a [1, H, W, C] float32 tensor + with preprocessed images. + features[HASH_KEY] is a [1] int32 tensor representing unique + identifiers for the images. + features[fields.InputDataFields.true_image_shape] is a [1, 3] + int32 tensor representing the true image shapes, as preprocessed + images could be padded. + features[fields.InputDataFields.original_image] is a [1, H', W', C] + float32 tensor with the original image. + labels: Dictionary of groundtruth tensors. + labels[fields.InputDataFields.groundtruth_boxes] is a [1, num_boxes, 4] + float32 tensor containing the corners of the groundtruth boxes. + labels[fields.InputDataFields.groundtruth_classes] is a + [num_boxes, num_classes] float32 one-hot tensor of classes. + labels[fields.InputDataFields.groundtruth_area] is a [1, num_boxes] + float32 tensor containing object areas. + labels[fields.InputDataFields.groundtruth_is_crowd] is a [1, num_boxes] + bool tensor indicating if the boxes enclose a crowd. + labels[fields.InputDataFields.groundtruth_difficult] is a [1, num_boxes] + int32 tensor indicating if the boxes represent difficult instances. + -- Optional -- + labels[fields.InputDataFields.groundtruth_instance_masks] is a + [1, num_boxes, H, W] float32 tensor containing only binary values, + which represent instance masks for objects. + + Raises: + TypeError: if the `eval_config`, `eval_input_config` or `model_config` + are not of the correct type. + """ + params = params or {} + if not isinstance(eval_config, eval_pb2.EvalConfig): + raise TypeError('For eval mode, the `eval_config` must be a ' + 'train_pb2.EvalConfig.') + if not isinstance(eval_input_config, input_reader_pb2.InputReader): + raise TypeError('The `eval_input_config` must be a ' + 'input_reader_pb2.InputReader.') + if not isinstance(model_config, model_pb2.DetectionModel): + raise TypeError('The `model_config` must be a ' + 'model_pb2.DetectionModel.') + + num_classes = config_util.get_number_of_classes(model_config) + model = model_builder.build(model_config, is_training=False) + image_resizer_config = config_util.get_image_resizer_config(model_config) + image_resizer_fn = image_resizer_builder.build(image_resizer_config) + + transform_data_fn = functools.partial( + transform_input_data, model_preprocess_fn=model.preprocess, + image_resizer_fn=image_resizer_fn, + num_classes=num_classes, + data_augmentation_fn=None, + retain_original_image=eval_config.retain_original_images) + dataset = INPUT_BUILDER_UTIL_MAP['dataset_build']( + eval_input_config, + transform_input_data_fn=transform_data_fn, + batch_size=params.get('batch_size', 1), + num_classes=config_util.get_number_of_classes(model_config), + spatial_image_shape=config_util.get_spatial_image_size( + image_resizer_config)) + input_dict = dataset_util.make_initializable_iterator(dataset).get_next() + + return (_get_features_dict(input_dict), _get_labels_dict(input_dict)) + + return _eval_input_fn + + +def create_predict_input_fn(model_config): + """Creates a predict `input` function for `Estimator`. + + Args: + model_config: A model_pb2.DetectionModel. + + Returns: + `input_fn` for `Estimator` in PREDICT mode. + """ + + def _predict_input_fn(params=None): + """Decodes serialized tf.Examples and returns `ServingInputReceiver`. + + Args: + params: Parameter dictionary passed from the estimator. + + Returns: + `ServingInputReceiver`. + """ + del params + example = tf.placeholder(dtype=tf.string, shape=[], name='input_feature') + + num_classes = config_util.get_number_of_classes(model_config) + model = model_builder.build(model_config, is_training=False) + image_resizer_config = config_util.get_image_resizer_config(model_config) + image_resizer_fn = image_resizer_builder.build(image_resizer_config) + + transform_fn = functools.partial( + transform_input_data, model_preprocess_fn=model.preprocess, + image_resizer_fn=image_resizer_fn, + num_classes=num_classes, + data_augmentation_fn=None) + + decoder = tf_example_decoder.TfExampleDecoder(load_instance_masks=False) + input_dict = transform_fn(decoder.decode(example)) + images = tf.to_float(input_dict[fields.InputDataFields.image]) + images = tf.expand_dims(images, axis=0) + true_image_shape = tf.expand_dims( + input_dict[fields.InputDataFields.true_image_shape], axis=0) + + return tf.estimator.export.ServingInputReceiver( + features={ + fields.InputDataFields.image: images, + fields.InputDataFields.true_image_shape: true_image_shape}, + receiver_tensors={SERVING_FED_EXAMPLE_KEY: example}) + + return _predict_input_fn diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/inputs_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/inputs_test.py new file mode 100644 index 0000000000000000000000000000000000000000..8326633597922a568c768d70c64ceea1d90156fa --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/inputs_test.py @@ -0,0 +1,601 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for object_detection.tflearn.inputs.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import os + +import numpy as np +import tensorflow as tf + +from object_detection import inputs +from object_detection.core import preprocessor +from object_detection.core import standard_fields as fields +from object_detection.utils import config_util + +FLAGS = tf.flags.FLAGS + + +def _get_configs_for_model(model_name): + """Returns configurations for model.""" + fname = os.path.join(tf.resource_loader.get_data_files_path(), + 'samples/configs/' + model_name + '.config') + label_map_path = os.path.join(tf.resource_loader.get_data_files_path(), + 'data/pet_label_map.pbtxt') + data_path = os.path.join(tf.resource_loader.get_data_files_path(), + 'test_data/pets_examples.record') + configs = config_util.get_configs_from_pipeline_file(fname) + return config_util.merge_external_params_with_configs( + configs, + train_input_path=data_path, + eval_input_path=data_path, + label_map_path=label_map_path) + + +class InputsTest(tf.test.TestCase): + + def test_faster_rcnn_resnet50_train_input(self): + """Tests the training input function for FasterRcnnResnet50.""" + configs = _get_configs_for_model('faster_rcnn_resnet50_pets') + configs['train_config'].unpad_groundtruth_tensors = True + model_config = configs['model'] + model_config.faster_rcnn.num_classes = 37 + train_input_fn = inputs.create_train_input_fn( + configs['train_config'], configs['train_input_config'], model_config) + features, labels = train_input_fn() + + self.assertAllEqual([1, None, None, 3], + features[fields.InputDataFields.image].shape.as_list()) + self.assertEqual(tf.float32, features[fields.InputDataFields.image].dtype) + self.assertAllEqual([1], + features[inputs.HASH_KEY].shape.as_list()) + self.assertEqual(tf.int32, features[inputs.HASH_KEY].dtype) + self.assertAllEqual( + [1, 50, 4], + labels[fields.InputDataFields.groundtruth_boxes].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_boxes].dtype) + self.assertAllEqual( + [1, 50, model_config.faster_rcnn.num_classes], + labels[fields.InputDataFields.groundtruth_classes].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_classes].dtype) + self.assertAllEqual( + [1, 50], + labels[fields.InputDataFields.groundtruth_weights].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_weights].dtype) + + def test_faster_rcnn_resnet50_eval_input(self): + """Tests the eval input function for FasterRcnnResnet50.""" + configs = _get_configs_for_model('faster_rcnn_resnet50_pets') + model_config = configs['model'] + model_config.faster_rcnn.num_classes = 37 + eval_input_fn = inputs.create_eval_input_fn( + configs['eval_config'], configs['eval_input_config'], model_config) + features, labels = eval_input_fn() + + self.assertAllEqual([1, None, None, 3], + features[fields.InputDataFields.image].shape.as_list()) + self.assertEqual(tf.float32, features[fields.InputDataFields.image].dtype) + self.assertAllEqual( + [1, None, None, 3], + features[fields.InputDataFields.original_image].shape.as_list()) + self.assertEqual(tf.uint8, + features[fields.InputDataFields.original_image].dtype) + self.assertAllEqual([1], features[inputs.HASH_KEY].shape.as_list()) + self.assertEqual(tf.int32, features[inputs.HASH_KEY].dtype) + self.assertAllEqual( + [1, None, 4], + labels[fields.InputDataFields.groundtruth_boxes].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_boxes].dtype) + self.assertAllEqual( + [1, None, model_config.faster_rcnn.num_classes], + labels[fields.InputDataFields.groundtruth_classes].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_classes].dtype) + self.assertAllEqual( + [1, None], + labels[fields.InputDataFields.groundtruth_area].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_area].dtype) + self.assertAllEqual( + [1, None], + labels[fields.InputDataFields.groundtruth_is_crowd].shape.as_list()) + self.assertEqual( + tf.bool, labels[fields.InputDataFields.groundtruth_is_crowd].dtype) + self.assertAllEqual( + [1, None], + labels[fields.InputDataFields.groundtruth_difficult].shape.as_list()) + self.assertEqual( + tf.int32, labels[fields.InputDataFields.groundtruth_difficult].dtype) + + def test_ssd_inceptionV2_train_input(self): + """Tests the training input function for SSDInceptionV2.""" + configs = _get_configs_for_model('ssd_inception_v2_pets') + model_config = configs['model'] + model_config.ssd.num_classes = 37 + batch_size = configs['train_config'].batch_size + train_input_fn = inputs.create_train_input_fn( + configs['train_config'], configs['train_input_config'], model_config) + features, labels = train_input_fn() + + self.assertAllEqual([batch_size, 300, 300, 3], + features[fields.InputDataFields.image].shape.as_list()) + self.assertEqual(tf.float32, features[fields.InputDataFields.image].dtype) + self.assertAllEqual([batch_size], + features[inputs.HASH_KEY].shape.as_list()) + self.assertEqual(tf.int32, features[inputs.HASH_KEY].dtype) + self.assertAllEqual( + [batch_size], + labels[fields.InputDataFields.num_groundtruth_boxes].shape.as_list()) + self.assertEqual(tf.int32, + labels[fields.InputDataFields.num_groundtruth_boxes].dtype) + self.assertAllEqual( + [batch_size, 50, 4], + labels[fields.InputDataFields.groundtruth_boxes].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_boxes].dtype) + self.assertAllEqual( + [batch_size, 50, model_config.ssd.num_classes], + labels[fields.InputDataFields.groundtruth_classes].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_classes].dtype) + self.assertAllEqual( + [batch_size, 50], + labels[fields.InputDataFields.groundtruth_weights].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_weights].dtype) + + def test_ssd_inceptionV2_eval_input(self): + """Tests the eval input function for SSDInceptionV2.""" + configs = _get_configs_for_model('ssd_inception_v2_pets') + model_config = configs['model'] + model_config.ssd.num_classes = 37 + eval_input_fn = inputs.create_eval_input_fn( + configs['eval_config'], configs['eval_input_config'], model_config) + features, labels = eval_input_fn() + + self.assertAllEqual([1, 300, 300, 3], + features[fields.InputDataFields.image].shape.as_list()) + self.assertEqual(tf.float32, features[fields.InputDataFields.image].dtype) + self.assertAllEqual( + [1, None, None, 3], + features[fields.InputDataFields.original_image].shape.as_list()) + self.assertEqual(tf.uint8, + features[fields.InputDataFields.original_image].dtype) + self.assertAllEqual([1], features[inputs.HASH_KEY].shape.as_list()) + self.assertEqual(tf.int32, features[inputs.HASH_KEY].dtype) + self.assertAllEqual( + [1, None, 4], + labels[fields.InputDataFields.groundtruth_boxes].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_boxes].dtype) + self.assertAllEqual( + [1, None, model_config.ssd.num_classes], + labels[fields.InputDataFields.groundtruth_classes].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_classes].dtype) + self.assertAllEqual( + [1, None], + labels[fields.InputDataFields.groundtruth_area].shape.as_list()) + self.assertEqual(tf.float32, + labels[fields.InputDataFields.groundtruth_area].dtype) + self.assertAllEqual( + [1, None], + labels[fields.InputDataFields.groundtruth_is_crowd].shape.as_list()) + self.assertEqual( + tf.bool, labels[fields.InputDataFields.groundtruth_is_crowd].dtype) + self.assertAllEqual( + [1, None], + labels[fields.InputDataFields.groundtruth_difficult].shape.as_list()) + self.assertEqual( + tf.int32, labels[fields.InputDataFields.groundtruth_difficult].dtype) + + def test_predict_input(self): + """Tests the predict input function.""" + configs = _get_configs_for_model('ssd_inception_v2_pets') + predict_input_fn = inputs.create_predict_input_fn( + model_config=configs['model']) + serving_input_receiver = predict_input_fn() + + image = serving_input_receiver.features[fields.InputDataFields.image] + receiver_tensors = serving_input_receiver.receiver_tensors[ + inputs.SERVING_FED_EXAMPLE_KEY] + self.assertEqual([1, 300, 300, 3], image.shape.as_list()) + self.assertEqual(tf.float32, image.dtype) + self.assertEqual(tf.string, receiver_tensors.dtype) + + def test_error_with_bad_train_config(self): + """Tests that a TypeError is raised with improper train config.""" + configs = _get_configs_for_model('ssd_inception_v2_pets') + configs['model'].ssd.num_classes = 37 + train_input_fn = inputs.create_train_input_fn( + train_config=configs['eval_config'], # Expecting `TrainConfig`. + train_input_config=configs['train_input_config'], + model_config=configs['model']) + with self.assertRaises(TypeError): + train_input_fn() + + def test_error_with_bad_train_input_config(self): + """Tests that a TypeError is raised with improper train input config.""" + configs = _get_configs_for_model('ssd_inception_v2_pets') + configs['model'].ssd.num_classes = 37 + train_input_fn = inputs.create_train_input_fn( + train_config=configs['train_config'], + train_input_config=configs['model'], # Expecting `InputReader`. + model_config=configs['model']) + with self.assertRaises(TypeError): + train_input_fn() + + def test_error_with_bad_train_model_config(self): + """Tests that a TypeError is raised with improper train model config.""" + configs = _get_configs_for_model('ssd_inception_v2_pets') + configs['model'].ssd.num_classes = 37 + train_input_fn = inputs.create_train_input_fn( + train_config=configs['train_config'], + train_input_config=configs['train_input_config'], + model_config=configs['train_config']) # Expecting `DetectionModel`. + with self.assertRaises(TypeError): + train_input_fn() + + def test_error_with_bad_eval_config(self): + """Tests that a TypeError is raised with improper eval config.""" + configs = _get_configs_for_model('ssd_inception_v2_pets') + configs['model'].ssd.num_classes = 37 + eval_input_fn = inputs.create_eval_input_fn( + eval_config=configs['train_config'], # Expecting `EvalConfig`. + eval_input_config=configs['eval_input_config'], + model_config=configs['model']) + with self.assertRaises(TypeError): + eval_input_fn() + + def test_error_with_bad_eval_input_config(self): + """Tests that a TypeError is raised with improper eval input config.""" + configs = _get_configs_for_model('ssd_inception_v2_pets') + configs['model'].ssd.num_classes = 37 + eval_input_fn = inputs.create_eval_input_fn( + eval_config=configs['eval_config'], + eval_input_config=configs['model'], # Expecting `InputReader`. + model_config=configs['model']) + with self.assertRaises(TypeError): + eval_input_fn() + + def test_error_with_bad_eval_model_config(self): + """Tests that a TypeError is raised with improper eval model config.""" + configs = _get_configs_for_model('ssd_inception_v2_pets') + configs['model'].ssd.num_classes = 37 + eval_input_fn = inputs.create_eval_input_fn( + eval_config=configs['eval_config'], + eval_input_config=configs['eval_input_config'], + model_config=configs['eval_config']) # Expecting `DetectionModel`. + with self.assertRaises(TypeError): + eval_input_fn() + + +class DataAugmentationFnTest(tf.test.TestCase): + + def test_apply_image_and_box_augmentation(self): + data_augmentation_options = [ + (preprocessor.resize_image, { + 'new_height': 20, + 'new_width': 20, + 'method': tf.image.ResizeMethod.NEAREST_NEIGHBOR + }), + (preprocessor.scale_boxes_to_pixel_coordinates, {}), + ] + data_augmentation_fn = functools.partial( + inputs.augment_input_data, + data_augmentation_options=data_augmentation_options) + tensor_dict = { + fields.InputDataFields.image: + tf.constant(np.random.rand(10, 10, 3).astype(np.float32)), + fields.InputDataFields.groundtruth_boxes: + tf.constant(np.array([[.5, .5, 1., 1.]], np.float32)) + } + augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict) + with self.test_session() as sess: + augmented_tensor_dict_out = sess.run(augmented_tensor_dict) + + self.assertAllEqual( + augmented_tensor_dict_out[fields.InputDataFields.image].shape, + [20, 20, 3] + ) + self.assertAllClose( + augmented_tensor_dict_out[fields.InputDataFields.groundtruth_boxes], + [[10, 10, 20, 20]] + ) + + def test_include_masks_in_data_augmentation(self): + data_augmentation_options = [ + (preprocessor.resize_image, { + 'new_height': 20, + 'new_width': 20, + 'method': tf.image.ResizeMethod.NEAREST_NEIGHBOR + }) + ] + data_augmentation_fn = functools.partial( + inputs.augment_input_data, + data_augmentation_options=data_augmentation_options) + tensor_dict = { + fields.InputDataFields.image: + tf.constant(np.random.rand(10, 10, 3).astype(np.float32)), + fields.InputDataFields.groundtruth_instance_masks: + tf.constant(np.zeros([2, 10, 10], np.uint8)) + } + augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict) + with self.test_session() as sess: + augmented_tensor_dict_out = sess.run(augmented_tensor_dict) + + self.assertAllEqual( + augmented_tensor_dict_out[fields.InputDataFields.image].shape, + [20, 20, 3]) + self.assertAllEqual(augmented_tensor_dict_out[ + fields.InputDataFields.groundtruth_instance_masks].shape, [2, 20, 20]) + + def test_include_keypoints_in_data_augmentation(self): + data_augmentation_options = [ + (preprocessor.resize_image, { + 'new_height': 20, + 'new_width': 20, + 'method': tf.image.ResizeMethod.NEAREST_NEIGHBOR + }), + (preprocessor.scale_boxes_to_pixel_coordinates, {}), + ] + data_augmentation_fn = functools.partial( + inputs.augment_input_data, + data_augmentation_options=data_augmentation_options) + tensor_dict = { + fields.InputDataFields.image: + tf.constant(np.random.rand(10, 10, 3).astype(np.float32)), + fields.InputDataFields.groundtruth_boxes: + tf.constant(np.array([[.5, .5, 1., 1.]], np.float32)), + fields.InputDataFields.groundtruth_keypoints: + tf.constant(np.array([[[0.5, 1.0], [0.5, 0.5]]], np.float32)) + } + augmented_tensor_dict = data_augmentation_fn(tensor_dict=tensor_dict) + with self.test_session() as sess: + augmented_tensor_dict_out = sess.run(augmented_tensor_dict) + + self.assertAllEqual( + augmented_tensor_dict_out[fields.InputDataFields.image].shape, + [20, 20, 3] + ) + self.assertAllClose( + augmented_tensor_dict_out[fields.InputDataFields.groundtruth_boxes], + [[10, 10, 20, 20]] + ) + self.assertAllClose( + augmented_tensor_dict_out[fields.InputDataFields.groundtruth_keypoints], + [[[10, 20], [10, 10]]] + ) + + +def _fake_model_preprocessor_fn(image): + return (image, tf.expand_dims(tf.shape(image)[1:], axis=0)) + + +def _fake_image_resizer_fn(image, mask): + return (image, mask, tf.shape(image)) + + +class DataTransformationFnTest(tf.test.TestCase): + + def test_combine_additional_channels_if_present(self): + image = np.random.rand(4, 4, 3).astype(np.float32) + additional_channels = np.random.rand(4, 4, 2).astype(np.float32) + tensor_dict = { + fields.InputDataFields.image: + tf.constant(image), + fields.InputDataFields.image_additional_channels: + tf.constant(additional_channels), + fields.InputDataFields.groundtruth_classes: + tf.constant(np.array([1, 1], np.int32)) + } + + input_transformation_fn = functools.partial( + inputs.transform_input_data, + model_preprocess_fn=_fake_model_preprocessor_fn, + image_resizer_fn=_fake_image_resizer_fn, + num_classes=1) + with self.test_session() as sess: + transformed_inputs = sess.run( + input_transformation_fn(tensor_dict=tensor_dict)) + self.assertAllEqual(transformed_inputs[fields.InputDataFields.image].dtype, + tf.float32) + self.assertAllEqual(transformed_inputs[fields.InputDataFields.image].shape, + [4, 4, 5]) + self.assertAllClose(transformed_inputs[fields.InputDataFields.image], + np.concatenate((image, additional_channels), axis=2)) + + def test_returns_correct_class_label_encodings(self): + tensor_dict = { + fields.InputDataFields.image: + tf.constant(np.random.rand(4, 4, 3).astype(np.float32)), + fields.InputDataFields.groundtruth_boxes: + tf.constant(np.array([[0, 0, 1, 1], [.5, .5, 1, 1]], np.float32)), + fields.InputDataFields.groundtruth_classes: + tf.constant(np.array([3, 1], np.int32)) + } + num_classes = 3 + input_transformation_fn = functools.partial( + inputs.transform_input_data, + model_preprocess_fn=_fake_model_preprocessor_fn, + image_resizer_fn=_fake_image_resizer_fn, + num_classes=num_classes) + with self.test_session() as sess: + transformed_inputs = sess.run( + input_transformation_fn(tensor_dict=tensor_dict)) + + self.assertAllClose( + transformed_inputs[fields.InputDataFields.groundtruth_classes], + [[0, 0, 1], [1, 0, 0]]) + + def test_returns_correct_merged_boxes(self): + tensor_dict = { + fields.InputDataFields.image: + tf.constant(np.random.rand(4, 4, 3).astype(np.float32)), + fields.InputDataFields.groundtruth_boxes: + tf.constant(np.array([[.5, .5, 1, 1], [.5, .5, 1, 1]], np.float32)), + fields.InputDataFields.groundtruth_classes: + tf.constant(np.array([3, 1], np.int32)) + } + + num_classes = 3 + input_transformation_fn = functools.partial( + inputs.transform_input_data, + model_preprocess_fn=_fake_model_preprocessor_fn, + image_resizer_fn=_fake_image_resizer_fn, + num_classes=num_classes, + merge_multiple_boxes=True) + + with self.test_session() as sess: + transformed_inputs = sess.run( + input_transformation_fn(tensor_dict=tensor_dict)) + self.assertAllClose( + transformed_inputs[fields.InputDataFields.groundtruth_boxes], + [[.5, .5, 1., 1.]]) + self.assertAllClose( + transformed_inputs[fields.InputDataFields.groundtruth_classes], + [[1, 0, 1]]) + + def test_returns_resized_masks(self): + tensor_dict = { + fields.InputDataFields.image: + tf.constant(np.random.rand(4, 4, 3).astype(np.float32)), + fields.InputDataFields.groundtruth_instance_masks: + tf.constant(np.random.rand(2, 4, 4).astype(np.float32)), + fields.InputDataFields.groundtruth_classes: + tf.constant(np.array([3, 1], np.int32)) + } + def fake_image_resizer_fn(image, masks=None): + resized_image = tf.image.resize_images(image, [8, 8]) + results = [resized_image] + if masks is not None: + resized_masks = tf.transpose( + tf.image.resize_images(tf.transpose(masks, [1, 2, 0]), [8, 8]), + [2, 0, 1]) + results.append(resized_masks) + results.append(tf.shape(resized_image)) + return results + + num_classes = 3 + input_transformation_fn = functools.partial( + inputs.transform_input_data, + model_preprocess_fn=_fake_model_preprocessor_fn, + image_resizer_fn=fake_image_resizer_fn, + num_classes=num_classes, + retain_original_image=True) + with self.test_session() as sess: + transformed_inputs = sess.run( + input_transformation_fn(tensor_dict=tensor_dict)) + self.assertAllEqual(transformed_inputs[ + fields.InputDataFields.original_image].dtype, tf.uint8) + self.assertAllEqual(transformed_inputs[ + fields.InputDataFields.original_image].shape, [4, 4, 3]) + self.assertAllEqual(transformed_inputs[ + fields.InputDataFields.groundtruth_instance_masks].shape, [2, 8, 8]) + + def test_applies_model_preprocess_fn_to_image_tensor(self): + np_image = np.random.randint(256, size=(4, 4, 3)) + tensor_dict = { + fields.InputDataFields.image: + tf.constant(np_image), + fields.InputDataFields.groundtruth_classes: + tf.constant(np.array([3, 1], np.int32)) + } + def fake_model_preprocessor_fn(image): + return (image / 255., tf.expand_dims(tf.shape(image)[1:], axis=0)) + + num_classes = 3 + input_transformation_fn = functools.partial( + inputs.transform_input_data, + model_preprocess_fn=fake_model_preprocessor_fn, + image_resizer_fn=_fake_image_resizer_fn, + num_classes=num_classes) + + with self.test_session() as sess: + transformed_inputs = sess.run( + input_transformation_fn(tensor_dict=tensor_dict)) + self.assertAllClose(transformed_inputs[fields.InputDataFields.image], + np_image / 255.) + self.assertAllClose(transformed_inputs[fields.InputDataFields. + true_image_shape], + [4, 4, 3]) + + def test_applies_data_augmentation_fn_to_tensor_dict(self): + np_image = np.random.randint(256, size=(4, 4, 3)) + tensor_dict = { + fields.InputDataFields.image: + tf.constant(np_image), + fields.InputDataFields.groundtruth_classes: + tf.constant(np.array([3, 1], np.int32)) + } + def add_one_data_augmentation_fn(tensor_dict): + return {key: value + 1 for key, value in tensor_dict.items()} + + num_classes = 4 + input_transformation_fn = functools.partial( + inputs.transform_input_data, + model_preprocess_fn=_fake_model_preprocessor_fn, + image_resizer_fn=_fake_image_resizer_fn, + num_classes=num_classes, + data_augmentation_fn=add_one_data_augmentation_fn) + with self.test_session() as sess: + augmented_tensor_dict = sess.run( + input_transformation_fn(tensor_dict=tensor_dict)) + + self.assertAllEqual(augmented_tensor_dict[fields.InputDataFields.image], + np_image + 1) + self.assertAllEqual( + augmented_tensor_dict[fields.InputDataFields.groundtruth_classes], + [[0, 0, 0, 1], [0, 1, 0, 0]]) + + def test_applies_data_augmentation_fn_before_model_preprocess_fn(self): + np_image = np.random.randint(256, size=(4, 4, 3)) + tensor_dict = { + fields.InputDataFields.image: + tf.constant(np_image), + fields.InputDataFields.groundtruth_classes: + tf.constant(np.array([3, 1], np.int32)) + } + def mul_two_model_preprocessor_fn(image): + return (image * 2, tf.expand_dims(tf.shape(image)[1:], axis=0)) + def add_five_to_image_data_augmentation_fn(tensor_dict): + tensor_dict[fields.InputDataFields.image] += 5 + return tensor_dict + + num_classes = 4 + input_transformation_fn = functools.partial( + inputs.transform_input_data, + model_preprocess_fn=mul_two_model_preprocessor_fn, + image_resizer_fn=_fake_image_resizer_fn, + num_classes=num_classes, + data_augmentation_fn=add_five_to_image_data_augmentation_fn) + with self.test_session() as sess: + augmented_tensor_dict = sess.run( + input_transformation_fn(tensor_dict=tensor_dict)) + + self.assertAllEqual(augmented_tensor_dict[fields.InputDataFields.image], + (np_image + 5) * 2) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/argmax_matcher.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/argmax_matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..d397ff41ac560180cacebfe906b092582b8e2fa6 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/argmax_matcher.py @@ -0,0 +1,204 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Argmax matcher implementation. + +This class takes a similarity matrix and matches columns to rows based on the +maximum value per column. One can specify matched_thresholds and +to prevent columns from matching to rows (generally resulting in a negative +training example) and unmatched_theshold to ignore the match (generally +resulting in neither a positive or negative training example). + +This matcher is used in Fast(er)-RCNN. + +Note: matchers are used in TargetAssigners. There is a create_target_assigner +factory function for popular implementations. +""" +import tensorflow as tf + +from object_detection.core import matcher +from object_detection.utils import shape_utils + + +class ArgMaxMatcher(matcher.Matcher): + """Matcher based on highest value. + + This class computes matches from a similarity matrix. Each column is matched + to a single row. + + To support object detection target assignment this class enables setting both + matched_threshold (upper threshold) and unmatched_threshold (lower thresholds) + defining three categories of similarity which define whether examples are + positive, negative, or ignored: + (1) similarity >= matched_threshold: Highest similarity. Matched/Positive! + (2) matched_threshold > similarity >= unmatched_threshold: Medium similarity. + Depending on negatives_lower_than_unmatched, this is either + Unmatched/Negative OR Ignore. + (3) unmatched_threshold > similarity: Lowest similarity. Depending on flag + negatives_lower_than_unmatched, either Unmatched/Negative OR Ignore. + For ignored matches this class sets the values in the Match object to -2. + """ + + def __init__(self, + matched_threshold, + unmatched_threshold=None, + negatives_lower_than_unmatched=True, + force_match_for_each_row=False, + use_matmul_gather=False): + """Construct ArgMaxMatcher. + + Args: + matched_threshold: Threshold for positive matches. Positive if + sim >= matched_threshold, where sim is the maximum value of the + similarity matrix for a given column. Set to None for no threshold. + unmatched_threshold: Threshold for negative matches. Negative if + sim < unmatched_threshold. Defaults to matched_threshold + when set to None. + negatives_lower_than_unmatched: Boolean which defaults to True. If True + then negative matches are the ones below the unmatched_threshold, + whereas ignored matches are in between the matched and umatched + threshold. If False, then negative matches are in between the matched + and unmatched threshold, and everything lower than unmatched is ignored. + force_match_for_each_row: If True, ensures that each row is matched to + at least one column (which is not guaranteed otherwise if the + matched_threshold is high). Defaults to False. See + argmax_matcher_test.testMatcherForceMatch() for an example. + use_matmul_gather: Force constructed match objects to use matrix + multiplication based gather instead of standard tf.gather. + (Default: False). + + Raises: + ValueError: if unmatched_threshold is set but matched_threshold is not set + or if unmatched_threshold > matched_threshold. + """ + super(ArgMaxMatcher, self).__init__(use_matmul_gather=use_matmul_gather) + if (matched_threshold is None) and (unmatched_threshold is not None): + raise ValueError('Need to also define matched_threshold when' + 'unmatched_threshold is defined') + self._matched_threshold = matched_threshold + if unmatched_threshold is None: + self._unmatched_threshold = matched_threshold + else: + if unmatched_threshold > matched_threshold: + raise ValueError('unmatched_threshold needs to be smaller or equal' + 'to matched_threshold') + self._unmatched_threshold = unmatched_threshold + if not negatives_lower_than_unmatched: + if self._unmatched_threshold == self._matched_threshold: + raise ValueError('When negatives are in between matched and ' + 'unmatched thresholds, these cannot be of equal ' + 'value. matched: %s, unmatched: %s', + self._matched_threshold, self._unmatched_threshold) + self._force_match_for_each_row = force_match_for_each_row + self._negatives_lower_than_unmatched = negatives_lower_than_unmatched + + def _match(self, similarity_matrix): + """Tries to match each column of the similarity matrix to a row. + + Args: + similarity_matrix: tensor of shape [N, M] representing any similarity + metric. + + Returns: + Match object with corresponding matches for each of M columns. + """ + + def _match_when_rows_are_empty(): + """Performs matching when the rows of similarity matrix are empty. + + When the rows are empty, all detections are false positives. So we return + a tensor of -1's to indicate that the columns do not match to any rows. + + Returns: + matches: int32 tensor indicating the row each column matches to. + """ + similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( + similarity_matrix) + return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32) + + def _match_when_rows_are_non_empty(): + """Performs matching when the rows of similarity matrix are non empty. + + Returns: + matches: int32 tensor indicating the row each column matches to. + """ + # Matches for each column + matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32) + + # Deal with matched and unmatched threshold + if self._matched_threshold is not None: + # Get logical indices of ignored and unmatched columns as tf.int64 + matched_vals = tf.reduce_max(similarity_matrix, 0) + below_unmatched_threshold = tf.greater(self._unmatched_threshold, + matched_vals) + between_thresholds = tf.logical_and( + tf.greater_equal(matched_vals, self._unmatched_threshold), + tf.greater(self._matched_threshold, matched_vals)) + + if self._negatives_lower_than_unmatched: + matches = self._set_values_using_indicator(matches, + below_unmatched_threshold, + -1) + matches = self._set_values_using_indicator(matches, + between_thresholds, + -2) + else: + matches = self._set_values_using_indicator(matches, + below_unmatched_threshold, + -2) + matches = self._set_values_using_indicator(matches, + between_thresholds, + -1) + + if self._force_match_for_each_row: + similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( + similarity_matrix) + force_match_column_ids = tf.argmax(similarity_matrix, 1, + output_type=tf.int32) + force_match_column_indicators = tf.one_hot( + force_match_column_ids, depth=similarity_matrix_shape[1]) + force_match_row_ids = tf.argmax(force_match_column_indicators, 0, + output_type=tf.int32) + force_match_column_mask = tf.cast( + tf.reduce_max(force_match_column_indicators, 0), tf.bool) + final_matches = tf.where(force_match_column_mask, + force_match_row_ids, matches) + return final_matches + else: + return matches + + if similarity_matrix.shape.is_fully_defined(): + if similarity_matrix.shape[0].value == 0: + return _match_when_rows_are_empty() + else: + return _match_when_rows_are_non_empty() + else: + return tf.cond( + tf.greater(tf.shape(similarity_matrix)[0], 0), + _match_when_rows_are_non_empty, _match_when_rows_are_empty) + + def _set_values_using_indicator(self, x, indicator, val): + """Set the indicated fields of x to val. + + Args: + x: tensor. + indicator: boolean with same shape as x. + val: scalar with value to set. + + Returns: + modified tensor. + """ + indicator = tf.cast(indicator, x.dtype) + return tf.add(tf.multiply(x, 1 - indicator), val * indicator) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/argmax_matcher_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/argmax_matcher_test.py new file mode 100644 index 0000000000000000000000000000000000000000..694bebdc72a5d0d7794cb7c845c97964f4dda517 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/argmax_matcher_test.py @@ -0,0 +1,207 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.matchers.argmax_matcher.""" + +import numpy as np +import tensorflow as tf + +from object_detection.matchers import argmax_matcher +from object_detection.utils import test_case + + +class ArgMaxMatcherTest(test_case.TestCase): + + def test_return_correct_matches_with_default_thresholds(self): + + def graph_fn(similarity_matrix): + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=None) + match = matcher.match(similarity_matrix) + matched_cols = match.matched_column_indicator() + unmatched_cols = match.unmatched_column_indicator() + match_results = match.match_results + return (matched_cols, unmatched_cols, match_results) + + similarity = np.array([[1., 1, 1, 3, 1], + [2, -1, 2, 0, 4], + [3, 0, -1, 0, 0]], dtype=np.float32) + expected_matched_rows = np.array([2, 0, 1, 0, 1]) + (res_matched_cols, res_unmatched_cols, + res_match_results) = self.execute(graph_fn, [similarity]) + + self.assertAllEqual(res_match_results[res_matched_cols], + expected_matched_rows) + self.assertAllEqual(np.nonzero(res_matched_cols)[0], [0, 1, 2, 3, 4]) + self.assertFalse(np.all(res_unmatched_cols)) + + def test_return_correct_matches_with_empty_rows(self): + + def graph_fn(similarity_matrix): + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=None) + match = matcher.match(similarity_matrix) + return match.unmatched_column_indicator() + similarity = 0.2 * np.ones([0, 5], dtype=np.float32) + res_unmatched_cols = self.execute(graph_fn, [similarity]) + self.assertAllEqual(np.nonzero(res_unmatched_cols)[0], np.arange(5)) + + def test_return_correct_matches_with_matched_threshold(self): + + def graph_fn(similarity): + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3.) + match = matcher.match(similarity) + matched_cols = match.matched_column_indicator() + unmatched_cols = match.unmatched_column_indicator() + match_results = match.match_results + return (matched_cols, unmatched_cols, match_results) + + similarity = np.array([[1, 1, 1, 3, 1], + [2, -1, 2, 0, 4], + [3, 0, -1, 0, 0]], dtype=np.float32) + expected_matched_cols = np.array([0, 3, 4]) + expected_matched_rows = np.array([2, 0, 1]) + expected_unmatched_cols = np.array([1, 2]) + + (res_matched_cols, res_unmatched_cols, + match_results) = self.execute(graph_fn, [similarity]) + self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows) + self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols) + self.assertAllEqual(np.nonzero(res_unmatched_cols)[0], + expected_unmatched_cols) + + def test_return_correct_matches_with_matched_and_unmatched_threshold(self): + + def graph_fn(similarity): + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3., + unmatched_threshold=2.) + match = matcher.match(similarity) + matched_cols = match.matched_column_indicator() + unmatched_cols = match.unmatched_column_indicator() + match_results = match.match_results + return (matched_cols, unmatched_cols, match_results) + + similarity = np.array([[1, 1, 1, 3, 1], + [2, -1, 2, 0, 4], + [3, 0, -1, 0, 0]], dtype=np.float32) + expected_matched_cols = np.array([0, 3, 4]) + expected_matched_rows = np.array([2, 0, 1]) + expected_unmatched_cols = np.array([1]) # col 2 has too high maximum val + + (res_matched_cols, res_unmatched_cols, + match_results) = self.execute(graph_fn, [similarity]) + self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows) + self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols) + self.assertAllEqual(np.nonzero(res_unmatched_cols)[0], + expected_unmatched_cols) + + def test_return_correct_matches_negatives_lower_than_unmatched_false(self): + + def graph_fn(similarity): + matcher = argmax_matcher.ArgMaxMatcher( + matched_threshold=3., + unmatched_threshold=2., + negatives_lower_than_unmatched=False) + match = matcher.match(similarity) + matched_cols = match.matched_column_indicator() + unmatched_cols = match.unmatched_column_indicator() + match_results = match.match_results + return (matched_cols, unmatched_cols, match_results) + + similarity = np.array([[1, 1, 1, 3, 1], + [2, -1, 2, 0, 4], + [3, 0, -1, 0, 0]], dtype=np.float32) + expected_matched_cols = np.array([0, 3, 4]) + expected_matched_rows = np.array([2, 0, 1]) + expected_unmatched_cols = np.array([2]) # col 1 has too low maximum val + + (res_matched_cols, res_unmatched_cols, + match_results) = self.execute(graph_fn, [similarity]) + self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows) + self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols) + self.assertAllEqual(np.nonzero(res_unmatched_cols)[0], + expected_unmatched_cols) + + def test_return_correct_matches_unmatched_row_not_using_force_match(self): + + def graph_fn(similarity): + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3., + unmatched_threshold=2.) + match = matcher.match(similarity) + matched_cols = match.matched_column_indicator() + unmatched_cols = match.unmatched_column_indicator() + match_results = match.match_results + return (matched_cols, unmatched_cols, match_results) + + similarity = np.array([[1, 1, 1, 3, 1], + [-1, 0, -2, -2, -1], + [3, 0, -1, 2, 0]], dtype=np.float32) + expected_matched_cols = np.array([0, 3]) + expected_matched_rows = np.array([2, 0]) + expected_unmatched_cols = np.array([1, 2, 4]) + + (res_matched_cols, res_unmatched_cols, + match_results) = self.execute(graph_fn, [similarity]) + self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows) + self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols) + self.assertAllEqual(np.nonzero(res_unmatched_cols)[0], + expected_unmatched_cols) + + def test_return_correct_matches_unmatched_row_while_using_force_match(self): + def graph_fn(similarity): + matcher = argmax_matcher.ArgMaxMatcher(matched_threshold=3., + unmatched_threshold=2., + force_match_for_each_row=True) + match = matcher.match(similarity) + matched_cols = match.matched_column_indicator() + unmatched_cols = match.unmatched_column_indicator() + match_results = match.match_results + return (matched_cols, unmatched_cols, match_results) + + similarity = np.array([[1, 1, 1, 3, 1], + [-1, 0, -2, -2, -1], + [3, 0, -1, 2, 0]], dtype=np.float32) + expected_matched_cols = np.array([0, 1, 3]) + expected_matched_rows = np.array([2, 1, 0]) + expected_unmatched_cols = np.array([2, 4]) # col 2 has too high max val + + (res_matched_cols, res_unmatched_cols, + match_results) = self.execute(graph_fn, [similarity]) + self.assertAllEqual(match_results[res_matched_cols], expected_matched_rows) + self.assertAllEqual(np.nonzero(res_matched_cols)[0], expected_matched_cols) + self.assertAllEqual(np.nonzero(res_unmatched_cols)[0], + expected_unmatched_cols) + + def test_valid_arguments_corner_case(self): + argmax_matcher.ArgMaxMatcher(matched_threshold=1, + unmatched_threshold=1) + + def test_invalid_arguments_corner_case_negatives_lower_than_thres_false(self): + with self.assertRaises(ValueError): + argmax_matcher.ArgMaxMatcher(matched_threshold=1, + unmatched_threshold=1, + negatives_lower_than_unmatched=False) + + def test_invalid_arguments_no_matched_threshold(self): + with self.assertRaises(ValueError): + argmax_matcher.ArgMaxMatcher(matched_threshold=None, + unmatched_threshold=4) + + def test_invalid_arguments_unmatched_thres_larger_than_matched_thres(self): + with self.assertRaises(ValueError): + argmax_matcher.ArgMaxMatcher(matched_threshold=1, + unmatched_threshold=2) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/bipartite_matcher.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/bipartite_matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..a1bb0b849ae77569af7d14beaeb159bedbd972bd --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/bipartite_matcher.py @@ -0,0 +1,64 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Bipartite matcher implementation.""" + +import tensorflow as tf + +from tensorflow.contrib.image.python.ops import image_ops +from object_detection.core import matcher + + +class GreedyBipartiteMatcher(matcher.Matcher): + """Wraps a Tensorflow greedy bipartite matcher.""" + + def __init__(self, use_matmul_gather=False): + """Constructs a Matcher. + + Args: + use_matmul_gather: Force constructed match objects to use matrix + multiplication based gather instead of standard tf.gather. + (Default: False). + """ + super(GreedyBipartiteMatcher, self).__init__( + use_matmul_gather=use_matmul_gather) + + def _match(self, similarity_matrix, num_valid_rows=-1): + """Bipartite matches a collection rows and columns. A greedy bi-partite. + + TODO(rathodv): Add num_valid_columns options to match only that many columns + with all the rows. + + Args: + similarity_matrix: Float tensor of shape [N, M] with pairwise similarity + where higher values mean more similar. + num_valid_rows: A scalar or a 1-D tensor with one element describing the + number of valid rows of similarity_matrix to consider for the bipartite + matching. If set to be negative, then all rows from similarity_matrix + are used. + + Returns: + match_results: int32 tensor of shape [M] with match_results[i]=-1 + meaning that column i is not matched and otherwise that it is matched to + row match_results[i]. + """ + # Convert similarity matrix to distance matrix as tf.image.bipartite tries + # to find minimum distance matches. + distance_matrix = -1 * similarity_matrix + _, match_results = image_ops.bipartite_match( + distance_matrix, num_valid_rows) + match_results = tf.reshape(match_results, [-1]) + match_results = tf.cast(match_results, tf.int32) + return match_results diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/bipartite_matcher_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/bipartite_matcher_test.py new file mode 100644 index 0000000000000000000000000000000000000000..2ee45a80dfafc82b6ee4965a28719b9840296591 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/matchers/bipartite_matcher_test.py @@ -0,0 +1,71 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.core.bipartite_matcher.""" + +import tensorflow as tf + +from object_detection.matchers import bipartite_matcher + + +class GreedyBipartiteMatcherTest(tf.test.TestCase): + + def test_get_expected_matches_when_all_rows_are_valid(self): + similarity_matrix = tf.constant([[0.50, 0.1, 0.8], [0.15, 0.2, 0.3]]) + num_valid_rows = 2 + expected_match_results = [-1, 1, 0] + + matcher = bipartite_matcher.GreedyBipartiteMatcher() + match = matcher.match(similarity_matrix, num_valid_rows=num_valid_rows) + with self.test_session() as sess: + match_results_out = sess.run(match._match_results) + self.assertAllEqual(match_results_out, expected_match_results) + + def test_get_expected_matches_with_valid_rows_set_to_minus_one(self): + similarity_matrix = tf.constant([[0.50, 0.1, 0.8], [0.15, 0.2, 0.3]]) + num_valid_rows = -1 + expected_match_results = [-1, 1, 0] + + matcher = bipartite_matcher.GreedyBipartiteMatcher() + match = matcher.match(similarity_matrix, num_valid_rows=num_valid_rows) + with self.test_session() as sess: + match_results_out = sess.run(match._match_results) + self.assertAllEqual(match_results_out, expected_match_results) + + def test_get_no_matches_with_zero_valid_rows(self): + similarity_matrix = tf.constant([[0.50, 0.1, 0.8], [0.15, 0.2, 0.3]]) + num_valid_rows = 0 + expected_match_results = [-1, -1, -1] + + matcher = bipartite_matcher.GreedyBipartiteMatcher() + match = matcher.match(similarity_matrix, num_valid_rows=num_valid_rows) + with self.test_session() as sess: + match_results_out = sess.run(match._match_results) + self.assertAllEqual(match_results_out, expected_match_results) + + def test_get_expected_matches_with_only_one_valid_row(self): + similarity_matrix = tf.constant([[0.50, 0.1, 0.8], [0.15, 0.2, 0.3]]) + num_valid_rows = 1 + expected_match_results = [-1, -1, 0] + + matcher = bipartite_matcher.GreedyBipartiteMatcher() + match = matcher.match(similarity_matrix, num_valid_rows=num_valid_rows) + with self.test_session() as sess: + match_results_out = sess.run(match._match_results) + self.assertAllEqual(match_results_out, expected_match_results) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/faster_rcnn_meta_arch.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/faster_rcnn_meta_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..6315d4dfeb3afc8581c5da7b99866c0b1e23d47a --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/faster_rcnn_meta_arch.py @@ -0,0 +1,2028 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Faster R-CNN meta-architecture definition. + +General tensorflow implementation of Faster R-CNN detection models. + +See Faster R-CNN: Ren, Shaoqing, et al. +"Faster R-CNN: Towards real-time object detection with region proposal +networks." Advances in neural information processing systems. 2015. + +We allow for three modes: number_of_stages={1, 2, 3}. In case of 1 stage, +all of the user facing methods (e.g., predict, postprocess, loss) can be used as +if the model consisted only of the RPN, returning class agnostic proposals +(these can be thought of as approximate detections with no associated class +information). In case of 2 stages, proposals are computed, then passed +through a second stage "box classifier" to yield (multi-class) detections. +Finally, in case of 3 stages which is only used during eval, proposals are +computed, then passed through a second stage "box classifier" that will compute +refined boxes and classes, and then features are pooled from the refined and +non-maximum suppressed boxes and are passed through the box classifier again. If +number of stages is 3 during training it will be reduced to two automatically. + +Implementations of Faster R-CNN models must define a new +FasterRCNNFeatureExtractor and override three methods: `preprocess`, +`_extract_proposal_features` (the first stage of the model), and +`_extract_box_classifier_features` (the second stage of the model). Optionally, +the `restore_fn` method can be overridden. See tests for an example. + +A few important notes: ++ Batching conventions: We support batched inference and training where +all images within a batch have the same resolution. Batch sizes are determined +dynamically via the shape of the input tensors (rather than being specified +directly as, e.g., a model constructor). + +A complication is that due to non-max suppression, we are not guaranteed to get +the same number of proposals from the first stage RPN (region proposal network) +for each image (though in practice, we should often get the same number of +proposals). For this reason we pad to a max number of proposals per image +within a batch. This `self.max_num_proposals` property is set to the +`first_stage_max_proposals` parameter at inference time and the +`second_stage_batch_size` at training time since we subsample the batch to +be sent through the box classifier during training. + +For the second stage of the pipeline, we arrange the proposals for all images +within the batch along a single batch dimension. For example, the input to +_extract_box_classifier_features is a tensor of shape +`[total_num_proposals, crop_height, crop_width, depth]` where +total_num_proposals is batch_size * self.max_num_proposals. (And note that per +the above comment, a subset of these entries correspond to zero paddings.) + ++ Coordinate representations: +Following the API (see model.DetectionModel definition), our outputs after +postprocessing operations are always normalized boxes however, internally, we +sometimes convert to absolute --- e.g. for loss computation. In particular, +anchors and proposal_boxes are both represented as absolute coordinates. + +Images are resized in the `preprocess` method. + +The Faster R-CNN meta architecture has two post-processing methods +`_postprocess_rpn` which is applied after first stage and +`_postprocess_box_classifier` which is applied after second stage. There are +three different ways post-processing can happen depending on number_of_stages +configured in the meta architecture: + +1. When number_of_stages is 1: + `_postprocess_rpn` is run as part of the `postprocess` method where + true_image_shapes is used to clip proposals, perform non-max suppression and + normalize them. +2. When number of stages is 2: + `_postprocess_rpn` is run as part of the `_predict_second_stage` method where + `resized_image_shapes` is used to clip proposals, perform non-max suppression + and normalize them. In this case `postprocess` method skips `_postprocess_rpn` + and only runs `_postprocess_box_classifier` using `true_image_shapes` to clip + detections, perform non-max suppression and normalize them. +3. When number of stages is 3: + `_postprocess_rpn` is run as part of the `_predict_second_stage` using + `resized_image_shapes` to clip proposals, perform non-max suppression and + normalize them. Subsequently, `_postprocess_box_classifier` is run as part of + `_predict_third_stage` using `true_image_shapes` to clip detections, peform + non-max suppression and normalize them. In this case, the `postprocess` method + skips both `_postprocess_rpn` and `_postprocess_box_classifier`. +""" +from abc import abstractmethod +from functools import partial +import tensorflow as tf + +from object_detection.anchor_generators import grid_anchor_generator +from object_detection.core import balanced_positive_negative_sampler as sampler +from object_detection.core import box_list +from object_detection.core import box_list_ops +from object_detection.core import box_predictor +from object_detection.core import losses +from object_detection.core import model +from object_detection.core import post_processing +from object_detection.core import standard_fields as fields +from object_detection.core import target_assigner +from object_detection.utils import ops +from object_detection.utils import shape_utils + +slim = tf.contrib.slim + + +class FasterRCNNFeatureExtractor(object): + """Faster R-CNN Feature Extractor definition.""" + + def __init__(self, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0): + """Constructor. + + Args: + is_training: A boolean indicating whether the training version of the + computation graph should be constructed. + first_stage_features_stride: Output stride of extracted RPN feature map. + batch_norm_trainable: Whether to update batch norm parameters during + training or not. When training with a relative large batch size + (e.g. 8), it could be desirable to enable batch norm update. + reuse_weights: Whether to reuse variables. Default is None. + weight_decay: float weight decay for feature extractor (default: 0.0). + """ + self._is_training = is_training + self._first_stage_features_stride = first_stage_features_stride + self._train_batch_norm = (batch_norm_trainable and is_training) + self._reuse_weights = reuse_weights + self._weight_decay = weight_decay + + @abstractmethod + def preprocess(self, resized_inputs): + """Feature-extractor specific preprocessing (minus image resizing).""" + pass + + def extract_proposal_features(self, preprocessed_inputs, scope): + """Extracts first stage RPN features. + + This function is responsible for extracting feature maps from preprocessed + images. These features are used by the region proposal network (RPN) to + predict proposals. + + Args: + preprocessed_inputs: A [batch, height, width, channels] float tensor + representing a batch of images. + scope: A scope name. + + Returns: + rpn_feature_map: A tensor with shape [batch, height, width, depth] + activations: A dictionary mapping activation tensor names to tensors. + """ + with tf.variable_scope(scope, values=[preprocessed_inputs]): + return self._extract_proposal_features(preprocessed_inputs, scope) + + @abstractmethod + def _extract_proposal_features(self, preprocessed_inputs, scope): + """Extracts first stage RPN features, to be overridden.""" + pass + + def extract_box_classifier_features(self, proposal_feature_maps, scope): + """Extracts second stage box classifier features. + + Args: + proposal_feature_maps: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, crop_height, crop_width, depth] + representing the feature map cropped to each proposal. + scope: A scope name. + + Returns: + proposal_classifier_features: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, height, width, depth] + representing box classifier features for each proposal. + """ + with tf.variable_scope( + scope, values=[proposal_feature_maps], reuse=tf.AUTO_REUSE): + return self._extract_box_classifier_features(proposal_feature_maps, scope) + + @abstractmethod + def _extract_box_classifier_features(self, proposal_feature_maps, scope): + """Extracts second stage box classifier features, to be overridden.""" + pass + + def restore_from_classification_checkpoint_fn( + self, + first_stage_feature_extractor_scope, + second_stage_feature_extractor_scope): + """Returns a map of variables to load from a foreign checkpoint. + + Args: + first_stage_feature_extractor_scope: A scope name for the first stage + feature extractor. + second_stage_feature_extractor_scope: A scope name for the second stage + feature extractor. + + Returns: + A dict mapping variable names (to load from a checkpoint) to variables in + the model graph. + """ + variables_to_restore = {} + for variable in tf.global_variables(): + for scope_name in [first_stage_feature_extractor_scope, + second_stage_feature_extractor_scope]: + if variable.op.name.startswith(scope_name): + var_name = variable.op.name.replace(scope_name + '/', '') + variables_to_restore[var_name] = variable + return variables_to_restore + + +class FasterRCNNMetaArch(model.DetectionModel): + """Faster R-CNN Meta-architecture definition.""" + + def __init__(self, + is_training, + num_classes, + image_resizer_fn, + feature_extractor, + number_of_stages, + first_stage_anchor_generator, + first_stage_atrous_rate, + first_stage_box_predictor_arg_scope_fn, + first_stage_box_predictor_kernel_size, + first_stage_box_predictor_depth, + first_stage_minibatch_size, + first_stage_positive_balance_fraction, + first_stage_nms_score_threshold, + first_stage_nms_iou_threshold, + first_stage_max_proposals, + first_stage_localization_loss_weight, + first_stage_objectness_loss_weight, + initial_crop_size, + maxpool_kernel_size, + maxpool_stride, + second_stage_mask_rcnn_box_predictor, + second_stage_batch_size, + second_stage_balance_fraction, + second_stage_non_max_suppression_fn, + second_stage_score_conversion_fn, + second_stage_localization_loss_weight, + second_stage_classification_loss_weight, + second_stage_classification_loss, + second_stage_mask_prediction_loss_weight=1.0, + hard_example_miner=None, + parallel_iterations=16, + add_summaries=True): + """FasterRCNNMetaArch Constructor. + + Args: + is_training: A boolean indicating whether the training version of the + computation graph should be constructed. + num_classes: Number of classes. Note that num_classes *does not* + include the background category, so if groundtruth labels take values + in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the + assigned classification targets can range from {0,... K}). + image_resizer_fn: A callable for image resizing. This callable + takes a rank-3 image tensor of shape [height, width, channels] + (corresponding to a single image), an optional rank-3 instance mask + tensor of shape [num_masks, height, width] and returns a resized rank-3 + image tensor, a resized mask tensor if one was provided in the input. In + addition this callable must also return a 1-D tensor of the form + [height, width, channels] containing the size of the true image, as the + image resizer can perform zero padding. See protos/image_resizer.proto. + feature_extractor: A FasterRCNNFeatureExtractor object. + number_of_stages: An integer values taking values in {1, 2, 3}. If + 1, the function will construct only the Region Proposal Network (RPN) + part of the model. If 2, the function will perform box refinement and + other auxiliary predictions all in the second stage. If 3, it will + extract features from refined boxes and perform the auxiliary + predictions on the non-maximum suppressed refined boxes. + If is_training is true and the value of number_of_stages is 3, it is + reduced to 2 since all the model heads are trained in parallel in second + stage during training. + first_stage_anchor_generator: An anchor_generator.AnchorGenerator object + (note that currently we only support + grid_anchor_generator.GridAnchorGenerator objects) + first_stage_atrous_rate: A single integer indicating the atrous rate for + the single convolution op which is applied to the `rpn_features_to_crop` + tensor to obtain a tensor to be used for box prediction. Some feature + extractors optionally allow for producing feature maps computed at + denser resolutions. The atrous rate is used to compensate for the + denser feature maps by using an effectively larger receptive field. + (This should typically be set to 1). + first_stage_box_predictor_arg_scope_fn: A function to construct tf-slim + arg_scope for conv2d, separable_conv2d and fully_connected ops for the + RPN box predictor. + first_stage_box_predictor_kernel_size: Kernel size to use for the + convolution op just prior to RPN box predictions. + first_stage_box_predictor_depth: Output depth for the convolution op + just prior to RPN box predictions. + first_stage_minibatch_size: The "batch size" to use for computing the + objectness and location loss of the region proposal network. This + "batch size" refers to the number of anchors selected as contributing + to the loss function for any given image within the image batch and is + only called "batch_size" due to terminology from the Faster R-CNN paper. + first_stage_positive_balance_fraction: Fraction of positive examples + per image for the RPN. The recommended value for Faster RCNN is 0.5. + first_stage_nms_score_threshold: Score threshold for non max suppression + for the Region Proposal Network (RPN). This value is expected to be in + [0, 1] as it is applied directly after a softmax transformation. The + recommended value for Faster R-CNN is 0. + first_stage_nms_iou_threshold: The Intersection Over Union (IOU) threshold + for performing Non-Max Suppression (NMS) on the boxes predicted by the + Region Proposal Network (RPN). + first_stage_max_proposals: Maximum number of boxes to retain after + performing Non-Max Suppression (NMS) on the boxes predicted by the + Region Proposal Network (RPN). + first_stage_localization_loss_weight: A float + first_stage_objectness_loss_weight: A float + initial_crop_size: A single integer indicating the output size + (width and height are set to be the same) of the initial bilinear + interpolation based cropping during ROI pooling. + maxpool_kernel_size: A single integer indicating the kernel size of the + max pool op on the cropped feature map during ROI pooling. + maxpool_stride: A single integer indicating the stride of the max pool + op on the cropped feature map during ROI pooling. + second_stage_mask_rcnn_box_predictor: Mask R-CNN box predictor to use for + the second stage. + second_stage_batch_size: The batch size used for computing the + classification and refined location loss of the box classifier. This + "batch size" refers to the number of proposals selected as contributing + to the loss function for any given image within the image batch and is + only called "batch_size" due to terminology from the Faster R-CNN paper. + second_stage_balance_fraction: Fraction of positive examples to use + per image for the box classifier. The recommended value for Faster RCNN + is 0.25. + second_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression + callable that takes `boxes`, `scores`, optional `clip_window` and + optional (kwarg) `mask` inputs (with all other inputs already set) + and returns a dictionary containing tensors with keys: + `detection_boxes`, `detection_scores`, `detection_classes`, + `num_detections`, and (optionally) `detection_masks`. See + `post_processing.batch_multiclass_non_max_suppression` for the type and + shape of these tensors. + second_stage_score_conversion_fn: Callable elementwise nonlinearity + (that takes tensors as inputs and returns tensors). This is usually + used to convert logits to probabilities. + second_stage_localization_loss_weight: A float indicating the scale factor + for second stage localization loss. + second_stage_classification_loss_weight: A float indicating the scale + factor for second stage classification loss. + second_stage_classification_loss: Classification loss used by the second + stage classifier. Either losses.WeightedSigmoidClassificationLoss or + losses.WeightedSoftmaxClassificationLoss. + second_stage_mask_prediction_loss_weight: A float indicating the scale + factor for second stage mask prediction loss. This is applicable only if + second stage box predictor is configured to predict masks. + hard_example_miner: A losses.HardExampleMiner object (can be None). + parallel_iterations: (Optional) The number of iterations allowed to run + in parallel for calls to tf.map_fn. + add_summaries: boolean (default: True) controlling whether summary ops + should be added to tensorflow graph. + + Raises: + ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at + training time. + ValueError: If first_stage_anchor_generator is not of type + grid_anchor_generator.GridAnchorGenerator. + """ + # TODO(rathodv): add_summaries is currently unused. Respect that directive + # in the future. + super(FasterRCNNMetaArch, self).__init__(num_classes=num_classes) + + if is_training and second_stage_batch_size > first_stage_max_proposals: + raise ValueError('second_stage_batch_size should be no greater than ' + 'first_stage_max_proposals.') + if not isinstance(first_stage_anchor_generator, + grid_anchor_generator.GridAnchorGenerator): + raise ValueError('first_stage_anchor_generator must be of type ' + 'grid_anchor_generator.GridAnchorGenerator.') + + self._is_training = is_training + self._image_resizer_fn = image_resizer_fn + self._feature_extractor = feature_extractor + self._number_of_stages = number_of_stages + + # The first class is reserved as background. + unmatched_cls_target = tf.constant( + [1] + self._num_classes * [0], dtype=tf.float32) + self._proposal_target_assigner = target_assigner.create_target_assigner( + 'FasterRCNN', 'proposal') + self._detector_target_assigner = target_assigner.create_target_assigner( + 'FasterRCNN', 'detection', unmatched_cls_target=unmatched_cls_target) + # Both proposal and detector target assigners use the same box coder + self._box_coder = self._proposal_target_assigner.box_coder + + # (First stage) Region proposal network parameters + self._first_stage_anchor_generator = first_stage_anchor_generator + self._first_stage_atrous_rate = first_stage_atrous_rate + self._first_stage_box_predictor_arg_scope_fn = ( + first_stage_box_predictor_arg_scope_fn) + self._first_stage_box_predictor_kernel_size = ( + first_stage_box_predictor_kernel_size) + self._first_stage_box_predictor_depth = first_stage_box_predictor_depth + self._first_stage_minibatch_size = first_stage_minibatch_size + self._first_stage_sampler = sampler.BalancedPositiveNegativeSampler( + positive_fraction=first_stage_positive_balance_fraction) + self._first_stage_box_predictor = box_predictor.ConvolutionalBoxPredictor( + self._is_training, num_classes=1, + conv_hyperparams_fn=self._first_stage_box_predictor_arg_scope_fn, + min_depth=0, max_depth=0, num_layers_before_predictor=0, + use_dropout=False, dropout_keep_prob=1.0, kernel_size=1, + box_code_size=self._box_coder.code_size) + + self._first_stage_nms_score_threshold = first_stage_nms_score_threshold + self._first_stage_nms_iou_threshold = first_stage_nms_iou_threshold + self._first_stage_max_proposals = first_stage_max_proposals + + self._first_stage_localization_loss = ( + losses.WeightedSmoothL1LocalizationLoss()) + self._first_stage_objectness_loss = ( + losses.WeightedSoftmaxClassificationLoss()) + self._first_stage_loc_loss_weight = first_stage_localization_loss_weight + self._first_stage_obj_loss_weight = first_stage_objectness_loss_weight + + # Per-region cropping parameters + self._initial_crop_size = initial_crop_size + self._maxpool_kernel_size = maxpool_kernel_size + self._maxpool_stride = maxpool_stride + + self._mask_rcnn_box_predictor = second_stage_mask_rcnn_box_predictor + + self._second_stage_batch_size = second_stage_batch_size + self._second_stage_sampler = sampler.BalancedPositiveNegativeSampler( + positive_fraction=second_stage_balance_fraction) + + self._second_stage_nms_fn = second_stage_non_max_suppression_fn + self._second_stage_score_conversion_fn = second_stage_score_conversion_fn + + self._second_stage_localization_loss = ( + losses.WeightedSmoothL1LocalizationLoss()) + self._second_stage_classification_loss = second_stage_classification_loss + self._second_stage_mask_loss = ( + losses.WeightedSigmoidClassificationLoss()) + self._second_stage_loc_loss_weight = second_stage_localization_loss_weight + self._second_stage_cls_loss_weight = second_stage_classification_loss_weight + self._second_stage_mask_loss_weight = ( + second_stage_mask_prediction_loss_weight) + self._hard_example_miner = hard_example_miner + self._parallel_iterations = parallel_iterations + + if self._number_of_stages <= 0 or self._number_of_stages > 3: + raise ValueError('Number of stages should be a value in {1, 2, 3}.') + + @property + def first_stage_feature_extractor_scope(self): + return 'FirstStageFeatureExtractor' + + @property + def second_stage_feature_extractor_scope(self): + return 'SecondStageFeatureExtractor' + + @property + def first_stage_box_predictor_scope(self): + return 'FirstStageBoxPredictor' + + @property + def second_stage_box_predictor_scope(self): + return 'SecondStageBoxPredictor' + + @property + def max_num_proposals(self): + """Max number of proposals (to pad to) for each image in the input batch. + + At training time, this is set to be the `second_stage_batch_size` if hard + example miner is not configured, else it is set to + `first_stage_max_proposals`. At inference time, this is always set to + `first_stage_max_proposals`. + + Returns: + A positive integer. + """ + if self._is_training and not self._hard_example_miner: + return self._second_stage_batch_size + return self._first_stage_max_proposals + + @property + def anchors(self): + if not self._anchors: + raise RuntimeError('anchors have not been constructed yet!') + if not isinstance(self._anchors, box_list.BoxList): + raise RuntimeError('anchors should be a BoxList object, but is not.') + return self._anchors + + def preprocess(self, inputs): + """Feature-extractor specific preprocessing. + + See base class. + + For Faster R-CNN, we perform image resizing in the base class --- each + class subclassing FasterRCNNMetaArch is responsible for any additional + preprocessing (e.g., scaling pixel values to be in [-1, 1]). + + Args: + inputs: a [batch, height_in, width_in, channels] float tensor representing + a batch of images with values between 0 and 255.0. + + Returns: + preprocessed_inputs: a [batch, height_out, width_out, channels] float + tensor representing a batch of images. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + Raises: + ValueError: if inputs tensor does not have type tf.float32 + """ + if inputs.dtype is not tf.float32: + raise ValueError('`preprocess` expects a tf.float32 tensor') + with tf.name_scope('Preprocessor'): + outputs = shape_utils.static_or_dynamic_map_fn( + self._image_resizer_fn, + elems=inputs, + dtype=[tf.float32, tf.int32], + parallel_iterations=self._parallel_iterations) + resized_inputs = outputs[0] + true_image_shapes = outputs[1] + return (self._feature_extractor.preprocess(resized_inputs), + true_image_shapes) + + def _compute_clip_window(self, image_shapes): + """Computes clip window for non max suppression based on image shapes. + + This function assumes that the clip window's left top corner is at (0, 0). + + Args: + image_shapes: A 2-D int32 tensor of shape [batch_size, 3] containing + shapes of images in the batch. Each row represents [height, width, + channels] of an image. + + Returns: + A 2-D float32 tensor of shape [batch_size, 4] containing the clip window + for each image in the form [ymin, xmin, ymax, xmax]. + """ + clip_heights = image_shapes[:, 0] + clip_widths = image_shapes[:, 1] + clip_window = tf.to_float(tf.stack([tf.zeros_like(clip_heights), + tf.zeros_like(clip_heights), + clip_heights, clip_widths], axis=1)) + return clip_window + + def predict(self, preprocessed_inputs, true_image_shapes): + """Predicts unpostprocessed tensors from input tensor. + + This function takes an input batch of images and runs it through the + forward pass of the network to yield "raw" un-postprocessed predictions. + If `number_of_stages` is 1, this function only returns first stage + RPN predictions (un-postprocessed). Otherwise it returns both + first stage RPN predictions as well as second stage box classifier + predictions. + + Other remarks: + + Anchor pruning vs. clipping: following the recommendation of the Faster + R-CNN paper, we prune anchors that venture outside the image window at + training time and clip anchors to the image window at inference time. + + Proposal padding: as described at the top of the file, proposals are + padded to self._max_num_proposals and flattened so that proposals from all + images within the input batch are arranged along the same batch dimension. + + Args: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Returns: + prediction_dict: a dictionary holding "raw" prediction tensors: + 1) rpn_box_predictor_features: A 4-D float32 tensor with shape + [batch_size, height, width, depth] to be used for predicting proposal + boxes and corresponding objectness scores. + 2) rpn_features_to_crop: A 4-D float32 tensor with shape + [batch_size, height, width, depth] representing image features to crop + using the proposal boxes predicted by the RPN. + 3) image_shape: a 1-D tensor of shape [4] representing the input + image shape. + 4) rpn_box_encodings: 3-D float tensor of shape + [batch_size, num_anchors, self._box_coder.code_size] containing + predicted boxes. + 5) rpn_objectness_predictions_with_background: 3-D float tensor of shape + [batch_size, num_anchors, 2] containing class + predictions (logits) for each of the anchors. Note that this + tensor *includes* background class predictions (at class index 0). + 6) anchors: A 2-D tensor of shape [num_anchors, 4] representing anchors + for the first stage RPN (in absolute coordinates). Note that + `num_anchors` can differ depending on whether the model is created in + training or inference mode. + + (and if number_of_stages > 1): + 7) refined_box_encodings: a 3-D tensor with shape + [total_num_proposals, num_classes, self._box_coder.code_size] + representing predicted (final) refined box encodings, where + total_num_proposals=batch_size*self._max_num_proposals. If using + a shared box across classes the shape will instead be + [total_num_proposals, 1, self._box_coder.code_size]. + 8) class_predictions_with_background: a 3-D tensor with shape + [total_num_proposals, num_classes + 1] containing class + predictions (logits) for each of the anchors, where + total_num_proposals=batch_size*self._max_num_proposals. + Note that this tensor *includes* background class predictions + (at class index 0). + 9) num_proposals: An int32 tensor of shape [batch_size] representing the + number of proposals generated by the RPN. `num_proposals` allows us + to keep track of which entries are to be treated as zero paddings and + which are not since we always pad the number of proposals to be + `self.max_num_proposals` for each image. + 10) proposal_boxes: A float32 tensor of shape + [batch_size, self.max_num_proposals, 4] representing + decoded proposal bounding boxes in absolute coordinates. + 11) mask_predictions: (optional) a 4-D tensor with shape + [total_num_padded_proposals, num_classes, mask_height, mask_width] + containing instance mask predictions. + + Raises: + ValueError: If `predict` is called before `preprocess`. + """ + (rpn_box_predictor_features, rpn_features_to_crop, anchors_boxlist, + image_shape) = self._extract_rpn_feature_maps(preprocessed_inputs) + (rpn_box_encodings, rpn_objectness_predictions_with_background + ) = self._predict_rpn_proposals(rpn_box_predictor_features) + + # The Faster R-CNN paper recommends pruning anchors that venture outside + # the image window at training time and clipping at inference time. + clip_window = tf.to_float(tf.stack([0, 0, image_shape[1], image_shape[2]])) + if self._is_training: + (rpn_box_encodings, rpn_objectness_predictions_with_background, + anchors_boxlist) = self._remove_invalid_anchors_and_predictions( + rpn_box_encodings, rpn_objectness_predictions_with_background, + anchors_boxlist, clip_window) + else: + anchors_boxlist = box_list_ops.clip_to_window( + anchors_boxlist, clip_window) + + self._anchors = anchors_boxlist + prediction_dict = { + 'rpn_box_predictor_features': rpn_box_predictor_features, + 'rpn_features_to_crop': rpn_features_to_crop, + 'image_shape': image_shape, + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'anchors': self._anchors.get() + } + + if self._number_of_stages >= 2: + prediction_dict.update(self._predict_second_stage( + rpn_box_encodings, + rpn_objectness_predictions_with_background, + rpn_features_to_crop, + self._anchors.get(), image_shape, true_image_shapes)) + + if self._number_of_stages == 3: + prediction_dict = self._predict_third_stage( + prediction_dict, true_image_shapes) + + return prediction_dict + + def _image_batch_shape_2d(self, image_batch_shape_1d): + """Takes a 1-D image batch shape tensor and converts it to a 2-D tensor. + + Example: + If 1-D image batch shape tensor is [2, 300, 300, 3]. The corresponding 2-D + image batch tensor would be [[300, 300, 3], [300, 300, 3]] + + Args: + image_batch_shape_1d: 1-D tensor of the form [batch_size, height, + width, channels]. + + Returns: + image_batch_shape_2d: 2-D tensor of shape [batch_size, 3] were each row is + of the form [height, width, channels]. + """ + return tf.tile(tf.expand_dims(image_batch_shape_1d[1:], 0), + [image_batch_shape_1d[0], 1]) + + def _predict_second_stage(self, rpn_box_encodings, + rpn_objectness_predictions_with_background, + rpn_features_to_crop, + anchors, + image_shape, + true_image_shapes): + """Predicts the output tensors from second stage of Faster R-CNN. + + Args: + rpn_box_encodings: 4-D float tensor of shape + [batch_size, num_valid_anchors, self._box_coder.code_size] containing + predicted boxes. + rpn_objectness_predictions_with_background: 2-D float tensor of shape + [batch_size, num_valid_anchors, 2] containing class + predictions (logits) for each of the anchors. Note that this + tensor *includes* background class predictions (at class index 0). + rpn_features_to_crop: A 4-D float32 tensor with shape + [batch_size, height, width, depth] representing image features to crop + using the proposal boxes predicted by the RPN. + anchors: 2-D float tensor of shape + [num_anchors, self._box_coder.code_size]. + image_shape: A 1D int32 tensors of size [4] containing the image shape. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Returns: + prediction_dict: a dictionary holding "raw" prediction tensors: + 1) refined_box_encodings: a 3-D tensor with shape + [total_num_proposals, num_classes, self._box_coder.code_size] + representing predicted (final) refined box encodings, where + total_num_proposals=batch_size*self._max_num_proposals. If using a + shared box across classes the shape will instead be + [total_num_proposals, 1, self._box_coder.code_size]. + 2) class_predictions_with_background: a 3-D tensor with shape + [total_num_proposals, num_classes + 1] containing class + predictions (logits) for each of the anchors, where + total_num_proposals=batch_size*self._max_num_proposals. + Note that this tensor *includes* background class predictions + (at class index 0). + 3) num_proposals: An int32 tensor of shape [batch_size] representing the + number of proposals generated by the RPN. `num_proposals` allows us + to keep track of which entries are to be treated as zero paddings and + which are not since we always pad the number of proposals to be + `self.max_num_proposals` for each image. + 4) proposal_boxes: A float32 tensor of shape + [batch_size, self.max_num_proposals, 4] representing + decoded proposal bounding boxes in absolute coordinates. + 5) proposal_boxes_normalized: A float32 tensor of shape + [batch_size, self.max_num_proposals, 4] representing decoded proposal + bounding boxes in normalized coordinates. Can be used to override the + boxes proposed by the RPN, thus enabling one to extract features and + get box classification and prediction for externally selected areas + of the image. + 6) box_classifier_features: a 4-D float32 tensor representing the + features for each proposal. + """ + image_shape_2d = self._image_batch_shape_2d(image_shape) + proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn( + rpn_box_encodings, rpn_objectness_predictions_with_background, + anchors, image_shape_2d, true_image_shapes) + + flattened_proposal_feature_maps = ( + self._compute_second_stage_input_feature_maps( + rpn_features_to_crop, proposal_boxes_normalized)) + + box_classifier_features = ( + self._feature_extractor.extract_box_classifier_features( + flattened_proposal_feature_maps, + scope=self.second_stage_feature_extractor_scope)) + + box_predictions = self._mask_rcnn_box_predictor.predict( + [box_classifier_features], + num_predictions_per_location=[1], + scope=self.second_stage_box_predictor_scope, + predict_boxes_and_classes=True) + + refined_box_encodings = tf.squeeze( + box_predictions[box_predictor.BOX_ENCODINGS], + axis=1, name='all_refined_box_encodings') + class_predictions_with_background = tf.squeeze( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1, name='all_class_predictions_with_background') + + absolute_proposal_boxes = ops.normalized_to_image_coordinates( + proposal_boxes_normalized, image_shape, self._parallel_iterations) + + prediction_dict = { + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': + class_predictions_with_background, + 'num_proposals': num_proposals, + 'proposal_boxes': absolute_proposal_boxes, + 'box_classifier_features': box_classifier_features, + 'proposal_boxes_normalized': proposal_boxes_normalized, + } + + return prediction_dict + + def _predict_third_stage(self, prediction_dict, image_shapes): + """Predicts non-box, non-class outputs using refined detections. + + For training, masks as predicted directly on the box_classifier_features, + which are region-features from the initial anchor boxes. + For inference, this happens after calling the post-processing stage, such + that masks are only calculated for the top scored boxes. + + Args: + prediction_dict: a dictionary holding "raw" prediction tensors: + 1) refined_box_encodings: a 3-D tensor with shape + [total_num_proposals, num_classes, self._box_coder.code_size] + representing predicted (final) refined box encodings, where + total_num_proposals=batch_size*self._max_num_proposals. If using a + shared box across classes the shape will instead be + [total_num_proposals, 1, self._box_coder.code_size]. + 2) class_predictions_with_background: a 3-D tensor with shape + [total_num_proposals, num_classes + 1] containing class + predictions (logits) for each of the anchors, where + total_num_proposals=batch_size*self._max_num_proposals. + Note that this tensor *includes* background class predictions + (at class index 0). + 3) num_proposals: An int32 tensor of shape [batch_size] representing the + number of proposals generated by the RPN. `num_proposals` allows us + to keep track of which entries are to be treated as zero paddings and + which are not since we always pad the number of proposals to be + `self.max_num_proposals` for each image. + 4) proposal_boxes: A float32 tensor of shape + [batch_size, self.max_num_proposals, 4] representing + decoded proposal bounding boxes in absolute coordinates. + 5) box_classifier_features: a 4-D float32 tensor representing the + features for each proposal. + image_shapes: A 2-D int32 tensors of shape [batch_size, 3] containing + shapes of images in the batch. + + Returns: + prediction_dict: a dictionary that in addition to the input predictions + does hold the following predictions as well: + 1) mask_predictions: a 4-D tensor with shape + [batch_size, max_detection, mask_height, mask_width] containing + instance mask predictions. + """ + if self._is_training: + curr_box_classifier_features = prediction_dict['box_classifier_features'] + detection_classes = prediction_dict['class_predictions_with_background'] + mask_predictions = self._mask_rcnn_box_predictor.predict( + [curr_box_classifier_features], + num_predictions_per_location=[1], + scope=self.second_stage_box_predictor_scope, + predict_boxes_and_classes=False, + predict_auxiliary_outputs=True) + prediction_dict['mask_predictions'] = tf.squeeze(mask_predictions[ + box_predictor.MASK_PREDICTIONS], axis=1) + else: + detections_dict = self._postprocess_box_classifier( + prediction_dict['refined_box_encodings'], + prediction_dict['class_predictions_with_background'], + prediction_dict['proposal_boxes'], + prediction_dict['num_proposals'], + image_shapes) + prediction_dict.update(detections_dict) + detection_boxes = detections_dict[ + fields.DetectionResultFields.detection_boxes] + detection_classes = detections_dict[ + fields.DetectionResultFields.detection_classes] + rpn_features_to_crop = prediction_dict['rpn_features_to_crop'] + batch_size = tf.shape(detection_boxes)[0] + max_detection = tf.shape(detection_boxes)[1] + flattened_detected_feature_maps = ( + self._compute_second_stage_input_feature_maps( + rpn_features_to_crop, detection_boxes)) + curr_box_classifier_features = ( + self._feature_extractor.extract_box_classifier_features( + flattened_detected_feature_maps, + scope=self.second_stage_feature_extractor_scope)) + + mask_predictions = self._mask_rcnn_box_predictor.predict( + [curr_box_classifier_features], + num_predictions_per_location=[1], + scope=self.second_stage_box_predictor_scope, + predict_boxes_and_classes=False, + predict_auxiliary_outputs=True) + + detection_masks = tf.squeeze(mask_predictions[ + box_predictor.MASK_PREDICTIONS], axis=1) + + _, num_classes, mask_height, mask_width = ( + detection_masks.get_shape().as_list()) + _, max_detection = detection_classes.get_shape().as_list() + if num_classes > 1: + detection_masks = self._gather_instance_masks( + detection_masks, detection_classes) + + prediction_dict[fields.DetectionResultFields.detection_masks] = ( + tf.reshape(detection_masks, + [batch_size, max_detection, mask_height, mask_width])) + + return prediction_dict + + def _gather_instance_masks(self, instance_masks, classes): + """Gathers the masks that correspond to classes. + + Args: + instance_masks: A 4-D float32 tensor with shape + [K, num_classes, mask_height, mask_width]. + classes: A 2-D int32 tensor with shape [batch_size, max_detection]. + + Returns: + masks: a 3-D float32 tensor with shape [K, mask_height, mask_width]. + """ + _, num_classes, height, width = instance_masks.get_shape().as_list() + k = tf.shape(instance_masks)[0] + instance_masks = tf.reshape(instance_masks, [-1, height, width]) + classes = tf.to_int32(tf.reshape(classes, [-1])) + gather_idx = tf.range(k) * num_classes + classes + return tf.gather(instance_masks, gather_idx) + + def _extract_rpn_feature_maps(self, preprocessed_inputs): + """Extracts RPN features. + + This function extracts two feature maps: a feature map to be directly + fed to a box predictor (to predict location and objectness scores for + proposals) and a feature map from which to crop regions which will then + be sent to the second stage box classifier. + + Args: + preprocessed_inputs: a [batch, height, width, channels] image tensor. + + Returns: + rpn_box_predictor_features: A 4-D float32 tensor with shape + [batch, height, width, depth] to be used for predicting proposal boxes + and corresponding objectness scores. + rpn_features_to_crop: A 4-D float32 tensor with shape + [batch, height, width, depth] representing image features to crop using + the proposals boxes. + anchors: A BoxList representing anchors (for the RPN) in + absolute coordinates. + image_shape: A 1-D tensor representing the input image shape. + """ + image_shape = tf.shape(preprocessed_inputs) + rpn_features_to_crop, _ = self._feature_extractor.extract_proposal_features( + preprocessed_inputs, scope=self.first_stage_feature_extractor_scope) + + feature_map_shape = tf.shape(rpn_features_to_crop) + anchors = box_list_ops.concatenate( + self._first_stage_anchor_generator.generate([(feature_map_shape[1], + feature_map_shape[2])])) + with slim.arg_scope(self._first_stage_box_predictor_arg_scope_fn()): + kernel_size = self._first_stage_box_predictor_kernel_size + rpn_box_predictor_features = slim.conv2d( + rpn_features_to_crop, + self._first_stage_box_predictor_depth, + kernel_size=[kernel_size, kernel_size], + rate=self._first_stage_atrous_rate, + activation_fn=tf.nn.relu6) + return (rpn_box_predictor_features, rpn_features_to_crop, + anchors, image_shape) + + def _predict_rpn_proposals(self, rpn_box_predictor_features): + """Adds box predictors to RPN feature map to predict proposals. + + Note resulting tensors will not have been postprocessed. + + Args: + rpn_box_predictor_features: A 4-D float32 tensor with shape + [batch, height, width, depth] to be used for predicting proposal boxes + and corresponding objectness scores. + + Returns: + box_encodings: 3-D float tensor of shape + [batch_size, num_anchors, self._box_coder.code_size] containing + predicted boxes. + objectness_predictions_with_background: 3-D float tensor of shape + [batch_size, num_anchors, 2] containing class + predictions (logits) for each of the anchors. Note that this + tensor *includes* background class predictions (at class index 0). + + Raises: + RuntimeError: if the anchor generator generates anchors corresponding to + multiple feature maps. We currently assume that a single feature map + is generated for the RPN. + """ + num_anchors_per_location = ( + self._first_stage_anchor_generator.num_anchors_per_location()) + if len(num_anchors_per_location) != 1: + raise RuntimeError('anchor_generator is expected to generate anchors ' + 'corresponding to a single feature map.') + box_predictions = self._first_stage_box_predictor.predict( + [rpn_box_predictor_features], + num_anchors_per_location, + scope=self.first_stage_box_predictor_scope) + + box_encodings = tf.concat( + box_predictions[box_predictor.BOX_ENCODINGS], axis=1) + objectness_predictions_with_background = tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1) + return (tf.squeeze(box_encodings, axis=2), + objectness_predictions_with_background) + + def _remove_invalid_anchors_and_predictions( + self, + box_encodings, + objectness_predictions_with_background, + anchors_boxlist, + clip_window): + """Removes anchors that (partially) fall outside an image. + + Also removes associated box encodings and objectness predictions. + + Args: + box_encodings: 3-D float tensor of shape + [batch_size, num_anchors, self._box_coder.code_size] containing + predicted boxes. + objectness_predictions_with_background: 3-D float tensor of shape + [batch_size, num_anchors, 2] containing class + predictions (logits) for each of the anchors. Note that this + tensor *includes* background class predictions (at class index 0). + anchors_boxlist: A BoxList representing num_anchors anchors (for the RPN) + in absolute coordinates. + clip_window: a 1-D tensor representing the [ymin, xmin, ymax, xmax] + extent of the window to clip/prune to. + + Returns: + box_encodings: 4-D float tensor of shape + [batch_size, num_valid_anchors, self._box_coder.code_size] containing + predicted boxes, where num_valid_anchors <= num_anchors + objectness_predictions_with_background: 2-D float tensor of shape + [batch_size, num_valid_anchors, 2] containing class + predictions (logits) for each of the anchors, where + num_valid_anchors <= num_anchors. Note that this + tensor *includes* background class predictions (at class index 0). + anchors: A BoxList representing num_valid_anchors anchors (for the RPN) in + absolute coordinates. + """ + pruned_anchors_boxlist, keep_indices = box_list_ops.prune_outside_window( + anchors_boxlist, clip_window) + def _batch_gather_kept_indices(predictions_tensor): + return shape_utils.static_or_dynamic_map_fn( + partial(tf.gather, indices=keep_indices), + elems=predictions_tensor, + dtype=tf.float32, + parallel_iterations=self._parallel_iterations, + back_prop=True) + return (_batch_gather_kept_indices(box_encodings), + _batch_gather_kept_indices(objectness_predictions_with_background), + pruned_anchors_boxlist) + + def _flatten_first_two_dimensions(self, inputs): + """Flattens `K-d` tensor along batch dimension to be a `(K-1)-d` tensor. + + Converts `inputs` with shape [A, B, ..., depth] into a tensor of shape + [A * B, ..., depth]. + + Args: + inputs: A float tensor with shape [A, B, ..., depth]. Note that the first + two and last dimensions must be statically defined. + Returns: + A float tensor with shape [A * B, ..., depth] (where the first and last + dimension are statically defined. + """ + combined_shape = shape_utils.combined_static_and_dynamic_shape(inputs) + flattened_shape = tf.stack([combined_shape[0] * combined_shape[1]] + + combined_shape[2:]) + return tf.reshape(inputs, flattened_shape) + + def postprocess(self, prediction_dict, true_image_shapes): + """Convert prediction tensors to final detections. + + This function converts raw predictions tensors to final detection results. + See base class for output format conventions. Note also that by default, + scores are to be interpreted as logits, but if a score_converter is used, + then scores are remapped (and may thus have a different interpretation). + + If number_of_stages=1, the returned results represent proposals from the + first stage RPN and are padded to have self.max_num_proposals for each + image; otherwise, the results can be interpreted as multiclass detections + from the full two-stage model and are padded to self._max_detections. + + Args: + prediction_dict: a dictionary holding prediction tensors (see the + documentation for the predict method. If number_of_stages=1, we + expect prediction_dict to contain `rpn_box_encodings`, + `rpn_objectness_predictions_with_background`, `rpn_features_to_crop`, + and `anchors` fields. Otherwise we expect prediction_dict to + additionally contain `refined_box_encodings`, + `class_predictions_with_background`, `num_proposals`, + `proposal_boxes` and, optionally, `mask_predictions` fields. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Returns: + detections: a dictionary containing the following fields + detection_boxes: [batch, max_detection, 4] + detection_scores: [batch, max_detections] + detection_classes: [batch, max_detections] + (this entry is only created if rpn_mode=False) + num_detections: [batch] + + Raises: + ValueError: If `predict` is called before `preprocess`. + """ + + with tf.name_scope('FirstStagePostprocessor'): + if self._number_of_stages == 1: + proposal_boxes, proposal_scores, num_proposals = self._postprocess_rpn( + prediction_dict['rpn_box_encodings'], + prediction_dict['rpn_objectness_predictions_with_background'], + prediction_dict['anchors'], + true_image_shapes, + true_image_shapes) + return { + fields.DetectionResultFields.detection_boxes: proposal_boxes, + fields.DetectionResultFields.detection_scores: proposal_scores, + fields.DetectionResultFields.num_detections: + tf.to_float(num_proposals), + } + + # TODO(jrru): Remove mask_predictions from _post_process_box_classifier. + with tf.name_scope('SecondStagePostprocessor'): + if (self._number_of_stages == 2 or + (self._number_of_stages == 3 and self._is_training)): + mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS) + detections_dict = self._postprocess_box_classifier( + prediction_dict['refined_box_encodings'], + prediction_dict['class_predictions_with_background'], + prediction_dict['proposal_boxes'], + prediction_dict['num_proposals'], + true_image_shapes, + mask_predictions=mask_predictions) + return detections_dict + + if self._number_of_stages == 3: + # Post processing is already performed in 3rd stage. We need to transfer + # postprocessed tensors from `prediction_dict` to `detections_dict`. + detections_dict = {} + for key in prediction_dict: + if key == fields.DetectionResultFields.detection_masks: + detections_dict[key] = tf.sigmoid(prediction_dict[key]) + elif 'detection' in key: + detections_dict[key] = prediction_dict[key] + return detections_dict + + def _postprocess_rpn(self, + rpn_box_encodings_batch, + rpn_objectness_predictions_with_background_batch, + anchors, + image_shapes, + true_image_shapes): + """Converts first stage prediction tensors from the RPN to proposals. + + This function decodes the raw RPN predictions, runs non-max suppression + on the result. + + Note that the behavior of this function is slightly modified during + training --- specifically, we stop the gradient from passing through the + proposal boxes and we only return a balanced sampled subset of proposals + with size `second_stage_batch_size`. + + Args: + rpn_box_encodings_batch: A 3-D float32 tensor of shape + [batch_size, num_anchors, self._box_coder.code_size] containing + predicted proposal box encodings. + rpn_objectness_predictions_with_background_batch: A 3-D float tensor of + shape [batch_size, num_anchors, 2] containing objectness predictions + (logits) for each of the anchors with 0 corresponding to background + and 1 corresponding to object. + anchors: A 2-D tensor of shape [num_anchors, 4] representing anchors + for the first stage RPN. Note that `num_anchors` can differ depending + on whether the model is created in training or inference mode. + image_shapes: A 2-D tensor of shape [batch, 3] containing the shapes of + images in the batch. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Returns: + proposal_boxes: A float tensor with shape + [batch_size, max_num_proposals, 4] representing the (potentially zero + padded) proposal boxes for all images in the batch. These boxes are + represented as normalized coordinates. + proposal_scores: A float tensor with shape + [batch_size, max_num_proposals] representing the (potentially zero + padded) proposal objectness scores for all images in the batch. + num_proposals: A Tensor of type `int32`. A 1-D tensor of shape [batch] + representing the number of proposals predicted for each image in + the batch. + """ + rpn_box_encodings_batch = tf.expand_dims(rpn_box_encodings_batch, axis=2) + rpn_encodings_shape = shape_utils.combined_static_and_dynamic_shape( + rpn_box_encodings_batch) + tiled_anchor_boxes = tf.tile( + tf.expand_dims(anchors, 0), [rpn_encodings_shape[0], 1, 1]) + proposal_boxes = self._batch_decode_boxes(rpn_box_encodings_batch, + tiled_anchor_boxes) + proposal_boxes = tf.squeeze(proposal_boxes, axis=2) + rpn_objectness_softmax_without_background = tf.nn.softmax( + rpn_objectness_predictions_with_background_batch)[:, :, 1] + clip_window = self._compute_clip_window(image_shapes) + (proposal_boxes, proposal_scores, _, _, _, + num_proposals) = post_processing.batch_multiclass_non_max_suppression( + tf.expand_dims(proposal_boxes, axis=2), + tf.expand_dims(rpn_objectness_softmax_without_background, + axis=2), + self._first_stage_nms_score_threshold, + self._first_stage_nms_iou_threshold, + self._first_stage_max_proposals, + self._first_stage_max_proposals, + clip_window=clip_window) + if self._is_training: + proposal_boxes = tf.stop_gradient(proposal_boxes) + if not self._hard_example_miner: + (groundtruth_boxlists, groundtruth_classes_with_background_list, _, + _) = self._format_groundtruth_data(true_image_shapes) + (proposal_boxes, proposal_scores, + num_proposals) = self._unpad_proposals_and_sample_box_classifier_batch( + proposal_boxes, proposal_scores, num_proposals, + groundtruth_boxlists, groundtruth_classes_with_background_list) + # normalize proposal boxes + def normalize_boxes(args): + proposal_boxes_per_image = args[0] + image_shape = args[1] + normalized_boxes_per_image = box_list_ops.to_normalized_coordinates( + box_list.BoxList(proposal_boxes_per_image), image_shape[0], + image_shape[1], check_range=False).get() + return normalized_boxes_per_image + normalized_proposal_boxes = shape_utils.static_or_dynamic_map_fn( + normalize_boxes, elems=[proposal_boxes, image_shapes], dtype=tf.float32) + return normalized_proposal_boxes, proposal_scores, num_proposals + + def _unpad_proposals_and_sample_box_classifier_batch( + self, + proposal_boxes, + proposal_scores, + num_proposals, + groundtruth_boxlists, + groundtruth_classes_with_background_list): + """Unpads proposals and samples a minibatch for second stage. + + Args: + proposal_boxes: A float tensor with shape + [batch_size, num_proposals, 4] representing the (potentially zero + padded) proposal boxes for all images in the batch. These boxes are + represented in absolute coordinates. + proposal_scores: A float tensor with shape + [batch_size, num_proposals] representing the (potentially zero + padded) proposal objectness scores for all images in the batch. + num_proposals: A Tensor of type `int32`. A 1-D tensor of shape [batch] + representing the number of proposals predicted for each image in + the batch. + groundtruth_boxlists: A list of BoxLists containing (absolute) coordinates + of the groundtruth boxes. + groundtruth_classes_with_background_list: A list of 2-D one-hot + (or k-hot) tensors of shape [num_boxes, num_classes+1] containing the + class targets with the 0th index assumed to map to the background class. + + Returns: + proposal_boxes: A float tensor with shape + [batch_size, second_stage_batch_size, 4] representing the (potentially + zero padded) proposal boxes for all images in the batch. These boxes + are represented in absolute coordinates. + proposal_scores: A float tensor with shape + [batch_size, second_stage_batch_size] representing the (potentially zero + padded) proposal objectness scores for all images in the batch. + num_proposals: A Tensor of type `int32`. A 1-D tensor of shape [batch] + representing the number of proposals predicted for each image in + the batch. + """ + single_image_proposal_box_sample = [] + single_image_proposal_score_sample = [] + single_image_num_proposals_sample = [] + for (single_image_proposal_boxes, + single_image_proposal_scores, + single_image_num_proposals, + single_image_groundtruth_boxlist, + single_image_groundtruth_classes_with_background) in zip( + tf.unstack(proposal_boxes), + tf.unstack(proposal_scores), + tf.unstack(num_proposals), + groundtruth_boxlists, + groundtruth_classes_with_background_list): + static_shape = single_image_proposal_boxes.get_shape() + sliced_static_shape = tf.TensorShape([tf.Dimension(None), + static_shape.dims[-1]]) + single_image_proposal_boxes = tf.slice( + single_image_proposal_boxes, + [0, 0], + [single_image_num_proposals, -1]) + single_image_proposal_boxes.set_shape(sliced_static_shape) + + single_image_proposal_scores = tf.slice(single_image_proposal_scores, + [0], + [single_image_num_proposals]) + single_image_boxlist = box_list.BoxList(single_image_proposal_boxes) + single_image_boxlist.add_field(fields.BoxListFields.scores, + single_image_proposal_scores) + sampled_boxlist = self._sample_box_classifier_minibatch( + single_image_boxlist, + single_image_groundtruth_boxlist, + single_image_groundtruth_classes_with_background) + sampled_padded_boxlist = box_list_ops.pad_or_clip_box_list( + sampled_boxlist, + num_boxes=self._second_stage_batch_size) + single_image_num_proposals_sample.append(tf.minimum( + sampled_boxlist.num_boxes(), + self._second_stage_batch_size)) + bb = sampled_padded_boxlist.get() + single_image_proposal_box_sample.append(bb) + single_image_proposal_score_sample.append( + sampled_padded_boxlist.get_field(fields.BoxListFields.scores)) + return (tf.stack(single_image_proposal_box_sample), + tf.stack(single_image_proposal_score_sample), + tf.stack(single_image_num_proposals_sample)) + + def _format_groundtruth_data(self, true_image_shapes): + """Helper function for preparing groundtruth data for target assignment. + + In order to be consistent with the model.DetectionModel interface, + groundtruth boxes are specified in normalized coordinates and classes are + specified as label indices with no assumed background category. To prepare + for target assignment, we: + 1) convert boxes to absolute coordinates, + 2) add a background class at class index 0 + 3) groundtruth instance masks, if available, are resized to match + image_shape. + + Args: + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Returns: + groundtruth_boxlists: A list of BoxLists containing (absolute) coordinates + of the groundtruth boxes. + groundtruth_classes_with_background_list: A list of 2-D one-hot + (or k-hot) tensors of shape [num_boxes, num_classes+1] containing the + class targets with the 0th index assumed to map to the background class. + groundtruth_masks_list: If present, a list of 3-D tf.float32 tensors of + shape [num_boxes, image_height, image_width] containing instance masks. + This is set to None if no masks exist in the provided groundtruth. + """ + groundtruth_boxlists = [ + box_list_ops.to_absolute_coordinates( + box_list.BoxList(boxes), true_image_shapes[i, 0], + true_image_shapes[i, 1]) + for i, boxes in enumerate( + self.groundtruth_lists(fields.BoxListFields.boxes)) + ] + groundtruth_classes_with_background_list = [ + tf.to_float( + tf.pad(one_hot_encoding, [[0, 0], [1, 0]], mode='CONSTANT')) + for one_hot_encoding in self.groundtruth_lists( + fields.BoxListFields.classes)] + + groundtruth_masks_list = self._groundtruth_lists.get( + fields.BoxListFields.masks) + if groundtruth_masks_list is not None: + resized_masks_list = [] + for mask in groundtruth_masks_list: + _, resized_mask, _ = self._image_resizer_fn( + # Reuse the given `image_resizer_fn` to resize groundtruth masks. + # `mask` tensor for an image is of the shape [num_masks, + # image_height, image_width]. Below we create a dummy image of the + # the shape [image_height, image_width, 1] to use with + # `image_resizer_fn`. + image=tf.zeros(tf.stack([tf.shape(mask)[1], tf.shape(mask)[2], 1])), + masks=mask) + resized_masks_list.append(resized_mask) + + groundtruth_masks_list = resized_masks_list + groundtruth_weights_list = None + if self.groundtruth_has_field(fields.BoxListFields.weights): + groundtruth_weights_list = self.groundtruth_lists( + fields.BoxListFields.weights) + + return (groundtruth_boxlists, groundtruth_classes_with_background_list, + groundtruth_masks_list, groundtruth_weights_list) + + def _sample_box_classifier_minibatch(self, + proposal_boxlist, + groundtruth_boxlist, + groundtruth_classes_with_background): + """Samples a mini-batch of proposals to be sent to the box classifier. + + Helper function for self._postprocess_rpn. + + Args: + proposal_boxlist: A BoxList containing K proposal boxes in absolute + coordinates. + groundtruth_boxlist: A Boxlist containing N groundtruth object boxes in + absolute coordinates. + groundtruth_classes_with_background: A tensor with shape + `[N, self.num_classes + 1]` representing groundtruth classes. The + classes are assumed to be k-hot encoded, and include background as the + zero-th class. + + Returns: + a BoxList contained sampled proposals. + """ + (cls_targets, cls_weights, _, _, _) = self._detector_target_assigner.assign( + proposal_boxlist, groundtruth_boxlist, + groundtruth_classes_with_background) + # Selects all boxes as candidates if none of them is selected according + # to cls_weights. This could happen as boxes within certain IOU ranges + # are ignored. If triggered, the selected boxes will still be ignored + # during loss computation. + cls_weights += tf.to_float(tf.equal(tf.reduce_sum(cls_weights), 0)) + positive_indicator = tf.greater(tf.argmax(cls_targets, axis=1), 0) + sampled_indices = self._second_stage_sampler.subsample( + tf.cast(cls_weights, tf.bool), + self._second_stage_batch_size, + positive_indicator) + return box_list_ops.boolean_mask(proposal_boxlist, sampled_indices) + + def _compute_second_stage_input_feature_maps(self, features_to_crop, + proposal_boxes_normalized): + """Crops to a set of proposals from the feature map for a batch of images. + + Helper function for self._postprocess_rpn. This function calls + `tf.image.crop_and_resize` to create the feature map to be passed to the + second stage box classifier for each proposal. + + Args: + features_to_crop: A float32 tensor with shape + [batch_size, height, width, depth] + proposal_boxes_normalized: A float32 tensor with shape [batch_size, + num_proposals, box_code_size] containing proposal boxes in + normalized coordinates. + + Returns: + A float32 tensor with shape [K, new_height, new_width, depth]. + """ + def get_box_inds(proposals): + proposals_shape = proposals.get_shape().as_list() + if any(dim is None for dim in proposals_shape): + proposals_shape = tf.shape(proposals) + ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32) + multiplier = tf.expand_dims( + tf.range(start=0, limit=proposals_shape[0]), 1) + return tf.reshape(ones_mat * multiplier, [-1]) + + cropped_regions = tf.image.crop_and_resize( + features_to_crop, + self._flatten_first_two_dimensions(proposal_boxes_normalized), + get_box_inds(proposal_boxes_normalized), + (self._initial_crop_size, self._initial_crop_size)) + return slim.max_pool2d( + cropped_regions, + [self._maxpool_kernel_size, self._maxpool_kernel_size], + stride=self._maxpool_stride) + + def _postprocess_box_classifier(self, + refined_box_encodings, + class_predictions_with_background, + proposal_boxes, + num_proposals, + image_shapes, + mask_predictions=None): + """Converts predictions from the second stage box classifier to detections. + + Args: + refined_box_encodings: a 3-D float tensor with shape + [total_num_padded_proposals, num_classes, self._box_coder.code_size] + representing predicted (final) refined box encodings. If using a shared + box across classes the shape will instead be + [total_num_padded_proposals, 1, 4] + class_predictions_with_background: a 3-D tensor float with shape + [total_num_padded_proposals, num_classes + 1] containing class + predictions (logits) for each of the proposals. Note that this tensor + *includes* background class predictions (at class index 0). + proposal_boxes: a 3-D float tensor with shape + [batch_size, self.max_num_proposals, 4] representing decoded proposal + bounding boxes in absolute coordinates. + num_proposals: a 1-D int32 tensor of shape [batch] representing the number + of proposals predicted for each image in the batch. + image_shapes: a 2-D int32 tensor containing shapes of input image in the + batch. + mask_predictions: (optional) a 4-D float tensor with shape + [total_num_padded_proposals, num_classes, mask_height, mask_width] + containing instance mask prediction logits. + + Returns: + A dictionary containing: + `detection_boxes`: [batch, max_detection, 4] + `detection_scores`: [batch, max_detections] + `detection_classes`: [batch, max_detections] + `num_detections`: [batch] + `detection_masks`: + (optional) [batch, max_detections, mask_height, mask_width]. Note + that a pixel-wise sigmoid score converter is applied to the detection + masks. + """ + refined_box_encodings_batch = tf.reshape( + refined_box_encodings, + [-1, + self.max_num_proposals, + refined_box_encodings.shape[1], + self._box_coder.code_size]) + class_predictions_with_background_batch = tf.reshape( + class_predictions_with_background, + [-1, self.max_num_proposals, self.num_classes + 1] + ) + refined_decoded_boxes_batch = self._batch_decode_boxes( + refined_box_encodings_batch, proposal_boxes) + class_predictions_with_background_batch = ( + self._second_stage_score_conversion_fn( + class_predictions_with_background_batch)) + class_predictions_batch = tf.reshape( + tf.slice(class_predictions_with_background_batch, + [0, 0, 1], [-1, -1, -1]), + [-1, self.max_num_proposals, self.num_classes]) + clip_window = self._compute_clip_window(image_shapes) + mask_predictions_batch = None + if mask_predictions is not None: + mask_height = mask_predictions.shape[2].value + mask_width = mask_predictions.shape[3].value + mask_predictions = tf.sigmoid(mask_predictions) + mask_predictions_batch = tf.reshape( + mask_predictions, [-1, self.max_num_proposals, + self.num_classes, mask_height, mask_width]) + (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks, _, + num_detections) = self._second_stage_nms_fn( + refined_decoded_boxes_batch, + class_predictions_batch, + clip_window=clip_window, + change_coordinate_frame=True, + num_valid_boxes=num_proposals, + masks=mask_predictions_batch) + detections = { + fields.DetectionResultFields.detection_boxes: nmsed_boxes, + fields.DetectionResultFields.detection_scores: nmsed_scores, + fields.DetectionResultFields.detection_classes: nmsed_classes, + fields.DetectionResultFields.num_detections: tf.to_float(num_detections) + } + if nmsed_masks is not None: + detections[fields.DetectionResultFields.detection_masks] = nmsed_masks + return detections + + def _batch_decode_boxes(self, box_encodings, anchor_boxes): + """Decodes box encodings with respect to the anchor boxes. + + Args: + box_encodings: a 4-D tensor with shape + [batch_size, num_anchors, num_classes, self._box_coder.code_size] + representing box encodings. + anchor_boxes: [batch_size, num_anchors, self._box_coder.code_size] + representing decoded bounding boxes. If using a shared box across + classes the shape will instead be + [total_num_proposals, 1, self._box_coder.code_size]. + + Returns: + decoded_boxes: a + [batch_size, num_anchors, num_classes, self._box_coder.code_size] + float tensor representing bounding box predictions (for each image in + batch, proposal and class). If using a shared box across classes the + shape will instead be + [batch_size, num_anchors, 1, self._box_coder.code_size]. + """ + combined_shape = shape_utils.combined_static_and_dynamic_shape( + box_encodings) + num_classes = combined_shape[2] + tiled_anchor_boxes = tf.tile( + tf.expand_dims(anchor_boxes, 2), [1, 1, num_classes, 1]) + tiled_anchors_boxlist = box_list.BoxList( + tf.reshape(tiled_anchor_boxes, [-1, 4])) + decoded_boxes = self._box_coder.decode( + tf.reshape(box_encodings, [-1, self._box_coder.code_size]), + tiled_anchors_boxlist) + return tf.reshape(decoded_boxes.get(), + tf.stack([combined_shape[0], combined_shape[1], + num_classes, 4])) + + def loss(self, prediction_dict, true_image_shapes, scope=None): + """Compute scalar loss tensors given prediction tensors. + + If number_of_stages=1, only RPN related losses are computed (i.e., + `rpn_localization_loss` and `rpn_objectness_loss`). Otherwise all + losses are computed. + + Args: + prediction_dict: a dictionary holding prediction tensors (see the + documentation for the predict method. If number_of_stages=1, we + expect prediction_dict to contain `rpn_box_encodings`, + `rpn_objectness_predictions_with_background`, `rpn_features_to_crop`, + `image_shape`, and `anchors` fields. Otherwise we expect + prediction_dict to additionally contain `refined_box_encodings`, + `class_predictions_with_background`, `num_proposals`, and + `proposal_boxes` fields. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + scope: Optional scope name. + + Returns: + a dictionary mapping loss keys (`first_stage_localization_loss`, + `first_stage_objectness_loss`, 'second_stage_localization_loss', + 'second_stage_classification_loss') to scalar tensors representing + corresponding loss values. + """ + with tf.name_scope(scope, 'Loss', prediction_dict.values()): + (groundtruth_boxlists, groundtruth_classes_with_background_list, + groundtruth_masks_list, groundtruth_weights_list + ) = self._format_groundtruth_data(true_image_shapes) + loss_dict = self._loss_rpn( + prediction_dict['rpn_box_encodings'], + prediction_dict['rpn_objectness_predictions_with_background'], + prediction_dict['anchors'], groundtruth_boxlists, + groundtruth_classes_with_background_list, groundtruth_weights_list) + if self._number_of_stages > 1: + loss_dict.update( + self._loss_box_classifier( + prediction_dict['refined_box_encodings'], + prediction_dict['class_predictions_with_background'], + prediction_dict['proposal_boxes'], + prediction_dict['num_proposals'], + groundtruth_boxlists, + groundtruth_classes_with_background_list, + groundtruth_weights_list, + prediction_dict['image_shape'], + prediction_dict.get('mask_predictions'), + groundtruth_masks_list, + )) + return loss_dict + + def _loss_rpn(self, rpn_box_encodings, + rpn_objectness_predictions_with_background, anchors, + groundtruth_boxlists, groundtruth_classes_with_background_list, + groundtruth_weights_list): + """Computes scalar RPN loss tensors. + + Uses self._proposal_target_assigner to obtain regression and classification + targets for the first stage RPN, samples a "minibatch" of anchors to + participate in the loss computation, and returns the RPN losses. + + Args: + rpn_box_encodings: A 4-D float tensor of shape + [batch_size, num_anchors, self._box_coder.code_size] containing + predicted proposal box encodings. + rpn_objectness_predictions_with_background: A 2-D float tensor of shape + [batch_size, num_anchors, 2] containing objectness predictions + (logits) for each of the anchors with 0 corresponding to background + and 1 corresponding to object. + anchors: A 2-D tensor of shape [num_anchors, 4] representing anchors + for the first stage RPN. Note that `num_anchors` can differ depending + on whether the model is created in training or inference mode. + groundtruth_boxlists: A list of BoxLists containing coordinates of the + groundtruth boxes. + groundtruth_classes_with_background_list: A list of 2-D one-hot + (or k-hot) tensors of shape [num_boxes, num_classes+1] containing the + class targets with the 0th index assumed to map to the background class. + groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape + [num_boxes] containing weights for groundtruth boxes. + + Returns: + a dictionary mapping loss keys (`first_stage_localization_loss`, + `first_stage_objectness_loss`) to scalar tensors representing + corresponding loss values. + """ + with tf.name_scope('RPNLoss'): + (batch_cls_targets, batch_cls_weights, batch_reg_targets, + batch_reg_weights, _) = target_assigner.batch_assign_targets( + self._proposal_target_assigner, box_list.BoxList(anchors), + groundtruth_boxlists, + len(groundtruth_boxlists) * [None], groundtruth_weights_list) + batch_cls_targets = tf.squeeze(batch_cls_targets, axis=2) + + def _minibatch_subsample_fn(inputs): + cls_targets, cls_weights = inputs + return self._first_stage_sampler.subsample( + tf.cast(cls_weights, tf.bool), + self._first_stage_minibatch_size, tf.cast(cls_targets, tf.bool)) + batch_sampled_indices = tf.to_float(shape_utils.static_or_dynamic_map_fn( + _minibatch_subsample_fn, + [batch_cls_targets, batch_cls_weights], + dtype=tf.bool, + parallel_iterations=self._parallel_iterations, + back_prop=True)) + + # Normalize by number of examples in sampled minibatch + normalizer = tf.reduce_sum(batch_sampled_indices, axis=1) + batch_one_hot_targets = tf.one_hot( + tf.to_int32(batch_cls_targets), depth=2) + sampled_reg_indices = tf.multiply(batch_sampled_indices, + batch_reg_weights) + + localization_losses = self._first_stage_localization_loss( + rpn_box_encodings, batch_reg_targets, weights=sampled_reg_indices) + objectness_losses = self._first_stage_objectness_loss( + rpn_objectness_predictions_with_background, + batch_one_hot_targets, weights=batch_sampled_indices) + localization_loss = tf.reduce_mean( + tf.reduce_sum(localization_losses, axis=1) / normalizer) + objectness_loss = tf.reduce_mean( + tf.reduce_sum(objectness_losses, axis=1) / normalizer) + + localization_loss = tf.multiply(self._first_stage_loc_loss_weight, + localization_loss, + name='localization_loss') + objectness_loss = tf.multiply(self._first_stage_obj_loss_weight, + objectness_loss, name='objectness_loss') + loss_dict = {localization_loss.op.name: localization_loss, + objectness_loss.op.name: objectness_loss} + return loss_dict + + def _loss_box_classifier(self, + refined_box_encodings, + class_predictions_with_background, + proposal_boxes, + num_proposals, + groundtruth_boxlists, + groundtruth_classes_with_background_list, + groundtruth_weights_list, + image_shape, + prediction_masks=None, + groundtruth_masks_list=None): + """Computes scalar box classifier loss tensors. + + Uses self._detector_target_assigner to obtain regression and classification + targets for the second stage box classifier, optionally performs + hard mining, and returns losses. All losses are computed independently + for each image and then averaged across the batch. + Please note that for boxes and masks with multiple labels, the box + regression and mask prediction losses are only computed for one label. + + This function assumes that the proposal boxes in the "padded" regions are + actually zero (and thus should not be matched to). + + + Args: + refined_box_encodings: a 3-D tensor with shape + [total_num_proposals, num_classes, box_coder.code_size] representing + predicted (final) refined box encodings. If using a shared box across + classes this will instead have shape + [total_num_proposals, 1, box_coder.code_size]. + class_predictions_with_background: a 2-D tensor with shape + [total_num_proposals, num_classes + 1] containing class + predictions (logits) for each of the anchors. Note that this tensor + *includes* background class predictions (at class index 0). + proposal_boxes: [batch_size, self.max_num_proposals, 4] representing + decoded proposal bounding boxes. + num_proposals: A Tensor of type `int32`. A 1-D tensor of shape [batch] + representing the number of proposals predicted for each image in + the batch. + groundtruth_boxlists: a list of BoxLists containing coordinates of the + groundtruth boxes. + groundtruth_classes_with_background_list: a list of 2-D one-hot + (or k-hot) tensors of shape [num_boxes, num_classes + 1] containing the + class targets with the 0th index assumed to map to the background class. + groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape + [num_boxes] containing weights for groundtruth boxes. + image_shape: a 1-D tensor of shape [4] representing the image shape. + prediction_masks: an optional 4-D tensor with shape [total_num_proposals, + num_classes, mask_height, mask_width] containing the instance masks for + each box. + groundtruth_masks_list: an optional list of 3-D tensors of shape + [num_boxes, image_height, image_width] containing the instance masks for + each of the boxes. + + Returns: + a dictionary mapping loss keys ('second_stage_localization_loss', + 'second_stage_classification_loss') to scalar tensors representing + corresponding loss values. + + Raises: + ValueError: if `predict_instance_masks` in + second_stage_mask_rcnn_box_predictor is True and + `groundtruth_masks_list` is not provided. + """ + with tf.name_scope('BoxClassifierLoss'): + paddings_indicator = self._padded_batched_proposals_indicator( + num_proposals, self.max_num_proposals) + proposal_boxlists = [ + box_list.BoxList(proposal_boxes_single_image) + for proposal_boxes_single_image in tf.unstack(proposal_boxes)] + batch_size = len(proposal_boxlists) + + num_proposals_or_one = tf.to_float(tf.expand_dims( + tf.maximum(num_proposals, tf.ones_like(num_proposals)), 1)) + normalizer = tf.tile(num_proposals_or_one, + [1, self.max_num_proposals]) * batch_size + + (batch_cls_targets_with_background, batch_cls_weights, batch_reg_targets, + batch_reg_weights, _) = target_assigner.batch_assign_targets( + self._detector_target_assigner, proposal_boxlists, + groundtruth_boxlists, groundtruth_classes_with_background_list, + groundtruth_weights_list) + + class_predictions_with_background = tf.reshape( + class_predictions_with_background, + [batch_size, self.max_num_proposals, -1]) + + flat_cls_targets_with_background = tf.reshape( + batch_cls_targets_with_background, + [batch_size * self.max_num_proposals, -1]) + one_hot_flat_cls_targets_with_background = tf.argmax( + flat_cls_targets_with_background, axis=1) + one_hot_flat_cls_targets_with_background = tf.one_hot( + one_hot_flat_cls_targets_with_background, + flat_cls_targets_with_background.get_shape()[1]) + + # If using a shared box across classes use directly + if refined_box_encodings.shape[1] == 1: + reshaped_refined_box_encodings = tf.reshape( + refined_box_encodings, + [batch_size, self.max_num_proposals, self._box_coder.code_size]) + # For anchors with multiple labels, picks refined_location_encodings + # for just one class to avoid over-counting for regression loss and + # (optionally) mask loss. + else: + # We only predict refined location encodings for the non background + # classes, but we now pad it to make it compatible with the class + # predictions + refined_box_encodings_with_background = tf.pad( + refined_box_encodings, [[0, 0], [1, 0], [0, 0]]) + refined_box_encodings_masked_by_class_targets = tf.boolean_mask( + refined_box_encodings_with_background, + tf.greater(one_hot_flat_cls_targets_with_background, 0)) + reshaped_refined_box_encodings = tf.reshape( + refined_box_encodings_masked_by_class_targets, + [batch_size, self.max_num_proposals, self._box_coder.code_size]) + + second_stage_loc_losses = self._second_stage_localization_loss( + reshaped_refined_box_encodings, + batch_reg_targets, weights=batch_reg_weights) / normalizer + second_stage_cls_losses = ops.reduce_sum_trailing_dimensions( + self._second_stage_classification_loss( + class_predictions_with_background, + batch_cls_targets_with_background, + weights=batch_cls_weights), + ndims=2) / normalizer + + second_stage_loc_loss = tf.reduce_sum( + tf.boolean_mask(second_stage_loc_losses, paddings_indicator)) + second_stage_cls_loss = tf.reduce_sum( + tf.boolean_mask(second_stage_cls_losses, paddings_indicator)) + + if self._hard_example_miner: + (second_stage_loc_loss, second_stage_cls_loss + ) = self._unpad_proposals_and_apply_hard_mining( + proposal_boxlists, second_stage_loc_losses, + second_stage_cls_losses, num_proposals) + localization_loss = tf.multiply(self._second_stage_loc_loss_weight, + second_stage_loc_loss, + name='localization_loss') + + classification_loss = tf.multiply(self._second_stage_cls_loss_weight, + second_stage_cls_loss, + name='classification_loss') + + loss_dict = {localization_loss.op.name: localization_loss, + classification_loss.op.name: classification_loss} + second_stage_mask_loss = None + if prediction_masks is not None: + if groundtruth_masks_list is None: + raise ValueError('Groundtruth instance masks not provided. ' + 'Please configure input reader.') + + # Create a new target assigner that matches the proposals to groundtruth + # and returns the mask targets. + # TODO(rathodv): Move `unmatched_cls_target` from constructor to assign + # function. This will enable reuse of a single target assigner for both + # class targets and mask targets. + mask_target_assigner = target_assigner.create_target_assigner( + 'FasterRCNN', 'detection', + unmatched_cls_target=tf.zeros(image_shape[1:3], dtype=tf.float32)) + (batch_mask_targets, _, _, + batch_mask_target_weights, _) = target_assigner.batch_assign_targets( + mask_target_assigner, proposal_boxlists, groundtruth_boxlists, + groundtruth_masks_list, groundtruth_weights_list) + + # Pad the prediction_masks with to add zeros for background class to be + # consistent with class predictions. + if prediction_masks.get_shape().as_list()[1] == 1: + # Class agnostic masks or masks for one-class prediction. Logic for + # both cases is the same since background predictions are ignored + # through the batch_mask_target_weights. + prediction_masks_masked_by_class_targets = prediction_masks + else: + prediction_masks_with_background = tf.pad( + prediction_masks, [[0, 0], [1, 0], [0, 0], [0, 0]]) + prediction_masks_masked_by_class_targets = tf.boolean_mask( + prediction_masks_with_background, + tf.greater(one_hot_flat_cls_targets_with_background, 0)) + + mask_height = prediction_masks.shape[2].value + mask_width = prediction_masks.shape[3].value + reshaped_prediction_masks = tf.reshape( + prediction_masks_masked_by_class_targets, + [batch_size, -1, mask_height * mask_width]) + + batch_mask_targets_shape = tf.shape(batch_mask_targets) + flat_gt_masks = tf.reshape(batch_mask_targets, + [-1, batch_mask_targets_shape[2], + batch_mask_targets_shape[3]]) + + # Use normalized proposals to crop mask targets from image masks. + flat_normalized_proposals = box_list_ops.to_normalized_coordinates( + box_list.BoxList(tf.reshape(proposal_boxes, [-1, 4])), + image_shape[1], image_shape[2]).get() + + flat_cropped_gt_mask = tf.image.crop_and_resize( + tf.expand_dims(flat_gt_masks, -1), + flat_normalized_proposals, + tf.range(flat_normalized_proposals.shape[0].value), + [mask_height, mask_width]) + + batch_cropped_gt_mask = tf.reshape( + flat_cropped_gt_mask, + [batch_size, -1, mask_height * mask_width]) + + second_stage_mask_losses = ops.reduce_sum_trailing_dimensions( + self._second_stage_mask_loss( + reshaped_prediction_masks, + batch_cropped_gt_mask, + weights=batch_mask_target_weights), + ndims=2) / ( + mask_height * mask_width * tf.maximum( + tf.reduce_sum( + batch_mask_target_weights, axis=1, keep_dims=True + ), tf.ones((batch_size, 1)))) + second_stage_mask_loss = tf.reduce_sum( + tf.boolean_mask(second_stage_mask_losses, paddings_indicator)) + + if second_stage_mask_loss is not None: + mask_loss = tf.multiply(self._second_stage_mask_loss_weight, + second_stage_mask_loss, name='mask_loss') + loss_dict[mask_loss.op.name] = mask_loss + return loss_dict + + def _padded_batched_proposals_indicator(self, + num_proposals, + max_num_proposals): + """Creates indicator matrix of non-pad elements of padded batch proposals. + + Args: + num_proposals: Tensor of type tf.int32 with shape [batch_size]. + max_num_proposals: Maximum number of proposals per image (integer). + + Returns: + A Tensor of type tf.bool with shape [batch_size, max_num_proposals]. + """ + batch_size = tf.size(num_proposals) + tiled_num_proposals = tf.tile( + tf.expand_dims(num_proposals, 1), [1, max_num_proposals]) + tiled_proposal_index = tf.tile( + tf.expand_dims(tf.range(max_num_proposals), 0), [batch_size, 1]) + return tf.greater(tiled_num_proposals, tiled_proposal_index) + + def _unpad_proposals_and_apply_hard_mining(self, + proposal_boxlists, + second_stage_loc_losses, + second_stage_cls_losses, + num_proposals): + """Unpads proposals and applies hard mining. + + Args: + proposal_boxlists: A list of `batch_size` BoxLists each representing + `self.max_num_proposals` representing decoded proposal bounding boxes + for each image. + second_stage_loc_losses: A Tensor of type `float32`. A tensor of shape + `[batch_size, self.max_num_proposals]` representing per-anchor + second stage localization loss values. + second_stage_cls_losses: A Tensor of type `float32`. A tensor of shape + `[batch_size, self.max_num_proposals]` representing per-anchor + second stage classification loss values. + num_proposals: A Tensor of type `int32`. A 1-D tensor of shape [batch] + representing the number of proposals predicted for each image in + the batch. + + Returns: + second_stage_loc_loss: A scalar float32 tensor representing the second + stage localization loss. + second_stage_cls_loss: A scalar float32 tensor representing the second + stage classification loss. + """ + for (proposal_boxlist, single_image_loc_loss, single_image_cls_loss, + single_image_num_proposals) in zip( + proposal_boxlists, + tf.unstack(second_stage_loc_losses), + tf.unstack(second_stage_cls_losses), + tf.unstack(num_proposals)): + proposal_boxlist = box_list.BoxList( + tf.slice(proposal_boxlist.get(), + [0, 0], [single_image_num_proposals, -1])) + single_image_loc_loss = tf.slice(single_image_loc_loss, + [0], [single_image_num_proposals]) + single_image_cls_loss = tf.slice(single_image_cls_loss, + [0], [single_image_num_proposals]) + return self._hard_example_miner( + location_losses=tf.expand_dims(single_image_loc_loss, 0), + cls_losses=tf.expand_dims(single_image_cls_loss, 0), + decoded_boxlist_list=[proposal_boxlist]) + + def restore_map(self, + fine_tune_checkpoint_type='detection', + load_all_detection_checkpoint_vars=False): + """Returns a map of variables to load from a foreign checkpoint. + + See parent class for details. + + Args: + fine_tune_checkpoint_type: whether to restore from a full detection + checkpoint (with compatible variable names) or to restore from a + classification checkpoint for initialization prior to training. + Valid values: `detection`, `classification`. Default 'detection'. + load_all_detection_checkpoint_vars: whether to load all variables (when + `fine_tune_checkpoint_type` is `detection`). If False, only variables + within the feature extractor scopes are included. Default False. + + Returns: + A dict mapping variable names (to load from a checkpoint) to variables in + the model graph. + Raises: + ValueError: if fine_tune_checkpoint_type is neither `classification` + nor `detection`. + """ + if fine_tune_checkpoint_type not in ['detection', 'classification']: + raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format( + fine_tune_checkpoint_type)) + if fine_tune_checkpoint_type == 'classification': + return self._feature_extractor.restore_from_classification_checkpoint_fn( + self.first_stage_feature_extractor_scope, + self.second_stage_feature_extractor_scope) + + variables_to_restore = tf.global_variables() + variables_to_restore.append(slim.get_or_create_global_step()) + # Only load feature extractor variables to be consistent with loading from + # a classification checkpoint. + include_patterns = None + if not load_all_detection_checkpoint_vars: + include_patterns = [ + self.first_stage_feature_extractor_scope, + self.second_stage_feature_extractor_scope + ] + feature_extractor_variables = tf.contrib.framework.filter_variables( + variables_to_restore, include_patterns=include_patterns) + return {var.op.name: var for var in feature_extractor_variables} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py new file mode 100644 index 0000000000000000000000000000000000000000..93bd34c915c848d0d44013a059031c14bdbff60c --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/faster_rcnn_meta_arch_test.py @@ -0,0 +1,370 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.meta_architectures.faster_rcnn_meta_arch.""" + +from absl.testing import parameterized +import numpy as np +import tensorflow as tf + +from object_detection.meta_architectures import faster_rcnn_meta_arch_test_lib + + +class FasterRCNNMetaArchTest( + faster_rcnn_meta_arch_test_lib.FasterRCNNMetaArchTestBase, + parameterized.TestCase): + + def test_postprocess_second_stage_only_inference_mode_with_masks(self): + model = self._build_model( + is_training=False, number_of_stages=2, second_stage_batch_size=6) + + batch_size = 2 + total_num_padded_proposals = batch_size * model.max_num_proposals + proposal_boxes = tf.constant( + [[[1, 1, 2, 3], + [0, 0, 1, 1], + [.5, .5, .6, .6], + 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]], + [[2, 3, 6, 8], + [1, 2, 5, 3], + 4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]], dtype=tf.float32) + num_proposals = tf.constant([3, 2], dtype=tf.int32) + refined_box_encodings = tf.zeros( + [total_num_padded_proposals, model.num_classes, 4], dtype=tf.float32) + class_predictions_with_background = tf.ones( + [total_num_padded_proposals, model.num_classes+1], dtype=tf.float32) + image_shape = tf.constant([batch_size, 36, 48, 3], dtype=tf.int32) + + mask_height = 2 + mask_width = 2 + mask_predictions = 30. * tf.ones( + [total_num_padded_proposals, model.num_classes, + mask_height, mask_width], dtype=tf.float32) + exp_detection_masks = np.array([[[[1, 1], [1, 1]], + [[1, 1], [1, 1]], + [[1, 1], [1, 1]], + [[1, 1], [1, 1]], + [[1, 1], [1, 1]]], + [[[1, 1], [1, 1]], + [[1, 1], [1, 1]], + [[1, 1], [1, 1]], + [[1, 1], [1, 1]], + [[0, 0], [0, 0]]]]) + + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + detections = model.postprocess({ + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': class_predictions_with_background, + 'num_proposals': num_proposals, + 'proposal_boxes': proposal_boxes, + 'image_shape': image_shape, + 'mask_predictions': mask_predictions + }, true_image_shapes) + with self.test_session() as sess: + detections_out = sess.run(detections) + self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4]) + self.assertAllClose(detections_out['detection_scores'], + [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]) + self.assertAllClose(detections_out['detection_classes'], + [[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]]) + self.assertAllClose(detections_out['num_detections'], [5, 4]) + self.assertAllClose(detections_out['detection_masks'], + exp_detection_masks) + self.assertTrue(np.amax(detections_out['detection_masks'] <= 1.0)) + self.assertTrue(np.amin(detections_out['detection_masks'] >= 0.0)) + + def test_postprocess_second_stage_only_inference_mode_with_shared_boxes(self): + model = self._build_model( + is_training=False, number_of_stages=2, second_stage_batch_size=6) + + batch_size = 2 + total_num_padded_proposals = batch_size * model.max_num_proposals + proposal_boxes = tf.constant( + [[[1, 1, 2, 3], + [0, 0, 1, 1], + [.5, .5, .6, .6], + 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]], + [[2, 3, 6, 8], + [1, 2, 5, 3], + 4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]], dtype=tf.float32) + num_proposals = tf.constant([3, 2], dtype=tf.int32) + + # This has 1 box instead of one for each class. + refined_box_encodings = tf.zeros( + [total_num_padded_proposals, 1, 4], dtype=tf.float32) + class_predictions_with_background = tf.ones( + [total_num_padded_proposals, model.num_classes+1], dtype=tf.float32) + image_shape = tf.constant([batch_size, 36, 48, 3], dtype=tf.int32) + + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + detections = model.postprocess({ + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': class_predictions_with_background, + 'num_proposals': num_proposals, + 'proposal_boxes': proposal_boxes, + 'image_shape': image_shape, + }, true_image_shapes) + with self.test_session() as sess: + detections_out = sess.run(detections) + self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4]) + self.assertAllClose(detections_out['detection_scores'], + [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]) + self.assertAllClose(detections_out['detection_classes'], + [[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]]) + self.assertAllClose(detections_out['num_detections'], [5, 4]) + + @parameterized.parameters( + {'masks_are_class_agnostic': False}, + {'masks_are_class_agnostic': True}, + ) + def test_predict_correct_shapes_in_inference_mode_three_stages_with_masks( + self, masks_are_class_agnostic): + batch_size = 2 + image_size = 10 + max_num_proposals = 8 + initial_crop_size = 3 + maxpool_stride = 1 + + input_shapes = [(batch_size, image_size, image_size, 3), + (None, image_size, image_size, 3), + (batch_size, None, None, 3), + (None, None, None, 3)] + expected_num_anchors = image_size * image_size * 3 * 3 + expected_shapes = { + 'rpn_box_predictor_features': + (2, image_size, image_size, 512), + 'rpn_features_to_crop': (2, image_size, image_size, 3), + 'image_shape': (4,), + 'rpn_box_encodings': (2, expected_num_anchors, 4), + 'rpn_objectness_predictions_with_background': + (2, expected_num_anchors, 2), + 'anchors': (expected_num_anchors, 4), + 'refined_box_encodings': (2 * max_num_proposals, 2, 4), + 'class_predictions_with_background': (2 * max_num_proposals, 2 + 1), + 'num_proposals': (2,), + 'proposal_boxes': (2, max_num_proposals, 4), + 'proposal_boxes_normalized': (2, max_num_proposals, 4), + 'box_classifier_features': + self._get_box_classifier_features_shape(image_size, + batch_size, + max_num_proposals, + initial_crop_size, + maxpool_stride, + 3) + } + + for input_shape in input_shapes: + test_graph = tf.Graph() + with test_graph.as_default(): + model = self._build_model( + is_training=False, + number_of_stages=3, + second_stage_batch_size=2, + predict_masks=True, + masks_are_class_agnostic=masks_are_class_agnostic) + preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape) + _, true_image_shapes = model.preprocess(preprocessed_inputs) + result_tensor_dict = model.predict(preprocessed_inputs, + true_image_shapes) + init_op = tf.global_variables_initializer() + with self.test_session(graph=test_graph) as sess: + sess.run(init_op) + tensor_dict_out = sess.run(result_tensor_dict, feed_dict={ + preprocessed_inputs: + np.zeros((batch_size, image_size, image_size, 3))}) + self.assertEqual( + set(tensor_dict_out.keys()), + set(expected_shapes.keys()).union( + set([ + 'detection_boxes', 'detection_scores', 'detection_classes', + 'detection_masks', 'num_detections' + ]))) + for key in expected_shapes: + self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key]) + self.assertAllEqual(tensor_dict_out['detection_boxes'].shape, [2, 5, 4]) + self.assertAllEqual(tensor_dict_out['detection_masks'].shape, + [2, 5, 14, 14]) + self.assertAllEqual(tensor_dict_out['detection_classes'].shape, [2, 5]) + self.assertAllEqual(tensor_dict_out['detection_scores'].shape, [2, 5]) + self.assertAllEqual(tensor_dict_out['num_detections'].shape, [2]) + + @parameterized.parameters( + {'masks_are_class_agnostic': False}, + {'masks_are_class_agnostic': True}, + ) + def test_predict_gives_correct_shapes_in_train_mode_both_stages_with_masks( + self, masks_are_class_agnostic): + test_graph = tf.Graph() + with test_graph.as_default(): + model = self._build_model( + is_training=True, + number_of_stages=3, + second_stage_batch_size=7, + predict_masks=True, + masks_are_class_agnostic=masks_are_class_agnostic) + batch_size = 2 + image_size = 10 + max_num_proposals = 7 + initial_crop_size = 3 + maxpool_stride = 1 + + image_shape = (batch_size, image_size, image_size, 3) + preprocessed_inputs = tf.zeros(image_shape, dtype=tf.float32) + groundtruth_boxes_list = [ + tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32), + tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32) + ] + groundtruth_classes_list = [ + tf.constant([[1, 0], [0, 1]], dtype=tf.float32), + tf.constant([[1, 0], [1, 0]], dtype=tf.float32) + ] + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + + result_tensor_dict = model.predict(preprocessed_inputs, true_image_shapes) + mask_shape_1 = 1 if masks_are_class_agnostic else model._num_classes + expected_shapes = { + 'rpn_box_predictor_features': (2, image_size, image_size, 512), + 'rpn_features_to_crop': (2, image_size, image_size, 3), + 'image_shape': (4,), + 'refined_box_encodings': (2 * max_num_proposals, 2, 4), + 'class_predictions_with_background': (2 * max_num_proposals, 2 + 1), + 'num_proposals': (2,), + 'proposal_boxes': (2, max_num_proposals, 4), + 'proposal_boxes_normalized': (2, max_num_proposals, 4), + 'box_classifier_features': + self._get_box_classifier_features_shape( + image_size, batch_size, max_num_proposals, initial_crop_size, + maxpool_stride, 3), + 'mask_predictions': (2 * max_num_proposals, mask_shape_1, 14, 14) + } + + init_op = tf.global_variables_initializer() + with self.test_session(graph=test_graph) as sess: + sess.run(init_op) + tensor_dict_out = sess.run(result_tensor_dict) + self.assertEqual( + set(tensor_dict_out.keys()), + set(expected_shapes.keys()).union( + set([ + 'rpn_box_encodings', + 'rpn_objectness_predictions_with_background', + 'anchors', + ]))) + for key in expected_shapes: + self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key]) + + anchors_shape_out = tensor_dict_out['anchors'].shape + self.assertEqual(2, len(anchors_shape_out)) + self.assertEqual(4, anchors_shape_out[1]) + num_anchors_out = anchors_shape_out[0] + self.assertAllEqual(tensor_dict_out['rpn_box_encodings'].shape, + (2, num_anchors_out, 4)) + self.assertAllEqual( + tensor_dict_out['rpn_objectness_predictions_with_background'].shape, + (2, num_anchors_out, 2)) + + def test_postprocess_third_stage_only_inference_mode(self): + num_proposals_shapes = [(2), (None)] + refined_box_encodings_shapes = [(16, 2, 4), (None, 2, 4)] + class_predictions_with_background_shapes = [(16, 3), (None, 3)] + proposal_boxes_shapes = [(2, 8, 4), (None, 8, 4)] + batch_size = 2 + image_shape = np.array((2, 36, 48, 3), dtype=np.int32) + for (num_proposals_shape, refined_box_encoding_shape, + class_predictions_with_background_shape, + proposal_boxes_shape) in zip(num_proposals_shapes, + refined_box_encodings_shapes, + class_predictions_with_background_shapes, + proposal_boxes_shapes): + tf_graph = tf.Graph() + with tf_graph.as_default(): + model = self._build_model( + is_training=False, number_of_stages=3, + second_stage_batch_size=6, predict_masks=True) + total_num_padded_proposals = batch_size * model.max_num_proposals + proposal_boxes = np.array( + [[[1, 1, 2, 3], + [0, 0, 1, 1], + [.5, .5, .6, .6], + 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]], + [[2, 3, 6, 8], + [1, 2, 5, 3], + 4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]]) + num_proposals = np.array([3, 2], dtype=np.int32) + refined_box_encodings = np.zeros( + [total_num_padded_proposals, model.num_classes, 4]) + class_predictions_with_background = np.ones( + [total_num_padded_proposals, model.num_classes+1]) + + num_proposals_placeholder = tf.placeholder(tf.int32, + shape=num_proposals_shape) + refined_box_encodings_placeholder = tf.placeholder( + tf.float32, shape=refined_box_encoding_shape) + class_predictions_with_background_placeholder = tf.placeholder( + tf.float32, shape=class_predictions_with_background_shape) + proposal_boxes_placeholder = tf.placeholder( + tf.float32, shape=proposal_boxes_shape) + image_shape_placeholder = tf.placeholder(tf.int32, shape=(4)) + _, true_image_shapes = model.preprocess( + tf.zeros(image_shape_placeholder)) + detections = model.postprocess({ + 'refined_box_encodings': refined_box_encodings_placeholder, + 'class_predictions_with_background': + class_predictions_with_background_placeholder, + 'num_proposals': num_proposals_placeholder, + 'proposal_boxes': proposal_boxes_placeholder, + 'image_shape': image_shape_placeholder, + 'detection_boxes': tf.zeros([2, 5, 4]), + 'detection_masks': tf.zeros([2, 5, 14, 14]), + 'detection_scores': tf.zeros([2, 5]), + 'detection_classes': tf.zeros([2, 5]), + 'num_detections': tf.zeros([2]), + }, true_image_shapes) + with self.test_session(graph=tf_graph) as sess: + detections_out = sess.run( + detections, + feed_dict={ + refined_box_encodings_placeholder: refined_box_encodings, + class_predictions_with_background_placeholder: + class_predictions_with_background, + num_proposals_placeholder: num_proposals, + proposal_boxes_placeholder: proposal_boxes, + image_shape_placeholder: image_shape + }) + self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4]) + self.assertAllEqual(detections_out['detection_masks'].shape, + [2, 5, 14, 14]) + self.assertAllClose(detections_out['detection_scores'].shape, [2, 5]) + self.assertAllClose(detections_out['detection_classes'].shape, [2, 5]) + self.assertAllClose(detections_out['num_detections'].shape, [2]) + self.assertTrue(np.amax(detections_out['detection_masks'] <= 1.0)) + self.assertTrue(np.amin(detections_out['detection_masks'] >= 0.0)) + + def _get_box_classifier_features_shape(self, + image_size, + batch_size, + max_num_proposals, + initial_crop_size, + maxpool_stride, + num_features): + return (batch_size * max_num_proposals, + initial_crop_size/maxpool_stride, + initial_crop_size/maxpool_stride, + num_features) + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..00c7054006b2a7d63f3bf77fa74e5dc4e170501e --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/faster_rcnn_meta_arch_test_lib.py @@ -0,0 +1,1512 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.meta_architectures.faster_rcnn_meta_arch.""" +import numpy as np +import tensorflow as tf +from google.protobuf import text_format +from object_detection.anchor_generators import grid_anchor_generator +from object_detection.builders import box_predictor_builder +from object_detection.builders import hyperparams_builder +from object_detection.builders import post_processing_builder +from object_detection.core import losses +from object_detection.meta_architectures import faster_rcnn_meta_arch +from object_detection.protos import box_predictor_pb2 +from object_detection.protos import hyperparams_pb2 +from object_detection.protos import post_processing_pb2 +from object_detection.utils import test_utils + +slim = tf.contrib.slim +BOX_CODE_SIZE = 4 + + +class FakeFasterRCNNFeatureExtractor( + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor): + """Fake feature extracture to use in tests.""" + + def __init__(self): + super(FakeFasterRCNNFeatureExtractor, self).__init__( + is_training=False, + first_stage_features_stride=32, + reuse_weights=None, + weight_decay=0.0) + + def preprocess(self, resized_inputs): + return tf.identity(resized_inputs) + + def _extract_proposal_features(self, preprocessed_inputs, scope): + with tf.variable_scope('mock_model'): + proposal_features = 0 * slim.conv2d( + preprocessed_inputs, num_outputs=3, kernel_size=1, scope='layer1') + return proposal_features, {} + + def _extract_box_classifier_features(self, proposal_feature_maps, scope): + with tf.variable_scope('mock_model'): + return 0 * slim.conv2d(proposal_feature_maps, + num_outputs=3, kernel_size=1, scope='layer2') + + +class FasterRCNNMetaArchTestBase(tf.test.TestCase): + """Base class to test Faster R-CNN and R-FCN meta architectures.""" + + def _build_arg_scope_with_hyperparams(self, + hyperparams_text_proto, + is_training): + hyperparams = hyperparams_pb2.Hyperparams() + text_format.Merge(hyperparams_text_proto, hyperparams) + return hyperparams_builder.build(hyperparams, is_training=is_training) + + def _get_second_stage_box_predictor_text_proto(self): + box_predictor_text_proto = """ + mask_rcnn_box_predictor { + fc_hyperparams { + op: FC + activation: NONE + regularizer { + l2_regularizer { + weight: 0.0005 + } + } + initializer { + variance_scaling_initializer { + factor: 1.0 + uniform: true + mode: FAN_AVG + } + } + } + } + """ + return box_predictor_text_proto + + def _add_mask_to_second_stage_box_predictor_text_proto( + self, masks_are_class_agnostic=False): + agnostic = 'true' if masks_are_class_agnostic else 'false' + box_predictor_text_proto = """ + mask_rcnn_box_predictor { + predict_instance_masks: true + masks_are_class_agnostic: """ + agnostic + """ + mask_height: 14 + mask_width: 14 + conv_hyperparams { + op: CONV + regularizer { + l2_regularizer { + weight: 0.0 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.01 + } + } + } + } + """ + return box_predictor_text_proto + + def _get_second_stage_box_predictor(self, num_classes, is_training, + predict_masks, masks_are_class_agnostic): + box_predictor_proto = box_predictor_pb2.BoxPredictor() + text_format.Merge(self._get_second_stage_box_predictor_text_proto(), + box_predictor_proto) + if predict_masks: + text_format.Merge( + self._add_mask_to_second_stage_box_predictor_text_proto( + masks_are_class_agnostic), + box_predictor_proto) + + return box_predictor_builder.build( + hyperparams_builder.build, + box_predictor_proto, + num_classes=num_classes, + is_training=is_training) + + def _get_model(self, box_predictor, **common_kwargs): + return faster_rcnn_meta_arch.FasterRCNNMetaArch( + initial_crop_size=3, + maxpool_kernel_size=1, + maxpool_stride=1, + second_stage_mask_rcnn_box_predictor=box_predictor, + **common_kwargs) + + def _build_model(self, + is_training, + number_of_stages, + second_stage_batch_size, + first_stage_max_proposals=8, + num_classes=2, + hard_mining=False, + softmax_second_stage_classification_loss=True, + predict_masks=False, + pad_to_max_dimension=None, + masks_are_class_agnostic=False): + + def image_resizer_fn(image, masks=None): + """Fake image resizer function.""" + resized_inputs = [] + resized_image = tf.identity(image) + if pad_to_max_dimension is not None: + resized_image = tf.image.pad_to_bounding_box(image, 0, 0, + pad_to_max_dimension, + pad_to_max_dimension) + resized_inputs.append(resized_image) + if masks is not None: + resized_masks = tf.identity(masks) + if pad_to_max_dimension is not None: + resized_masks = tf.image.pad_to_bounding_box(tf.transpose(masks, + [1, 2, 0]), + 0, 0, + pad_to_max_dimension, + pad_to_max_dimension) + resized_masks = tf.transpose(resized_masks, [2, 0, 1]) + resized_inputs.append(resized_masks) + resized_inputs.append(tf.shape(image)) + return resized_inputs + + # anchors in this test are designed so that a subset of anchors are inside + # the image and a subset of anchors are outside. + first_stage_anchor_scales = (0.001, 0.005, 0.1) + first_stage_anchor_aspect_ratios = (0.5, 1.0, 2.0) + first_stage_anchor_strides = (1, 1) + first_stage_anchor_generator = grid_anchor_generator.GridAnchorGenerator( + first_stage_anchor_scales, + first_stage_anchor_aspect_ratios, + anchor_stride=first_stage_anchor_strides) + + fake_feature_extractor = FakeFasterRCNNFeatureExtractor() + + first_stage_box_predictor_hyperparams_text_proto = """ + op: CONV + activation: RELU + regularizer { + l2_regularizer { + weight: 0.00004 + } + } + initializer { + truncated_normal_initializer { + stddev: 0.03 + } + } + """ + first_stage_box_predictor_arg_scope_fn = ( + self._build_arg_scope_with_hyperparams( + first_stage_box_predictor_hyperparams_text_proto, is_training)) + + first_stage_box_predictor_kernel_size = 3 + first_stage_atrous_rate = 1 + first_stage_box_predictor_depth = 512 + first_stage_minibatch_size = 3 + first_stage_positive_balance_fraction = .5 + + first_stage_nms_score_threshold = -1.0 + first_stage_nms_iou_threshold = 1.0 + first_stage_max_proposals = first_stage_max_proposals + + first_stage_localization_loss_weight = 1.0 + first_stage_objectness_loss_weight = 1.0 + + post_processing_text_proto = """ + batch_non_max_suppression { + score_threshold: -20.0 + iou_threshold: 1.0 + max_detections_per_class: 5 + max_total_detections: 5 + } + """ + post_processing_config = post_processing_pb2.PostProcessing() + text_format.Merge(post_processing_text_proto, post_processing_config) + second_stage_non_max_suppression_fn, _ = post_processing_builder.build( + post_processing_config) + second_stage_balance_fraction = 1.0 + + second_stage_score_conversion_fn = tf.identity + second_stage_localization_loss_weight = 1.0 + second_stage_classification_loss_weight = 1.0 + if softmax_second_stage_classification_loss: + second_stage_classification_loss = ( + losses.WeightedSoftmaxClassificationLoss()) + else: + second_stage_classification_loss = ( + losses.WeightedSigmoidClassificationLoss()) + + hard_example_miner = None + if hard_mining: + hard_example_miner = losses.HardExampleMiner( + num_hard_examples=1, + iou_threshold=0.99, + loss_type='both', + cls_loss_weight=second_stage_classification_loss_weight, + loc_loss_weight=second_stage_localization_loss_weight, + max_negatives_per_positive=None) + + common_kwargs = { + 'is_training': is_training, + 'num_classes': num_classes, + 'image_resizer_fn': image_resizer_fn, + 'feature_extractor': fake_feature_extractor, + 'number_of_stages': number_of_stages, + 'first_stage_anchor_generator': first_stage_anchor_generator, + 'first_stage_atrous_rate': first_stage_atrous_rate, + 'first_stage_box_predictor_arg_scope_fn': + first_stage_box_predictor_arg_scope_fn, + 'first_stage_box_predictor_kernel_size': + first_stage_box_predictor_kernel_size, + 'first_stage_box_predictor_depth': first_stage_box_predictor_depth, + 'first_stage_minibatch_size': first_stage_minibatch_size, + 'first_stage_positive_balance_fraction': + first_stage_positive_balance_fraction, + 'first_stage_nms_score_threshold': first_stage_nms_score_threshold, + 'first_stage_nms_iou_threshold': first_stage_nms_iou_threshold, + 'first_stage_max_proposals': first_stage_max_proposals, + 'first_stage_localization_loss_weight': + first_stage_localization_loss_weight, + 'first_stage_objectness_loss_weight': + first_stage_objectness_loss_weight, + 'second_stage_batch_size': second_stage_batch_size, + 'second_stage_balance_fraction': second_stage_balance_fraction, + 'second_stage_non_max_suppression_fn': + second_stage_non_max_suppression_fn, + 'second_stage_score_conversion_fn': second_stage_score_conversion_fn, + 'second_stage_localization_loss_weight': + second_stage_localization_loss_weight, + 'second_stage_classification_loss_weight': + second_stage_classification_loss_weight, + 'second_stage_classification_loss': + second_stage_classification_loss, + 'hard_example_miner': hard_example_miner} + + return self._get_model( + self._get_second_stage_box_predictor( + num_classes=num_classes, + is_training=is_training, + predict_masks=predict_masks, + masks_are_class_agnostic=masks_are_class_agnostic), **common_kwargs) + + def test_predict_gives_correct_shapes_in_inference_mode_first_stage_only( + self): + test_graph = tf.Graph() + with test_graph.as_default(): + model = self._build_model( + is_training=False, number_of_stages=1, second_stage_batch_size=2) + batch_size = 2 + height = 10 + width = 12 + input_image_shape = (batch_size, height, width, 3) + + _, true_image_shapes = model.preprocess(tf.zeros(input_image_shape)) + preprocessed_inputs = tf.placeholder( + dtype=tf.float32, shape=(batch_size, None, None, 3)) + prediction_dict = model.predict(preprocessed_inputs, true_image_shapes) + + # In inference mode, anchors are clipped to the image window, but not + # pruned. Since MockFasterRCNN.extract_proposal_features returns a + # tensor with the same shape as its input, the expected number of anchors + # is height * width * the number of anchors per location (i.e. 3x3). + expected_num_anchors = height * width * 3 * 3 + expected_output_keys = set([ + 'rpn_box_predictor_features', 'rpn_features_to_crop', 'image_shape', + 'rpn_box_encodings', 'rpn_objectness_predictions_with_background', + 'anchors']) + expected_output_shapes = { + 'rpn_box_predictor_features': (batch_size, height, width, 512), + 'rpn_features_to_crop': (batch_size, height, width, 3), + 'rpn_box_encodings': (batch_size, expected_num_anchors, 4), + 'rpn_objectness_predictions_with_background': + (batch_size, expected_num_anchors, 2), + 'anchors': (expected_num_anchors, 4) + } + + init_op = tf.global_variables_initializer() + with self.test_session(graph=test_graph) as sess: + sess.run(init_op) + prediction_out = sess.run(prediction_dict, + feed_dict={ + preprocessed_inputs: + np.zeros(input_image_shape) + }) + + self.assertEqual(set(prediction_out.keys()), expected_output_keys) + + self.assertAllEqual(prediction_out['image_shape'], input_image_shape) + for output_key, expected_shape in expected_output_shapes.items(): + self.assertAllEqual(prediction_out[output_key].shape, expected_shape) + + # Check that anchors are clipped to window. + anchors = prediction_out['anchors'] + self.assertTrue(np.all(np.greater_equal(anchors, 0))) + self.assertTrue(np.all(np.less_equal(anchors[:, 0], height))) + self.assertTrue(np.all(np.less_equal(anchors[:, 1], width))) + self.assertTrue(np.all(np.less_equal(anchors[:, 2], height))) + self.assertTrue(np.all(np.less_equal(anchors[:, 3], width))) + + def test_predict_gives_valid_anchors_in_training_mode_first_stage_only(self): + test_graph = tf.Graph() + with test_graph.as_default(): + model = self._build_model( + is_training=True, number_of_stages=1, second_stage_batch_size=2) + batch_size = 2 + height = 10 + width = 12 + input_image_shape = (batch_size, height, width, 3) + _, true_image_shapes = model.preprocess(tf.zeros(input_image_shape)) + preprocessed_inputs = tf.placeholder( + dtype=tf.float32, shape=(batch_size, None, None, 3)) + prediction_dict = model.predict(preprocessed_inputs, true_image_shapes) + + expected_output_keys = set([ + 'rpn_box_predictor_features', 'rpn_features_to_crop', 'image_shape', + 'rpn_box_encodings', 'rpn_objectness_predictions_with_background', + 'anchors']) + # At training time, anchors that exceed image bounds are pruned. Thus + # the `expected_num_anchors` in the above inference mode test is now + # a strict upper bound on the number of anchors. + num_anchors_strict_upper_bound = height * width * 3 * 3 + + init_op = tf.global_variables_initializer() + with self.test_session(graph=test_graph) as sess: + sess.run(init_op) + prediction_out = sess.run(prediction_dict, + feed_dict={ + preprocessed_inputs: + np.zeros(input_image_shape) + }) + + self.assertEqual(set(prediction_out.keys()), expected_output_keys) + self.assertAllEqual(prediction_out['image_shape'], input_image_shape) + + # Check that anchors have less than the upper bound and + # are clipped to window. + anchors = prediction_out['anchors'] + self.assertTrue(len(anchors.shape) == 2 and anchors.shape[1] == 4) + num_anchors_out = anchors.shape[0] + self.assertTrue(num_anchors_out < num_anchors_strict_upper_bound) + + self.assertTrue(np.all(np.greater_equal(anchors, 0))) + self.assertTrue(np.all(np.less_equal(anchors[:, 0], height))) + self.assertTrue(np.all(np.less_equal(anchors[:, 1], width))) + self.assertTrue(np.all(np.less_equal(anchors[:, 2], height))) + self.assertTrue(np.all(np.less_equal(anchors[:, 3], width))) + + self.assertAllEqual(prediction_out['rpn_box_encodings'].shape, + (batch_size, num_anchors_out, 4)) + self.assertAllEqual( + prediction_out['rpn_objectness_predictions_with_background'].shape, + (batch_size, num_anchors_out, 2)) + + def test_predict_correct_shapes_in_inference_mode_two_stages(self): + batch_size = 2 + image_size = 10 + max_num_proposals = 8 + initial_crop_size = 3 + maxpool_stride = 1 + + input_shapes = [(batch_size, image_size, image_size, 3), + (None, image_size, image_size, 3), + (batch_size, None, None, 3), + (None, None, None, 3)] + expected_num_anchors = image_size * image_size * 3 * 3 + expected_shapes = { + 'rpn_box_predictor_features': + (2, image_size, image_size, 512), + 'rpn_features_to_crop': (2, image_size, image_size, 3), + 'image_shape': (4,), + 'rpn_box_encodings': (2, expected_num_anchors, 4), + 'rpn_objectness_predictions_with_background': + (2, expected_num_anchors, 2), + 'anchors': (expected_num_anchors, 4), + 'refined_box_encodings': (2 * max_num_proposals, 2, 4), + 'class_predictions_with_background': (2 * max_num_proposals, 2 + 1), + 'num_proposals': (2,), + 'proposal_boxes': (2, max_num_proposals, 4), + 'proposal_boxes_normalized': (2, max_num_proposals, 4), + 'box_classifier_features': + self._get_box_classifier_features_shape(image_size, + batch_size, + max_num_proposals, + initial_crop_size, + maxpool_stride, + 3) + } + + for input_shape in input_shapes: + test_graph = tf.Graph() + with test_graph.as_default(): + model = self._build_model( + is_training=False, + number_of_stages=2, + second_stage_batch_size=2, + predict_masks=False) + preprocessed_inputs = tf.placeholder(tf.float32, shape=input_shape) + _, true_image_shapes = model.preprocess(preprocessed_inputs) + result_tensor_dict = model.predict( + preprocessed_inputs, true_image_shapes) + init_op = tf.global_variables_initializer() + with self.test_session(graph=test_graph) as sess: + sess.run(init_op) + tensor_dict_out = sess.run(result_tensor_dict, feed_dict={ + preprocessed_inputs: + np.zeros((batch_size, image_size, image_size, 3))}) + self.assertEqual(set(tensor_dict_out.keys()), + set(expected_shapes.keys())) + for key in expected_shapes: + self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key]) + + def test_predict_gives_correct_shapes_in_train_mode_both_stages(self): + test_graph = tf.Graph() + with test_graph.as_default(): + model = self._build_model( + is_training=True, + number_of_stages=2, + second_stage_batch_size=7, + predict_masks=False) + + batch_size = 2 + image_size = 10 + max_num_proposals = 7 + initial_crop_size = 3 + maxpool_stride = 1 + + image_shape = (batch_size, image_size, image_size, 3) + preprocessed_inputs = tf.zeros(image_shape, dtype=tf.float32) + groundtruth_boxes_list = [ + tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32), + tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)] + groundtruth_classes_list = [ + tf.constant([[1, 0], [0, 1]], dtype=tf.float32), + tf.constant([[1, 0], [1, 0]], dtype=tf.float32)] + + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + + result_tensor_dict = model.predict(preprocessed_inputs, true_image_shapes) + expected_shapes = { + 'rpn_box_predictor_features': + (2, image_size, image_size, 512), + 'rpn_features_to_crop': (2, image_size, image_size, 3), + 'image_shape': (4,), + 'refined_box_encodings': (2 * max_num_proposals, 2, 4), + 'class_predictions_with_background': (2 * max_num_proposals, 2 + 1), + 'num_proposals': (2,), + 'proposal_boxes': (2, max_num_proposals, 4), + 'proposal_boxes_normalized': (2, max_num_proposals, 4), + 'box_classifier_features': + self._get_box_classifier_features_shape(image_size, + batch_size, + max_num_proposals, + initial_crop_size, + maxpool_stride, + 3) + } + + init_op = tf.global_variables_initializer() + with self.test_session(graph=test_graph) as sess: + sess.run(init_op) + tensor_dict_out = sess.run(result_tensor_dict) + self.assertEqual(set(tensor_dict_out.keys()), + set(expected_shapes.keys()).union(set([ + 'rpn_box_encodings', + 'rpn_objectness_predictions_with_background', + 'anchors']))) + for key in expected_shapes: + self.assertAllEqual(tensor_dict_out[key].shape, expected_shapes[key]) + + anchors_shape_out = tensor_dict_out['anchors'].shape + self.assertEqual(2, len(anchors_shape_out)) + self.assertEqual(4, anchors_shape_out[1]) + num_anchors_out = anchors_shape_out[0] + self.assertAllEqual(tensor_dict_out['rpn_box_encodings'].shape, + (2, num_anchors_out, 4)) + self.assertAllEqual( + tensor_dict_out['rpn_objectness_predictions_with_background'].shape, + (2, num_anchors_out, 2)) + + def _test_postprocess_first_stage_only_inference_mode( + self, pad_to_max_dimension=None): + model = self._build_model( + is_training=False, number_of_stages=1, second_stage_batch_size=6, + pad_to_max_dimension=pad_to_max_dimension) + batch_size = 2 + anchors = tf.constant( + [[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32]], dtype=tf.float32) + rpn_box_encodings = tf.zeros( + [batch_size, anchors.get_shape().as_list()[0], + BOX_CODE_SIZE], dtype=tf.float32) + # use different numbers for the objectness category to break ties in + # order of boxes returned by NMS + rpn_objectness_predictions_with_background = tf.constant([ + [[-10, 13], + [10, -10], + [10, -11], + [-10, 12]], + [[10, -10], + [-10, 13], + [-10, 12], + [10, -11]]], dtype=tf.float32) + rpn_features_to_crop = tf.ones((batch_size, 8, 8, 10), dtype=tf.float32) + image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + proposals = model.postprocess({ + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'rpn_features_to_crop': rpn_features_to_crop, + 'anchors': anchors}, true_image_shapes) + expected_proposal_boxes = [ + [[0, 0, .5, .5], [.5, .5, 1, 1], [0, .5, .5, 1], [.5, 0, 1.0, .5]] + + 4 * [4 * [0]], + [[0, .5, .5, 1], [.5, 0, 1.0, .5], [0, 0, .5, .5], [.5, .5, 1, 1]] + + 4 * [4 * [0]]] + expected_proposal_scores = [[1, 1, 0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0, 0, 0]] + expected_num_proposals = [4, 4] + + expected_output_keys = set(['detection_boxes', 'detection_scores', + 'num_detections']) + self.assertEqual(set(proposals.keys()), expected_output_keys) + with self.test_session() as sess: + proposals_out = sess.run(proposals) + self.assertAllClose(proposals_out['detection_boxes'], + expected_proposal_boxes) + self.assertAllClose(proposals_out['detection_scores'], + expected_proposal_scores) + self.assertAllEqual(proposals_out['num_detections'], + expected_num_proposals) + + def test_postprocess_first_stage_only_inference_mode(self): + self._test_postprocess_first_stage_only_inference_mode() + + def test_postprocess_first_stage_only_inference_mode_padded_image(self): + self._test_postprocess_first_stage_only_inference_mode( + pad_to_max_dimension=56) + + def _test_postprocess_first_stage_only_train_mode(self, + pad_to_max_dimension=None): + model = self._build_model( + is_training=True, number_of_stages=1, second_stage_batch_size=2, + pad_to_max_dimension=pad_to_max_dimension) + batch_size = 2 + anchors = tf.constant( + [[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32]], dtype=tf.float32) + rpn_box_encodings = tf.zeros( + [batch_size, anchors.get_shape().as_list()[0], + BOX_CODE_SIZE], dtype=tf.float32) + # use different numbers for the objectness category to break ties in + # order of boxes returned by NMS + rpn_objectness_predictions_with_background = tf.constant([ + [[-10, 13], + [-10, 12], + [-10, 11], + [-10, 10]], + [[-10, 13], + [-10, 12], + [-10, 11], + [-10, 10]]], dtype=tf.float32) + rpn_features_to_crop = tf.ones((batch_size, 8, 8, 10), dtype=tf.float32) + image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) + groundtruth_boxes_list = [ + tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32), + tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)] + groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32), + tf.constant([[1, 0], [1, 0]], dtype=tf.float32)] + + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + proposals = model.postprocess({ + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'rpn_features_to_crop': rpn_features_to_crop, + 'anchors': anchors}, true_image_shapes) + expected_proposal_boxes = [ + [[0, 0, .5, .5], [.5, .5, 1, 1]], [[0, .5, .5, 1], [.5, 0, 1, .5]]] + expected_proposal_scores = [[1, 1], + [1, 1]] + expected_num_proposals = [2, 2] + + expected_output_keys = set(['detection_boxes', 'detection_scores', + 'num_detections']) + self.assertEqual(set(proposals.keys()), expected_output_keys) + + with self.test_session() as sess: + proposals_out = sess.run(proposals) + for image_idx in range(batch_size): + self.assertTrue( + test_utils.first_rows_close_as_set( + proposals_out['detection_boxes'][image_idx].tolist(), + expected_proposal_boxes[image_idx])) + self.assertAllClose(proposals_out['detection_scores'], + expected_proposal_scores) + self.assertAllEqual(proposals_out['num_detections'], + expected_num_proposals) + + def test_postprocess_first_stage_only_train_mode(self): + self._test_postprocess_first_stage_only_train_mode() + + def test_postprocess_first_stage_only_train_mode_padded_image(self): + self._test_postprocess_first_stage_only_train_mode(pad_to_max_dimension=56) + + def _test_postprocess_second_stage_only_inference_mode( + self, pad_to_max_dimension=None): + num_proposals_shapes = [(2), (None,)] + refined_box_encodings_shapes = [(16, 2, 4), (None, 2, 4)] + class_predictions_with_background_shapes = [(16, 3), (None, 3)] + proposal_boxes_shapes = [(2, 8, 4), (None, 8, 4)] + batch_size = 2 + image_shape = np.array((2, 36, 48, 3), dtype=np.int32) + for (num_proposals_shape, refined_box_encoding_shape, + class_predictions_with_background_shape, + proposal_boxes_shape) in zip(num_proposals_shapes, + refined_box_encodings_shapes, + class_predictions_with_background_shapes, + proposal_boxes_shapes): + tf_graph = tf.Graph() + with tf_graph.as_default(): + model = self._build_model( + is_training=False, number_of_stages=2, + second_stage_batch_size=6, + pad_to_max_dimension=pad_to_max_dimension) + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + total_num_padded_proposals = batch_size * model.max_num_proposals + proposal_boxes = np.array( + [[[1, 1, 2, 3], + [0, 0, 1, 1], + [.5, .5, .6, .6], + 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]], + [[2, 3, 6, 8], + [1, 2, 5, 3], + 4*[0], 4*[0], 4*[0], 4*[0], 4*[0], 4*[0]]]) + num_proposals = np.array([3, 2], dtype=np.int32) + refined_box_encodings = np.zeros( + [total_num_padded_proposals, model.num_classes, 4]) + class_predictions_with_background = np.ones( + [total_num_padded_proposals, model.num_classes+1]) + + num_proposals_placeholder = tf.placeholder(tf.int32, + shape=num_proposals_shape) + refined_box_encodings_placeholder = tf.placeholder( + tf.float32, shape=refined_box_encoding_shape) + class_predictions_with_background_placeholder = tf.placeholder( + tf.float32, shape=class_predictions_with_background_shape) + proposal_boxes_placeholder = tf.placeholder( + tf.float32, shape=proposal_boxes_shape) + image_shape_placeholder = tf.placeholder(tf.int32, shape=(4)) + + detections = model.postprocess({ + 'refined_box_encodings': refined_box_encodings_placeholder, + 'class_predictions_with_background': + class_predictions_with_background_placeholder, + 'num_proposals': num_proposals_placeholder, + 'proposal_boxes': proposal_boxes_placeholder, + }, true_image_shapes) + with self.test_session(graph=tf_graph) as sess: + detections_out = sess.run( + detections, + feed_dict={ + refined_box_encodings_placeholder: refined_box_encodings, + class_predictions_with_background_placeholder: + class_predictions_with_background, + num_proposals_placeholder: num_proposals, + proposal_boxes_placeholder: proposal_boxes, + image_shape_placeholder: image_shape + }) + self.assertAllEqual(detections_out['detection_boxes'].shape, [2, 5, 4]) + self.assertAllClose(detections_out['detection_scores'], + [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]) + self.assertAllClose(detections_out['detection_classes'], + [[0, 0, 0, 1, 1], [0, 0, 1, 1, 0]]) + self.assertAllClose(detections_out['num_detections'], [5, 4]) + + def test_postprocess_second_stage_only_inference_mode(self): + self._test_postprocess_second_stage_only_inference_mode() + + def test_postprocess_second_stage_only_inference_mode_padded_image(self): + self._test_postprocess_second_stage_only_inference_mode( + pad_to_max_dimension=56) + + def test_preprocess_preserves_input_shapes(self): + image_shapes = [(3, None, None, 3), + (None, 10, 10, 3), + (None, None, None, 3)] + for image_shape in image_shapes: + model = self._build_model( + is_training=False, number_of_stages=2, second_stage_batch_size=6) + image_placeholder = tf.placeholder(tf.float32, shape=image_shape) + preprocessed_inputs, _ = model.preprocess(image_placeholder) + self.assertAllEqual(preprocessed_inputs.shape.as_list(), image_shape) + + # TODO(rathodv): Split test into two - with and without masks. + def test_loss_first_stage_only_mode(self): + model = self._build_model( + is_training=True, number_of_stages=1, second_stage_batch_size=6) + batch_size = 2 + anchors = tf.constant( + [[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32]], dtype=tf.float32) + + rpn_box_encodings = tf.zeros( + [batch_size, + anchors.get_shape().as_list()[0], + BOX_CODE_SIZE], dtype=tf.float32) + # use different numbers for the objectness category to break ties in + # order of boxes returned by NMS + rpn_objectness_predictions_with_background = tf.constant([ + [[-10, 13], + [10, -10], + [10, -11], + [-10, 12]], + [[10, -10], + [-10, 13], + [-10, 12], + [10, -11]]], dtype=tf.float32) + image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) + + groundtruth_boxes_list = [ + tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32), + tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)] + groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32), + tf.constant([[1, 0], [1, 0]], dtype=tf.float32)] + + prediction_dict = { + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'image_shape': image_shape, + 'anchors': anchors + } + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + loss_dict = model.loss(prediction_dict, true_image_shapes) + with self.test_session() as sess: + loss_dict_out = sess.run(loss_dict) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], 0) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0) + self.assertTrue('Loss/BoxClassifierLoss/localization_loss' + not in loss_dict_out) + self.assertTrue('Loss/BoxClassifierLoss/classification_loss' + not in loss_dict_out) + + # TODO(rathodv): Split test into two - with and without masks. + def test_loss_full(self): + model = self._build_model( + is_training=True, number_of_stages=2, second_stage_batch_size=6) + batch_size = 3 + anchors = tf.constant( + [[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32]], dtype=tf.float32) + rpn_box_encodings = tf.zeros( + [batch_size, + anchors.get_shape().as_list()[0], + BOX_CODE_SIZE], dtype=tf.float32) + # use different numbers for the objectness category to break ties in + # order of boxes returned by NMS + rpn_objectness_predictions_with_background = tf.constant( + [[[-10, 13], [10, -10], [10, -11], [-10, 12]], [[10, -10], [-10, 13], [ + -10, 12 + ], [10, -11]], [[10, -10], [-10, 13], [-10, 12], [10, -11]]], + dtype=tf.float32) + image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) + + num_proposals = tf.constant([6, 6, 6], dtype=tf.int32) + proposal_boxes = tf.constant( + 3 * [[[0, 0, 16, 16], [0, 16, 16, 32], [16, 0, 32, 16], + [16, 16, 32, 32], [0, 0, 16, 16], [0, 16, 16, 32]]], + dtype=tf.float32) + refined_box_encodings = tf.zeros( + (batch_size * model.max_num_proposals, + model.num_classes, + BOX_CODE_SIZE), dtype=tf.float32) + class_predictions_with_background = tf.constant( + [ + [-10, 10, -10], # first image + [10, -10, -10], + [10, -10, -10], + [-10, -10, 10], + [-10, 10, -10], + [10, -10, -10], + [10, -10, -10], # second image + [-10, 10, -10], + [-10, 10, -10], + [10, -10, -10], + [10, -10, -10], + [-10, 10, -10], + [10, -10, -10], # third image + [-10, 10, -10], + [-10, 10, -10], + [10, -10, -10], + [10, -10, -10], + [-10, 10, -10] + ], + dtype=tf.float32) + + mask_predictions_logits = 20 * tf.ones((batch_size * + model.max_num_proposals, + model.num_classes, + 14, 14), + dtype=tf.float32) + + groundtruth_boxes_list = [ + tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32), + tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32), + tf.constant([[0, .5, .5, 1], [.5, 0, 1, 1]], dtype=tf.float32) + ] + groundtruth_classes_list = [ + tf.constant([[1, 0], [0, 1]], dtype=tf.float32), + tf.constant([[1, 0], [1, 0]], dtype=tf.float32), + tf.constant([[1, 0], [0, 1]], dtype=tf.float32) + ] + + # Set all elements of groundtruth mask to 1.0. In this case all proposal + # crops of the groundtruth masks should return a mask that covers the entire + # proposal. Thus, if mask_predictions_logits element values are all greater + # than 20, the loss should be zero. + groundtruth_masks_list = [ + tf.convert_to_tensor(np.ones((2, 32, 32)), dtype=tf.float32), + tf.convert_to_tensor(np.ones((2, 32, 32)), dtype=tf.float32), + tf.convert_to_tensor(np.ones((2, 32, 32)), dtype=tf.float32) + ] + groundtruth_weights_list = [ + tf.constant([1, 1], dtype=tf.float32), + tf.constant([1, 1], dtype=tf.float32), + tf.constant([1, 0], dtype=tf.float32) + ] + prediction_dict = { + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'image_shape': image_shape, + 'anchors': anchors, + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': class_predictions_with_background, + 'proposal_boxes': proposal_boxes, + 'num_proposals': num_proposals, + 'mask_predictions': mask_predictions_logits + } + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth( + groundtruth_boxes_list, + groundtruth_classes_list, + groundtruth_masks_list, + groundtruth_weights_list=groundtruth_weights_list) + loss_dict = model.loss(prediction_dict, true_image_shapes) + + with self.test_session() as sess: + loss_dict_out = sess.run(loss_dict) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], 0) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/localization_loss'], 0) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/classification_loss'], 0) + self.assertAllClose(loss_dict_out['Loss/BoxClassifierLoss/mask_loss'], 0) + + def test_loss_full_zero_padded_proposals(self): + model = self._build_model( + is_training=True, number_of_stages=2, second_stage_batch_size=6) + batch_size = 1 + anchors = tf.constant( + [[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32]], dtype=tf.float32) + rpn_box_encodings = tf.zeros( + [batch_size, + anchors.get_shape().as_list()[0], + BOX_CODE_SIZE], dtype=tf.float32) + # use different numbers for the objectness category to break ties in + # order of boxes returned by NMS + rpn_objectness_predictions_with_background = tf.constant([ + [[-10, 13], + [10, -10], + [10, -11], + [10, -12]],], dtype=tf.float32) + image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) + + # box_classifier_batch_size is 6, but here we assume that the number of + # actual proposals (not counting zero paddings) is fewer (3). + num_proposals = tf.constant([3], dtype=tf.int32) + proposal_boxes = tf.constant( + [[[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [0, 0, 0, 0], # begin paddings + [0, 0, 0, 0], + [0, 0, 0, 0]]], dtype=tf.float32) + + refined_box_encodings = tf.zeros( + (batch_size * model.max_num_proposals, + model.num_classes, + BOX_CODE_SIZE), dtype=tf.float32) + class_predictions_with_background = tf.constant( + [[-10, 10, -10], + [10, -10, -10], + [10, -10, -10], + [0, 0, 0], # begin paddings + [0, 0, 0], + [0, 0, 0]], dtype=tf.float32) + + mask_predictions_logits = 20 * tf.ones((batch_size * + model.max_num_proposals, + model.num_classes, + 14, 14), + dtype=tf.float32) + + groundtruth_boxes_list = [ + tf.constant([[0, 0, .5, .5]], dtype=tf.float32)] + groundtruth_classes_list = [tf.constant([[1, 0]], dtype=tf.float32)] + + # Set all elements of groundtruth mask to 1.0. In this case all proposal + # crops of the groundtruth masks should return a mask that covers the entire + # proposal. Thus, if mask_predictions_logits element values are all greater + # than 20, the loss should be zero. + groundtruth_masks_list = [tf.convert_to_tensor(np.ones((1, 32, 32)), + dtype=tf.float32)] + + prediction_dict = { + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'image_shape': image_shape, + 'anchors': anchors, + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': class_predictions_with_background, + 'proposal_boxes': proposal_boxes, + 'num_proposals': num_proposals, + 'mask_predictions': mask_predictions_logits + } + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list, + groundtruth_masks_list) + loss_dict = model.loss(prediction_dict, true_image_shapes) + + with self.test_session() as sess: + loss_dict_out = sess.run(loss_dict) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], 0) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/localization_loss'], 0) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/classification_loss'], 0) + self.assertAllClose(loss_dict_out['Loss/BoxClassifierLoss/mask_loss'], 0) + + def test_loss_full_multiple_label_groundtruth(self): + model = self._build_model( + is_training=True, number_of_stages=2, second_stage_batch_size=6, + softmax_second_stage_classification_loss=False) + batch_size = 1 + anchors = tf.constant( + [[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32]], dtype=tf.float32) + rpn_box_encodings = tf.zeros( + [batch_size, + anchors.get_shape().as_list()[0], + BOX_CODE_SIZE], dtype=tf.float32) + # use different numbers for the objectness category to break ties in + # order of boxes returned by NMS + rpn_objectness_predictions_with_background = tf.constant([ + [[-10, 13], + [10, -10], + [10, -11], + [10, -12]],], dtype=tf.float32) + image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) + + # box_classifier_batch_size is 6, but here we assume that the number of + # actual proposals (not counting zero paddings) is fewer (3). + num_proposals = tf.constant([3], dtype=tf.int32) + proposal_boxes = tf.constant( + [[[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [0, 0, 0, 0], # begin paddings + [0, 0, 0, 0], + [0, 0, 0, 0]]], dtype=tf.float32) + + # second_stage_localization_loss should only be computed for predictions + # that match groundtruth. For multiple label groundtruth boxes, the loss + # should only be computed once for the label with the smaller index. + refined_box_encodings = tf.constant( + [[[0, 0, 0, 0], [1, 1, -1, -1]], + [[1, 1, -1, -1], [1, 1, 1, 1]], + [[1, 1, -1, -1], [1, 1, 1, 1]], + [[1, 1, -1, -1], [1, 1, 1, 1]], + [[1, 1, -1, -1], [1, 1, 1, 1]], + [[1, 1, -1, -1], [1, 1, 1, 1]]], dtype=tf.float32) + class_predictions_with_background = tf.constant( + [[-100, 100, 100], + [100, -100, -100], + [100, -100, -100], + [0, 0, 0], # begin paddings + [0, 0, 0], + [0, 0, 0]], dtype=tf.float32) + + mask_predictions_logits = 20 * tf.ones((batch_size * + model.max_num_proposals, + model.num_classes, + 14, 14), + dtype=tf.float32) + + groundtruth_boxes_list = [ + tf.constant([[0, 0, .5, .5]], dtype=tf.float32)] + # Box contains two ground truth labels. + groundtruth_classes_list = [tf.constant([[1, 1]], dtype=tf.float32)] + + # Set all elements of groundtruth mask to 1.0. In this case all proposal + # crops of the groundtruth masks should return a mask that covers the entire + # proposal. Thus, if mask_predictions_logits element values are all greater + # than 20, the loss should be zero. + groundtruth_masks_list = [tf.convert_to_tensor(np.ones((1, 32, 32)), + dtype=tf.float32)] + + prediction_dict = { + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'image_shape': image_shape, + 'anchors': anchors, + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': class_predictions_with_background, + 'proposal_boxes': proposal_boxes, + 'num_proposals': num_proposals, + 'mask_predictions': mask_predictions_logits + } + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list, + groundtruth_masks_list) + loss_dict = model.loss(prediction_dict, true_image_shapes) + + with self.test_session() as sess: + loss_dict_out = sess.run(loss_dict) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], 0) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/localization_loss'], 0) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/classification_loss'], 0) + self.assertAllClose(loss_dict_out['Loss/BoxClassifierLoss/mask_loss'], 0) + + def test_loss_full_zero_padded_proposals_nonzero_loss_with_two_images(self): + model = self._build_model( + is_training=True, number_of_stages=2, second_stage_batch_size=6) + batch_size = 2 + anchors = tf.constant( + [[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32]], dtype=tf.float32) + rpn_box_encodings = tf.zeros( + [batch_size, + anchors.get_shape().as_list()[0], + BOX_CODE_SIZE], dtype=tf.float32) + # use different numbers for the objectness category to break ties in + # order of boxes returned by NMS + rpn_objectness_predictions_with_background = tf.constant( + [[[-10, 13], + [10, -10], + [10, -11], + [10, -12]], + [[-10, 13], + [10, -10], + [10, -11], + [10, -12]]], dtype=tf.float32) + image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) + + # box_classifier_batch_size is 6, but here we assume that the number of + # actual proposals (not counting zero paddings) is fewer. + num_proposals = tf.constant([3, 2], dtype=tf.int32) + proposal_boxes = tf.constant( + [[[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [0, 0, 0, 0], # begin paddings + [0, 0, 0, 0], + [0, 0, 0, 0]], + [[0, 0, 16, 16], + [0, 16, 16, 32], + [0, 0, 0, 0], # begin paddings + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0]]], dtype=tf.float32) + + refined_box_encodings = tf.zeros( + (batch_size * model.max_num_proposals, + model.num_classes, + BOX_CODE_SIZE), dtype=tf.float32) + class_predictions_with_background = tf.constant( + [[-10, 10, -10], # first image + [10, -10, -10], + [10, -10, -10], + [0, 0, 0], # begin paddings + [0, 0, 0], + [0, 0, 0], + [-10, -10, 10], # second image + [10, -10, -10], + [0, 0, 0], # begin paddings + [0, 0, 0], + [0, 0, 0], + [0, 0, 0],], dtype=tf.float32) + + # The first groundtruth box is 4/5 of the anchor size in both directions + # experiencing a loss of: + # 2 * SmoothL1(5 * log(4/5)) / num_proposals + # = 2 * (abs(5 * log(1/2)) - .5) / 3 + # The second groundtruth box is identical to the prediction and thus + # experiences zero loss. + # Total average loss is (abs(5 * log(1/2)) - .5) / 3. + groundtruth_boxes_list = [ + tf.constant([[0.05, 0.05, 0.45, 0.45]], dtype=tf.float32), + tf.constant([[0.0, 0.0, 0.5, 0.5]], dtype=tf.float32)] + groundtruth_classes_list = [tf.constant([[1, 0]], dtype=tf.float32), + tf.constant([[0, 1]], dtype=tf.float32)] + exp_loc_loss = (-5 * np.log(.8) - 0.5) / 3.0 + + prediction_dict = { + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'image_shape': image_shape, + 'anchors': anchors, + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': class_predictions_with_background, + 'proposal_boxes': proposal_boxes, + 'num_proposals': num_proposals + } + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + loss_dict = model.loss(prediction_dict, true_image_shapes) + + with self.test_session() as sess: + loss_dict_out = sess.run(loss_dict) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], + exp_loc_loss) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/localization_loss'], exp_loc_loss) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/classification_loss'], 0) + + def test_loss_with_hard_mining(self): + model = self._build_model(is_training=True, + number_of_stages=2, + second_stage_batch_size=None, + first_stage_max_proposals=6, + hard_mining=True) + batch_size = 1 + anchors = tf.constant( + [[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32]], dtype=tf.float32) + rpn_box_encodings = tf.zeros( + [batch_size, + anchors.get_shape().as_list()[0], + BOX_CODE_SIZE], dtype=tf.float32) + # use different numbers for the objectness category to break ties in + # order of boxes returned by NMS + rpn_objectness_predictions_with_background = tf.constant( + [[[-10, 13], + [-10, 12], + [10, -11], + [10, -12]]], dtype=tf.float32) + image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) + + # box_classifier_batch_size is 6, but here we assume that the number of + # actual proposals (not counting zero paddings) is fewer (3). + num_proposals = tf.constant([3], dtype=tf.int32) + proposal_boxes = tf.constant( + [[[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [0, 0, 0, 0], # begin paddings + [0, 0, 0, 0], + [0, 0, 0, 0]]], dtype=tf.float32) + + refined_box_encodings = tf.zeros( + (batch_size * model.max_num_proposals, + model.num_classes, + BOX_CODE_SIZE), dtype=tf.float32) + class_predictions_with_background = tf.constant( + [[-10, 10, -10], # first image + [-10, -10, 10], + [10, -10, -10], + [0, 0, 0], # begin paddings + [0, 0, 0], + [0, 0, 0]], dtype=tf.float32) + + # The first groundtruth box is 4/5 of the anchor size in both directions + # experiencing a loss of: + # 2 * SmoothL1(5 * log(4/5)) / num_proposals + # = 2 * (abs(5 * log(1/2)) - .5) / 3 + # The second groundtruth box is 46/50 of the anchor size in both directions + # experiencing a loss of: + # 2 * SmoothL1(5 * log(42/50)) / num_proposals + # = 2 * (.5(5 * log(.92))^2 - .5) / 3. + # Since the first groundtruth box experiences greater loss, and we have + # set num_hard_examples=1 in the HardMiner, the final localization loss + # corresponds to that of the first groundtruth box. + groundtruth_boxes_list = [ + tf.constant([[0.05, 0.05, 0.45, 0.45], + [0.02, 0.52, 0.48, 0.98],], dtype=tf.float32)] + groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32)] + exp_loc_loss = 2 * (-5 * np.log(.8) - 0.5) / 3.0 + + prediction_dict = { + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'image_shape': image_shape, + 'anchors': anchors, + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': class_predictions_with_background, + 'proposal_boxes': proposal_boxes, + 'num_proposals': num_proposals + } + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + loss_dict = model.loss(prediction_dict, true_image_shapes) + + with self.test_session() as sess: + loss_dict_out = sess.run(loss_dict) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/localization_loss'], exp_loc_loss) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/classification_loss'], 0) + + def test_loss_full_with_shared_boxes(self): + model = self._build_model( + is_training=True, number_of_stages=2, second_stage_batch_size=6) + batch_size = 2 + anchors = tf.constant( + [[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32]], dtype=tf.float32) + rpn_box_encodings = tf.zeros( + [batch_size, + anchors.get_shape().as_list()[0], + BOX_CODE_SIZE], dtype=tf.float32) + # use different numbers for the objectness category to break ties in + # order of boxes returned by NMS + rpn_objectness_predictions_with_background = tf.constant([ + [[-10, 13], + [10, -10], + [10, -11], + [-10, 12]], + [[10, -10], + [-10, 13], + [-10, 12], + [10, -11]]], dtype=tf.float32) + image_shape = tf.constant([batch_size, 32, 32, 3], dtype=tf.int32) + + num_proposals = tf.constant([6, 6], dtype=tf.int32) + proposal_boxes = tf.constant( + 2 * [[[0, 0, 16, 16], + [0, 16, 16, 32], + [16, 0, 32, 16], + [16, 16, 32, 32], + [0, 0, 16, 16], + [0, 16, 16, 32]]], dtype=tf.float32) + refined_box_encodings = tf.zeros( + (batch_size * model.max_num_proposals, + 1, # one box shared among all the classes + BOX_CODE_SIZE), dtype=tf.float32) + class_predictions_with_background = tf.constant( + [[-10, 10, -10], # first image + [10, -10, -10], + [10, -10, -10], + [-10, -10, 10], + [-10, 10, -10], + [10, -10, -10], + [10, -10, -10], # second image + [-10, 10, -10], + [-10, 10, -10], + [10, -10, -10], + [10, -10, -10], + [-10, 10, -10]], dtype=tf.float32) + + mask_predictions_logits = 20 * tf.ones((batch_size * + model.max_num_proposals, + model.num_classes, + 14, 14), + dtype=tf.float32) + + groundtruth_boxes_list = [ + tf.constant([[0, 0, .5, .5], [.5, .5, 1, 1]], dtype=tf.float32), + tf.constant([[0, .5, .5, 1], [.5, 0, 1, .5]], dtype=tf.float32)] + groundtruth_classes_list = [tf.constant([[1, 0], [0, 1]], dtype=tf.float32), + tf.constant([[1, 0], [1, 0]], dtype=tf.float32)] + + # Set all elements of groundtruth mask to 1.0. In this case all proposal + # crops of the groundtruth masks should return a mask that covers the entire + # proposal. Thus, if mask_predictions_logits element values are all greater + # than 20, the loss should be zero. + groundtruth_masks_list = [tf.convert_to_tensor(np.ones((2, 32, 32)), + dtype=tf.float32), + tf.convert_to_tensor(np.ones((2, 32, 32)), + dtype=tf.float32)] + prediction_dict = { + 'rpn_box_encodings': rpn_box_encodings, + 'rpn_objectness_predictions_with_background': + rpn_objectness_predictions_with_background, + 'image_shape': image_shape, + 'anchors': anchors, + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': class_predictions_with_background, + 'proposal_boxes': proposal_boxes, + 'num_proposals': num_proposals, + 'mask_predictions': mask_predictions_logits + } + _, true_image_shapes = model.preprocess(tf.zeros(image_shape)) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list, + groundtruth_masks_list) + loss_dict = model.loss(prediction_dict, true_image_shapes) + + with self.test_session() as sess: + loss_dict_out = sess.run(loss_dict) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/localization_loss'], 0) + self.assertAllClose(loss_dict_out['Loss/RPNLoss/objectness_loss'], 0) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/localization_loss'], 0) + self.assertAllClose(loss_dict_out[ + 'Loss/BoxClassifierLoss/classification_loss'], 0) + self.assertAllClose(loss_dict_out['Loss/BoxClassifierLoss/mask_loss'], 0) + + def test_restore_map_for_classification_ckpt(self): + # Define mock tensorflow classification graph and save variables. + test_graph_classification = tf.Graph() + with test_graph_classification.as_default(): + image = tf.placeholder(dtype=tf.float32, shape=[1, 20, 20, 3]) + with tf.variable_scope('mock_model'): + net = slim.conv2d(image, num_outputs=3, kernel_size=1, scope='layer1') + slim.conv2d(net, num_outputs=3, kernel_size=1, scope='layer2') + + init_op = tf.global_variables_initializer() + saver = tf.train.Saver() + save_path = self.get_temp_dir() + with self.test_session(graph=test_graph_classification) as sess: + sess.run(init_op) + saved_model_path = saver.save(sess, save_path) + + # Create tensorflow detection graph and load variables from + # classification checkpoint. + test_graph_detection = tf.Graph() + with test_graph_detection.as_default(): + model = self._build_model( + is_training=False, number_of_stages=2, second_stage_batch_size=6) + + inputs_shape = (2, 20, 20, 3) + inputs = tf.to_float(tf.random_uniform( + inputs_shape, minval=0, maxval=255, dtype=tf.int32)) + preprocessed_inputs, true_image_shapes = model.preprocess(inputs) + prediction_dict = model.predict(preprocessed_inputs, true_image_shapes) + model.postprocess(prediction_dict, true_image_shapes) + var_map = model.restore_map(fine_tune_checkpoint_type='classification') + self.assertIsInstance(var_map, dict) + saver = tf.train.Saver(var_map) + with self.test_session(graph=test_graph_classification) as sess: + saver.restore(sess, saved_model_path) + for var in sess.run(tf.report_uninitialized_variables()): + self.assertNotIn(model.first_stage_feature_extractor_scope, var) + self.assertNotIn(model.second_stage_feature_extractor_scope, var) + + def test_restore_map_for_detection_ckpt(self): + # Define first detection graph and save variables. + test_graph_detection1 = tf.Graph() + with test_graph_detection1.as_default(): + model = self._build_model( + is_training=False, number_of_stages=2, second_stage_batch_size=6) + inputs_shape = (2, 20, 20, 3) + inputs = tf.to_float(tf.random_uniform( + inputs_shape, minval=0, maxval=255, dtype=tf.int32)) + preprocessed_inputs, true_image_shapes = model.preprocess(inputs) + prediction_dict = model.predict(preprocessed_inputs, true_image_shapes) + model.postprocess(prediction_dict, true_image_shapes) + another_variable = tf.Variable([17.0], name='another_variable') # pylint: disable=unused-variable + init_op = tf.global_variables_initializer() + saver = tf.train.Saver() + save_path = self.get_temp_dir() + with self.test_session(graph=test_graph_detection1) as sess: + sess.run(init_op) + saved_model_path = saver.save(sess, save_path) + + # Define second detection graph and restore variables. + test_graph_detection2 = tf.Graph() + with test_graph_detection2.as_default(): + model2 = self._build_model(is_training=False, number_of_stages=2, + second_stage_batch_size=6, num_classes=42) + + inputs_shape2 = (2, 20, 20, 3) + inputs2 = tf.to_float(tf.random_uniform( + inputs_shape2, minval=0, maxval=255, dtype=tf.int32)) + preprocessed_inputs2, true_image_shapes = model2.preprocess(inputs2) + prediction_dict2 = model2.predict(preprocessed_inputs2, true_image_shapes) + model2.postprocess(prediction_dict2, true_image_shapes) + another_variable = tf.Variable([17.0], name='another_variable') # pylint: disable=unused-variable + var_map = model2.restore_map(fine_tune_checkpoint_type='detection') + self.assertIsInstance(var_map, dict) + saver = tf.train.Saver(var_map) + with self.test_session(graph=test_graph_detection2) as sess: + saver.restore(sess, saved_model_path) + uninitialized_vars_list = sess.run(tf.report_uninitialized_variables()) + self.assertIn('another_variable', uninitialized_vars_list) + for var in uninitialized_vars_list: + self.assertNotIn(model2.first_stage_feature_extractor_scope, var) + self.assertNotIn(model2.second_stage_feature_extractor_scope, var) + + def test_load_all_det_checkpoint_vars(self): + test_graph_detection = tf.Graph() + with test_graph_detection.as_default(): + model = self._build_model( + is_training=False, + number_of_stages=2, + second_stage_batch_size=6, + num_classes=42) + + inputs_shape = (2, 20, 20, 3) + inputs = tf.to_float( + tf.random_uniform(inputs_shape, minval=0, maxval=255, dtype=tf.int32)) + preprocessed_inputs, true_image_shapes = model.preprocess(inputs) + prediction_dict = model.predict(preprocessed_inputs, true_image_shapes) + model.postprocess(prediction_dict, true_image_shapes) + another_variable = tf.Variable([17.0], name='another_variable') # pylint: disable=unused-variable + var_map = model.restore_map( + fine_tune_checkpoint_type='detection', + load_all_detection_checkpoint_vars=True) + self.assertIsInstance(var_map, dict) + self.assertIn('another_variable', var_map) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/rfcn_meta_arch.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/rfcn_meta_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..3655449382a2d59009c251f5b7f571c2cc14a4f0 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/rfcn_meta_arch.py @@ -0,0 +1,299 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""R-FCN meta-architecture definition. + +R-FCN: Dai, Jifeng, et al. "R-FCN: Object Detection via Region-based +Fully Convolutional Networks." arXiv preprint arXiv:1605.06409 (2016). + +The R-FCN meta architecture is similar to Faster R-CNN and only differs in the +second stage. Hence this class inherits FasterRCNNMetaArch and overrides only +the `_predict_second_stage` method. + +Similar to Faster R-CNN we allow for two modes: number_of_stages=1 and +number_of_stages=2. In the former setting, all of the user facing methods +(e.g., predict, postprocess, loss) can be used as if the model consisted +only of the RPN, returning class agnostic proposals (these can be thought of as +approximate detections with no associated class information). In the latter +setting, proposals are computed, then passed through a second stage +"box classifier" to yield (multi-class) detections. + +Implementations of R-FCN models must define a new FasterRCNNFeatureExtractor and +override three methods: `preprocess`, `_extract_proposal_features` (the first +stage of the model), and `_extract_box_classifier_features` (the second stage of +the model). Optionally, the `restore_fn` method can be overridden. See tests +for an example. + +See notes in the documentation of Faster R-CNN meta-architecture as they all +apply here. +""" +import tensorflow as tf + +from object_detection.core import box_predictor +from object_detection.meta_architectures import faster_rcnn_meta_arch +from object_detection.utils import ops + + +class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch): + """R-FCN Meta-architecture definition.""" + + def __init__(self, + is_training, + num_classes, + image_resizer_fn, + feature_extractor, + number_of_stages, + first_stage_anchor_generator, + first_stage_atrous_rate, + first_stage_box_predictor_arg_scope_fn, + first_stage_box_predictor_kernel_size, + first_stage_box_predictor_depth, + first_stage_minibatch_size, + first_stage_positive_balance_fraction, + first_stage_nms_score_threshold, + first_stage_nms_iou_threshold, + first_stage_max_proposals, + first_stage_localization_loss_weight, + first_stage_objectness_loss_weight, + second_stage_rfcn_box_predictor, + second_stage_batch_size, + second_stage_balance_fraction, + second_stage_non_max_suppression_fn, + second_stage_score_conversion_fn, + second_stage_localization_loss_weight, + second_stage_classification_loss_weight, + second_stage_classification_loss, + hard_example_miner, + parallel_iterations=16, + add_summaries=True): + """RFCNMetaArch Constructor. + + Args: + is_training: A boolean indicating whether the training version of the + computation graph should be constructed. + num_classes: Number of classes. Note that num_classes *does not* + include the background category, so if groundtruth labels take values + in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the + assigned classification targets can range from {0,... K}). + image_resizer_fn: A callable for image resizing. This callable always + takes a rank-3 image tensor (corresponding to a single image) and + returns a rank-3 image tensor, possibly with new spatial dimensions. + See builders/image_resizer_builder.py. + feature_extractor: A FasterRCNNFeatureExtractor object. + number_of_stages: Valid values are {1, 2}. If 1 will only construct the + Region Proposal Network (RPN) part of the model. + first_stage_anchor_generator: An anchor_generator.AnchorGenerator object + (note that currently we only support + grid_anchor_generator.GridAnchorGenerator objects) + first_stage_atrous_rate: A single integer indicating the atrous rate for + the single convolution op which is applied to the `rpn_features_to_crop` + tensor to obtain a tensor to be used for box prediction. Some feature + extractors optionally allow for producing feature maps computed at + denser resolutions. The atrous rate is used to compensate for the + denser feature maps by using an effectively larger receptive field. + (This should typically be set to 1). + first_stage_box_predictor_arg_scope_fn: A function to generate tf-slim + arg_scope for conv2d, separable_conv2d and fully_connected ops for the + RPN box predictor. + first_stage_box_predictor_kernel_size: Kernel size to use for the + convolution op just prior to RPN box predictions. + first_stage_box_predictor_depth: Output depth for the convolution op + just prior to RPN box predictions. + first_stage_minibatch_size: The "batch size" to use for computing the + objectness and location loss of the region proposal network. This + "batch size" refers to the number of anchors selected as contributing + to the loss function for any given image within the image batch and is + only called "batch_size" due to terminology from the Faster R-CNN paper. + first_stage_positive_balance_fraction: Fraction of positive examples + per image for the RPN. The recommended value for Faster RCNN is 0.5. + first_stage_nms_score_threshold: Score threshold for non max suppression + for the Region Proposal Network (RPN). This value is expected to be in + [0, 1] as it is applied directly after a softmax transformation. The + recommended value for Faster R-CNN is 0. + first_stage_nms_iou_threshold: The Intersection Over Union (IOU) threshold + for performing Non-Max Suppression (NMS) on the boxes predicted by the + Region Proposal Network (RPN). + first_stage_max_proposals: Maximum number of boxes to retain after + performing Non-Max Suppression (NMS) on the boxes predicted by the + Region Proposal Network (RPN). + first_stage_localization_loss_weight: A float + first_stage_objectness_loss_weight: A float + second_stage_rfcn_box_predictor: RFCN box predictor to use for + second stage. + second_stage_batch_size: The batch size used for computing the + classification and refined location loss of the box classifier. This + "batch size" refers to the number of proposals selected as contributing + to the loss function for any given image within the image batch and is + only called "batch_size" due to terminology from the Faster R-CNN paper. + second_stage_balance_fraction: Fraction of positive examples to use + per image for the box classifier. The recommended value for Faster RCNN + is 0.25. + second_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression + callable that takes `boxes`, `scores`, optional `clip_window` and + optional (kwarg) `mask` inputs (with all other inputs already set) + and returns a dictionary containing tensors with keys: + `detection_boxes`, `detection_scores`, `detection_classes`, + `num_detections`, and (optionally) `detection_masks`. See + `post_processing.batch_multiclass_non_max_suppression` for the type and + shape of these tensors. + second_stage_score_conversion_fn: Callable elementwise nonlinearity + (that takes tensors as inputs and returns tensors). This is usually + used to convert logits to probabilities. + second_stage_localization_loss_weight: A float + second_stage_classification_loss_weight: A float + second_stage_classification_loss: A string indicating which loss function + to use, supports 'softmax' and 'sigmoid'. + hard_example_miner: A losses.HardExampleMiner object (can be None). + parallel_iterations: (Optional) The number of iterations allowed to run + in parallel for calls to tf.map_fn. + add_summaries: boolean (default: True) controlling whether summary ops + should be added to tensorflow graph. + + Raises: + ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` + ValueError: If first_stage_anchor_generator is not of type + grid_anchor_generator.GridAnchorGenerator. + """ + # TODO(rathodv): add_summaries is currently unused. Respect that directive + # in the future. + super(RFCNMetaArch, self).__init__( + is_training, + num_classes, + image_resizer_fn, + feature_extractor, + number_of_stages, + first_stage_anchor_generator, + first_stage_atrous_rate, + first_stage_box_predictor_arg_scope_fn, + first_stage_box_predictor_kernel_size, + first_stage_box_predictor_depth, + first_stage_minibatch_size, + first_stage_positive_balance_fraction, + first_stage_nms_score_threshold, + first_stage_nms_iou_threshold, + first_stage_max_proposals, + first_stage_localization_loss_weight, + first_stage_objectness_loss_weight, + None, # initial_crop_size is not used in R-FCN + None, # maxpool_kernel_size is not use in R-FCN + None, # maxpool_stride is not use in R-FCN + None, # fully_connected_box_predictor is not used in R-FCN. + second_stage_batch_size, + second_stage_balance_fraction, + second_stage_non_max_suppression_fn, + second_stage_score_conversion_fn, + second_stage_localization_loss_weight, + second_stage_classification_loss_weight, + second_stage_classification_loss, + 1.0, # second stage mask prediction loss weight isn't used in R-FCN. + hard_example_miner, + parallel_iterations) + + self._rfcn_box_predictor = second_stage_rfcn_box_predictor + + def _predict_second_stage(self, rpn_box_encodings, + rpn_objectness_predictions_with_background, + rpn_features, + anchors, + image_shape, + true_image_shapes): + """Predicts the output tensors from 2nd stage of R-FCN. + + Args: + rpn_box_encodings: 3-D float tensor of shape + [batch_size, num_valid_anchors, self._box_coder.code_size] containing + predicted boxes. + rpn_objectness_predictions_with_background: 3-D float tensor of shape + [batch_size, num_valid_anchors, 2] containing class + predictions (logits) for each of the anchors. Note that this + tensor *includes* background class predictions (at class index 0). + rpn_features: A 4-D float32 tensor with shape + [batch_size, height, width, depth] representing image features from the + RPN. + anchors: 2-D float tensor of shape + [num_anchors, self._box_coder.code_size]. + image_shape: A 1D int32 tensors of size [4] containing the image shape. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Returns: + prediction_dict: a dictionary holding "raw" prediction tensors: + 1) refined_box_encodings: a 3-D tensor with shape + [total_num_proposals, num_classes, 4] representing predicted + (final) refined box encodings, where + total_num_proposals=batch_size*self._max_num_proposals + 2) class_predictions_with_background: a 2-D tensor with shape + [total_num_proposals, num_classes + 1] containing class + predictions (logits) for each of the anchors, where + total_num_proposals=batch_size*self._max_num_proposals. + Note that this tensor *includes* background class predictions + (at class index 0). + 3) num_proposals: An int32 tensor of shape [batch_size] representing the + number of proposals generated by the RPN. `num_proposals` allows us + to keep track of which entries are to be treated as zero paddings and + which are not since we always pad the number of proposals to be + `self.max_num_proposals` for each image. + 4) proposal_boxes: A float32 tensor of shape + [batch_size, self.max_num_proposals, 4] representing + decoded proposal bounding boxes (in absolute coordinates). + 5) proposal_boxes_normalized: A float32 tensor of shape + [batch_size, self.max_num_proposals, 4] representing decoded proposal + bounding boxes (in normalized coordinates). Can be used to override + the boxes proposed by the RPN, thus enabling one to extract box + classification and prediction for externally selected areas of the + image. + 6) box_classifier_features: a 4-D float32 tensor, of shape + [batch_size, feature_map_height, feature_map_width, depth], + representing the box classifier features. + """ + image_shape_2d = tf.tile(tf.expand_dims(image_shape[1:], 0), + [image_shape[0], 1]) + proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn( + rpn_box_encodings, rpn_objectness_predictions_with_background, + anchors, image_shape_2d, true_image_shapes) + + box_classifier_features = ( + self._feature_extractor.extract_box_classifier_features( + rpn_features, + scope=self.second_stage_feature_extractor_scope)) + + box_predictions = self._rfcn_box_predictor.predict( + [box_classifier_features], + num_predictions_per_location=[1], + scope=self.second_stage_box_predictor_scope, + proposal_boxes=proposal_boxes_normalized) + refined_box_encodings = tf.squeeze( + tf.concat(box_predictions[box_predictor.BOX_ENCODINGS], axis=1), axis=1) + class_predictions_with_background = tf.squeeze( + tf.concat( + box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], + axis=1), + axis=1) + + absolute_proposal_boxes = ops.normalized_to_image_coordinates( + proposal_boxes_normalized, image_shape, + parallel_iterations=self._parallel_iterations) + + prediction_dict = { + 'refined_box_encodings': refined_box_encodings, + 'class_predictions_with_background': + class_predictions_with_background, + 'num_proposals': num_proposals, + 'proposal_boxes': absolute_proposal_boxes, + 'box_classifier_features': box_classifier_features, + 'proposal_boxes_normalized': proposal_boxes_normalized, + } + return prediction_dict diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/rfcn_meta_arch_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/rfcn_meta_arch_test.py new file mode 100644 index 0000000000000000000000000000000000000000..829140ac968885bf200924bc08edfc170474d1bc --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/rfcn_meta_arch_test.py @@ -0,0 +1,65 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.meta_architectures.rfcn_meta_arch.""" + +import tensorflow as tf + +from object_detection.meta_architectures import faster_rcnn_meta_arch_test_lib +from object_detection.meta_architectures import rfcn_meta_arch + + +class RFCNMetaArchTest( + faster_rcnn_meta_arch_test_lib.FasterRCNNMetaArchTestBase): + + def _get_second_stage_box_predictor_text_proto(self): + box_predictor_text_proto = """ + rfcn_box_predictor { + conv_hyperparams { + op: CONV + activation: NONE + regularizer { + l2_regularizer { + weight: 0.0005 + } + } + initializer { + variance_scaling_initializer { + factor: 1.0 + uniform: true + mode: FAN_AVG + } + } + } + } + """ + return box_predictor_text_proto + + def _get_model(self, box_predictor, **common_kwargs): + return rfcn_meta_arch.RFCNMetaArch( + second_stage_rfcn_box_predictor=box_predictor, **common_kwargs) + + def _get_box_classifier_features_shape(self, + image_size, + batch_size, + max_num_proposals, + initial_crop_size, + maxpool_stride, + num_features): + return (batch_size, image_size, image_size, num_features) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/ssd_meta_arch.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/ssd_meta_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..ffbe914837a47d7fa8426fb59bfdcde25ae771e7 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/ssd_meta_arch.py @@ -0,0 +1,870 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""SSD Meta-architecture definition. + +General tensorflow implementation of convolutional Multibox/SSD detection +models. +""" +from abc import abstractmethod + +import re +import tensorflow as tf + +from object_detection.core import box_list +from object_detection.core import box_list_ops +from object_detection.core import model +from object_detection.core import standard_fields as fields +from object_detection.core import target_assigner +from object_detection.utils import ops +from object_detection.utils import shape_utils +from object_detection.utils import visualization_utils + +slim = tf.contrib.slim + + +class SSDFeatureExtractor(object): + """SSD Feature Extractor definition.""" + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """Constructor. + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + min_depth: minimum feature extractor depth. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + reuse_weights: whether to reuse variables. Default is None. + use_explicit_padding: Whether to use explicit padding when extracting + features. Default is False. + use_depthwise: Whether to use depthwise convolutions. Default is False. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + """ + self._is_training = is_training + self._depth_multiplier = depth_multiplier + self._min_depth = min_depth + self._pad_to_multiple = pad_to_multiple + self._conv_hyperparams_fn = conv_hyperparams_fn + self._reuse_weights = reuse_weights + self._use_explicit_padding = use_explicit_padding + self._use_depthwise = use_depthwise + self._override_base_feature_extractor_hyperparams = ( + override_base_feature_extractor_hyperparams) + + @abstractmethod + def preprocess(self, resized_inputs): + """Preprocesses images for feature extraction (minus image resizing). + + Args: + resized_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + """ + pass + + @abstractmethod + def extract_features(self, preprocessed_inputs): + """Extracts features from preprocessed inputs. + + This function is responsible for extracting feature maps from preprocessed + images. + + Args: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + feature_maps: a list of tensors where the ith tensor has shape + [batch, height_i, width_i, depth_i] + """ + raise NotImplementedError + + +class SSDMetaArch(model.DetectionModel): + """SSD Meta-architecture definition.""" + + def __init__(self, + is_training, + anchor_generator, + box_predictor, + box_coder, + feature_extractor, + matcher, + region_similarity_calculator, + encode_background_as_zeros, + negative_class_weight, + image_resizer_fn, + non_max_suppression_fn, + score_conversion_fn, + classification_loss, + localization_loss, + classification_loss_weight, + localization_loss_weight, + normalize_loss_by_num_matches, + hard_example_miner, + add_summaries=True, + normalize_loc_loss_by_codesize=False, + freeze_batchnorm=False, + inplace_batchnorm_update=False, + add_background_class=True, + random_example_sampler=None): + """SSDMetaArch Constructor. + + TODO(rathodv,jonathanhuang): group NMS parameters + score converter into + a class and loss parameters into a class and write config protos for + postprocessing and losses. + + Args: + is_training: A boolean indicating whether the training version of the + computation graph should be constructed. + anchor_generator: an anchor_generator.AnchorGenerator object. + box_predictor: a box_predictor.BoxPredictor object. + box_coder: a box_coder.BoxCoder object. + feature_extractor: a SSDFeatureExtractor object. + matcher: a matcher.Matcher object. + region_similarity_calculator: a + region_similarity_calculator.RegionSimilarityCalculator object. + encode_background_as_zeros: boolean determining whether background + targets are to be encoded as an all zeros vector or a one-hot + vector (where background is the 0th class). + negative_class_weight: Weight for confidence loss of negative anchors. + image_resizer_fn: a callable for image resizing. This callable always + takes a rank-3 image tensor (corresponding to a single image) and + returns a rank-3 image tensor, possibly with new spatial dimensions and + a 1-D tensor of shape [3] indicating shape of true image within + the resized image tensor as the resized image tensor could be padded. + See builders/image_resizer_builder.py. + non_max_suppression_fn: batch_multiclass_non_max_suppression + callable that takes `boxes`, `scores` and optional `clip_window` + inputs (with all other inputs already set) and returns a dictionary + hold tensors with keys: `detection_boxes`, `detection_scores`, + `detection_classes` and `num_detections`. See `post_processing. + batch_multiclass_non_max_suppression` for the type and shape of these + tensors. + score_conversion_fn: callable elementwise nonlinearity (that takes tensors + as inputs and returns tensors). This is usually used to convert logits + to probabilities. + classification_loss: an object_detection.core.losses.Loss object. + localization_loss: a object_detection.core.losses.Loss object. + classification_loss_weight: float + localization_loss_weight: float + normalize_loss_by_num_matches: boolean + hard_example_miner: a losses.HardExampleMiner object (can be None) + add_summaries: boolean (default: True) controlling whether summary ops + should be added to tensorflow graph. + normalize_loc_loss_by_codesize: whether to normalize localization loss + by code size of the box encoder. + freeze_batchnorm: Whether to freeze batch norm parameters during + training or not. When training with a small batch size (e.g. 1), it is + desirable to freeze batch norm update and use pretrained batch norm + params. + inplace_batchnorm_update: Whether to update batch norm moving average + values inplace. When this is false train op must add a control + dependency on tf.graphkeys.UPDATE_OPS collection in order to update + batch norm statistics. + add_background_class: Whether to add an implicit background class to + one-hot encodings of groundtruth labels. Set to false if using + groundtruth labels with an explicit background class or using multiclass + scores instead of truth in the case of distillation. + random_example_sampler: a BalancedPositiveNegativeSampler object that can + perform random example sampling when computing loss. If None, random + sampling process is skipped. Note that random example sampler and hard + example miner can both be applied to the model. In that case, random + sampler will take effect first and hard example miner can only process + the random sampled examples. + """ + super(SSDMetaArch, self).__init__(num_classes=box_predictor.num_classes) + self._is_training = is_training + self._freeze_batchnorm = freeze_batchnorm + self._inplace_batchnorm_update = inplace_batchnorm_update + + # Needed for fine-tuning from classification checkpoints whose + # variables do not have the feature extractor scope. + self._extract_features_scope = 'FeatureExtractor' + + self._anchor_generator = anchor_generator + self._box_predictor = box_predictor + + self._box_coder = box_coder + self._feature_extractor = feature_extractor + self._matcher = matcher + self._region_similarity_calculator = region_similarity_calculator + self._add_background_class = add_background_class + + # TODO(jonathanhuang): handle agnostic mode + # weights + unmatched_cls_target = None + unmatched_cls_target = tf.constant([1] + self.num_classes * [0], + tf.float32) + if encode_background_as_zeros: + unmatched_cls_target = tf.constant((self.num_classes + 1) * [0], + tf.float32) + + self._target_assigner = target_assigner.TargetAssigner( + self._region_similarity_calculator, + self._matcher, + self._box_coder, + negative_class_weight=negative_class_weight, + unmatched_cls_target=unmatched_cls_target) + + self._classification_loss = classification_loss + self._localization_loss = localization_loss + self._classification_loss_weight = classification_loss_weight + self._localization_loss_weight = localization_loss_weight + self._normalize_loss_by_num_matches = normalize_loss_by_num_matches + self._normalize_loc_loss_by_codesize = normalize_loc_loss_by_codesize + self._hard_example_miner = hard_example_miner + self._random_example_sampler = random_example_sampler + self._parallel_iterations = 16 + + self._image_resizer_fn = image_resizer_fn + self._non_max_suppression_fn = non_max_suppression_fn + self._score_conversion_fn = score_conversion_fn + + self._anchors = None + self._add_summaries = add_summaries + self._batched_prediction_tensor_names = [] + + @property + def anchors(self): + if not self._anchors: + raise RuntimeError('anchors have not been constructed yet!') + if not isinstance(self._anchors, box_list.BoxList): + raise RuntimeError('anchors should be a BoxList object, but is not.') + return self._anchors + + @property + def batched_prediction_tensor_names(self): + if not self._batched_prediction_tensor_names: + raise RuntimeError('Must call predict() method to get batched prediction ' + 'tensor names.') + return self._batched_prediction_tensor_names + + def preprocess(self, inputs): + """Feature-extractor specific preprocessing. + + SSD meta architecture uses a default clip_window of [0, 0, 1, 1] during + post-processing. On calling `preprocess` method, clip_window gets updated + based on `true_image_shapes` returned by `image_resizer_fn`. + + Args: + inputs: a [batch, height_in, width_in, channels] float tensor representing + a batch of images with values between 0 and 255.0. + + Returns: + preprocessed_inputs: a [batch, height_out, width_out, channels] float + tensor representing a batch of images. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Raises: + ValueError: if inputs tensor does not have type tf.float32 + """ + if inputs.dtype is not tf.float32: + raise ValueError('`preprocess` expects a tf.float32 tensor') + with tf.name_scope('Preprocessor'): + # TODO(jonathanhuang): revisit whether to always use batch size as + # the number of parallel iterations vs allow for dynamic batching. + outputs = shape_utils.static_or_dynamic_map_fn( + self._image_resizer_fn, + elems=inputs, + dtype=[tf.float32, tf.int32]) + resized_inputs = outputs[0] + true_image_shapes = outputs[1] + + return (self._feature_extractor.preprocess(resized_inputs), + true_image_shapes) + + def _compute_clip_window(self, preprocessed_images, true_image_shapes): + """Computes clip window to use during post_processing. + + Computes a new clip window to use during post-processing based on + `resized_image_shapes` and `true_image_shapes` only if `preprocess` method + has been called. Otherwise returns a default clip window of [0, 0, 1, 1]. + + Args: + preprocessed_images: the [batch, height, width, channels] image + tensor. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. Or None if the clip window should cover the full image. + + Returns: + a 2-D float32 tensor of the form [batch_size, 4] containing the clip + window for each image in the batch in normalized coordinates (relative to + the resized dimensions) where each clip window is of the form [ymin, xmin, + ymax, xmax] or a default clip window of [0, 0, 1, 1]. + + """ + if true_image_shapes is None: + return tf.constant([0, 0, 1, 1], dtype=tf.float32) + + resized_inputs_shape = shape_utils.combined_static_and_dynamic_shape( + preprocessed_images) + true_heights, true_widths, _ = tf.unstack( + tf.to_float(true_image_shapes), axis=1) + padded_height = tf.to_float(resized_inputs_shape[1]) + padded_width = tf.to_float(resized_inputs_shape[2]) + return tf.stack( + [ + tf.zeros_like(true_heights), + tf.zeros_like(true_widths), true_heights / padded_height, + true_widths / padded_width + ], + axis=1) + + def predict(self, preprocessed_inputs, true_image_shapes): + """Predicts unpostprocessed tensors from input tensor. + + This function takes an input batch of images and runs it through the forward + pass of the network to yield unpostprocessesed predictions. + + A side effect of calling the predict method is that self._anchors is + populated with a box_list.BoxList of anchors. These anchors must be + constructed before the postprocess or loss functions can be called. + + Args: + preprocessed_inputs: a [batch, height, width, channels] image tensor. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + + Returns: + prediction_dict: a dictionary holding "raw" prediction tensors: + 1) preprocessed_inputs: the [batch, height, width, channels] image + tensor. + 2) box_encodings: 4-D float tensor of shape [batch_size, num_anchors, + box_code_dimension] containing predicted boxes. + 3) class_predictions_with_background: 3-D float tensor of shape + [batch_size, num_anchors, num_classes+1] containing class predictions + (logits) for each of the anchors. Note that this tensor *includes* + background class predictions (at class index 0). + 4) feature_maps: a list of tensors where the ith tensor has shape + [batch, height_i, width_i, depth_i]. + 5) anchors: 2-D float tensor of shape [num_anchors, 4] containing + the generated anchors in normalized coordinates. + """ + batchnorm_updates_collections = (None if self._inplace_batchnorm_update + else tf.GraphKeys.UPDATE_OPS) + with slim.arg_scope([slim.batch_norm], + is_training=(self._is_training and + not self._freeze_batchnorm), + updates_collections=batchnorm_updates_collections): + with tf.variable_scope(None, self._extract_features_scope, + [preprocessed_inputs]): + feature_maps = self._feature_extractor.extract_features( + preprocessed_inputs) + feature_map_spatial_dims = self._get_feature_map_spatial_dims( + feature_maps) + image_shape = shape_utils.combined_static_and_dynamic_shape( + preprocessed_inputs) + self._anchors = box_list_ops.concatenate( + self._anchor_generator.generate( + feature_map_spatial_dims, + im_height=image_shape[1], + im_width=image_shape[2])) + prediction_dict = self._box_predictor.predict( + feature_maps, self._anchor_generator.num_anchors_per_location()) + box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1) + if box_encodings.shape.ndims == 4 and box_encodings.shape[2] == 1: + box_encodings = tf.squeeze(box_encodings, axis=2) + class_predictions_with_background = tf.concat( + prediction_dict['class_predictions_with_background'], axis=1) + predictions_dict = { + 'preprocessed_inputs': preprocessed_inputs, + 'box_encodings': box_encodings, + 'class_predictions_with_background': + class_predictions_with_background, + 'feature_maps': feature_maps, + 'anchors': self._anchors.get() + } + self._batched_prediction_tensor_names = [x for x in predictions_dict + if x != 'anchors'] + return predictions_dict + + def _get_feature_map_spatial_dims(self, feature_maps): + """Return list of spatial dimensions for each feature map in a list. + + Args: + feature_maps: a list of tensors where the ith tensor has shape + [batch, height_i, width_i, depth_i]. + + Returns: + a list of pairs (height, width) for each feature map in feature_maps + """ + feature_map_shapes = [ + shape_utils.combined_static_and_dynamic_shape( + feature_map) for feature_map in feature_maps + ] + return [(shape[1], shape[2]) for shape in feature_map_shapes] + + def postprocess(self, prediction_dict, true_image_shapes): + """Converts prediction tensors to final detections. + + This function converts raw predictions tensors to final detection results by + slicing off the background class, decoding box predictions and applying + non max suppression and clipping to the image window. + + See base class for output format conventions. Note also that by default, + scores are to be interpreted as logits, but if a score_conversion_fn is + used, then scores are remapped (and may thus have a different + interpretation). + + Args: + prediction_dict: a dictionary holding prediction tensors with + 1) preprocessed_inputs: a [batch, height, width, channels] image + tensor. + 2) box_encodings: 3-D float tensor of shape [batch_size, num_anchors, + box_code_dimension] containing predicted boxes. + 3) class_predictions_with_background: 3-D float tensor of shape + [batch_size, num_anchors, num_classes+1] containing class predictions + (logits) for each of the anchors. Note that this tensor *includes* + background class predictions. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. Or None, if the clip window should cover the full image. + + Returns: + detections: a dictionary containing the following fields + detection_boxes: [batch, max_detections, 4] + detection_scores: [batch, max_detections] + detection_classes: [batch, max_detections] + detection_keypoints: [batch, max_detections, num_keypoints, 2] (if + encoded in the prediction_dict 'box_encodings') + num_detections: [batch] + Raises: + ValueError: if prediction_dict does not contain `box_encodings` or + `class_predictions_with_background` fields. + """ + if ('box_encodings' not in prediction_dict or + 'class_predictions_with_background' not in prediction_dict): + raise ValueError('prediction_dict does not contain expected entries.') + with tf.name_scope('Postprocessor'): + preprocessed_images = prediction_dict['preprocessed_inputs'] + box_encodings = prediction_dict['box_encodings'] + class_predictions = prediction_dict['class_predictions_with_background'] + detection_boxes, detection_keypoints = self._batch_decode(box_encodings) + detection_boxes = tf.expand_dims(detection_boxes, axis=2) + + detection_scores_with_background = self._score_conversion_fn( + class_predictions) + detection_scores = tf.slice(detection_scores_with_background, [0, 0, 1], + [-1, -1, -1]) + additional_fields = None + + if detection_keypoints is not None: + additional_fields = { + fields.BoxListFields.keypoints: detection_keypoints} + (nmsed_boxes, nmsed_scores, nmsed_classes, _, nmsed_additional_fields, + num_detections) = self._non_max_suppression_fn( + detection_boxes, + detection_scores, + clip_window=self._compute_clip_window( + preprocessed_images, true_image_shapes), + additional_fields=additional_fields) + detection_dict = { + fields.DetectionResultFields.detection_boxes: nmsed_boxes, + fields.DetectionResultFields.detection_scores: nmsed_scores, + fields.DetectionResultFields.detection_classes: nmsed_classes, + fields.DetectionResultFields.num_detections: + tf.to_float(num_detections) + } + if (nmsed_additional_fields is not None and + fields.BoxListFields.keypoints in nmsed_additional_fields): + detection_dict[fields.DetectionResultFields.detection_keypoints] = ( + nmsed_additional_fields[fields.BoxListFields.keypoints]) + return detection_dict + + def loss(self, prediction_dict, true_image_shapes, scope=None): + """Compute scalar loss tensors with respect to provided groundtruth. + + Calling this function requires that groundtruth tensors have been + provided via the provide_groundtruth function. + + Args: + prediction_dict: a dictionary holding prediction tensors with + 1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors, + box_code_dimension] containing predicted boxes. + 2) class_predictions_with_background: 3-D float tensor of shape + [batch_size, num_anchors, num_classes+1] containing class predictions + (logits) for each of the anchors. Note that this tensor *includes* + background class predictions. + true_image_shapes: int32 tensor of shape [batch, 3] where each row is + of the form [height, width, channels] indicating the shapes + of true images in the resized images, as resized images can be padded + with zeros. + scope: Optional scope name. + + Returns: + a dictionary mapping loss keys (`localization_loss` and + `classification_loss`) to scalar tensors representing corresponding loss + values. + """ + with tf.name_scope(scope, 'Loss', prediction_dict.values()): + keypoints = None + if self.groundtruth_has_field(fields.BoxListFields.keypoints): + keypoints = self.groundtruth_lists(fields.BoxListFields.keypoints) + weights = None + if self.groundtruth_has_field(fields.BoxListFields.weights): + weights = self.groundtruth_lists(fields.BoxListFields.weights) + (batch_cls_targets, batch_cls_weights, batch_reg_targets, + batch_reg_weights, match_list) = self._assign_targets( + self.groundtruth_lists(fields.BoxListFields.boxes), + self.groundtruth_lists(fields.BoxListFields.classes), + keypoints, weights) + if self._add_summaries: + self._summarize_target_assignment( + self.groundtruth_lists(fields.BoxListFields.boxes), match_list) + + if self._random_example_sampler: + batch_sampled_indicator = tf.to_float( + shape_utils.static_or_dynamic_map_fn( + self._minibatch_subsample_fn, + [batch_cls_targets, batch_cls_weights], + dtype=tf.bool, + parallel_iterations=self._parallel_iterations, + back_prop=True)) + batch_reg_weights = tf.multiply(batch_sampled_indicator, + batch_reg_weights) + batch_cls_weights = tf.multiply(batch_sampled_indicator, + batch_cls_weights) + + location_losses = self._localization_loss( + prediction_dict['box_encodings'], + batch_reg_targets, + ignore_nan_targets=True, + weights=batch_reg_weights) + cls_losses = ops.reduce_sum_trailing_dimensions( + self._classification_loss( + prediction_dict['class_predictions_with_background'], + batch_cls_targets, + weights=batch_cls_weights), + ndims=2) + + if self._hard_example_miner: + (localization_loss, classification_loss) = self._apply_hard_mining( + location_losses, cls_losses, prediction_dict, match_list) + if self._add_summaries: + self._hard_example_miner.summarize() + else: + if self._add_summaries: + class_ids = tf.argmax(batch_cls_targets, axis=2) + flattened_class_ids = tf.reshape(class_ids, [-1]) + flattened_classification_losses = tf.reshape(cls_losses, [-1]) + self._summarize_anchor_classification_loss( + flattened_class_ids, flattened_classification_losses) + localization_loss = tf.reduce_sum(location_losses) + classification_loss = tf.reduce_sum(cls_losses) + + # Optionally normalize by number of positive matches + normalizer = tf.constant(1.0, dtype=tf.float32) + if self._normalize_loss_by_num_matches: + normalizer = tf.maximum(tf.to_float(tf.reduce_sum(batch_reg_weights)), + 1.0) + + localization_loss_normalizer = normalizer + if self._normalize_loc_loss_by_codesize: + localization_loss_normalizer *= self._box_coder.code_size + localization_loss = tf.multiply((self._localization_loss_weight / + localization_loss_normalizer), + localization_loss, + name='localization_loss') + classification_loss = tf.multiply((self._classification_loss_weight / + normalizer), classification_loss, + name='classification_loss') + + loss_dict = { + str(localization_loss.op.name): localization_loss, + str(classification_loss.op.name): classification_loss + } + return loss_dict + + def _minibatch_subsample_fn(self, inputs): + """Randomly samples anchors for one image. + + Args: + inputs: a list of 2 inputs. First one is a tensor of shape [num_anchors, + num_classes] indicating targets assigned to each anchor. Second one + is a tensor of shape [num_anchors] indicating the class weight of each + anchor. + + Returns: + batch_sampled_indicator: bool tensor of shape [num_anchors] indicating + whether the anchor should be selected for loss computation. + """ + cls_targets, cls_weights = inputs + if self._add_background_class: + # Set background_class bits to 0 so that the positives_indicator + # computation would not consider background class. + background_class = tf.zeros_like(tf.slice(cls_targets, [0, 0], [-1, 1])) + regular_class = tf.slice(cls_targets, [0, 1], [-1, -1]) + cls_targets = tf.concat([background_class, regular_class], 1) + positives_indicator = tf.reduce_sum(cls_targets, axis=1) + return self._random_example_sampler.subsample( + tf.cast(cls_weights, tf.bool), + batch_size=None, + labels=tf.cast(positives_indicator, tf.bool)) + + def _summarize_anchor_classification_loss(self, class_ids, cls_losses): + positive_indices = tf.where(tf.greater(class_ids, 0)) + positive_anchor_cls_loss = tf.squeeze( + tf.gather(cls_losses, positive_indices), axis=1) + visualization_utils.add_cdf_image_summary(positive_anchor_cls_loss, + 'PositiveAnchorLossCDF') + negative_indices = tf.where(tf.equal(class_ids, 0)) + negative_anchor_cls_loss = tf.squeeze( + tf.gather(cls_losses, negative_indices), axis=1) + visualization_utils.add_cdf_image_summary(negative_anchor_cls_loss, + 'NegativeAnchorLossCDF') + + def _assign_targets(self, groundtruth_boxes_list, groundtruth_classes_list, + groundtruth_keypoints_list=None, + groundtruth_weights_list=None): + """Assign groundtruth targets. + + Adds a background class to each one-hot encoding of groundtruth classes + and uses target assigner to obtain regression and classification targets. + + Args: + groundtruth_boxes_list: a list of 2-D tensors of shape [num_boxes, 4] + containing coordinates of the groundtruth boxes. + Groundtruth boxes are provided in [y_min, x_min, y_max, x_max] + format and assumed to be normalized and clipped + relative to the image window with y_min <= y_max and x_min <= x_max. + groundtruth_classes_list: a list of 2-D one-hot (or k-hot) tensors of + shape [num_boxes, num_classes] containing the class targets with the 0th + index assumed to map to the first non-background class. + groundtruth_keypoints_list: (optional) a list of 3-D tensors of shape + [num_boxes, num_keypoints, 2] + groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape + [num_boxes] containing weights for groundtruth boxes. + + Returns: + batch_cls_targets: a tensor with shape [batch_size, num_anchors, + num_classes], + batch_cls_weights: a tensor with shape [batch_size, num_anchors], + batch_reg_targets: a tensor with shape [batch_size, num_anchors, + box_code_dimension] + batch_reg_weights: a tensor with shape [batch_size, num_anchors], + match_list: a list of matcher.Match objects encoding the match between + anchors and groundtruth boxes for each image of the batch, + with rows of the Match objects corresponding to groundtruth boxes + and columns corresponding to anchors. + """ + groundtruth_boxlists = [ + box_list.BoxList(boxes) for boxes in groundtruth_boxes_list + ] + if self._add_background_class: + groundtruth_classes_with_background_list = [ + tf.pad(one_hot_encoding, [[0, 0], [1, 0]], mode='CONSTANT') + for one_hot_encoding in groundtruth_classes_list + ] + else: + groundtruth_classes_with_background_list = groundtruth_classes_list + + if groundtruth_keypoints_list is not None: + for boxlist, keypoints in zip( + groundtruth_boxlists, groundtruth_keypoints_list): + boxlist.add_field(fields.BoxListFields.keypoints, keypoints) + return target_assigner.batch_assign_targets( + self._target_assigner, self.anchors, groundtruth_boxlists, + groundtruth_classes_with_background_list, groundtruth_weights_list) + + def _summarize_target_assignment(self, groundtruth_boxes_list, match_list): + """Creates tensorflow summaries for the input boxes and anchors. + + This function creates four summaries corresponding to the average + number (over images in a batch) of (1) groundtruth boxes, (2) anchors + marked as positive, (3) anchors marked as negative, and (4) anchors marked + as ignored. + + Args: + groundtruth_boxes_list: a list of 2-D tensors of shape [num_boxes, 4] + containing corners of the groundtruth boxes. + match_list: a list of matcher.Match objects encoding the match between + anchors and groundtruth boxes for each image of the batch, + with rows of the Match objects corresponding to groundtruth boxes + and columns corresponding to anchors. + """ + num_boxes_per_image = tf.stack( + [tf.shape(x)[0] for x in groundtruth_boxes_list]) + pos_anchors_per_image = tf.stack( + [match.num_matched_columns() for match in match_list]) + neg_anchors_per_image = tf.stack( + [match.num_unmatched_columns() for match in match_list]) + ignored_anchors_per_image = tf.stack( + [match.num_ignored_columns() for match in match_list]) + tf.summary.scalar('AvgNumGroundtruthBoxesPerImage', + tf.reduce_mean(tf.to_float(num_boxes_per_image)), + family='TargetAssignment') + tf.summary.scalar('AvgNumPositiveAnchorsPerImage', + tf.reduce_mean(tf.to_float(pos_anchors_per_image)), + family='TargetAssignment') + tf.summary.scalar('AvgNumNegativeAnchorsPerImage', + tf.reduce_mean(tf.to_float(neg_anchors_per_image)), + family='TargetAssignment') + tf.summary.scalar('AvgNumIgnoredAnchorsPerImage', + tf.reduce_mean(tf.to_float(ignored_anchors_per_image)), + family='TargetAssignment') + + def _apply_hard_mining(self, location_losses, cls_losses, prediction_dict, + match_list): + """Applies hard mining to anchorwise losses. + + Args: + location_losses: Float tensor of shape [batch_size, num_anchors] + representing anchorwise location losses. + cls_losses: Float tensor of shape [batch_size, num_anchors] + representing anchorwise classification losses. + prediction_dict: p a dictionary holding prediction tensors with + 1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors, + box_code_dimension] containing predicted boxes. + 2) class_predictions_with_background: 3-D float tensor of shape + [batch_size, num_anchors, num_classes+1] containing class predictions + (logits) for each of the anchors. Note that this tensor *includes* + background class predictions. + match_list: a list of matcher.Match objects encoding the match between + anchors and groundtruth boxes for each image of the batch, + with rows of the Match objects corresponding to groundtruth boxes + and columns corresponding to anchors. + + Returns: + mined_location_loss: a float scalar with sum of localization losses from + selected hard examples. + mined_cls_loss: a float scalar with sum of classification losses from + selected hard examples. + """ + class_predictions = tf.slice( + prediction_dict['class_predictions_with_background'], [0, 0, + 1], [-1, -1, -1]) + + decoded_boxes, _ = self._batch_decode(prediction_dict['box_encodings']) + decoded_box_tensors_list = tf.unstack(decoded_boxes) + class_prediction_list = tf.unstack(class_predictions) + decoded_boxlist_list = [] + for box_location, box_score in zip(decoded_box_tensors_list, + class_prediction_list): + decoded_boxlist = box_list.BoxList(box_location) + decoded_boxlist.add_field('scores', box_score) + decoded_boxlist_list.append(decoded_boxlist) + return self._hard_example_miner( + location_losses=location_losses, + cls_losses=cls_losses, + decoded_boxlist_list=decoded_boxlist_list, + match_list=match_list) + + def _batch_decode(self, box_encodings): + """Decodes a batch of box encodings with respect to the anchors. + + Args: + box_encodings: A float32 tensor of shape + [batch_size, num_anchors, box_code_size] containing box encodings. + + Returns: + decoded_boxes: A float32 tensor of shape + [batch_size, num_anchors, 4] containing the decoded boxes. + decoded_keypoints: A float32 tensor of shape + [batch_size, num_anchors, num_keypoints, 2] containing the decoded + keypoints if present in the input `box_encodings`, None otherwise. + """ + combined_shape = shape_utils.combined_static_and_dynamic_shape( + box_encodings) + batch_size = combined_shape[0] + tiled_anchor_boxes = tf.tile( + tf.expand_dims(self.anchors.get(), 0), [batch_size, 1, 1]) + tiled_anchors_boxlist = box_list.BoxList( + tf.reshape(tiled_anchor_boxes, [-1, 4])) + decoded_boxes = self._box_coder.decode( + tf.reshape(box_encodings, [-1, self._box_coder.code_size]), + tiled_anchors_boxlist) + decoded_keypoints = None + if decoded_boxes.has_field(fields.BoxListFields.keypoints): + decoded_keypoints = decoded_boxes.get_field( + fields.BoxListFields.keypoints) + num_keypoints = decoded_keypoints.get_shape()[1] + decoded_keypoints = tf.reshape( + decoded_keypoints, + tf.stack([combined_shape[0], combined_shape[1], num_keypoints, 2])) + decoded_boxes = tf.reshape(decoded_boxes.get(), tf.stack( + [combined_shape[0], combined_shape[1], 4])) + return decoded_boxes, decoded_keypoints + + def restore_map(self, + fine_tune_checkpoint_type='detection', + load_all_detection_checkpoint_vars=False): + """Returns a map of variables to load from a foreign checkpoint. + + See parent class for details. + + Args: + fine_tune_checkpoint_type: whether to restore from a full detection + checkpoint (with compatible variable names) or to restore from a + classification checkpoint for initialization prior to training. + Valid values: `detection`, `classification`. Default 'detection'. + load_all_detection_checkpoint_vars: whether to load all variables (when + `fine_tune_checkpoint_type='detection'`). If False, only variables + within the appropriate scopes are included. Default False. + + Returns: + A dict mapping variable names (to load from a checkpoint) to variables in + the model graph. + Raises: + ValueError: if fine_tune_checkpoint_type is neither `classification` + nor `detection`. + """ + if fine_tune_checkpoint_type not in ['detection', 'classification']: + raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format( + fine_tune_checkpoint_type)) + variables_to_restore = {} + for variable in tf.global_variables(): + var_name = variable.op.name + if (fine_tune_checkpoint_type == 'detection' and + load_all_detection_checkpoint_vars): + variables_to_restore[var_name] = variable + else: + if var_name.startswith(self._extract_features_scope): + if fine_tune_checkpoint_type == 'classification': + var_name = ( + re.split('^' + self._extract_features_scope + '/', + var_name)[-1]) + variables_to_restore[var_name] = variable + + return variables_to_restore diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/ssd_meta_arch_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/ssd_meta_arch_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b1b62a3c00bd602efc42c7359feee1206da19584 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/meta_architectures/ssd_meta_arch_test.py @@ -0,0 +1,554 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.meta_architectures.ssd_meta_arch.""" +import functools +import numpy as np +import tensorflow as tf + +from object_detection.core import anchor_generator +from object_detection.core import balanced_positive_negative_sampler as sampler +from object_detection.core import box_list +from object_detection.core import losses +from object_detection.core import post_processing +from object_detection.core import region_similarity_calculator as sim_calc +from object_detection.meta_architectures import ssd_meta_arch +from object_detection.utils import test_case +from object_detection.utils import test_utils + +slim = tf.contrib.slim + + +class FakeSSDFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): + + def __init__(self): + super(FakeSSDFeatureExtractor, self).__init__( + is_training=True, + depth_multiplier=0, + min_depth=0, + pad_to_multiple=1, + conv_hyperparams_fn=None) + + def preprocess(self, resized_inputs): + return tf.identity(resized_inputs) + + def extract_features(self, preprocessed_inputs): + with tf.variable_scope('mock_model'): + features = slim.conv2d(inputs=preprocessed_inputs, num_outputs=32, + kernel_size=1, scope='layer1') + return [features] + + +class MockAnchorGenerator2x2(anchor_generator.AnchorGenerator): + """Sets up a simple 2x2 anchor grid on the unit square.""" + + def name_scope(self): + return 'MockAnchorGenerator' + + def num_anchors_per_location(self): + return [1] + + def _generate(self, feature_map_shape_list, im_height, im_width): + return [box_list.BoxList( + tf.constant([[0, 0, .5, .5], + [0, .5, .5, 1], + [.5, 0, 1, .5], + [1., 1., 1.5, 1.5] # Anchor that is outside clip_window. + ], tf.float32))] + + def num_anchors(self): + return 4 + + +def _get_value_for_matching_key(dictionary, suffix): + for key in dictionary.keys(): + if key.endswith(suffix): + return dictionary[key] + raise ValueError('key not found {}'.format(suffix)) + + +class SsdMetaArchTest(test_case.TestCase): + + def _create_model(self, + apply_hard_mining=True, + normalize_loc_loss_by_codesize=False, + add_background_class=True, + random_example_sampling=False): + is_training = False + num_classes = 1 + mock_anchor_generator = MockAnchorGenerator2x2() + mock_box_predictor = test_utils.MockBoxPredictor( + is_training, num_classes) + mock_box_coder = test_utils.MockBoxCoder() + fake_feature_extractor = FakeSSDFeatureExtractor() + mock_matcher = test_utils.MockMatcher() + region_similarity_calculator = sim_calc.IouSimilarity() + encode_background_as_zeros = False + def image_resizer_fn(image): + return [tf.identity(image), tf.shape(image)] + + classification_loss = losses.WeightedSigmoidClassificationLoss() + localization_loss = losses.WeightedSmoothL1LocalizationLoss() + non_max_suppression_fn = functools.partial( + post_processing.batch_multiclass_non_max_suppression, + score_thresh=-20.0, + iou_thresh=1.0, + max_size_per_class=5, + max_total_size=5) + classification_loss_weight = 1.0 + localization_loss_weight = 1.0 + negative_class_weight = 1.0 + normalize_loss_by_num_matches = False + + hard_example_miner = None + if apply_hard_mining: + # This hard example miner is expected to be a no-op. + hard_example_miner = losses.HardExampleMiner( + num_hard_examples=None, + iou_threshold=1.0) + + random_example_sampler = None + if random_example_sampling: + random_example_sampler = sampler.BalancedPositiveNegativeSampler( + positive_fraction=0.5) + + code_size = 4 + model = ssd_meta_arch.SSDMetaArch( + is_training, + mock_anchor_generator, + mock_box_predictor, + mock_box_coder, + fake_feature_extractor, + mock_matcher, + region_similarity_calculator, + encode_background_as_zeros, + negative_class_weight, + image_resizer_fn, + non_max_suppression_fn, + tf.identity, + classification_loss, + localization_loss, + classification_loss_weight, + localization_loss_weight, + normalize_loss_by_num_matches, + hard_example_miner, + add_summaries=False, + normalize_loc_loss_by_codesize=normalize_loc_loss_by_codesize, + freeze_batchnorm=False, + inplace_batchnorm_update=False, + add_background_class=add_background_class, + random_example_sampler=random_example_sampler) + return model, num_classes, mock_anchor_generator.num_anchors(), code_size + + def test_preprocess_preserves_shapes_with_dynamic_input_image(self): + image_shapes = [(3, None, None, 3), + (None, 10, 10, 3), + (None, None, None, 3)] + model, _, _, _ = self._create_model() + for image_shape in image_shapes: + image_placeholder = tf.placeholder(tf.float32, shape=image_shape) + preprocessed_inputs, _ = model.preprocess(image_placeholder) + self.assertAllEqual(preprocessed_inputs.shape.as_list(), image_shape) + + def test_preprocess_preserves_shape_with_static_input_image(self): + def graph_fn(input_image): + model, _, _, _ = self._create_model() + return model.preprocess(input_image) + input_image = np.random.rand(2, 3, 3, 3).astype(np.float32) + preprocessed_inputs, _ = self.execute(graph_fn, [input_image]) + self.assertAllEqual(preprocessed_inputs.shape, [2, 3, 3, 3]) + + def test_predict_result_shapes_on_image_with_dynamic_shape(self): + batch_size = 3 + image_size = 2 + input_shapes = [(None, image_size, image_size, 3), + (batch_size, None, None, 3), + (None, None, None, 3)] + + for input_shape in input_shapes: + tf_graph = tf.Graph() + with tf_graph.as_default(): + model, num_classes, num_anchors, code_size = self._create_model() + preprocessed_input_placeholder = tf.placeholder(tf.float32, + shape=input_shape) + prediction_dict = model.predict( + preprocessed_input_placeholder, true_image_shapes=None) + + self.assertTrue('box_encodings' in prediction_dict) + self.assertTrue('class_predictions_with_background' in prediction_dict) + self.assertTrue('feature_maps' in prediction_dict) + self.assertTrue('anchors' in prediction_dict) + + init_op = tf.global_variables_initializer() + with self.test_session(graph=tf_graph) as sess: + sess.run(init_op) + prediction_out = sess.run(prediction_dict, + feed_dict={ + preprocessed_input_placeholder: + np.random.uniform( + size=(batch_size, 2, 2, 3))}) + expected_box_encodings_shape_out = (batch_size, num_anchors, code_size) + expected_class_predictions_with_background_shape_out = (batch_size, + num_anchors, + num_classes + 1) + + self.assertAllEqual(prediction_out['box_encodings'].shape, + expected_box_encodings_shape_out) + self.assertAllEqual( + prediction_out['class_predictions_with_background'].shape, + expected_class_predictions_with_background_shape_out) + + def test_predict_result_shapes_on_image_with_static_shape(self): + + with tf.Graph().as_default(): + _, num_classes, num_anchors, code_size = self._create_model() + + def graph_fn(input_image): + model, _, _, _ = self._create_model() + predictions = model.predict(input_image, true_image_shapes=None) + return (predictions['box_encodings'], + predictions['class_predictions_with_background'], + predictions['feature_maps'], + predictions['anchors']) + batch_size = 3 + image_size = 2 + channels = 3 + input_image = np.random.rand(batch_size, image_size, image_size, + channels).astype(np.float32) + expected_box_encodings_shape = (batch_size, num_anchors, code_size) + expected_class_predictions_shape = (batch_size, num_anchors, num_classes+1) + (box_encodings, class_predictions, _, _) = self.execute(graph_fn, + [input_image]) + self.assertAllEqual(box_encodings.shape, expected_box_encodings_shape) + self.assertAllEqual(class_predictions.shape, + expected_class_predictions_shape) + + def test_postprocess_results_are_correct(self): + batch_size = 2 + image_size = 2 + input_shapes = [(batch_size, image_size, image_size, 3), + (None, image_size, image_size, 3), + (batch_size, None, None, 3), + (None, None, None, 3)] + + expected_boxes = [ + [ + [0, 0, .5, .5], + [0, .5, .5, 1], + [.5, 0, 1, .5], + [0, 0, 0, 0], # pruned prediction + [0, 0, 0, 0] + ], # padding + [ + [0, 0, .5, .5], + [0, .5, .5, 1], + [.5, 0, 1, .5], + [0, 0, 0, 0], # pruned prediction + [0, 0, 0, 0] + ] + ] # padding + expected_scores = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] + expected_classes = [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] + expected_num_detections = np.array([3, 3]) + + for input_shape in input_shapes: + tf_graph = tf.Graph() + with tf_graph.as_default(): + model, _, _, _ = self._create_model() + input_placeholder = tf.placeholder(tf.float32, shape=input_shape) + preprocessed_inputs, true_image_shapes = model.preprocess( + input_placeholder) + prediction_dict = model.predict(preprocessed_inputs, + true_image_shapes) + detections = model.postprocess(prediction_dict, true_image_shapes) + self.assertTrue('detection_boxes' in detections) + self.assertTrue('detection_scores' in detections) + self.assertTrue('detection_classes' in detections) + self.assertTrue('num_detections' in detections) + init_op = tf.global_variables_initializer() + with self.test_session(graph=tf_graph) as sess: + sess.run(init_op) + detections_out = sess.run(detections, + feed_dict={ + input_placeholder: + np.random.uniform( + size=(batch_size, 2, 2, 3))}) + for image_idx in range(batch_size): + self.assertTrue( + test_utils.first_rows_close_as_set( + detections_out['detection_boxes'][image_idx].tolist(), + expected_boxes[image_idx])) + self.assertAllClose(detections_out['detection_scores'], expected_scores) + self.assertAllClose(detections_out['detection_classes'], expected_classes) + self.assertAllClose(detections_out['num_detections'], + expected_num_detections) + + def test_loss_results_are_correct(self): + + with tf.Graph().as_default(): + _, num_classes, num_anchors, _ = self._create_model() + def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2, + groundtruth_classes1, groundtruth_classes2): + groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2] + groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2] + model, _, _, _ = self._create_model(apply_hard_mining=False) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + prediction_dict = model.predict(preprocessed_tensor, + true_image_shapes=None) + loss_dict = model.loss(prediction_dict, true_image_shapes=None) + return ( + _get_value_for_matching_key(loss_dict, 'Loss/localization_loss'), + _get_value_for_matching_key(loss_dict, 'Loss/classification_loss')) + + batch_size = 2 + preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32) + groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32) + groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32) + groundtruth_classes1 = np.array([[1]], dtype=np.float32) + groundtruth_classes2 = np.array([[1]], dtype=np.float32) + expected_localization_loss = 0.0 + expected_classification_loss = (batch_size * num_anchors + * (num_classes+1) * np.log(2.0)) + (localization_loss, + classification_loss) = self.execute(graph_fn, [preprocessed_input, + groundtruth_boxes1, + groundtruth_boxes2, + groundtruth_classes1, + groundtruth_classes2]) + self.assertAllClose(localization_loss, expected_localization_loss) + self.assertAllClose(classification_loss, expected_classification_loss) + + def test_loss_results_are_correct_with_normalize_by_codesize_true(self): + + with tf.Graph().as_default(): + _, _, _, _ = self._create_model() + def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2, + groundtruth_classes1, groundtruth_classes2): + groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2] + groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2] + model, _, _, _ = self._create_model(apply_hard_mining=False, + normalize_loc_loss_by_codesize=True) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + prediction_dict = model.predict(preprocessed_tensor, + true_image_shapes=None) + loss_dict = model.loss(prediction_dict, true_image_shapes=None) + return (_get_value_for_matching_key(loss_dict, 'Loss/localization_loss'),) + + batch_size = 2 + preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32) + groundtruth_boxes1 = np.array([[0, 0, 1, 1]], dtype=np.float32) + groundtruth_boxes2 = np.array([[0, 0, 1, 1]], dtype=np.float32) + groundtruth_classes1 = np.array([[1]], dtype=np.float32) + groundtruth_classes2 = np.array([[1]], dtype=np.float32) + expected_localization_loss = 0.5 / 4 + localization_loss = self.execute(graph_fn, [preprocessed_input, + groundtruth_boxes1, + groundtruth_boxes2, + groundtruth_classes1, + groundtruth_classes2]) + self.assertAllClose(localization_loss, expected_localization_loss) + + def test_loss_results_are_correct_with_hard_example_mining(self): + + with tf.Graph().as_default(): + _, num_classes, num_anchors, _ = self._create_model() + def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2, + groundtruth_classes1, groundtruth_classes2): + groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2] + groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2] + model, _, _, _ = self._create_model() + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + prediction_dict = model.predict(preprocessed_tensor, + true_image_shapes=None) + loss_dict = model.loss(prediction_dict, true_image_shapes=None) + return ( + _get_value_for_matching_key(loss_dict, 'Loss/localization_loss'), + _get_value_for_matching_key(loss_dict, 'Loss/classification_loss')) + + batch_size = 2 + preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32) + groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32) + groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32) + groundtruth_classes1 = np.array([[1]], dtype=np.float32) + groundtruth_classes2 = np.array([[1]], dtype=np.float32) + expected_localization_loss = 0.0 + expected_classification_loss = (batch_size * num_anchors + * (num_classes+1) * np.log(2.0)) + (localization_loss, classification_loss) = self.execute_cpu( + graph_fn, [ + preprocessed_input, groundtruth_boxes1, groundtruth_boxes2, + groundtruth_classes1, groundtruth_classes2 + ]) + self.assertAllClose(localization_loss, expected_localization_loss) + self.assertAllClose(classification_loss, expected_classification_loss) + + def test_loss_results_are_correct_without_add_background_class(self): + + with tf.Graph().as_default(): + _, num_classes, num_anchors, _ = self._create_model( + add_background_class=False) + + def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2, + groundtruth_classes1, groundtruth_classes2): + groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2] + groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2] + model, _, _, _ = self._create_model( + apply_hard_mining=False, add_background_class=False) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + prediction_dict = model.predict( + preprocessed_tensor, true_image_shapes=None) + loss_dict = model.loss(prediction_dict, true_image_shapes=None) + return (loss_dict['Loss/localization_loss'], + loss_dict['Loss/classification_loss']) + + batch_size = 2 + preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32) + groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32) + groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32) + groundtruth_classes1 = np.array([[0, 1]], dtype=np.float32) + groundtruth_classes2 = np.array([[0, 1]], dtype=np.float32) + expected_localization_loss = 0.0 + expected_classification_loss = ( + batch_size * num_anchors * (num_classes + 1) * np.log(2.0)) + (localization_loss, classification_loss) = self.execute( + graph_fn, [ + preprocessed_input, groundtruth_boxes1, groundtruth_boxes2, + groundtruth_classes1, groundtruth_classes2 + ]) + self.assertAllClose(localization_loss, expected_localization_loss) + self.assertAllClose(classification_loss, expected_classification_loss) + + def test_restore_map_for_detection_ckpt(self): + model, _, _, _ = self._create_model() + model.predict(tf.constant(np.array([[[[0, 0], [1, 1]], [[1, 0], [0, 1]]]], + dtype=np.float32)), + true_image_shapes=None) + init_op = tf.global_variables_initializer() + saver = tf.train.Saver() + save_path = self.get_temp_dir() + with self.test_session() as sess: + sess.run(init_op) + saved_model_path = saver.save(sess, save_path) + var_map = model.restore_map( + fine_tune_checkpoint_type='detection', + load_all_detection_checkpoint_vars=False) + self.assertIsInstance(var_map, dict) + saver = tf.train.Saver(var_map) + saver.restore(sess, saved_model_path) + for var in sess.run(tf.report_uninitialized_variables()): + self.assertNotIn('FeatureExtractor', var) + + def test_restore_map_for_classification_ckpt(self): + # Define mock tensorflow classification graph and save variables. + test_graph_classification = tf.Graph() + with test_graph_classification.as_default(): + image = tf.placeholder(dtype=tf.float32, shape=[1, 20, 20, 3]) + with tf.variable_scope('mock_model'): + net = slim.conv2d(image, num_outputs=32, kernel_size=1, scope='layer1') + slim.conv2d(net, num_outputs=3, kernel_size=1, scope='layer2') + + init_op = tf.global_variables_initializer() + saver = tf.train.Saver() + save_path = self.get_temp_dir() + with self.test_session(graph=test_graph_classification) as sess: + sess.run(init_op) + saved_model_path = saver.save(sess, save_path) + + # Create tensorflow detection graph and load variables from + # classification checkpoint. + test_graph_detection = tf.Graph() + with test_graph_detection.as_default(): + model, _, _, _ = self._create_model() + inputs_shape = [2, 2, 2, 3] + inputs = tf.to_float(tf.random_uniform( + inputs_shape, minval=0, maxval=255, dtype=tf.int32)) + preprocessed_inputs, true_image_shapes = model.preprocess(inputs) + prediction_dict = model.predict(preprocessed_inputs, true_image_shapes) + model.postprocess(prediction_dict, true_image_shapes) + another_variable = tf.Variable([17.0], name='another_variable') # pylint: disable=unused-variable + var_map = model.restore_map(fine_tune_checkpoint_type='classification') + self.assertNotIn('another_variable', var_map) + self.assertIsInstance(var_map, dict) + saver = tf.train.Saver(var_map) + with self.test_session(graph=test_graph_detection) as sess: + saver.restore(sess, saved_model_path) + for var in sess.run(tf.report_uninitialized_variables()): + self.assertNotIn('FeatureExtractor', var) + + def test_load_all_det_checkpoint_vars(self): + test_graph_detection = tf.Graph() + with test_graph_detection.as_default(): + model, _, _, _ = self._create_model() + inputs_shape = [2, 2, 2, 3] + inputs = tf.to_float( + tf.random_uniform(inputs_shape, minval=0, maxval=255, dtype=tf.int32)) + preprocessed_inputs, true_image_shapes = model.preprocess(inputs) + prediction_dict = model.predict(preprocessed_inputs, true_image_shapes) + model.postprocess(prediction_dict, true_image_shapes) + another_variable = tf.Variable([17.0], name='another_variable') # pylint: disable=unused-variable + var_map = model.restore_map( + fine_tune_checkpoint_type='detection', + load_all_detection_checkpoint_vars=True) + self.assertIsInstance(var_map, dict) + self.assertIn('another_variable', var_map) + + def test_loss_results_are_correct_with_random_example_sampling(self): + + with tf.Graph().as_default(): + _, num_classes, num_anchors, _ = self._create_model( + random_example_sampling=True) + print num_classes, num_anchors + + def graph_fn(preprocessed_tensor, groundtruth_boxes1, groundtruth_boxes2, + groundtruth_classes1, groundtruth_classes2): + groundtruth_boxes_list = [groundtruth_boxes1, groundtruth_boxes2] + groundtruth_classes_list = [groundtruth_classes1, groundtruth_classes2] + model, _, _, _ = self._create_model(random_example_sampling=True) + model.provide_groundtruth(groundtruth_boxes_list, + groundtruth_classes_list) + prediction_dict = model.predict( + preprocessed_tensor, true_image_shapes=None) + loss_dict = model.loss(prediction_dict, true_image_shapes=None) + return (_get_value_for_matching_key(loss_dict, 'Loss/localization_loss'), + _get_value_for_matching_key(loss_dict, + 'Loss/classification_loss')) + + batch_size = 2 + preprocessed_input = np.random.rand(batch_size, 2, 2, 3).astype(np.float32) + groundtruth_boxes1 = np.array([[0, 0, .5, .5]], dtype=np.float32) + groundtruth_boxes2 = np.array([[0, 0, .5, .5]], dtype=np.float32) + groundtruth_classes1 = np.array([[1]], dtype=np.float32) + groundtruth_classes2 = np.array([[1]], dtype=np.float32) + expected_localization_loss = 0.0 + # Among 4 anchors (1 positive, 3 negative) in this test, only 2 anchors are + # selected (1 positive, 1 negative) since random sampler will adjust number + # of negative examples to make sure positive example fraction in the batch + # is 0.5. + expected_classification_loss = ( + batch_size * 2 * (num_classes + 1) * np.log(2.0)) + (localization_loss, classification_loss) = self.execute_cpu( + graph_fn, [ + preprocessed_input, groundtruth_boxes1, groundtruth_boxes2, + groundtruth_classes1, groundtruth_classes2 + ]) + self.assertAllClose(localization_loss, expected_localization_loss) + self.assertAllClose(classification_loss, expected_classification_loss) + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_evaluation.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..9f9bc7c7acca40dc303cf6b632ff5b43311eb38e --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_evaluation.py @@ -0,0 +1,658 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Class for evaluating object detections with COCO metrics.""" +import numpy as np +import tensorflow as tf + +from object_detection.core import standard_fields +from object_detection.metrics import coco_tools +from object_detection.utils import object_detection_evaluation + + +class CocoDetectionEvaluator(object_detection_evaluation.DetectionEvaluator): + """Class to evaluate COCO detection metrics.""" + + def __init__(self, + categories, + include_metrics_per_category=False, + all_metrics_per_category=False): + """Constructor. + + Args: + categories: A list of dicts, each of which has the following keys - + 'id': (required) an integer id uniquely identifying this category. + 'name': (required) string representing category name e.g., 'cat', 'dog'. + include_metrics_per_category: If True, include metrics for each category. + all_metrics_per_category: Whether to include all the summary metrics for + each category in per_category_ap. Be careful with setting it to true if + you have more than handful of categories, because it will pollute + your mldash. + """ + super(CocoDetectionEvaluator, self).__init__(categories) + # _image_ids is a dictionary that maps unique image ids to Booleans which + # indicate whether a corresponding detection has been added. + self._image_ids = {} + self._groundtruth_list = [] + self._detection_boxes_list = [] + self._category_id_set = set([cat['id'] for cat in self._categories]) + self._annotation_id = 1 + self._metrics = None + self._include_metrics_per_category = include_metrics_per_category + self._all_metrics_per_category = all_metrics_per_category + + def clear(self): + """Clears the state to prepare for a fresh evaluation.""" + self._image_ids.clear() + self._groundtruth_list = [] + self._detection_boxes_list = [] + + def add_single_ground_truth_image_info(self, + image_id, + groundtruth_dict): + """Adds groundtruth for a single image to be used for evaluation. + + If the image has already been added, a warning is logged, and groundtruth is + ignored. + + Args: + image_id: A unique string/integer identifier for the image. + groundtruth_dict: A dictionary containing - + InputDataFields.groundtruth_boxes: float32 numpy array of shape + [num_boxes, 4] containing `num_boxes` groundtruth boxes of the format + [ymin, xmin, ymax, xmax] in absolute image coordinates. + InputDataFields.groundtruth_classes: integer numpy array of shape + [num_boxes] containing 1-indexed groundtruth classes for the boxes. + InputDataFields.groundtruth_is_crowd (optional): integer numpy array of + shape [num_boxes] containing iscrowd flag for groundtruth boxes. + """ + if image_id in self._image_ids: + tf.logging.warning('Ignoring ground truth with image id %s since it was ' + 'previously added', image_id) + return + + groundtruth_is_crowd = groundtruth_dict.get( + standard_fields.InputDataFields.groundtruth_is_crowd) + # Drop groundtruth_is_crowd if empty tensor. + if groundtruth_is_crowd is not None and not groundtruth_is_crowd.shape[0]: + groundtruth_is_crowd = None + + self._groundtruth_list.extend( + coco_tools.ExportSingleImageGroundtruthToCoco( + image_id=image_id, + next_annotation_id=self._annotation_id, + category_id_set=self._category_id_set, + groundtruth_boxes=groundtruth_dict[ + standard_fields.InputDataFields.groundtruth_boxes], + groundtruth_classes=groundtruth_dict[ + standard_fields.InputDataFields.groundtruth_classes], + groundtruth_is_crowd=groundtruth_is_crowd)) + self._annotation_id += groundtruth_dict[standard_fields.InputDataFields. + groundtruth_boxes].shape[0] + # Boolean to indicate whether a detection has been added for this image. + self._image_ids[image_id] = False + + def add_single_detected_image_info(self, + image_id, + detections_dict): + """Adds detections for a single image to be used for evaluation. + + If a detection has already been added for this image id, a warning is + logged, and the detection is skipped. + + Args: + image_id: A unique string/integer identifier for the image. + detections_dict: A dictionary containing - + DetectionResultFields.detection_boxes: float32 numpy array of shape + [num_boxes, 4] containing `num_boxes` detection boxes of the format + [ymin, xmin, ymax, xmax] in absolute image coordinates. + DetectionResultFields.detection_scores: float32 numpy array of shape + [num_boxes] containing detection scores for the boxes. + DetectionResultFields.detection_classes: integer numpy array of shape + [num_boxes] containing 1-indexed detection classes for the boxes. + + Raises: + ValueError: If groundtruth for the image_id is not available. + """ + if image_id not in self._image_ids: + raise ValueError('Missing groundtruth for image id: {}'.format(image_id)) + + if self._image_ids[image_id]: + tf.logging.warning('Ignoring detection with image id %s since it was ' + 'previously added', image_id) + return + + self._detection_boxes_list.extend( + coco_tools.ExportSingleImageDetectionBoxesToCoco( + image_id=image_id, + category_id_set=self._category_id_set, + detection_boxes=detections_dict[standard_fields. + DetectionResultFields + .detection_boxes], + detection_scores=detections_dict[standard_fields. + DetectionResultFields. + detection_scores], + detection_classes=detections_dict[standard_fields. + DetectionResultFields. + detection_classes])) + self._image_ids[image_id] = True + + def evaluate(self): + """Evaluates the detection boxes and returns a dictionary of coco metrics. + + Returns: + A dictionary holding - + + 1. summary_metrics: + 'DetectionBoxes_Precision/mAP': mean average precision over classes + averaged over IOU thresholds ranging from .5 to .95 with .05 + increments. + 'DetectionBoxes_Precision/mAP@.50IOU': mean average precision at 50% IOU + 'DetectionBoxes_Precision/mAP@.75IOU': mean average precision at 75% IOU + 'DetectionBoxes_Precision/mAP (small)': mean average precision for small + objects (area < 32^2 pixels). + 'DetectionBoxes_Precision/mAP (medium)': mean average precision for + medium sized objects (32^2 pixels < area < 96^2 pixels). + 'DetectionBoxes_Precision/mAP (large)': mean average precision for large + objects (96^2 pixels < area < 10000^2 pixels). + 'DetectionBoxes_Recall/AR@1': average recall with 1 detection. + 'DetectionBoxes_Recall/AR@10': average recall with 10 detections. + 'DetectionBoxes_Recall/AR@100': average recall with 100 detections. + 'DetectionBoxes_Recall/AR@100 (small)': average recall for small objects + with 100. + 'DetectionBoxes_Recall/AR@100 (medium)': average recall for medium objects + with 100. + 'DetectionBoxes_Recall/AR@100 (large)': average recall for large objects + with 100 detections. + + 2. per_category_ap: if include_metrics_per_category is True, category + specific results with keys of the form: + 'Precision mAP ByCategory/category' (without the supercategory part if + no supercategories exist). For backward compatibility + 'PerformanceByCategory' is included in the output regardless of + all_metrics_per_category. + """ + groundtruth_dict = { + 'annotations': self._groundtruth_list, + 'images': [{'id': image_id} for image_id in self._image_ids], + 'categories': self._categories + } + coco_wrapped_groundtruth = coco_tools.COCOWrapper(groundtruth_dict) + coco_wrapped_detections = coco_wrapped_groundtruth.LoadAnnotations( + self._detection_boxes_list) + box_evaluator = coco_tools.COCOEvalWrapper( + coco_wrapped_groundtruth, coco_wrapped_detections, agnostic_mode=False) + box_metrics, box_per_category_ap = box_evaluator.ComputeMetrics( + include_metrics_per_category=self._include_metrics_per_category, + all_metrics_per_category=self._all_metrics_per_category) + box_metrics.update(box_per_category_ap) + box_metrics = {'DetectionBoxes_'+ key: value + for key, value in iter(box_metrics.items())} + return box_metrics + + def get_estimator_eval_metric_ops(self, image_id, groundtruth_boxes, + groundtruth_classes, + detection_boxes, + detection_scores, detection_classes, + groundtruth_is_crowd=None, + num_gt_boxes_per_image=None, + num_det_boxes_per_image=None): + """Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`. + + Note that once value_op is called, the detections and groundtruth added via + update_op are cleared. + + This function can take in groundtruth and detections for a batch of images, + or for a single image. For the latter case, the batch dimension for input + tensors need not be present. + + Args: + image_id: string/integer tensor of shape [batch] with unique identifiers + for the images. + groundtruth_boxes: float32 tensor of shape [batch, num_boxes, 4] + containing `num_boxes` groundtruth boxes of the format + [ymin, xmin, ymax, xmax] in absolute image coordinates. + groundtruth_classes: int32 tensor of shape [batch, num_boxes] containing + 1-indexed groundtruth classes for the boxes. + detection_boxes: float32 tensor of shape [batch, num_boxes, 4] containing + `num_boxes` detection boxes of the format [ymin, xmin, ymax, xmax] + in absolute image coordinates. + detection_scores: float32 tensor of shape [batch, num_boxes] containing + detection scores for the boxes. + detection_classes: int32 tensor of shape [batch, num_boxes] containing + 1-indexed detection classes for the boxes. + groundtruth_is_crowd: bool tensor of shape [batch, num_boxes] containing + is_crowd annotations. This field is optional, and if not passed, then + all boxes are treated as *not* is_crowd. + num_gt_boxes_per_image: int32 tensor of shape [batch] containing the + number of groundtruth boxes per image. If None, will assume no padding + in groundtruth tensors. + num_det_boxes_per_image: int32 tensor of shape [batch] containing the + number of detection boxes per image. If None, will assume no padding in + the detection tensors. + + Returns: + a dictionary of metric names to tuple of value_op and update_op that can + be used as eval metric ops in tf.EstimatorSpec. Note that all update ops + must be run together and similarly all value ops must be run together to + guarantee correct behaviour. + """ + def update_op( + image_id_batched, + groundtruth_boxes_batched, + groundtruth_classes_batched, + groundtruth_is_crowd_batched, + num_gt_boxes_per_image, + detection_boxes_batched, + detection_scores_batched, + detection_classes_batched, + num_det_boxes_per_image): + """Update operation for adding batch of images to Coco evaluator.""" + + for (image_id, gt_box, gt_class, gt_is_crowd, num_gt_box, det_box, + det_score, det_class, num_det_box) in zip( + image_id_batched, groundtruth_boxes_batched, + groundtruth_classes_batched, groundtruth_is_crowd_batched, + num_gt_boxes_per_image, + detection_boxes_batched, detection_scores_batched, + detection_classes_batched, num_det_boxes_per_image): + self.add_single_ground_truth_image_info( + image_id, + {'groundtruth_boxes': gt_box[:num_gt_box], + 'groundtruth_classes': gt_class[:num_gt_box], + 'groundtruth_is_crowd': gt_is_crowd[:num_gt_box]}) + self.add_single_detected_image_info( + image_id, + {'detection_boxes': det_box[:num_det_box], + 'detection_scores': det_score[:num_det_box], + 'detection_classes': det_class[:num_det_box]}) + + if groundtruth_is_crowd is None: + groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool) + if not image_id.shape.as_list(): + # Apply a batch dimension to all tensors. + image_id = tf.expand_dims(image_id, 0) + groundtruth_boxes = tf.expand_dims(groundtruth_boxes, 0) + groundtruth_classes = tf.expand_dims(groundtruth_classes, 0) + groundtruth_is_crowd = tf.expand_dims(groundtruth_is_crowd, 0) + detection_boxes = tf.expand_dims(detection_boxes, 0) + detection_scores = tf.expand_dims(detection_scores, 0) + detection_classes = tf.expand_dims(detection_classes, 0) + + if num_gt_boxes_per_image is None: + num_gt_boxes_per_image = tf.shape(groundtruth_boxes)[1:2] + else: + num_gt_boxes_per_image = tf.expand_dims(num_gt_boxes_per_image, 0) + + if num_det_boxes_per_image is None: + num_det_boxes_per_image = tf.shape(detection_boxes)[1:2] + else: + num_det_boxes_per_image = tf.expand_dims(num_det_boxes_per_image, 0) + else: + if num_gt_boxes_per_image is None: + num_gt_boxes_per_image = tf.tile( + tf.shape(groundtruth_boxes)[1:2], + multiples=tf.shape(groundtruth_boxes)[0:1]) + if num_det_boxes_per_image is None: + num_det_boxes_per_image = tf.tile( + tf.shape(detection_boxes)[1:2], + multiples=tf.shape(detection_boxes)[0:1]) + + update_op = tf.py_func(update_op, [image_id, + groundtruth_boxes, + groundtruth_classes, + groundtruth_is_crowd, + num_gt_boxes_per_image, + detection_boxes, + detection_scores, + detection_classes, + num_det_boxes_per_image], []) + metric_names = ['DetectionBoxes_Precision/mAP', + 'DetectionBoxes_Precision/mAP@.50IOU', + 'DetectionBoxes_Precision/mAP@.75IOU', + 'DetectionBoxes_Precision/mAP (large)', + 'DetectionBoxes_Precision/mAP (medium)', + 'DetectionBoxes_Precision/mAP (small)', + 'DetectionBoxes_Recall/AR@1', + 'DetectionBoxes_Recall/AR@10', + 'DetectionBoxes_Recall/AR@100', + 'DetectionBoxes_Recall/AR@100 (large)', + 'DetectionBoxes_Recall/AR@100 (medium)', + 'DetectionBoxes_Recall/AR@100 (small)'] + if self._include_metrics_per_category: + for category_dict in self._categories: + metric_names.append('DetectionBoxes_PerformanceByCategory/mAP/' + + category_dict['name']) + + def first_value_func(): + self._metrics = self.evaluate() + self.clear() + return np.float32(self._metrics[metric_names[0]]) + + def value_func_factory(metric_name): + def value_func(): + return np.float32(self._metrics[metric_name]) + return value_func + + # Ensure that the metrics are only evaluated once. + first_value_op = tf.py_func(first_value_func, [], tf.float32) + eval_metric_ops = {metric_names[0]: (first_value_op, update_op)} + with tf.control_dependencies([first_value_op]): + for metric_name in metric_names[1:]: + eval_metric_ops[metric_name] = (tf.py_func( + value_func_factory(metric_name), [], np.float32), update_op) + return eval_metric_ops + + +def _check_mask_type_and_value(array_name, masks): + """Checks whether mask dtype is uint8 and the values are either 0 or 1.""" + if masks.dtype != np.uint8: + raise ValueError('{} must be of type np.uint8. Found {}.'.format( + array_name, masks.dtype)) + if np.any(np.logical_and(masks != 0, masks != 1)): + raise ValueError('{} elements can only be either 0 or 1.'.format( + array_name)) + + +class CocoMaskEvaluator(object_detection_evaluation.DetectionEvaluator): + """Class to evaluate COCO detection metrics.""" + + def __init__(self, categories, include_metrics_per_category=False): + """Constructor. + + Args: + categories: A list of dicts, each of which has the following keys - + 'id': (required) an integer id uniquely identifying this category. + 'name': (required) string representing category name e.g., 'cat', 'dog'. + include_metrics_per_category: If True, include metrics for each category. + """ + super(CocoMaskEvaluator, self).__init__(categories) + self._image_id_to_mask_shape_map = {} + self._image_ids_with_detections = set([]) + self._groundtruth_list = [] + self._detection_masks_list = [] + self._category_id_set = set([cat['id'] for cat in self._categories]) + self._annotation_id = 1 + self._include_metrics_per_category = include_metrics_per_category + + def clear(self): + """Clears the state to prepare for a fresh evaluation.""" + self._image_id_to_mask_shape_map.clear() + self._image_ids_with_detections.clear() + self._groundtruth_list = [] + self._detection_masks_list = [] + + def add_single_ground_truth_image_info(self, + image_id, + groundtruth_dict): + """Adds groundtruth for a single image to be used for evaluation. + + If the image has already been added, a warning is logged, and groundtruth is + ignored. + + Args: + image_id: A unique string/integer identifier for the image. + groundtruth_dict: A dictionary containing - + InputDataFields.groundtruth_boxes: float32 numpy array of shape + [num_boxes, 4] containing `num_boxes` groundtruth boxes of the format + [ymin, xmin, ymax, xmax] in absolute image coordinates. + InputDataFields.groundtruth_classes: integer numpy array of shape + [num_boxes] containing 1-indexed groundtruth classes for the boxes. + InputDataFields.groundtruth_instance_masks: uint8 numpy array of shape + [num_boxes, image_height, image_width] containing groundtruth masks + corresponding to the boxes. The elements of the array must be in + {0, 1}. + """ + if image_id in self._image_id_to_mask_shape_map: + tf.logging.warning('Ignoring ground truth with image id %s since it was ' + 'previously added', image_id) + return + + groundtruth_instance_masks = groundtruth_dict[ + standard_fields.InputDataFields.groundtruth_instance_masks] + _check_mask_type_and_value(standard_fields.InputDataFields. + groundtruth_instance_masks, + groundtruth_instance_masks) + self._groundtruth_list.extend( + coco_tools. + ExportSingleImageGroundtruthToCoco( + image_id=image_id, + next_annotation_id=self._annotation_id, + category_id_set=self._category_id_set, + groundtruth_boxes=groundtruth_dict[standard_fields.InputDataFields. + groundtruth_boxes], + groundtruth_classes=groundtruth_dict[standard_fields. + InputDataFields. + groundtruth_classes], + groundtruth_masks=groundtruth_instance_masks)) + self._annotation_id += groundtruth_dict[standard_fields.InputDataFields. + groundtruth_boxes].shape[0] + self._image_id_to_mask_shape_map[image_id] = groundtruth_dict[ + standard_fields.InputDataFields.groundtruth_instance_masks].shape + + def add_single_detected_image_info(self, + image_id, + detections_dict): + """Adds detections for a single image to be used for evaluation. + + If a detection has already been added for this image id, a warning is + logged, and the detection is skipped. + + Args: + image_id: A unique string/integer identifier for the image. + detections_dict: A dictionary containing - + DetectionResultFields.detection_scores: float32 numpy array of shape + [num_boxes] containing detection scores for the boxes. + DetectionResultFields.detection_classes: integer numpy array of shape + [num_boxes] containing 1-indexed detection classes for the boxes. + DetectionResultFields.detection_masks: optional uint8 numpy array of + shape [num_boxes, image_height, image_width] containing instance + masks corresponding to the boxes. The elements of the array must be + in {0, 1}. + + Raises: + ValueError: If groundtruth for the image_id is not available or if + spatial shapes of groundtruth_instance_masks and detection_masks are + incompatible. + """ + if image_id not in self._image_id_to_mask_shape_map: + raise ValueError('Missing groundtruth for image id: {}'.format(image_id)) + + if image_id in self._image_ids_with_detections: + tf.logging.warning('Ignoring detection with image id %s since it was ' + 'previously added', image_id) + return + + groundtruth_masks_shape = self._image_id_to_mask_shape_map[image_id] + detection_masks = detections_dict[standard_fields.DetectionResultFields. + detection_masks] + if groundtruth_masks_shape[1:] != detection_masks.shape[1:]: + raise ValueError('Spatial shape of groundtruth masks and detection masks ' + 'are incompatible: {} vs {}'.format( + groundtruth_masks_shape, + detection_masks.shape)) + _check_mask_type_and_value(standard_fields.DetectionResultFields. + detection_masks, + detection_masks) + self._detection_masks_list.extend( + coco_tools.ExportSingleImageDetectionMasksToCoco( + image_id=image_id, + category_id_set=self._category_id_set, + detection_masks=detection_masks, + detection_scores=detections_dict[standard_fields. + DetectionResultFields. + detection_scores], + detection_classes=detections_dict[standard_fields. + DetectionResultFields. + detection_classes])) + self._image_ids_with_detections.update([image_id]) + + def evaluate(self): + """Evaluates the detection masks and returns a dictionary of coco metrics. + + Returns: + A dictionary holding - + + 1. summary_metrics: + 'DetectionMasks_Precision/mAP': mean average precision over classes + averaged over IOU thresholds ranging from .5 to .95 with .05 increments. + 'DetectionMasks_Precision/mAP@.50IOU': mean average precision at 50% IOU. + 'DetectionMasks_Precision/mAP@.75IOU': mean average precision at 75% IOU. + 'DetectionMasks_Precision/mAP (small)': mean average precision for small + objects (area < 32^2 pixels). + 'DetectionMasks_Precision/mAP (medium)': mean average precision for medium + sized objects (32^2 pixels < area < 96^2 pixels). + 'DetectionMasks_Precision/mAP (large)': mean average precision for large + objects (96^2 pixels < area < 10000^2 pixels). + 'DetectionMasks_Recall/AR@1': average recall with 1 detection. + 'DetectionMasks_Recall/AR@10': average recall with 10 detections. + 'DetectionMasks_Recall/AR@100': average recall with 100 detections. + 'DetectionMasks_Recall/AR@100 (small)': average recall for small objects + with 100 detections. + 'DetectionMasks_Recall/AR@100 (medium)': average recall for medium objects + with 100 detections. + 'DetectionMasks_Recall/AR@100 (large)': average recall for large objects + with 100 detections. + + 2. per_category_ap: if include_metrics_per_category is True, category + specific results with keys of the form: + 'Precision mAP ByCategory/category' (without the supercategory part if + no supercategories exist). For backward compatibility + 'PerformanceByCategory' is included in the output regardless of + all_metrics_per_category. + """ + groundtruth_dict = { + 'annotations': self._groundtruth_list, + 'images': [{'id': image_id, 'height': shape[1], 'width': shape[2]} + for image_id, shape in self._image_id_to_mask_shape_map. + items()], + 'categories': self._categories + } + coco_wrapped_groundtruth = coco_tools.COCOWrapper( + groundtruth_dict, detection_type='segmentation') + coco_wrapped_detection_masks = coco_wrapped_groundtruth.LoadAnnotations( + self._detection_masks_list) + mask_evaluator = coco_tools.COCOEvalWrapper( + coco_wrapped_groundtruth, coco_wrapped_detection_masks, + agnostic_mode=False, iou_type='segm') + mask_metrics, mask_per_category_ap = mask_evaluator.ComputeMetrics( + include_metrics_per_category=self._include_metrics_per_category) + mask_metrics.update(mask_per_category_ap) + mask_metrics = {'DetectionMasks_'+ key: value + for key, value in mask_metrics.items()} + return mask_metrics + + def get_estimator_eval_metric_ops(self, image_id, groundtruth_boxes, + groundtruth_classes, + groundtruth_instance_masks, + detection_scores, detection_classes, + detection_masks, groundtruth_is_crowd=None): + """Returns a dictionary of eval metric ops to use with `tf.EstimatorSpec`. + + Note that once value_op is called, the detections and groundtruth added via + update_op are cleared. + + Args: + image_id: Unique string/integer identifier for the image. + groundtruth_boxes: float32 tensor of shape [num_boxes, 4] containing + `num_boxes` groundtruth boxes of the format + [ymin, xmin, ymax, xmax] in absolute image coordinates. + groundtruth_classes: int32 tensor of shape [num_boxes] containing + 1-indexed groundtruth classes for the boxes. + groundtruth_instance_masks: uint8 tensor array of shape + [num_boxes, image_height, image_width] containing groundtruth masks + corresponding to the boxes. The elements of the array must be in {0, 1}. + detection_scores: float32 tensor of shape [num_boxes] containing + detection scores for the boxes. + detection_classes: int32 tensor of shape [num_boxes] containing + 1-indexed detection classes for the boxes. + detection_masks: uint8 tensor array of shape + [num_boxes, image_height, image_width] containing instance masks + corresponding to the boxes. The elements of the array must be in {0, 1}. + groundtruth_is_crowd: bool tensor of shape [batch, num_boxes] containing + is_crowd annotations. This field is optional, and if not passed, then + all boxes are treated as *not* is_crowd. + + Returns: + a dictionary of metric names to tuple of value_op and update_op that can + be used as eval metric ops in tf.EstimatorSpec. Note that all update ops + must be run together and similarly all value ops must be run together to + guarantee correct behaviour. + """ + def update_op( + image_id, + groundtruth_boxes, + groundtruth_classes, + groundtruth_instance_masks, + groundtruth_is_crowd, + detection_scores, + detection_classes, + detection_masks): + self.add_single_ground_truth_image_info( + image_id, + {'groundtruth_boxes': groundtruth_boxes, + 'groundtruth_classes': groundtruth_classes, + 'groundtruth_instance_masks': groundtruth_instance_masks, + 'groundtruth_is_crowd': groundtruth_is_crowd}) + self.add_single_detected_image_info( + image_id, + {'detection_scores': detection_scores, + 'detection_classes': detection_classes, + 'detection_masks': detection_masks}) + + if groundtruth_is_crowd is None: + groundtruth_is_crowd = tf.zeros_like(groundtruth_classes, dtype=tf.bool) + update_op = tf.py_func(update_op, [image_id, + groundtruth_boxes, + groundtruth_classes, + groundtruth_instance_masks, + groundtruth_is_crowd, + detection_scores, + detection_classes, + detection_masks], []) + metric_names = ['DetectionMasks_Precision/mAP', + 'DetectionMasks_Precision/mAP@.50IOU', + 'DetectionMasks_Precision/mAP@.75IOU', + 'DetectionMasks_Precision/mAP (large)', + 'DetectionMasks_Precision/mAP (medium)', + 'DetectionMasks_Precision/mAP (small)', + 'DetectionMasks_Recall/AR@1', + 'DetectionMasks_Recall/AR@10', + 'DetectionMasks_Recall/AR@100', + 'DetectionMasks_Recall/AR@100 (large)', + 'DetectionMasks_Recall/AR@100 (medium)', + 'DetectionMasks_Recall/AR@100 (small)'] + if self._include_metrics_per_category: + for category_dict in self._categories: + metric_names.append('DetectionMasks_PerformanceByCategory/mAP/' + + category_dict['name']) + + def first_value_func(): + self._metrics = self.evaluate() + self.clear() + return np.float32(self._metrics[metric_names[0]]) + + def value_func_factory(metric_name): + def value_func(): + return np.float32(self._metrics[metric_name]) + return value_func + + # Ensure that the metrics are only evaluated once. + first_value_op = tf.py_func(first_value_func, [], tf.float32) + eval_metric_ops = {metric_names[0]: (first_value_op, update_op)} + with tf.control_dependencies([first_value_op]): + for metric_name in metric_names[1:]: + eval_metric_ops[metric_name] = (tf.py_func( + value_func_factory(metric_name), [], np.float32), update_op) + return eval_metric_ops diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_evaluation_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_evaluation_test.py new file mode 100644 index 0000000000000000000000000000000000000000..47547e20389ad822a59dce318b019bf5a9fb5a67 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_evaluation_test.py @@ -0,0 +1,727 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow_models.object_detection.metrics.coco_evaluation.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf +from object_detection.core import standard_fields +from object_detection.metrics import coco_evaluation + + +class CocoDetectionEvaluationTest(tf.test.TestCase): + + def testGetOneMAPWithMatchingGroundtruthAndDetections(self): + """Tests that mAP is calculated correctly on GT and Detections.""" + category_list = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list) + coco_evaluator.add_single_ground_truth_image_info( + image_id='image1', + groundtruth_dict={ + standard_fields.InputDataFields.groundtruth_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.InputDataFields.groundtruth_classes: np.array([1]) + }) + coco_evaluator.add_single_detected_image_info( + image_id='image1', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]) + }) + coco_evaluator.add_single_ground_truth_image_info( + image_id='image2', + groundtruth_dict={ + standard_fields.InputDataFields.groundtruth_boxes: + np.array([[50., 50., 100., 100.]]), + standard_fields.InputDataFields.groundtruth_classes: np.array([1]) + }) + coco_evaluator.add_single_detected_image_info( + image_id='image2', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[50., 50., 100., 100.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]) + }) + coco_evaluator.add_single_ground_truth_image_info( + image_id='image3', + groundtruth_dict={ + standard_fields.InputDataFields.groundtruth_boxes: + np.array([[25., 25., 50., 50.]]), + standard_fields.InputDataFields.groundtruth_classes: np.array([1]) + }) + coco_evaluator.add_single_detected_image_info( + image_id='image3', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[25., 25., 50., 50.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]) + }) + metrics = coco_evaluator.evaluate() + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0) + + def testGetOneMAPWithMatchingGroundtruthAndDetectionsSkipCrowd(self): + """Tests computing mAP with is_crowd GT boxes skipped.""" + category_list = [{ + 'id': 0, + 'name': 'person' + }, { + 'id': 1, + 'name': 'cat' + }, { + 'id': 2, + 'name': 'dog' + }] + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list) + coco_evaluator.add_single_ground_truth_image_info( + image_id='image1', + groundtruth_dict={ + standard_fields.InputDataFields.groundtruth_boxes: + np.array([[100., 100., 200., 200.], [99., 99., 200., 200.]]), + standard_fields.InputDataFields.groundtruth_classes: + np.array([1, 2]), + standard_fields.InputDataFields.groundtruth_is_crowd: + np.array([0, 1]) + }) + coco_evaluator.add_single_detected_image_info( + image_id='image1', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]) + }) + metrics = coco_evaluator.evaluate() + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0) + + def testGetOneMAPWithMatchingGroundtruthAndDetectionsEmptyCrowd(self): + """Tests computing mAP with empty is_crowd array passed in.""" + category_list = [{ + 'id': 0, + 'name': 'person' + }, { + 'id': 1, + 'name': 'cat' + }, { + 'id': 2, + 'name': 'dog' + }] + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list) + coco_evaluator.add_single_ground_truth_image_info( + image_id='image1', + groundtruth_dict={ + standard_fields.InputDataFields.groundtruth_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.InputDataFields.groundtruth_classes: + np.array([1]), + standard_fields.InputDataFields.groundtruth_is_crowd: + np.array([]) + }) + coco_evaluator.add_single_detected_image_info( + image_id='image1', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]) + }) + metrics = coco_evaluator.evaluate() + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0) + + def testRejectionOnDuplicateGroundtruth(self): + """Tests that groundtruth cannot be added more than once for an image.""" + categories = [{'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}, + {'id': 3, 'name': 'elephant'}] + # Add groundtruth + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(categories) + image_key1 = 'img1' + groundtruth_boxes1 = np.array([[0, 0, 1, 1], [0, 0, 2, 2], [0, 0, 3, 3]], + dtype=float) + groundtruth_class_labels1 = np.array([1, 3, 1], dtype=int) + coco_evaluator.add_single_ground_truth_image_info(image_key1, { + standard_fields.InputDataFields.groundtruth_boxes: + groundtruth_boxes1, + standard_fields.InputDataFields.groundtruth_classes: + groundtruth_class_labels1 + }) + groundtruth_lists_len = len(coco_evaluator._groundtruth_list) + + # Add groundtruth with the same image id. + coco_evaluator.add_single_ground_truth_image_info(image_key1, { + standard_fields.InputDataFields.groundtruth_boxes: + groundtruth_boxes1, + standard_fields.InputDataFields.groundtruth_classes: + groundtruth_class_labels1 + }) + self.assertEqual(groundtruth_lists_len, + len(coco_evaluator._groundtruth_list)) + + def testRejectionOnDuplicateDetections(self): + """Tests that detections cannot be added more than once for an image.""" + categories = [{'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}, + {'id': 3, 'name': 'elephant'}] + # Add groundtruth + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(categories) + coco_evaluator.add_single_ground_truth_image_info( + image_id='image1', + groundtruth_dict={ + standard_fields.InputDataFields.groundtruth_boxes: + np.array([[99., 100., 200., 200.]]), + standard_fields.InputDataFields.groundtruth_classes: np.array([1]) + }) + coco_evaluator.add_single_detected_image_info( + image_id='image1', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]) + }) + detections_lists_len = len(coco_evaluator._detection_boxes_list) + coco_evaluator.add_single_detected_image_info( + image_id='image1', # Note that this image id was previously added. + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]) + }) + self.assertEqual(detections_lists_len, + len(coco_evaluator._detection_boxes_list)) + + def testExceptionRaisedWithMissingGroundtruth(self): + """Tests that exception is raised for detection with missing groundtruth.""" + categories = [{'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}, + {'id': 3, 'name': 'elephant'}] + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(categories) + with self.assertRaises(ValueError): + coco_evaluator.add_single_detected_image_info( + image_id='image1', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]) + }) + + +class CocoEvaluationPyFuncTest(tf.test.TestCase): + + def testGetOneMAPWithMatchingGroundtruthAndDetections(self): + category_list = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list) + image_id = tf.placeholder(tf.string, shape=()) + groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4)) + groundtruth_classes = tf.placeholder(tf.float32, shape=(None)) + detection_boxes = tf.placeholder(tf.float32, shape=(None, 4)) + detection_scores = tf.placeholder(tf.float32, shape=(None)) + detection_classes = tf.placeholder(tf.float32, shape=(None)) + + eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops( + image_id, groundtruth_boxes, + groundtruth_classes, + detection_boxes, + detection_scores, + detection_classes) + + _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP'] + + with self.test_session() as sess: + sess.run(update_op, + feed_dict={ + image_id: 'image1', + groundtruth_boxes: np.array([[100., 100., 200., 200.]]), + groundtruth_classes: np.array([1]), + detection_boxes: np.array([[100., 100., 200., 200.]]), + detection_scores: np.array([.8]), + detection_classes: np.array([1]) + }) + sess.run(update_op, + feed_dict={ + image_id: 'image2', + groundtruth_boxes: np.array([[50., 50., 100., 100.]]), + groundtruth_classes: np.array([3]), + detection_boxes: np.array([[50., 50., 100., 100.]]), + detection_scores: np.array([.7]), + detection_classes: np.array([3]) + }) + sess.run(update_op, + feed_dict={ + image_id: 'image3', + groundtruth_boxes: np.array([[25., 25., 50., 50.]]), + groundtruth_classes: np.array([2]), + detection_boxes: np.array([[25., 25., 50., 50.]]), + detection_scores: np.array([.9]), + detection_classes: np.array([2]) + }) + metrics = {} + for key, (value_op, _) in eval_metric_ops.iteritems(): + metrics[key] = value_op + metrics = sess.run(metrics) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'], + -1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'], + -1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0) + self.assertFalse(coco_evaluator._groundtruth_list) + self.assertFalse(coco_evaluator._detection_boxes_list) + self.assertFalse(coco_evaluator._image_ids) + + def testGetOneMAPWithMatchingGroundtruthAndDetectionsPadded(self): + category_list = [{ + 'id': 0, + 'name': 'person' + }, { + 'id': 1, + 'name': 'cat' + }, { + 'id': 2, + 'name': 'dog' + }] + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list) + image_id = tf.placeholder(tf.string, shape=()) + groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4)) + groundtruth_classes = tf.placeholder(tf.float32, shape=(None)) + detection_boxes = tf.placeholder(tf.float32, shape=(None, 4)) + detection_scores = tf.placeholder(tf.float32, shape=(None)) + detection_classes = tf.placeholder(tf.float32, shape=(None)) + + eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops( + image_id, groundtruth_boxes, groundtruth_classes, detection_boxes, + detection_scores, detection_classes) + + _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP'] + + with self.test_session() as sess: + sess.run( + update_op, + feed_dict={ + image_id: + 'image1', + groundtruth_boxes: + np.array([[100., 100., 200., 200.], [-1, -1, -1, -1]]), + groundtruth_classes: + np.array([1, -1]), + detection_boxes: + np.array([[100., 100., 200., 200.], [0., 0., 0., 0.]]), + detection_scores: + np.array([.8, 0.]), + detection_classes: + np.array([1, -1]) + }) + sess.run( + update_op, + feed_dict={ + image_id: + 'image2', + groundtruth_boxes: + np.array([[50., 50., 100., 100.], [-1, -1, -1, -1]]), + groundtruth_classes: + np.array([3, -1]), + detection_boxes: + np.array([[50., 50., 100., 100.], [0., 0., 0., 0.]]), + detection_scores: + np.array([.7, 0.]), + detection_classes: + np.array([3, -1]) + }) + sess.run( + update_op, + feed_dict={ + image_id: + 'image3', + groundtruth_boxes: + np.array([[25., 25., 50., 50.], [10., 10., 15., 15.]]), + groundtruth_classes: + np.array([2, 2]), + detection_boxes: + np.array([[25., 25., 50., 50.], [10., 10., 15., 15.]]), + detection_scores: + np.array([.95, .9]), + detection_classes: + np.array([2, 2]) + }) + metrics = {} + for key, (value_op, _) in eval_metric_ops.iteritems(): + metrics[key] = value_op + metrics = sess.run(metrics) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'], + -1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.75) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'], + -1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0) + self.assertFalse(coco_evaluator._groundtruth_list) + self.assertFalse(coco_evaluator._detection_boxes_list) + self.assertFalse(coco_evaluator._image_ids) + + def testGetOneMAPWithMatchingGroundtruthAndDetectionsBatched(self): + category_list = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list) + batch_size = 3 + image_id = tf.placeholder(tf.string, shape=(batch_size)) + groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4)) + groundtruth_classes = tf.placeholder(tf.float32, shape=(batch_size, None)) + detection_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4)) + detection_scores = tf.placeholder(tf.float32, shape=(batch_size, None)) + detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None)) + + eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops( + image_id, groundtruth_boxes, + groundtruth_classes, + detection_boxes, + detection_scores, + detection_classes) + + _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP'] + + with self.test_session() as sess: + sess.run(update_op, + feed_dict={ + image_id: ['image1', 'image2', 'image3'], + groundtruth_boxes: np.array([[[100., 100., 200., 200.]], + [[50., 50., 100., 100.]], + [[25., 25., 50., 50.]]]), + groundtruth_classes: np.array([[1], [3], [2]]), + detection_boxes: np.array([[[100., 100., 200., 200.]], + [[50., 50., 100., 100.]], + [[25., 25., 50., 50.]]]), + detection_scores: np.array([[.8], [.7], [.9]]), + detection_classes: np.array([[1], [3], [2]]) + }) + metrics = {} + for key, (value_op, _) in eval_metric_ops.iteritems(): + metrics[key] = value_op + metrics = sess.run(metrics) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'], + -1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'], + -1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0) + self.assertFalse(coco_evaluator._groundtruth_list) + self.assertFalse(coco_evaluator._detection_boxes_list) + self.assertFalse(coco_evaluator._image_ids) + + def testGetOneMAPWithMatchingGroundtruthAndDetectionsPaddedBatches(self): + category_list = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + coco_evaluator = coco_evaluation.CocoDetectionEvaluator(category_list) + batch_size = 3 + image_id = tf.placeholder(tf.string, shape=(batch_size)) + groundtruth_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4)) + groundtruth_classes = tf.placeholder(tf.float32, shape=(batch_size, None)) + num_gt_boxes_per_image = tf.placeholder(tf.int32, shape=(None)) + detection_boxes = tf.placeholder(tf.float32, shape=(batch_size, None, 4)) + detection_scores = tf.placeholder(tf.float32, shape=(batch_size, None)) + detection_classes = tf.placeholder(tf.float32, shape=(batch_size, None)) + num_det_boxes_per_image = tf.placeholder(tf.int32, shape=(None)) + + eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops( + image_id, groundtruth_boxes, + groundtruth_classes, + detection_boxes, + detection_scores, + detection_classes, + num_gt_boxes_per_image=num_gt_boxes_per_image, + num_det_boxes_per_image=num_det_boxes_per_image) + + _, update_op = eval_metric_ops['DetectionBoxes_Precision/mAP'] + + with self.test_session() as sess: + sess.run(update_op, + feed_dict={ + image_id: ['image1', 'image2', 'image3'], + groundtruth_boxes: np.array([[[100., 100., 200., 200.], + [-1, -1, -1, -1]], + [[50., 50., 100., 100.], + [-1, -1, -1, -1]], + [[25., 25., 50., 50.], + [10., 10., 15., 15.]]]), + groundtruth_classes: np.array([[1, -1], [3, -1], [2, 2]]), + num_gt_boxes_per_image: np.array([1, 1, 2]), + detection_boxes: np.array([[[100., 100., 200., 200.], + [0., 0., 0., 0.]], + [[50., 50., 100., 100.], + [0., 0., 0., 0.]], + [[25., 25., 50., 50.], + [10., 10., 15., 15.]]]), + detection_scores: np.array([[.8, 0.], [.7, 0.], [.95, .9]]), + detection_classes: np.array([[1, -1], [3, -1], [2, 2]]), + num_det_boxes_per_image: np.array([1, 1, 2]), + }) + metrics = {} + for key, (value_op, _) in eval_metric_ops.iteritems(): + metrics[key] = value_op + metrics = sess.run(metrics) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.50IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP@.75IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (medium)'], + -1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP (small)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@1'], 0.75) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@10'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (medium)'], + -1.0) + self.assertAlmostEqual(metrics['DetectionBoxes_Recall/AR@100 (small)'], 1.0) + self.assertFalse(coco_evaluator._groundtruth_list) + self.assertFalse(coco_evaluator._detection_boxes_list) + self.assertFalse(coco_evaluator._image_ids) + + +class CocoMaskEvaluationTest(tf.test.TestCase): + + def testGetOneMAPWithMatchingGroundtruthAndDetections(self): + category_list = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + coco_evaluator = coco_evaluation.CocoMaskEvaluator(category_list) + coco_evaluator.add_single_ground_truth_image_info( + image_id='image1', + groundtruth_dict={ + standard_fields.InputDataFields.groundtruth_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.InputDataFields.groundtruth_classes: np.array([1]), + standard_fields.InputDataFields.groundtruth_instance_masks: + np.pad(np.ones([1, 100, 100], dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), mode='constant') + }) + coco_evaluator.add_single_detected_image_info( + image_id='image1', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[100., 100., 200., 200.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]), + standard_fields.DetectionResultFields.detection_masks: + np.pad(np.ones([1, 100, 100], dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), mode='constant') + }) + coco_evaluator.add_single_ground_truth_image_info( + image_id='image2', + groundtruth_dict={ + standard_fields.InputDataFields.groundtruth_boxes: + np.array([[50., 50., 100., 100.]]), + standard_fields.InputDataFields.groundtruth_classes: np.array([1]), + standard_fields.InputDataFields.groundtruth_instance_masks: + np.pad(np.ones([1, 50, 50], dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), mode='constant') + }) + coco_evaluator.add_single_detected_image_info( + image_id='image2', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[50., 50., 100., 100.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]), + standard_fields.DetectionResultFields.detection_masks: + np.pad(np.ones([1, 50, 50], dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), mode='constant') + }) + coco_evaluator.add_single_ground_truth_image_info( + image_id='image3', + groundtruth_dict={ + standard_fields.InputDataFields.groundtruth_boxes: + np.array([[25., 25., 50., 50.]]), + standard_fields.InputDataFields.groundtruth_classes: np.array([1]), + standard_fields.InputDataFields.groundtruth_instance_masks: + np.pad(np.ones([1, 25, 25], dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), mode='constant') + }) + coco_evaluator.add_single_detected_image_info( + image_id='image3', + detections_dict={ + standard_fields.DetectionResultFields.detection_boxes: + np.array([[25., 25., 50., 50.]]), + standard_fields.DetectionResultFields.detection_scores: + np.array([.8]), + standard_fields.DetectionResultFields.detection_classes: + np.array([1]), + standard_fields.DetectionResultFields.detection_masks: + np.pad(np.ones([1, 25, 25], dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), mode='constant') + }) + metrics = coco_evaluator.evaluate() + self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP'], 1.0) + coco_evaluator.clear() + self.assertFalse(coco_evaluator._image_id_to_mask_shape_map) + self.assertFalse(coco_evaluator._image_ids_with_detections) + self.assertFalse(coco_evaluator._groundtruth_list) + self.assertFalse(coco_evaluator._detection_masks_list) + + +class CocoMaskEvaluationPyFuncTest(tf.test.TestCase): + + def testGetOneMAPWithMatchingGroundtruthAndDetections(self): + category_list = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + coco_evaluator = coco_evaluation.CocoMaskEvaluator(category_list) + image_id = tf.placeholder(tf.string, shape=()) + groundtruth_boxes = tf.placeholder(tf.float32, shape=(None, 4)) + groundtruth_classes = tf.placeholder(tf.float32, shape=(None)) + groundtruth_masks = tf.placeholder(tf.uint8, shape=(None, None, None)) + detection_scores = tf.placeholder(tf.float32, shape=(None)) + detection_classes = tf.placeholder(tf.float32, shape=(None)) + detection_masks = tf.placeholder(tf.uint8, shape=(None, None, None)) + + eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops( + image_id, groundtruth_boxes, + groundtruth_classes, + groundtruth_masks, + detection_scores, + detection_classes, + detection_masks) + + _, update_op = eval_metric_ops['DetectionMasks_Precision/mAP'] + + with self.test_session() as sess: + sess.run(update_op, + feed_dict={ + image_id: 'image1', + groundtruth_boxes: np.array([[100., 100., 200., 200.]]), + groundtruth_classes: np.array([1]), + groundtruth_masks: np.pad(np.ones([1, 100, 100], + dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), + mode='constant'), + detection_scores: np.array([.8]), + detection_classes: np.array([1]), + detection_masks: np.pad(np.ones([1, 100, 100], + dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), + mode='constant') + }) + sess.run(update_op, + feed_dict={ + image_id: 'image2', + groundtruth_boxes: np.array([[50., 50., 100., 100.]]), + groundtruth_classes: np.array([1]), + groundtruth_masks: np.pad(np.ones([1, 50, 50], + dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), + mode='constant'), + detection_scores: np.array([.8]), + detection_classes: np.array([1]), + detection_masks: np.pad(np.ones([1, 50, 50], dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), + mode='constant') + }) + sess.run(update_op, + feed_dict={ + image_id: 'image3', + groundtruth_boxes: np.array([[25., 25., 50., 50.]]), + groundtruth_classes: np.array([1]), + groundtruth_masks: np.pad(np.ones([1, 25, 25], + dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), + mode='constant'), + detection_scores: np.array([.8]), + detection_classes: np.array([1]), + detection_masks: np.pad(np.ones([1, 25, 25], + dtype=np.uint8), + ((0, 0), (10, 10), (10, 10)), + mode='constant') + }) + metrics = {} + for key, (value_op, _) in eval_metric_ops.iteritems(): + metrics[key] = value_op + metrics = sess.run(metrics) + self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP'], 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP@.50IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP@.75IOU'], 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP (medium)'], + 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Precision/mAP (small)'], 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@1'], 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@10'], 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100'], 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100 (large)'], 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100 (medium)'], + 1.0) + self.assertAlmostEqual(metrics['DetectionMasks_Recall/AR@100 (small)'], 1.0) + self.assertFalse(coco_evaluator._groundtruth_list) + self.assertFalse(coco_evaluator._image_ids_with_detections) + self.assertFalse(coco_evaluator._image_id_to_mask_shape_map) + self.assertFalse(coco_evaluator._detection_masks_list) + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_tools.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..71b747bcbde1ae6f39e92d2401079172b3b9a69b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_tools.py @@ -0,0 +1,850 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Wrappers for third party pycocotools to be used within object_detection. + +Note that nothing in this file is tensorflow related and thus cannot +be called directly as a slim metric, for example. + +TODO(jonathanhuang): wrap as a slim metric in metrics.py + + +Usage example: given a set of images with ids in the list image_ids +and corresponding lists of numpy arrays encoding groundtruth (boxes and classes) +and detections (boxes, scores and classes), where elements of each list +correspond to detections/annotations of a single image, +then evaluation (in multi-class mode) can be invoked as follows: + + groundtruth_dict = coco_tools.ExportGroundtruthToCOCO( + image_ids, groundtruth_boxes_list, groundtruth_classes_list, + max_num_classes, output_path=None) + detections_list = coco_tools.ExportDetectionsToCOCO( + image_ids, detection_boxes_list, detection_scores_list, + detection_classes_list, output_path=None) + groundtruth = coco_tools.COCOWrapper(groundtruth_dict) + detections = groundtruth.LoadAnnotations(detections_list) + evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections, + agnostic_mode=False) + metrics = evaluator.ComputeMetrics() + +""" +from collections import OrderedDict +import copy +import time +import numpy as np + +from pycocotools import coco +from pycocotools import cocoeval +from pycocotools import mask + +import tensorflow as tf + +from object_detection.utils import json_utils + + +class COCOWrapper(coco.COCO): + """Wrapper for the pycocotools COCO class.""" + + def __init__(self, dataset, detection_type='bbox'): + """COCOWrapper constructor. + + See http://mscoco.org/dataset/#format for a description of the format. + By default, the coco.COCO class constructor reads from a JSON file. + This function duplicates the same behavior but loads from a dictionary, + allowing us to perform evaluation without writing to external storage. + + Args: + dataset: a dictionary holding bounding box annotations in the COCO format. + detection_type: type of detections being wrapped. Can be one of ['bbox', + 'segmentation'] + + Raises: + ValueError: if detection_type is unsupported. + """ + supported_detection_types = ['bbox', 'segmentation'] + if detection_type not in supported_detection_types: + raise ValueError('Unsupported detection type: {}. ' + 'Supported values are: {}'.format( + detection_type, supported_detection_types)) + self._detection_type = detection_type + coco.COCO.__init__(self) + self.dataset = dataset + self.createIndex() + + def LoadAnnotations(self, annotations): + """Load annotations dictionary into COCO datastructure. + + See http://mscoco.org/dataset/#format for a description of the annotations + format. As above, this function replicates the default behavior of the API + but does not require writing to external storage. + + Args: + annotations: python list holding object detection results where each + detection is encoded as a dict with required keys ['image_id', + 'category_id', 'score'] and one of ['bbox', 'segmentation'] based on + `detection_type`. + + Returns: + a coco.COCO datastructure holding object detection annotations results + + Raises: + ValueError: if annotations is not a list + ValueError: if annotations do not correspond to the images contained + in self. + """ + results = coco.COCO() + results.dataset['images'] = [img for img in self.dataset['images']] + + tf.logging.info('Loading and preparing annotation results...') + tic = time.time() + + if not isinstance(annotations, list): + raise ValueError('annotations is not a list of objects') + annotation_img_ids = [ann['image_id'] for ann in annotations] + if (set(annotation_img_ids) != (set(annotation_img_ids) + & set(self.getImgIds()))): + raise ValueError('Results do not correspond to current coco set') + results.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + if self._detection_type == 'bbox': + for idx, ann in enumerate(annotations): + bb = ann['bbox'] + ann['area'] = bb[2] * bb[3] + ann['id'] = idx + 1 + ann['iscrowd'] = 0 + elif self._detection_type == 'segmentation': + for idx, ann in enumerate(annotations): + ann['area'] = mask.area(ann['segmentation']) + ann['bbox'] = mask.toBbox(ann['segmentation']) + ann['id'] = idx + 1 + ann['iscrowd'] = 0 + tf.logging.info('DONE (t=%0.2fs)', (time.time() - tic)) + + results.dataset['annotations'] = annotations + results.createIndex() + return results + + +class COCOEvalWrapper(cocoeval.COCOeval): + """Wrapper for the pycocotools COCOeval class. + + To evaluate, create two objects (groundtruth_dict and detections_list) + using the conventions listed at http://mscoco.org/dataset/#format. + Then call evaluation as follows: + + groundtruth = coco_tools.COCOWrapper(groundtruth_dict) + detections = groundtruth.LoadAnnotations(detections_list) + evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections, + agnostic_mode=False) + + metrics = evaluator.ComputeMetrics() + """ + + def __init__(self, groundtruth=None, detections=None, agnostic_mode=False, + iou_type='bbox'): + """COCOEvalWrapper constructor. + + Note that for the area-based metrics to be meaningful, detection and + groundtruth boxes must be in image coordinates measured in pixels. + + Args: + groundtruth: a coco.COCO (or coco_tools.COCOWrapper) object holding + groundtruth annotations + detections: a coco.COCO (or coco_tools.COCOWrapper) object holding + detections + agnostic_mode: boolean (default: False). If True, evaluation ignores + class labels, treating all detections as proposals. + iou_type: IOU type to use for evaluation. Supports `bbox` or `segm`. + """ + cocoeval.COCOeval.__init__(self, groundtruth, detections, + iouType=iou_type) + if agnostic_mode: + self.params.useCats = 0 + + def GetCategory(self, category_id): + """Fetches dictionary holding category information given category id. + + Args: + category_id: integer id + Returns: + dictionary holding 'id', 'name'. + """ + return self.cocoGt.cats[category_id] + + def GetAgnosticMode(self): + """Returns true if COCO Eval is configured to evaluate in agnostic mode.""" + return self.params.useCats == 0 + + def GetCategoryIdList(self): + """Returns list of valid category ids.""" + return self.params.catIds + + def ComputeMetrics(self, + include_metrics_per_category=False, + all_metrics_per_category=False): + """Computes detection metrics. + + Args: + include_metrics_per_category: If True, will include metrics per category. + all_metrics_per_category: If true, include all the summery metrics for + each category in per_category_ap. Be careful with setting it to true if + you have more than handful of categories, because it will pollute + your mldash. + + Returns: + 1. summary_metrics: a dictionary holding: + 'Precision/mAP': mean average precision over classes averaged over IOU + thresholds ranging from .5 to .95 with .05 increments + 'Precision/mAP@.50IOU': mean average precision at 50% IOU + 'Precision/mAP@.75IOU': mean average precision at 75% IOU + 'Precision/mAP (small)': mean average precision for small objects + (area < 32^2 pixels) + 'Precision/mAP (medium)': mean average precision for medium sized + objects (32^2 pixels < area < 96^2 pixels) + 'Precision/mAP (large)': mean average precision for large objects + (96^2 pixels < area < 10000^2 pixels) + 'Recall/AR@1': average recall with 1 detection + 'Recall/AR@10': average recall with 10 detections + 'Recall/AR@100': average recall with 100 detections + 'Recall/AR@100 (small)': average recall for small objects with 100 + detections + 'Recall/AR@100 (medium)': average recall for medium objects with 100 + detections + 'Recall/AR@100 (large)': average recall for large objects with 100 + detections + 2. per_category_ap: a dictionary holding category specific results with + keys of the form: 'Precision mAP ByCategory/category' + (without the supercategory part if no supercategories exist). + For backward compatibility 'PerformanceByCategory' is included in the + output regardless of all_metrics_per_category. + If evaluating class-agnostic mode, per_category_ap is an empty + dictionary. + + Raises: + ValueError: If category_stats does not exist. + """ + self.evaluate() + self.accumulate() + self.summarize() + + summary_metrics = OrderedDict([ + ('Precision/mAP', self.stats[0]), + ('Precision/mAP@.50IOU', self.stats[1]), + ('Precision/mAP@.75IOU', self.stats[2]), + ('Precision/mAP (small)', self.stats[3]), + ('Precision/mAP (medium)', self.stats[4]), + ('Precision/mAP (large)', self.stats[5]), + ('Recall/AR@1', self.stats[6]), + ('Recall/AR@10', self.stats[7]), + ('Recall/AR@100', self.stats[8]), + ('Recall/AR@100 (small)', self.stats[9]), + ('Recall/AR@100 (medium)', self.stats[10]), + ('Recall/AR@100 (large)', self.stats[11]) + ]) + if not include_metrics_per_category: + return summary_metrics, {} + if not hasattr(self, 'category_stats'): + raise ValueError('Category stats do not exist') + per_category_ap = OrderedDict([]) + if self.GetAgnosticMode(): + return summary_metrics, per_category_ap + for category_index, category_id in enumerate(self.GetCategoryIdList()): + category = self.GetCategory(category_id)['name'] + # Kept for backward compatilbility + per_category_ap['PerformanceByCategory/mAP/{}'.format( + category)] = self.category_stats[0][category_index] + if all_metrics_per_category: + per_category_ap['Precision mAP ByCategory/{}'.format( + category)] = self.category_stats[0][category_index] + per_category_ap['Precision mAP@.50IOU ByCategory/{}'.format( + category)] = self.category_stats[1][category_index] + per_category_ap['Precision mAP@.75IOU ByCategory/{}'.format( + category)] = self.category_stats[2][category_index] + per_category_ap['Precision mAP (small) ByCategory/{}'.format( + category)] = self.category_stats[3][category_index] + per_category_ap['Precision mAP (medium) ByCategory/{}'.format( + category)] = self.category_stats[4][category_index] + per_category_ap['Precision mAP (large) ByCategory/{}'.format( + category)] = self.category_stats[5][category_index] + per_category_ap['Recall AR@1 ByCategory/{}'.format( + category)] = self.category_stats[6][category_index] + per_category_ap['Recall AR@10 ByCategory/{}'.format( + category)] = self.category_stats[7][category_index] + per_category_ap['Recall AR@100 ByCategory/{}'.format( + category)] = self.category_stats[8][category_index] + per_category_ap['Recall AR@100 (small) ByCategory/{}'.format( + category)] = self.category_stats[9][category_index] + per_category_ap['Recall AR@100 (medium) ByCategory/{}'.format( + category)] = self.category_stats[10][category_index] + per_category_ap['Recall AR@100 (large) ByCategory/{}'.format( + category)] = self.category_stats[11][category_index] + + return summary_metrics, per_category_ap + + +def _ConvertBoxToCOCOFormat(box): + """Converts a box in [ymin, xmin, ymax, xmax] format to COCO format. + + This is a utility function for converting from our internal + [ymin, xmin, ymax, xmax] convention to the convention used by the COCO API + i.e., [xmin, ymin, width, height]. + + Args: + box: a [ymin, xmin, ymax, xmax] numpy array + + Returns: + a list of floats representing [xmin, ymin, width, height] + """ + return [float(box[1]), float(box[0]), float(box[3] - box[1]), + float(box[2] - box[0])] + + +def _RleCompress(masks): + """Compresses mask using Run-length encoding provided by pycocotools. + + Args: + masks: uint8 numpy array of shape [mask_height, mask_width] with values in + {0, 1}. + + Returns: + A pycocotools Run-length encoding of the mask. + """ + return mask.encode(np.asfortranarray(masks)) + + +def ExportSingleImageGroundtruthToCoco(image_id, + next_annotation_id, + category_id_set, + groundtruth_boxes, + groundtruth_classes, + groundtruth_masks=None, + groundtruth_is_crowd=None): + """Export groundtruth of a single image to COCO format. + + This function converts groundtruth detection annotations represented as numpy + arrays to dictionaries that can be ingested by the COCO evaluation API. Note + that the image_ids provided here must match the ones given to + ExportSingleImageDetectionsToCoco. We assume that boxes and classes are in + correspondence - that is: groundtruth_boxes[i, :], and + groundtruth_classes[i] are associated with the same groundtruth annotation. + + In the exported result, "area" fields are always set to the area of the + groundtruth bounding box. + + Args: + image_id: a unique image identifier either of type integer or string. + next_annotation_id: integer specifying the first id to use for the + groundtruth annotations. All annotations are assigned a continuous integer + id starting from this value. + category_id_set: A set of valid class ids. Groundtruth with classes not in + category_id_set are dropped. + groundtruth_boxes: numpy array (float32) with shape [num_gt_boxes, 4] + groundtruth_classes: numpy array (int) with shape [num_gt_boxes] + groundtruth_masks: optional uint8 numpy array of shape [num_detections, + image_height, image_width] containing detection_masks. + groundtruth_is_crowd: optional numpy array (int) with shape [num_gt_boxes] + indicating whether groundtruth boxes are crowd. + + Returns: + a list of groundtruth annotations for a single image in the COCO format. + + Raises: + ValueError: if (1) groundtruth_boxes and groundtruth_classes do not have the + right lengths or (2) if each of the elements inside these lists do not + have the correct shapes or (3) if image_ids are not integers + """ + + if len(groundtruth_classes.shape) != 1: + raise ValueError('groundtruth_classes is ' + 'expected to be of rank 1.') + if len(groundtruth_boxes.shape) != 2: + raise ValueError('groundtruth_boxes is expected to be of ' + 'rank 2.') + if groundtruth_boxes.shape[1] != 4: + raise ValueError('groundtruth_boxes should have ' + 'shape[1] == 4.') + num_boxes = groundtruth_classes.shape[0] + if num_boxes != groundtruth_boxes.shape[0]: + raise ValueError('Corresponding entries in groundtruth_classes, ' + 'and groundtruth_boxes should have ' + 'compatible shapes (i.e., agree on the 0th dimension).' + 'Classes shape: %d. Boxes shape: %d. Image ID: %s' % ( + groundtruth_classes.shape[0], + groundtruth_boxes.shape[0], image_id)) + has_is_crowd = groundtruth_is_crowd is not None + if has_is_crowd and len(groundtruth_is_crowd.shape) != 1: + raise ValueError('groundtruth_is_crowd is expected to be of rank 1.') + groundtruth_list = [] + for i in range(num_boxes): + if groundtruth_classes[i] in category_id_set: + iscrowd = groundtruth_is_crowd[i] if has_is_crowd else 0 + export_dict = { + 'id': + next_annotation_id + i, + 'image_id': + image_id, + 'category_id': + int(groundtruth_classes[i]), + 'bbox': + list(_ConvertBoxToCOCOFormat(groundtruth_boxes[i, :])), + 'area': + float((groundtruth_boxes[i, 2] - groundtruth_boxes[i, 0]) * + (groundtruth_boxes[i, 3] - groundtruth_boxes[i, 1])), + 'iscrowd': + iscrowd + } + if groundtruth_masks is not None: + export_dict['segmentation'] = _RleCompress(groundtruth_masks[i]) + groundtruth_list.append(export_dict) + return groundtruth_list + + +def ExportGroundtruthToCOCO(image_ids, + groundtruth_boxes, + groundtruth_classes, + categories, + output_path=None): + """Export groundtruth detection annotations in numpy arrays to COCO API. + + This function converts a set of groundtruth detection annotations represented + as numpy arrays to dictionaries that can be ingested by the COCO API. + Inputs to this function are three lists: image ids for each groundtruth image, + groundtruth boxes for each image and groundtruth classes respectively. + Note that the image_ids provided here must match the ones given to the + ExportDetectionsToCOCO function in order for evaluation to work properly. + We assume that for each image, boxes, scores and classes are in + correspondence --- that is: image_id[i], groundtruth_boxes[i, :] and + groundtruth_classes[i] are associated with the same groundtruth annotation. + + In the exported result, "area" fields are always set to the area of the + groundtruth bounding box and "iscrowd" fields are always set to 0. + TODO(jonathanhuang): pass in "iscrowd" array for evaluating on COCO dataset. + + Args: + image_ids: a list of unique image identifier either of type integer or + string. + groundtruth_boxes: list of numpy arrays with shape [num_gt_boxes, 4] + (note that num_gt_boxes can be different for each entry in the list) + groundtruth_classes: list of numpy arrays (int) with shape [num_gt_boxes] + (note that num_gt_boxes can be different for each entry in the list) + categories: a list of dictionaries representing all possible categories. + Each dict in this list has the following keys: + 'id': (required) an integer id uniquely identifying this category + 'name': (required) string representing category name + e.g., 'cat', 'dog', 'pizza' + 'supercategory': (optional) string representing the supercategory + e.g., 'animal', 'vehicle', 'food', etc + output_path: (optional) path for exporting result to JSON + Returns: + dictionary that can be read by COCO API + Raises: + ValueError: if (1) groundtruth_boxes and groundtruth_classes do not have the + right lengths or (2) if each of the elements inside these lists do not + have the correct shapes or (3) if image_ids are not integers + """ + category_id_set = set([cat['id'] for cat in categories]) + groundtruth_export_list = [] + image_export_list = [] + if not len(image_ids) == len(groundtruth_boxes) == len(groundtruth_classes): + raise ValueError('Input lists must have the same length') + + # For reasons internal to the COCO API, it is important that annotation ids + # are not equal to zero; we thus start counting from 1. + annotation_id = 1 + for image_id, boxes, classes in zip(image_ids, groundtruth_boxes, + groundtruth_classes): + image_export_list.append({'id': image_id}) + groundtruth_export_list.extend(ExportSingleImageGroundtruthToCoco( + image_id, + annotation_id, + category_id_set, + boxes, + classes)) + num_boxes = classes.shape[0] + annotation_id += num_boxes + + groundtruth_dict = { + 'annotations': groundtruth_export_list, + 'images': image_export_list, + 'categories': categories + } + if output_path: + with tf.gfile.GFile(output_path, 'w') as fid: + json_utils.Dump(groundtruth_dict, fid, float_digits=4, indent=2) + return groundtruth_dict + + +def ExportSingleImageDetectionBoxesToCoco(image_id, + category_id_set, + detection_boxes, + detection_scores, + detection_classes): + """Export detections of a single image to COCO format. + + This function converts detections represented as numpy arrays to dictionaries + that can be ingested by the COCO evaluation API. Note that the image_ids + provided here must match the ones given to the + ExporSingleImageDetectionBoxesToCoco. We assume that boxes, and classes are in + correspondence - that is: boxes[i, :], and classes[i] + are associated with the same groundtruth annotation. + + Args: + image_id: unique image identifier either of type integer or string. + category_id_set: A set of valid class ids. Detections with classes not in + category_id_set are dropped. + detection_boxes: float numpy array of shape [num_detections, 4] containing + detection boxes. + detection_scores: float numpy array of shape [num_detections] containing + scored for the detection boxes. + detection_classes: integer numpy array of shape [num_detections] containing + the classes for detection boxes. + + Returns: + a list of detection annotations for a single image in the COCO format. + + Raises: + ValueError: if (1) detection_boxes, detection_scores and detection_classes + do not have the right lengths or (2) if each of the elements inside these + lists do not have the correct shapes or (3) if image_ids are not integers. + """ + + if len(detection_classes.shape) != 1 or len(detection_scores.shape) != 1: + raise ValueError('All entries in detection_classes and detection_scores' + 'expected to be of rank 1.') + if len(detection_boxes.shape) != 2: + raise ValueError('All entries in detection_boxes expected to be of ' + 'rank 2.') + if detection_boxes.shape[1] != 4: + raise ValueError('All entries in detection_boxes should have ' + 'shape[1] == 4.') + num_boxes = detection_classes.shape[0] + if not num_boxes == detection_boxes.shape[0] == detection_scores.shape[0]: + raise ValueError('Corresponding entries in detection_classes, ' + 'detection_scores and detection_boxes should have ' + 'compatible shapes (i.e., agree on the 0th dimension). ' + 'Classes shape: %d. Boxes shape: %d. ' + 'Scores shape: %d' % ( + detection_classes.shape[0], detection_boxes.shape[0], + detection_scores.shape[0] + )) + detections_list = [] + for i in range(num_boxes): + if detection_classes[i] in category_id_set: + detections_list.append({ + 'image_id': image_id, + 'category_id': int(detection_classes[i]), + 'bbox': list(_ConvertBoxToCOCOFormat(detection_boxes[i, :])), + 'score': float(detection_scores[i]) + }) + return detections_list + + +def ExportSingleImageDetectionMasksToCoco(image_id, + category_id_set, + detection_masks, + detection_scores, + detection_classes): + """Export detection masks of a single image to COCO format. + + This function converts detections represented as numpy arrays to dictionaries + that can be ingested by the COCO evaluation API. We assume that + detection_masks, detection_scores, and detection_classes are in correspondence + - that is: detection_masks[i, :], detection_classes[i] and detection_scores[i] + are associated with the same annotation. + + Args: + image_id: unique image identifier either of type integer or string. + category_id_set: A set of valid class ids. Detections with classes not in + category_id_set are dropped. + detection_masks: uint8 numpy array of shape [num_detections, image_height, + image_width] containing detection_masks. + detection_scores: float numpy array of shape [num_detections] containing + scores for detection masks. + detection_classes: integer numpy array of shape [num_detections] containing + the classes for detection masks. + + Returns: + a list of detection mask annotations for a single image in the COCO format. + + Raises: + ValueError: if (1) detection_masks, detection_scores and detection_classes + do not have the right lengths or (2) if each of the elements inside these + lists do not have the correct shapes or (3) if image_ids are not integers. + """ + + if len(detection_classes.shape) != 1 or len(detection_scores.shape) != 1: + raise ValueError('All entries in detection_classes and detection_scores' + 'expected to be of rank 1.') + num_boxes = detection_classes.shape[0] + if not num_boxes == len(detection_masks) == detection_scores.shape[0]: + raise ValueError('Corresponding entries in detection_classes, ' + 'detection_scores and detection_masks should have ' + 'compatible lengths and shapes ' + 'Classes length: %d. Masks length: %d. ' + 'Scores length: %d' % ( + detection_classes.shape[0], len(detection_masks), + detection_scores.shape[0] + )) + detections_list = [] + for i in range(num_boxes): + if detection_classes[i] in category_id_set: + detections_list.append({ + 'image_id': image_id, + 'category_id': int(detection_classes[i]), + 'segmentation': _RleCompress(detection_masks[i]), + 'score': float(detection_scores[i]) + }) + return detections_list + + +def ExportDetectionsToCOCO(image_ids, + detection_boxes, + detection_scores, + detection_classes, + categories, + output_path=None): + """Export detection annotations in numpy arrays to COCO API. + + This function converts a set of predicted detections represented + as numpy arrays to dictionaries that can be ingested by the COCO API. + Inputs to this function are lists, consisting of boxes, scores and + classes, respectively, corresponding to each image for which detections + have been produced. Note that the image_ids provided here must + match the ones given to the ExportGroundtruthToCOCO function in order + for evaluation to work properly. + + We assume that for each image, boxes, scores and classes are in + correspondence --- that is: detection_boxes[i, :], detection_scores[i] and + detection_classes[i] are associated with the same detection. + + Args: + image_ids: a list of unique image identifier either of type integer or + string. + detection_boxes: list of numpy arrays with shape [num_detection_boxes, 4] + detection_scores: list of numpy arrays (float) with shape + [num_detection_boxes]. Note that num_detection_boxes can be different + for each entry in the list. + detection_classes: list of numpy arrays (int) with shape + [num_detection_boxes]. Note that num_detection_boxes can be different + for each entry in the list. + categories: a list of dictionaries representing all possible categories. + Each dict in this list must have an integer 'id' key uniquely identifying + this category. + output_path: (optional) path for exporting result to JSON + + Returns: + list of dictionaries that can be read by COCO API, where each entry + corresponds to a single detection and has keys from: + ['image_id', 'category_id', 'bbox', 'score']. + Raises: + ValueError: if (1) detection_boxes and detection_classes do not have the + right lengths or (2) if each of the elements inside these lists do not + have the correct shapes or (3) if image_ids are not integers. + """ + category_id_set = set([cat['id'] for cat in categories]) + detections_export_list = [] + if not (len(image_ids) == len(detection_boxes) == len(detection_scores) == + len(detection_classes)): + raise ValueError('Input lists must have the same length') + for image_id, boxes, scores, classes in zip(image_ids, detection_boxes, + detection_scores, + detection_classes): + detections_export_list.extend(ExportSingleImageDetectionBoxesToCoco( + image_id, + category_id_set, + boxes, + scores, + classes)) + if output_path: + with tf.gfile.GFile(output_path, 'w') as fid: + json_utils.Dump(detections_export_list, fid, float_digits=4, indent=2) + return detections_export_list + + +def ExportSegmentsToCOCO(image_ids, + detection_masks, + detection_scores, + detection_classes, + categories, + output_path=None): + """Export segmentation masks in numpy arrays to COCO API. + + This function converts a set of predicted instance masks represented + as numpy arrays to dictionaries that can be ingested by the COCO API. + Inputs to this function are lists, consisting of segments, scores and + classes, respectively, corresponding to each image for which detections + have been produced. + + Note this function is recommended to use for small dataset. + For large dataset, it should be used with a merge function + (e.g. in map reduce), otherwise the memory consumption is large. + + We assume that for each image, masks, scores and classes are in + correspondence --- that is: detection_masks[i, :, :, :], detection_scores[i] + and detection_classes[i] are associated with the same detection. + + Args: + image_ids: list of image ids (typically ints or strings) + detection_masks: list of numpy arrays with shape [num_detection, h, w, 1] + and type uint8. The height and width should match the shape of + corresponding image. + detection_scores: list of numpy arrays (float) with shape + [num_detection]. Note that num_detection can be different + for each entry in the list. + detection_classes: list of numpy arrays (int) with shape + [num_detection]. Note that num_detection can be different + for each entry in the list. + categories: a list of dictionaries representing all possible categories. + Each dict in this list must have an integer 'id' key uniquely identifying + this category. + output_path: (optional) path for exporting result to JSON + + Returns: + list of dictionaries that can be read by COCO API, where each entry + corresponds to a single detection and has keys from: + ['image_id', 'category_id', 'segmentation', 'score']. + + Raises: + ValueError: if detection_masks and detection_classes do not have the + right lengths or if each of the elements inside these lists do not + have the correct shapes. + """ + if not (len(image_ids) == len(detection_masks) == len(detection_scores) == + len(detection_classes)): + raise ValueError('Input lists must have the same length') + + segment_export_list = [] + for image_id, masks, scores, classes in zip(image_ids, detection_masks, + detection_scores, + detection_classes): + + if len(classes.shape) != 1 or len(scores.shape) != 1: + raise ValueError('All entries in detection_classes and detection_scores' + 'expected to be of rank 1.') + if len(masks.shape) != 4: + raise ValueError('All entries in masks expected to be of ' + 'rank 4. Given {}'.format(masks.shape)) + + num_boxes = classes.shape[0] + if not num_boxes == masks.shape[0] == scores.shape[0]: + raise ValueError('Corresponding entries in segment_classes, ' + 'detection_scores and detection_boxes should have ' + 'compatible shapes (i.e., agree on the 0th dimension).') + + category_id_set = set([cat['id'] for cat in categories]) + segment_export_list.extend(ExportSingleImageDetectionMasksToCoco( + image_id, category_id_set, np.squeeze(masks, axis=3), scores, classes)) + + if output_path: + with tf.gfile.GFile(output_path, 'w') as fid: + json_utils.Dump(segment_export_list, fid, float_digits=4, indent=2) + return segment_export_list + + +def ExportKeypointsToCOCO(image_ids, + detection_keypoints, + detection_scores, + detection_classes, + categories, + output_path=None): + """Exports keypoints in numpy arrays to COCO API. + + This function converts a set of predicted keypoints represented + as numpy arrays to dictionaries that can be ingested by the COCO API. + Inputs to this function are lists, consisting of keypoints, scores and + classes, respectively, corresponding to each image for which detections + have been produced. + + We assume that for each image, keypoints, scores and classes are in + correspondence --- that is: detection_keypoints[i, :, :, :], + detection_scores[i] and detection_classes[i] are associated with the same + detection. + + Args: + image_ids: list of image ids (typically ints or strings) + detection_keypoints: list of numpy arrays with shape + [num_detection, num_keypoints, 2] and type float32 in absolute + x-y coordinates. + detection_scores: list of numpy arrays (float) with shape + [num_detection]. Note that num_detection can be different + for each entry in the list. + detection_classes: list of numpy arrays (int) with shape + [num_detection]. Note that num_detection can be different + for each entry in the list. + categories: a list of dictionaries representing all possible categories. + Each dict in this list must have an integer 'id' key uniquely identifying + this category and an integer 'num_keypoints' key specifying the number of + keypoints the category has. + output_path: (optional) path for exporting result to JSON + + Returns: + list of dictionaries that can be read by COCO API, where each entry + corresponds to a single detection and has keys from: + ['image_id', 'category_id', 'keypoints', 'score']. + + Raises: + ValueError: if detection_keypoints and detection_classes do not have the + right lengths or if each of the elements inside these lists do not + have the correct shapes. + """ + if not (len(image_ids) == len(detection_keypoints) == + len(detection_scores) == len(detection_classes)): + raise ValueError('Input lists must have the same length') + + keypoints_export_list = [] + for image_id, keypoints, scores, classes in zip( + image_ids, detection_keypoints, detection_scores, detection_classes): + + if len(classes.shape) != 1 or len(scores.shape) != 1: + raise ValueError('All entries in detection_classes and detection_scores' + 'expected to be of rank 1.') + if len(keypoints.shape) != 3: + raise ValueError('All entries in keypoints expected to be of ' + 'rank 3. Given {}'.format(keypoints.shape)) + + num_boxes = classes.shape[0] + if not num_boxes == keypoints.shape[0] == scores.shape[0]: + raise ValueError('Corresponding entries in detection_classes, ' + 'detection_keypoints, and detection_scores should have ' + 'compatible shapes (i.e., agree on the 0th dimension).') + + category_id_set = set([cat['id'] for cat in categories]) + category_id_to_num_keypoints_map = { + cat['id']: cat['num_keypoints'] for cat in categories + if 'num_keypoints' in cat} + + for i in range(num_boxes): + if classes[i] not in category_id_set: + raise ValueError('class id should be in category_id_set\n') + + if classes[i] in category_id_to_num_keypoints_map: + num_keypoints = category_id_to_num_keypoints_map[classes[i]] + # Adds extra ones to indicate the visibility for each keypoint as is + # recommended by MSCOCO. + instance_keypoints = np.concatenate( + [keypoints[i, 0:num_keypoints, :], + np.expand_dims(np.ones(num_keypoints), axis=1)], + axis=1).astype(int) + + instance_keypoints = instance_keypoints.flatten().tolist() + keypoints_export_list.append({ + 'image_id': image_id, + 'category_id': int(classes[i]), + 'keypoints': instance_keypoints, + 'score': float(scores[i]) + }) + + if output_path: + with tf.gfile.GFile(output_path, 'w') as fid: + json_utils.Dump(keypoints_export_list, fid, float_digits=4, indent=2) + return keypoints_export_list diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_tools_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_tools_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cfb73d8c332420d93e19029f53e4068c9fc7b23b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/coco_tools_test.py @@ -0,0 +1,295 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensorflow_model.object_detection.metrics.coco_tools.""" +import json +import os +import re +import numpy as np + +from pycocotools import mask + +import tensorflow as tf + +from object_detection.metrics import coco_tools + + +class CocoToolsTest(tf.test.TestCase): + + def setUp(self): + groundtruth_annotations_list = [ + { + 'id': 1, + 'image_id': 'first', + 'category_id': 1, + 'bbox': [100., 100., 100., 100.], + 'area': 100.**2, + 'iscrowd': 0 + }, + { + 'id': 2, + 'image_id': 'second', + 'category_id': 1, + 'bbox': [50., 50., 50., 50.], + 'area': 50.**2, + 'iscrowd': 0 + }, + ] + image_list = [{'id': 'first'}, {'id': 'second'}] + category_list = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + self._groundtruth_dict = { + 'annotations': groundtruth_annotations_list, + 'images': image_list, + 'categories': category_list + } + + self._detections_list = [ + { + 'image_id': 'first', + 'category_id': 1, + 'bbox': [100., 100., 100., 100.], + 'score': .8 + }, + { + 'image_id': 'second', + 'category_id': 1, + 'bbox': [50., 50., 50., 50.], + 'score': .7 + }, + ] + + def testCocoWrappers(self): + groundtruth = coco_tools.COCOWrapper(self._groundtruth_dict) + detections = groundtruth.LoadAnnotations(self._detections_list) + evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections) + summary_metrics, _ = evaluator.ComputeMetrics() + self.assertAlmostEqual(1.0, summary_metrics['Precision/mAP']) + + def testExportGroundtruthToCOCO(self): + image_ids = ['first', 'second'] + groundtruth_boxes = [np.array([[100, 100, 200, 200]], np.float), + np.array([[50, 50, 100, 100]], np.float)] + groundtruth_classes = [np.array([1], np.int32), np.array([1], np.int32)] + categories = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + output_path = os.path.join(tf.test.get_temp_dir(), 'groundtruth.json') + result = coco_tools.ExportGroundtruthToCOCO( + image_ids, + groundtruth_boxes, + groundtruth_classes, + categories, + output_path=output_path) + self.assertDictEqual(result, self._groundtruth_dict) + with tf.gfile.GFile(output_path, 'r') as f: + written_result = f.read() + # The json output should have floats written to 4 digits of precision. + matcher = re.compile(r'"bbox":\s+\[\n\s+\d+.\d\d\d\d,', re.MULTILINE) + self.assertTrue(matcher.findall(written_result)) + written_result = json.loads(written_result) + self.assertAlmostEqual(result, written_result) + + def testExportDetectionsToCOCO(self): + image_ids = ['first', 'second'] + detections_boxes = [np.array([[100, 100, 200, 200]], np.float), + np.array([[50, 50, 100, 100]], np.float)] + detections_scores = [np.array([.8], np.float), np.array([.7], np.float)] + detections_classes = [np.array([1], np.int32), np.array([1], np.int32)] + categories = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + output_path = os.path.join(tf.test.get_temp_dir(), 'detections.json') + result = coco_tools.ExportDetectionsToCOCO( + image_ids, + detections_boxes, + detections_scores, + detections_classes, + categories, + output_path=output_path) + self.assertListEqual(result, self._detections_list) + with tf.gfile.GFile(output_path, 'r') as f: + written_result = f.read() + # The json output should have floats written to 4 digits of precision. + matcher = re.compile(r'"bbox":\s+\[\n\s+\d+.\d\d\d\d,', re.MULTILINE) + self.assertTrue(matcher.findall(written_result)) + written_result = json.loads(written_result) + self.assertAlmostEqual(result, written_result) + + def testExportSegmentsToCOCO(self): + image_ids = ['first', 'second'] + detection_masks = [np.array( + [[[0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 1], [0, 1, 0, 1]]], + dtype=np.uint8), np.array( + [[[0, 1, 0, 1], [0, 1, 1, 0], [0, 0, 0, 1], [0, 1, 0, 1]]], + dtype=np.uint8)] + + for i, detection_mask in enumerate(detection_masks): + detection_masks[i] = detection_mask[:, :, :, None] + + detection_scores = [np.array([.8], np.float), np.array([.7], np.float)] + detection_classes = [np.array([1], np.int32), np.array([1], np.int32)] + + categories = [{'id': 0, 'name': 'person'}, + {'id': 1, 'name': 'cat'}, + {'id': 2, 'name': 'dog'}] + output_path = os.path.join(tf.test.get_temp_dir(), 'segments.json') + result = coco_tools.ExportSegmentsToCOCO( + image_ids, + detection_masks, + detection_scores, + detection_classes, + categories, + output_path=output_path) + with tf.gfile.GFile(output_path, 'r') as f: + written_result = f.read() + written_result = json.loads(written_result) + mask_load = mask.decode([written_result[0]['segmentation']]) + self.assertTrue(np.allclose(mask_load, detection_masks[0])) + self.assertAlmostEqual(result, written_result) + + def testExportKeypointsToCOCO(self): + image_ids = ['first', 'second'] + detection_keypoints = [ + np.array( + [[[100, 200], [300, 400], [500, 600]], + [[50, 150], [250, 350], [450, 550]]], dtype=np.int32), + np.array( + [[[110, 210], [310, 410], [510, 610]], + [[60, 160], [260, 360], [460, 560]]], dtype=np.int32)] + + detection_scores = [np.array([.8, 0.2], np.float), + np.array([.7, 0.3], np.float)] + detection_classes = [np.array([1, 1], np.int32), np.array([1, 1], np.int32)] + + categories = [{'id': 1, 'name': 'person', 'num_keypoints': 3}, + {'id': 2, 'name': 'cat'}, + {'id': 3, 'name': 'dog'}] + + output_path = os.path.join(tf.test.get_temp_dir(), 'keypoints.json') + result = coco_tools.ExportKeypointsToCOCO( + image_ids, + detection_keypoints, + detection_scores, + detection_classes, + categories, + output_path=output_path) + + with tf.gfile.GFile(output_path, 'r') as f: + written_result = f.read() + written_result = json.loads(written_result) + self.assertAlmostEqual(result, written_result) + + def testSingleImageDetectionBoxesExport(self): + boxes = np.array([[0, 0, 1, 1], + [0, 0, .5, .5], + [.5, .5, 1, 1]], dtype=np.float32) + classes = np.array([1, 2, 3], dtype=np.int32) + scores = np.array([0.8, 0.2, 0.7], dtype=np.float32) + coco_boxes = np.array([[0, 0, 1, 1], + [0, 0, .5, .5], + [.5, .5, .5, .5]], dtype=np.float32) + coco_annotations = coco_tools.ExportSingleImageDetectionBoxesToCoco( + image_id='first_image', + category_id_set=set([1, 2, 3]), + detection_boxes=boxes, + detection_classes=classes, + detection_scores=scores) + for i, annotation in enumerate(coco_annotations): + self.assertEqual(annotation['image_id'], 'first_image') + self.assertEqual(annotation['category_id'], classes[i]) + self.assertAlmostEqual(annotation['score'], scores[i]) + self.assertTrue(np.all(np.isclose(annotation['bbox'], coco_boxes[i]))) + + def testSingleImageDetectionMaskExport(self): + masks = np.array( + [[[1, 1,], [1, 1]], + [[0, 0], [0, 1]], + [[0, 0], [0, 0]]], dtype=np.uint8) + classes = np.array([1, 2, 3], dtype=np.int32) + scores = np.array([0.8, 0.2, 0.7], dtype=np.float32) + coco_annotations = coco_tools.ExportSingleImageDetectionMasksToCoco( + image_id='first_image', + category_id_set=set([1, 2, 3]), + detection_classes=classes, + detection_scores=scores, + detection_masks=masks) + expected_counts = ['04', '31', '4'] + for i, mask_annotation in enumerate(coco_annotations): + self.assertEqual(mask_annotation['segmentation']['counts'], + expected_counts[i]) + self.assertTrue(np.all(np.equal(mask.decode( + mask_annotation['segmentation']), masks[i]))) + self.assertEqual(mask_annotation['image_id'], 'first_image') + self.assertEqual(mask_annotation['category_id'], classes[i]) + self.assertAlmostEqual(mask_annotation['score'], scores[i]) + + def testSingleImageGroundtruthExport(self): + masks = np.array( + [[[1, 1,], [1, 1]], + [[0, 0], [0, 1]], + [[0, 0], [0, 0]]], dtype=np.uint8) + boxes = np.array([[0, 0, 1, 1], + [0, 0, .5, .5], + [.5, .5, 1, 1]], dtype=np.float32) + coco_boxes = np.array([[0, 0, 1, 1], + [0, 0, .5, .5], + [.5, .5, .5, .5]], dtype=np.float32) + classes = np.array([1, 2, 3], dtype=np.int32) + is_crowd = np.array([0, 1, 0], dtype=np.int32) + next_annotation_id = 1 + expected_counts = ['04', '31', '4'] + + # Tests exporting without passing in is_crowd (for backward compatibility). + coco_annotations = coco_tools.ExportSingleImageGroundtruthToCoco( + image_id='first_image', + category_id_set=set([1, 2, 3]), + next_annotation_id=next_annotation_id, + groundtruth_boxes=boxes, + groundtruth_classes=classes, + groundtruth_masks=masks) + for i, annotation in enumerate(coco_annotations): + self.assertEqual(annotation['segmentation']['counts'], + expected_counts[i]) + self.assertTrue(np.all(np.equal(mask.decode( + annotation['segmentation']), masks[i]))) + self.assertTrue(np.all(np.isclose(annotation['bbox'], coco_boxes[i]))) + self.assertEqual(annotation['image_id'], 'first_image') + self.assertEqual(annotation['category_id'], classes[i]) + self.assertEqual(annotation['id'], i + next_annotation_id) + + # Tests exporting with is_crowd. + coco_annotations = coco_tools.ExportSingleImageGroundtruthToCoco( + image_id='first_image', + category_id_set=set([1, 2, 3]), + next_annotation_id=next_annotation_id, + groundtruth_boxes=boxes, + groundtruth_classes=classes, + groundtruth_masks=masks, + groundtruth_is_crowd=is_crowd) + for i, annotation in enumerate(coco_annotations): + self.assertEqual(annotation['segmentation']['counts'], + expected_counts[i]) + self.assertTrue(np.all(np.equal(mask.decode( + annotation['segmentation']), masks[i]))) + self.assertTrue(np.all(np.isclose(annotation['bbox'], coco_boxes[i]))) + self.assertEqual(annotation['image_id'], 'first_image') + self.assertEqual(annotation['category_id'], classes[i]) + self.assertEqual(annotation['iscrowd'], is_crowd[i]) + self.assertEqual(annotation['id'], i + next_annotation_id) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/offline_eval_map_corloc.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/offline_eval_map_corloc.py new file mode 100644 index 0000000000000000000000000000000000000000..b7b1eb696ee277858430533e56c43290171b9fdd --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/offline_eval_map_corloc.py @@ -0,0 +1,173 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Evaluation executable for detection data. + +This executable evaluates precomputed detections produced by a detection +model and writes the evaluation results into csv file metrics.csv, stored +in the directory, specified by --eval_dir. + +The evaluation metrics set is supplied in object_detection.protos.EvalConfig +in metrics_set field. +Currently two set of metrics are supported: +- pascal_voc_metrics: standard PASCAL VOC 2007 metric +- open_images_detection_metrics: Open Image V2 metric +All other field of object_detection.protos.EvalConfig are ignored. + +Example usage: + ./compute_metrics \ + --eval_dir=path/to/eval_dir \ + --eval_config_path=path/to/evaluation/configuration/file \ + --input_config_path=path/to/input/configuration/file +""" +import csv +import os +import re +import tensorflow as tf + +from object_detection import evaluator +from object_detection.core import standard_fields +from object_detection.metrics import tf_example_parser +from object_detection.utils import config_util +from object_detection.utils import label_map_util + +flags = tf.app.flags +tf.logging.set_verbosity(tf.logging.INFO) + +flags.DEFINE_string('eval_dir', None, 'Directory to write eval summaries to.') +flags.DEFINE_string('eval_config_path', None, + 'Path to an eval_pb2.EvalConfig config file.') +flags.DEFINE_string('input_config_path', None, + 'Path to an eval_pb2.InputConfig config file.') + +FLAGS = flags.FLAGS + + +def _generate_sharded_filenames(filename): + m = re.search(r'@(\d{1,})', filename) + if m: + num_shards = int(m.group(1)) + return [ + re.sub(r'@(\d{1,})', '-%.5d-of-%.5d' % (i, num_shards), filename) + for i in range(num_shards) + ] + else: + return [filename] + + +def _generate_filenames(filenames): + result = [] + for filename in filenames: + result += _generate_sharded_filenames(filename) + return result + + +def read_data_and_evaluate(input_config, eval_config): + """Reads pre-computed object detections and groundtruth from tf_record. + + Args: + input_config: input config proto of type + object_detection.protos.InputReader. + eval_config: evaluation config proto of type + object_detection.protos.EvalConfig. + + Returns: + Evaluated detections metrics. + + Raises: + ValueError: if input_reader type is not supported or metric type is unknown. + """ + if input_config.WhichOneof('input_reader') == 'tf_record_input_reader': + input_paths = input_config.tf_record_input_reader.input_path + + label_map = label_map_util.load_labelmap(input_config.label_map_path) + max_num_classes = max([item.id for item in label_map.item]) + categories = label_map_util.convert_label_map_to_categories( + label_map, max_num_classes) + + object_detection_evaluators = evaluator.get_evaluators( + eval_config, categories) + # Support a single evaluator + object_detection_evaluator = object_detection_evaluators[0] + + skipped_images = 0 + processed_images = 0 + for input_path in _generate_filenames(input_paths): + tf.logging.info('Processing file: {0}'.format(input_path)) + + record_iterator = tf.python_io.tf_record_iterator(path=input_path) + data_parser = tf_example_parser.TfExampleDetectionAndGTParser() + + for string_record in record_iterator: + tf.logging.log_every_n(tf.logging.INFO, 'Processed %d images...', 1000, + processed_images) + processed_images += 1 + + example = tf.train.Example() + example.ParseFromString(string_record) + decoded_dict = data_parser.parse(example) + + if decoded_dict: + object_detection_evaluator.add_single_ground_truth_image_info( + decoded_dict[standard_fields.DetectionResultFields.key], + decoded_dict) + object_detection_evaluator.add_single_detected_image_info( + decoded_dict[standard_fields.DetectionResultFields.key], + decoded_dict) + else: + skipped_images += 1 + tf.logging.info('Skipped images: {0}'.format(skipped_images)) + + return object_detection_evaluator.evaluate() + + raise ValueError('Unsupported input_reader_config.') + + +def write_metrics(metrics, output_dir): + """Write metrics to the output directory. + + Args: + metrics: A dictionary containing metric names and values. + output_dir: Directory to write metrics to. + """ + tf.logging.info('Writing metrics.') + + with open(os.path.join(output_dir, 'metrics.csv'), 'w') as csvfile: + metrics_writer = csv.writer(csvfile, delimiter=',') + for metric_name, metric_value in metrics.items(): + metrics_writer.writerow([metric_name, str(metric_value)]) + + +def main(argv): + del argv + required_flags = ['input_config_path', 'eval_config_path', 'eval_dir'] + for flag_name in required_flags: + if not getattr(FLAGS, flag_name): + raise ValueError('Flag --{} is required'.format(flag_name)) + + configs = config_util.get_configs_from_multiple_files( + eval_input_config_path=FLAGS.input_config_path, + eval_config_path=FLAGS.eval_config_path) + + eval_config = configs['eval_config'] + input_config = configs['eval_input_config'] + + metrics = read_data_and_evaluate(input_config, eval_config) + + # Save metrics + write_metrics(metrics, FLAGS.eval_dir) + + +if __name__ == '__main__': + tf.app.run(main) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/offline_eval_map_corloc_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/offline_eval_map_corloc_test.py new file mode 100644 index 0000000000000000000000000000000000000000..68ac3893530afecb6098a00c58811afe89e04554 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/offline_eval_map_corloc_test.py @@ -0,0 +1,58 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for utilities in offline_eval_map_corloc binary.""" + +import tensorflow as tf + +from object_detection.metrics import offline_eval_map_corloc as offline_eval + + +class OfflineEvalMapCorlocTest(tf.test.TestCase): + + def test_generateShardedFilenames(self): + test_filename = '/path/to/file' + result = offline_eval._generate_sharded_filenames(test_filename) + self.assertEqual(result, [test_filename]) + + test_filename = '/path/to/file-00000-of-00050' + result = offline_eval._generate_sharded_filenames(test_filename) + self.assertEqual(result, [test_filename]) + + result = offline_eval._generate_sharded_filenames('/path/to/@3.record') + self.assertEqual(result, [ + '/path/to/-00000-of-00003.record', '/path/to/-00001-of-00003.record', + '/path/to/-00002-of-00003.record' + ]) + + result = offline_eval._generate_sharded_filenames('/path/to/abc@3') + self.assertEqual(result, [ + '/path/to/abc-00000-of-00003', '/path/to/abc-00001-of-00003', + '/path/to/abc-00002-of-00003' + ]) + + result = offline_eval._generate_sharded_filenames('/path/to/@1') + self.assertEqual(result, ['/path/to/-00000-of-00001']) + + def test_generateFilenames(self): + test_filenames = ['/path/to/file', '/path/to/@3.record'] + result = offline_eval._generate_filenames(test_filenames) + self.assertEqual(result, [ + '/path/to/file', '/path/to/-00000-of-00003.record', + '/path/to/-00001-of-00003.record', '/path/to/-00002-of-00003.record' + ]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/oid_vrd_challenge_evaluation.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/oid_vrd_challenge_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..3c8bb54a2d286d641a3629bd1dce269cf7de00a6 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/oid_vrd_challenge_evaluation.py @@ -0,0 +1,151 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Runs evaluation using OpenImages groundtruth and predictions. + +Example usage: + python third_party/tensorflow_models/object_detection/\ + metrics/oid_vrd_challenge_evaluation.py \ + --input_annotations_boxes=/path/to/input/annotations-human-bbox.csv \ + --input_annotations_labels=/path/to/input/annotations-label.csv \ + --input_class_labelmap=/path/to/input/class_labelmap.pbtxt \ + --input_relationship_labelmap=/path/to/input/relationship_labelmap.pbtxt \ + --input_predictions=/path/to/input/predictions.csv \ + --output_metrics=/path/to/output/metric.csv \ + +CSVs with bounding box annotations and image label (including the image URLs) +can be downloaded from the Open Images Challenge website: +https://storage.googleapis.com/openimages/web/challenge.html +The format of the input csv and the metrics itself are described on the +challenge website. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import pandas as pd +from google.protobuf import text_format + +from object_detection.metrics import oid_vrd_challenge_evaluation_utils as utils +from object_detection.protos import string_int_label_map_pb2 +from object_detection.utils import vrd_evaluation + + +def _load_labelmap(labelmap_path): + """Loads labelmap from the labelmap path. + + Args: + labelmap_path: Path to the labelmap. + + Returns: + A dictionary mapping class name to class numerical id. + """ + + label_map = string_int_label_map_pb2.StringIntLabelMap() + with open(labelmap_path, 'r') as fid: + label_map_string = fid.read() + text_format.Merge(label_map_string, label_map) + labelmap_dict = {} + for item in label_map.item: + labelmap_dict[item.name] = item.id + return labelmap_dict + + +def _swap_labelmap_dict(labelmap_dict): + """Swaps keys and labels in labelmap. + + Args: + labelmap_dict: Input dictionary. + + Returns: + A dictionary mapping class name to class numerical id. + """ + return dict((v, k) for k, v in labelmap_dict.iteritems()) + + +def main(parsed_args): + all_box_annotations = pd.read_csv(parsed_args.input_annotations_boxes) + all_label_annotations = pd.read_csv(parsed_args.input_annotations_labels) + all_annotations = pd.concat([all_box_annotations, all_label_annotations]) + + class_label_map = _load_labelmap(parsed_args.input_class_labelmap) + relationship_label_map = _load_labelmap( + parsed_args.input_relationship_labelmap) + + relation_evaluator = vrd_evaluation.VRDRelationDetectionEvaluator() + phrase_evaluator = vrd_evaluation.VRDPhraseDetectionEvaluator() + + for _, groundtruth in enumerate(all_annotations.groupby('ImageID')): + image_id, image_groundtruth = groundtruth + groundtruth_dictionary = utils.build_groundtruth_vrd_dictionary( + image_groundtruth, class_label_map, relationship_label_map) + + relation_evaluator.add_single_ground_truth_image_info( + image_id, groundtruth_dictionary) + phrase_evaluator.add_single_ground_truth_image_info(image_id, + groundtruth_dictionary) + + all_predictions = pd.read_csv(parsed_args.input_predictions) + for _, prediction_data in enumerate(all_predictions.groupby('ImageID')): + image_id, image_predictions = prediction_data + prediction_dictionary = utils.build_predictions_vrd_dictionary( + image_predictions, class_label_map, relationship_label_map) + + relation_evaluator.add_single_detected_image_info(image_id, + prediction_dictionary) + phrase_evaluator.add_single_detected_image_info(image_id, + prediction_dictionary) + + relation_metrics = relation_evaluator.evaluate() + phrase_metrics = phrase_evaluator.evaluate() + + with open(parsed_args.output_metrics, 'w') as fid: + utils.write_csv(fid, relation_metrics) + utils.write_csv(fid, phrase_metrics) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description= + 'Evaluate Open Images Visual Relationship Detection predictions.') + parser.add_argument( + '--input_annotations_boxes', + required=True, + help='File with groundtruth vrd annotations.') + parser.add_argument( + '--input_annotations_labels', + required=True, + help='File with groundtruth labels annotations') + parser.add_argument( + '--input_predictions', + required=True, + help="""File with detection predictions; NOTE: no postprocessing is + applied in the evaluation script.""") + parser.add_argument( + '--input_class_labelmap', + required=True, + help="""OpenImages Challenge labelmap; note: it is expected to include + attributes.""") + parser.add_argument( + '--input_relationship_labelmap', + required=True, + help="""OpenImages Challenge relationship labelmap.""") + parser.add_argument( + '--output_metrics', required=True, help='Output file with csv metrics') + + args = parser.parse_args() + main(args) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/oid_vrd_challenge_evaluation_utils.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/oid_vrd_challenge_evaluation_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8c834775890df314cd09120d2003b34c66cdac94 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/oid_vrd_challenge_evaluation_utils.py @@ -0,0 +1,133 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Converts data from CSV format to the VRDDetectionEvaluator format.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import csv +import numpy as np +from object_detection.core import standard_fields +from object_detection.utils import vrd_evaluation + + +def build_groundtruth_vrd_dictionary(data, class_label_map, + relationship_label_map): + """Builds a groundtruth dictionary from groundtruth data in CSV file. + + Args: + data: Pandas DataFrame with the groundtruth data for a single image. + class_label_map: Class labelmap from string label name to an integer. + relationship_label_map: Relationship type labelmap from string name to an + integer. + + Returns: + A dictionary with keys suitable for passing to + VRDDetectionEvaluator.add_single_ground_truth_image_info: + standard_fields.InputDataFields.groundtruth_boxes: A numpy array + of structures with the shape [M, 1], representing M tuples, each tuple + containing the same number of named bounding boxes. + Each box is of the format [y_min, x_min, y_max, x_max] (see + datatype vrd_box_data_type, single_box_data_type above). + standard_fields.InputDataFields.groundtruth_classes: A numpy array of + structures shape [M, 1], representing the class labels of the + corresponding bounding boxes and possibly additional classes (see + datatype label_data_type above). + standard_fields.InputDataFields.verified_labels: numpy array + of shape [K] containing verified labels. + """ + data_boxes = data[data.LabelName.isnull()] + data_labels = data[data.LabelName1.isnull()] + + boxes = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.vrd_box_data_type) + boxes['subject'] = data_boxes[['YMin1', 'XMin1', 'YMax1', + 'XMax1']].as_matrix() + boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].as_matrix() + + labels = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.label_data_type) + labels['subject'] = data_boxes['LabelName1'].map(lambda x: class_label_map[x]) + labels['object'] = data_boxes['LabelName2'].map(lambda x: class_label_map[x]) + labels['relation'] = data_boxes['RelationshipLabel'].map( + lambda x: relationship_label_map[x]) + + return { + standard_fields.InputDataFields.groundtruth_boxes: + boxes, + standard_fields.InputDataFields.groundtruth_classes: + labels, + standard_fields.InputDataFields.verified_labels: + data_labels['LabelName'].map(lambda x: class_label_map[x]), + } + + +def build_predictions_vrd_dictionary(data, class_label_map, + relationship_label_map): + """Builds a predictions dictionary from predictions data in CSV file. + + Args: + data: Pandas DataFrame with the predictions data for a single image. + class_label_map: Class labelmap from string label name to an integer. + relationship_label_map: Relationship type labelmap from string name to an + integer. + + Returns: + Dictionary with keys suitable for passing to + VRDDetectionEvaluator.add_single_detected_image_info: + standard_fields.DetectionResultFields.detection_boxes: A numpy array of + structures with shape [N, 1], representing N tuples, each tuple + containing the same number of named bounding boxes. + Each box is of the format [y_min, x_min, y_max, x_max] (as an example + see datatype vrd_box_data_type, single_box_data_type above). + standard_fields.DetectionResultFields.detection_scores: float32 numpy + array of shape [N] containing detection scores for the boxes. + standard_fields.DetectionResultFields.detection_classes: A numpy array + of structures shape [N, 1], representing the class labels of the + corresponding bounding boxes and possibly additional classes (see + datatype label_data_type above). + """ + data_boxes = data + + boxes = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.vrd_box_data_type) + boxes['subject'] = data_boxes[['YMin1', 'XMin1', 'YMax1', + 'XMax1']].as_matrix() + boxes['object'] = data_boxes[['YMin2', 'XMin2', 'YMax2', 'XMax2']].as_matrix() + + labels = np.zeros(data_boxes.shape[0], dtype=vrd_evaluation.label_data_type) + labels['subject'] = data_boxes['LabelName1'].map(lambda x: class_label_map[x]) + labels['object'] = data_boxes['LabelName2'].map(lambda x: class_label_map[x]) + labels['relation'] = data_boxes['RelationshipLabel'].map( + lambda x: relationship_label_map[x]) + + return { + standard_fields.DetectionResultFields.detection_boxes: + boxes, + standard_fields.DetectionResultFields.detection_classes: + labels, + standard_fields.DetectionResultFields.detection_scores: + data_boxes['Score'].as_matrix() + } + + +def write_csv(fid, metrics): + """Writes metrics key-value pairs to CSV file. + + Args: + fid: File identifier of an opened file. + metrics: A dictionary with metrics to be written. + """ + metrics_writer = csv.writer(fid, delimiter=',') + for metric_name, metric_value in metrics.items(): + metrics_writer.writerow([metric_name, str(metric_value)]) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/oid_vrd_challenge_evaluation_utils_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/oid_vrd_challenge_evaluation_utils_test.py new file mode 100644 index 0000000000000000000000000000000000000000..49ce0898ea5a1afc0ab1c9ebd666c1373cbe54f9 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/oid_vrd_challenge_evaluation_utils_test.py @@ -0,0 +1,149 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for oid_vrd_challenge_evaluation_utils.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import pandas as pd +import tensorflow as tf +from object_detection.core import standard_fields +from object_detection.metrics import oid_vrd_challenge_evaluation_utils as utils +from object_detection.utils import vrd_evaluation + + +class OidVrdChallengeEvaluationUtilsTest(tf.test.TestCase): + + def testBuildGroundtruthDictionary(self): + np_data = pd.DataFrame( + [[ + 'fe58ec1b06db2bb7', '/m/04bcr3', '/m/083vt', 0.0, 0.3, 0.5, 0.6, + 0.0, 0.3, 0.5, 0.6, 'is', None, None + ], [ + 'fe58ec1b06db2bb7', '/m/04bcr3', '/m/02gy9n', 0.0, 0.3, 0.5, 0.6, + 0.1, 0.2, 0.3, 0.4, 'under', None, None + ], [ + 'fe58ec1b06db2bb7', '/m/04bcr3', '/m/083vt', 0.0, 0.1, 0.2, 0.3, + 0.0, 0.1, 0.2, 0.3, 'is', None, None + ], [ + 'fe58ec1b06db2bb7', '/m/083vt', '/m/04bcr3', 0.1, 0.2, 0.3, 0.4, + 0.5, 0.6, 0.7, 0.8, 'at', None, None + ], [ + 'fe58ec1b06db2bb7', None, None, None, None, None, None, None, None, + None, None, None, '/m/04bcr3', 1.0 + ], [ + 'fe58ec1b06db2bb7', None, None, None, None, None, None, None, None, + None, None, None, '/m/083vt', 0.0 + ], [ + 'fe58ec1b06db2bb7', None, None, None, None, None, None, None, None, + None, None, None, '/m/02gy9n', 0.0 + ]], + columns=[ + 'ImageID', 'LabelName1', 'LabelName2', 'XMin1', 'XMax1', 'YMin1', + 'YMax1', 'XMin2', 'XMax2', 'YMin2', 'YMax2', 'RelationshipLabel', + 'LabelName', 'Confidence' + ]) + class_label_map = {'/m/04bcr3': 1, '/m/083vt': 2, '/m/02gy9n': 3} + relationship_label_map = {'is': 1, 'under': 2, 'at': 3} + groundtruth_dictionary = utils.build_groundtruth_vrd_dictionary( + np_data, class_label_map, relationship_label_map) + + self.assertTrue(standard_fields.InputDataFields.groundtruth_boxes in + groundtruth_dictionary) + self.assertTrue(standard_fields.InputDataFields.groundtruth_classes in + groundtruth_dictionary) + self.assertTrue(standard_fields.InputDataFields.verified_labels in + groundtruth_dictionary) + + self.assertAllEqual( + np.array( + [(1, 2, 1), (1, 3, 2), (1, 2, 1), (2, 1, 3)], + dtype=vrd_evaluation.label_data_type), groundtruth_dictionary[ + standard_fields.InputDataFields.groundtruth_classes]) + expected_vrd_data = np.array( + [ + ([0.5, 0.0, 0.6, 0.3], [0.5, 0.0, 0.6, 0.3]), + ([0.5, 0.0, 0.6, 0.3], [0.3, 0.1, 0.4, 0.2]), + ([0.2, 0.0, 0.3, 0.1], [0.2, 0.0, 0.3, 0.1]), + ([0.3, 0.1, 0.4, 0.2], [0.7, 0.5, 0.8, 0.6]), + ], + dtype=vrd_evaluation.vrd_box_data_type) + for field in expected_vrd_data.dtype.fields: + self.assertNDArrayNear( + expected_vrd_data[field], groundtruth_dictionary[ + standard_fields.InputDataFields.groundtruth_boxes][field], 1e-5) + self.assertAllEqual( + np.array([1, 2, 3]), + groundtruth_dictionary[standard_fields.InputDataFields.verified_labels]) + + def testBuildPredictionDictionary(self): + np_data = pd.DataFrame( + [[ + 'fe58ec1b06db2bb7', '/m/04bcr3', '/m/083vt', 0.0, 0.3, 0.5, 0.6, + 0.0, 0.3, 0.5, 0.6, 'is', 0.1 + ], [ + 'fe58ec1b06db2bb7', '/m/04bcr3', '/m/02gy9n', 0.0, 0.3, 0.5, 0.6, + 0.1, 0.2, 0.3, 0.4, 'under', 0.2 + ], [ + 'fe58ec1b06db2bb7', '/m/04bcr3', '/m/083vt', 0.0, 0.1, 0.2, 0.3, + 0.0, 0.1, 0.2, 0.3, 'is', 0.3 + ], [ + 'fe58ec1b06db2bb7', '/m/083vt', '/m/04bcr3', 0.1, 0.2, 0.3, 0.4, + 0.5, 0.6, 0.7, 0.8, 'at', 0.4 + ]], + columns=[ + 'ImageID', 'LabelName1', 'LabelName2', 'XMin1', 'XMax1', 'YMin1', + 'YMax1', 'XMin2', 'XMax2', 'YMin2', 'YMax2', 'RelationshipLabel', + 'Score' + ]) + class_label_map = {'/m/04bcr3': 1, '/m/083vt': 2, '/m/02gy9n': 3} + relationship_label_map = {'is': 1, 'under': 2, 'at': 3} + prediction_dictionary = utils.build_predictions_vrd_dictionary( + np_data, class_label_map, relationship_label_map) + + self.assertTrue(standard_fields.DetectionResultFields.detection_boxes in + prediction_dictionary) + self.assertTrue(standard_fields.DetectionResultFields.detection_classes in + prediction_dictionary) + self.assertTrue(standard_fields.DetectionResultFields.detection_scores in + prediction_dictionary) + + self.assertAllEqual( + np.array( + [(1, 2, 1), (1, 3, 2), (1, 2, 1), (2, 1, 3)], + dtype=vrd_evaluation.label_data_type), prediction_dictionary[ + standard_fields.DetectionResultFields.detection_classes]) + expected_vrd_data = np.array( + [ + ([0.5, 0.0, 0.6, 0.3], [0.5, 0.0, 0.6, 0.3]), + ([0.5, 0.0, 0.6, 0.3], [0.3, 0.1, 0.4, 0.2]), + ([0.2, 0.0, 0.3, 0.1], [0.2, 0.0, 0.3, 0.1]), + ([0.3, 0.1, 0.4, 0.2], [0.7, 0.5, 0.8, 0.6]), + ], + dtype=vrd_evaluation.vrd_box_data_type) + for field in expected_vrd_data.dtype.fields: + self.assertNDArrayNear( + expected_vrd_data[field], prediction_dictionary[ + standard_fields.DetectionResultFields.detection_boxes][field], + 1e-5) + self.assertNDArrayNear( + np.array([0.1, 0.2, 0.3, 0.4]), prediction_dictionary[ + standard_fields.DetectionResultFields.detection_scores], 1e-5) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/tf_example_parser.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/tf_example_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..22a28e8ad933eefc773e750efc60074251a48cce --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/tf_example_parser.py @@ -0,0 +1,157 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tensorflow Example proto parser for data loading. + +A parser to decode data containing serialized tensorflow.Example +protos into materialized tensors (numpy arrays). +""" + +import numpy as np + +from object_detection.core import data_parser +from object_detection.core import standard_fields as fields + + +class FloatParser(data_parser.DataToNumpyParser): + """Tensorflow Example float parser.""" + + def __init__(self, field_name): + self.field_name = field_name + + def parse(self, tf_example): + return np.array( + tf_example.features.feature[self.field_name].float_list.value, + dtype=np.float).transpose() if tf_example.features.feature[ + self.field_name].HasField("float_list") else None + + +class StringParser(data_parser.DataToNumpyParser): + """Tensorflow Example string parser.""" + + def __init__(self, field_name): + self.field_name = field_name + + def parse(self, tf_example): + return "".join(tf_example.features.feature[self.field_name] + .bytes_list.value) if tf_example.features.feature[ + self.field_name].HasField("bytes_list") else None + + +class Int64Parser(data_parser.DataToNumpyParser): + """Tensorflow Example int64 parser.""" + + def __init__(self, field_name): + self.field_name = field_name + + def parse(self, tf_example): + return np.array( + tf_example.features.feature[self.field_name].int64_list.value, + dtype=np.int64).transpose() if tf_example.features.feature[ + self.field_name].HasField("int64_list") else None + + +class BoundingBoxParser(data_parser.DataToNumpyParser): + """Tensorflow Example bounding box parser.""" + + def __init__(self, xmin_field_name, ymin_field_name, xmax_field_name, + ymax_field_name): + self.field_names = [ + ymin_field_name, xmin_field_name, ymax_field_name, xmax_field_name + ] + + def parse(self, tf_example): + result = [] + parsed = True + for field_name in self.field_names: + result.append(tf_example.features.feature[field_name].float_list.value) + parsed &= ( + tf_example.features.feature[field_name].HasField("float_list")) + + return np.array(result).transpose() if parsed else None + + +class TfExampleDetectionAndGTParser(data_parser.DataToNumpyParser): + """Tensorflow Example proto parser.""" + + def __init__(self): + self.items_to_handlers = { + fields.DetectionResultFields.key: + StringParser(fields.TfExampleFields.source_id), + # Object ground truth boxes and classes. + fields.InputDataFields.groundtruth_boxes: (BoundingBoxParser( + fields.TfExampleFields.object_bbox_xmin, + fields.TfExampleFields.object_bbox_ymin, + fields.TfExampleFields.object_bbox_xmax, + fields.TfExampleFields.object_bbox_ymax)), + fields.InputDataFields.groundtruth_classes: ( + Int64Parser(fields.TfExampleFields.object_class_label)), + # Object detections. + fields.DetectionResultFields.detection_boxes: (BoundingBoxParser( + fields.TfExampleFields.detection_bbox_xmin, + fields.TfExampleFields.detection_bbox_ymin, + fields.TfExampleFields.detection_bbox_xmax, + fields.TfExampleFields.detection_bbox_ymax)), + fields.DetectionResultFields.detection_classes: ( + Int64Parser(fields.TfExampleFields.detection_class_label)), + fields.DetectionResultFields.detection_scores: ( + FloatParser(fields.TfExampleFields.detection_score)), + } + + self.optional_items_to_handlers = { + fields.InputDataFields.groundtruth_difficult: + Int64Parser(fields.TfExampleFields.object_difficult), + fields.InputDataFields.groundtruth_group_of: + Int64Parser(fields.TfExampleFields.object_group_of), + fields.InputDataFields.verified_labels: + Int64Parser(fields.TfExampleFields.image_class_label), + } + + def parse(self, tf_example): + """Parses tensorflow example and returns a tensor dictionary. + + Args: + tf_example: a tf.Example object. + + Returns: + A dictionary of the following numpy arrays: + fields.DetectionResultFields.source_id - string containing original image + id. + fields.InputDataFields.groundtruth_boxes - a numpy array containing + groundtruth boxes. + fields.InputDataFields.groundtruth_classes - a numpy array containing + groundtruth classes. + fields.InputDataFields.groundtruth_group_of - a numpy array containing + groundtruth group of flag (optional, None if not specified). + fields.InputDataFields.groundtruth_difficult - a numpy array containing + groundtruth difficult flag (optional, None if not specified). + fields.DetectionResultFields.detection_boxes - a numpy array containing + detection boxes. + fields.DetectionResultFields.detection_classes - a numpy array containing + detection class labels. + fields.DetectionResultFields.detection_scores - a numpy array containing + detection scores. + Returns None if tf.Example was not parsed or non-optional fields were not + found. + """ + results_dict = {} + parsed = True + for key, parser in self.items_to_handlers.items(): + results_dict[key] = parser.parse(tf_example) + parsed &= (results_dict[key] is not None) + + for key, parser in self.optional_items_to_handlers.items(): + results_dict[key] = parser.parse(tf_example) + + return results_dict if parsed else None diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/tf_example_parser_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/tf_example_parser_test.py new file mode 100644 index 0000000000000000000000000000000000000000..99a64d5fae33e2db80780122e03c8540e3c453c0 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/metrics/tf_example_parser_test.py @@ -0,0 +1,196 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for object_detection.data_decoders.tf_example_parser.""" + +import numpy as np +import numpy.testing as np_testing +import tensorflow as tf + +from object_detection.core import standard_fields as fields +from object_detection.metrics import tf_example_parser + + +class TfExampleDecoderTest(tf.test.TestCase): + + def _Int64Feature(self, value): + return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + + def _FloatFeature(self, value): + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) + + def _BytesFeature(self, value): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) + + def testParseDetectionsAndGT(self): + source_id = 'abc.jpg' + # y_min, x_min, y_max, x_max + object_bb = np.array([[0.0, 0.5, 0.3], [0.0, 0.1, 0.6], [1.0, 0.6, 0.8], + [1.0, 0.6, 0.7]]).transpose() + detection_bb = np.array([[0.1, 0.2], [0.0, 0.8], [1.0, 0.6], + [1.0, 0.85]]).transpose() + + object_class_label = [1, 1, 2] + object_difficult = [1, 0, 0] + object_group_of = [0, 0, 1] + verified_labels = [1, 2, 3, 4] + detection_class_label = [2, 1] + detection_score = [0.5, 0.3] + features = { + fields.TfExampleFields.source_id: + self._BytesFeature(source_id), + fields.TfExampleFields.object_bbox_ymin: + self._FloatFeature(object_bb[:, 0].tolist()), + fields.TfExampleFields.object_bbox_xmin: + self._FloatFeature(object_bb[:, 1].tolist()), + fields.TfExampleFields.object_bbox_ymax: + self._FloatFeature(object_bb[:, 2].tolist()), + fields.TfExampleFields.object_bbox_xmax: + self._FloatFeature(object_bb[:, 3].tolist()), + fields.TfExampleFields.detection_bbox_ymin: + self._FloatFeature(detection_bb[:, 0].tolist()), + fields.TfExampleFields.detection_bbox_xmin: + self._FloatFeature(detection_bb[:, 1].tolist()), + fields.TfExampleFields.detection_bbox_ymax: + self._FloatFeature(detection_bb[:, 2].tolist()), + fields.TfExampleFields.detection_bbox_xmax: + self._FloatFeature(detection_bb[:, 3].tolist()), + fields.TfExampleFields.detection_class_label: + self._Int64Feature(detection_class_label), + fields.TfExampleFields.detection_score: + self._FloatFeature(detection_score), + } + + example = tf.train.Example(features=tf.train.Features(feature=features)) + parser = tf_example_parser.TfExampleDetectionAndGTParser() + + results_dict = parser.parse(example) + self.assertIsNone(results_dict) + + features[fields.TfExampleFields.object_class_label] = ( + self._Int64Feature(object_class_label)) + features[fields.TfExampleFields.object_difficult] = ( + self._Int64Feature(object_difficult)) + + example = tf.train.Example(features=tf.train.Features(feature=features)) + results_dict = parser.parse(example) + + self.assertIsNotNone(results_dict) + self.assertEqual(source_id, results_dict[fields.DetectionResultFields.key]) + np_testing.assert_almost_equal( + object_bb, results_dict[fields.InputDataFields.groundtruth_boxes]) + np_testing.assert_almost_equal( + detection_bb, + results_dict[fields.DetectionResultFields.detection_boxes]) + np_testing.assert_almost_equal( + detection_score, + results_dict[fields.DetectionResultFields.detection_scores]) + np_testing.assert_almost_equal( + detection_class_label, + results_dict[fields.DetectionResultFields.detection_classes]) + np_testing.assert_almost_equal( + object_difficult, + results_dict[fields.InputDataFields.groundtruth_difficult]) + np_testing.assert_almost_equal( + object_class_label, + results_dict[fields.InputDataFields.groundtruth_classes]) + + parser = tf_example_parser.TfExampleDetectionAndGTParser() + + features[fields.TfExampleFields.object_group_of] = ( + self._Int64Feature(object_group_of)) + + example = tf.train.Example(features=tf.train.Features(feature=features)) + results_dict = parser.parse(example) + self.assertIsNotNone(results_dict) + np_testing.assert_equal( + object_group_of, + results_dict[fields.InputDataFields.groundtruth_group_of]) + + features[fields.TfExampleFields.image_class_label] = ( + self._Int64Feature(verified_labels)) + + example = tf.train.Example(features=tf.train.Features(feature=features)) + results_dict = parser.parse(example) + self.assertIsNotNone(results_dict) + np_testing.assert_equal( + verified_labels, results_dict[fields.InputDataFields.verified_labels]) + + def testParseString(self): + string_val = 'abc' + features = {'string': self._BytesFeature(string_val)} + example = tf.train.Example(features=tf.train.Features(feature=features)) + + parser = tf_example_parser.StringParser('string') + result = parser.parse(example) + self.assertIsNotNone(result) + self.assertEqual(result, string_val) + + parser = tf_example_parser.StringParser('another_string') + result = parser.parse(example) + self.assertIsNone(result) + + def testParseFloat(self): + float_array_val = [1.5, 1.4, 2.0] + features = {'floats': self._FloatFeature(float_array_val)} + example = tf.train.Example(features=tf.train.Features(feature=features)) + + parser = tf_example_parser.FloatParser('floats') + result = parser.parse(example) + self.assertIsNotNone(result) + np_testing.assert_almost_equal(result, float_array_val) + + parser = tf_example_parser.StringParser('another_floats') + result = parser.parse(example) + self.assertIsNone(result) + + def testInt64Parser(self): + int_val = [1, 2, 3] + features = {'ints': self._Int64Feature(int_val)} + example = tf.train.Example(features=tf.train.Features(feature=features)) + + parser = tf_example_parser.Int64Parser('ints') + result = parser.parse(example) + self.assertIsNotNone(result) + np_testing.assert_almost_equal(result, int_val) + + parser = tf_example_parser.Int64Parser('another_ints') + result = parser.parse(example) + self.assertIsNone(result) + + def testBoundingBoxParser(self): + bounding_boxes = np.array([[0.0, 0.5, 0.3], [0.0, 0.1, 0.6], + [1.0, 0.6, 0.8], [1.0, 0.6, 0.7]]).transpose() + features = { + 'ymin': self._FloatFeature(bounding_boxes[:, 0]), + 'xmin': self._FloatFeature(bounding_boxes[:, 1]), + 'ymax': self._FloatFeature(bounding_boxes[:, 2]), + 'xmax': self._FloatFeature(bounding_boxes[:, 3]) + } + + example = tf.train.Example(features=tf.train.Features(feature=features)) + + parser = tf_example_parser.BoundingBoxParser('xmin', 'ymin', 'xmax', 'ymax') + result = parser.parse(example) + self.assertIsNotNone(result) + np_testing.assert_almost_equal(result, bounding_boxes) + + parser = tf_example_parser.BoundingBoxParser('xmin', 'ymin', 'xmax', + 'another_ymax') + result = parser.parse(example) + self.assertIsNone(result) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/model_hparams.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_hparams.py new file mode 100644 index 0000000000000000000000000000000000000000..b0d12fceadfb1f79723b8f35377b1a3240ff8c63 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_hparams.py @@ -0,0 +1,44 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Hyperparameters for the object detection model in TF.learn. + +This file consolidates and documents the hyperparameters used by the model. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +def create_hparams(hparams_overrides=None): + """Returns hyperparameters, including any flag value overrides. + + Args: + hparams_overrides: Optional hparams overrides, represented as a + string containing comma-separated hparam_name=value pairs. + + Returns: + The hyperparameters as a tf.HParams object. + """ + hparams = tf.contrib.training.HParams( + # Whether a fine tuning checkpoint (provided in the pipeline config) + # should be loaded for training. + load_pretrained=True) + # Override any of the preceding hyperparameter values. + if hparams_overrides: + hparams = hparams.parse(hparams_overrides) + return hparams diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/model_lib.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..973ca259d658ff052c253d57b2dd35b6d1da434e --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_lib.py @@ -0,0 +1,717 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Constructs model, inputs, and training environment.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import os + +import tensorflow as tf + +from object_detection import eval_util +from object_detection import inputs +from object_detection.builders import graph_rewriter_builder +from object_detection.builders import model_builder +from object_detection.builders import optimizer_builder +from object_detection.core import standard_fields as fields +from object_detection.utils import config_util +from object_detection.utils import label_map_util +from object_detection.utils import shape_utils +from object_detection.utils import variables_helper +from object_detection.utils import visualization_utils as vis_utils + +# A map of names to methods that help build the model. +MODEL_BUILD_UTIL_MAP = { + 'get_configs_from_pipeline_file': + config_util.get_configs_from_pipeline_file, + 'create_pipeline_proto_from_configs': + config_util.create_pipeline_proto_from_configs, + 'merge_external_params_with_configs': + config_util.merge_external_params_with_configs, + 'create_train_input_fn': inputs.create_train_input_fn, + 'create_eval_input_fn': inputs.create_eval_input_fn, + 'create_predict_input_fn': inputs.create_predict_input_fn, +} + + +def _prepare_groundtruth_for_eval(detection_model, class_agnostic): + """Extracts groundtruth data from detection_model and prepares it for eval. + + Args: + detection_model: A `DetectionModel` object. + class_agnostic: Whether the detections are class_agnostic. + + Returns: + A tuple of: + groundtruth: Dictionary with the following fields: + 'groundtruth_boxes': [num_boxes, 4] float32 tensor of boxes, in + normalized coordinates. + 'groundtruth_classes': [num_boxes] int64 tensor of 1-indexed classes. + 'groundtruth_masks': 3D float32 tensor of instance masks (if provided in + groundtruth) + 'groundtruth_is_crowd': [num_boxes] bool tensor indicating is_crowd + annotations (if provided in groundtruth). + class_agnostic: Boolean indicating whether detections are class agnostic. + """ + input_data_fields = fields.InputDataFields() + groundtruth_boxes = detection_model.groundtruth_lists( + fields.BoxListFields.boxes)[0] + # For class-agnostic models, groundtruth one-hot encodings collapse to all + # ones. + if class_agnostic: + groundtruth_boxes_shape = tf.shape(groundtruth_boxes) + groundtruth_classes_one_hot = tf.ones([groundtruth_boxes_shape[0], 1]) + else: + groundtruth_classes_one_hot = detection_model.groundtruth_lists( + fields.BoxListFields.classes)[0] + label_id_offset = 1 # Applying label id offset (b/63711816) + groundtruth_classes = ( + tf.argmax(groundtruth_classes_one_hot, axis=1) + label_id_offset) + groundtruth = { + input_data_fields.groundtruth_boxes: groundtruth_boxes, + input_data_fields.groundtruth_classes: groundtruth_classes + } + if detection_model.groundtruth_has_field(fields.BoxListFields.masks): + groundtruth[input_data_fields.groundtruth_instance_masks] = ( + detection_model.groundtruth_lists(fields.BoxListFields.masks)[0]) + if detection_model.groundtruth_has_field(fields.BoxListFields.is_crowd): + groundtruth[input_data_fields.groundtruth_is_crowd] = ( + detection_model.groundtruth_lists(fields.BoxListFields.is_crowd)[0]) + return groundtruth + + +def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True): + """Unstacks all tensors in `tensor_dict` along 0th dimension. + + Unstacks tensor from the tensor dict along 0th dimension and returns a + tensor_dict containing values that are lists of unstacked tensors. + + Tensors in the `tensor_dict` are expected to be of one of the three shapes: + 1. [batch_size] + 2. [batch_size, height, width, channels] + 3. [batch_size, num_boxes, d1, d2, ... dn] + + When unpad_groundtruth_tensors is set to true, unstacked tensors of form 3 + above are sliced along the `num_boxes` dimension using the value in tensor + field.InputDataFields.num_groundtruth_boxes. + + Note that this function has a static list of input data fields and has to be + kept in sync with the InputDataFields defined in core/standard_fields.py + + Args: + tensor_dict: A dictionary of batched groundtruth tensors. + unpad_groundtruth_tensors: Whether to remove padding along `num_boxes` + dimension of the groundtruth tensors. + + Returns: + A dictionary where the keys are from fields.InputDataFields and values are + a list of unstacked (optionally unpadded) tensors. + + Raises: + ValueError: If unpad_tensors is True and `tensor_dict` does not contain + `num_groundtruth_boxes` tensor. + """ + unbatched_tensor_dict = {key: tf.unstack(tensor) + for key, tensor in tensor_dict.items()} + if unpad_groundtruth_tensors: + if (fields.InputDataFields.num_groundtruth_boxes not in + unbatched_tensor_dict): + raise ValueError('`num_groundtruth_boxes` not found in tensor_dict. ' + 'Keys available: {}'.format( + unbatched_tensor_dict.keys())) + unbatched_unpadded_tensor_dict = {} + unpad_keys = set([ + # List of input data fields that are padded along the num_boxes + # dimension. This list has to be kept in sync with InputDataFields in + # standard_fields.py. + fields.InputDataFields.groundtruth_instance_masks, + fields.InputDataFields.groundtruth_classes, + fields.InputDataFields.groundtruth_boxes, + fields.InputDataFields.groundtruth_keypoints, + fields.InputDataFields.groundtruth_group_of, + fields.InputDataFields.groundtruth_difficult, + fields.InputDataFields.groundtruth_is_crowd, + fields.InputDataFields.groundtruth_area, + fields.InputDataFields.groundtruth_weights + ]).intersection(set(unbatched_tensor_dict.keys())) + + for key in unpad_keys: + unpadded_tensor_list = [] + for num_gt, padded_tensor in zip( + unbatched_tensor_dict[fields.InputDataFields.num_groundtruth_boxes], + unbatched_tensor_dict[key]): + tensor_shape = shape_utils.combined_static_and_dynamic_shape( + padded_tensor) + slice_begin = tf.zeros([len(tensor_shape)], dtype=tf.int32) + slice_size = tf.stack( + [num_gt] + [-1 if dim is None else dim for dim in tensor_shape[1:]]) + unpadded_tensor = tf.slice(padded_tensor, slice_begin, slice_size) + unpadded_tensor_list.append(unpadded_tensor) + unbatched_unpadded_tensor_dict[key] = unpadded_tensor_list + unbatched_tensor_dict.update(unbatched_unpadded_tensor_dict) + + return unbatched_tensor_dict + + +def create_model_fn(detection_model_fn, configs, hparams, use_tpu=False): + """Creates a model function for `Estimator`. + + Args: + detection_model_fn: Function that returns a `DetectionModel` instance. + configs: Dictionary of pipeline config objects. + hparams: `HParams` object. + use_tpu: Boolean indicating whether model should be constructed for + use on TPU. + + Returns: + `model_fn` for `Estimator`. + """ + train_config = configs['train_config'] + eval_input_config = configs['eval_input_config'] + eval_config = configs['eval_config'] + + def model_fn(features, labels, mode, params=None): + """Constructs the object detection model. + + Args: + features: Dictionary of feature tensors, returned from `input_fn`. + labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, + otherwise None. + mode: Mode key from tf.estimator.ModeKeys. + params: Parameter dictionary passed from the estimator. + + Returns: + An `EstimatorSpec` that encapsulates the model and its serving + configurations. + """ + params = params or {} + total_loss, train_op, detections, export_outputs = None, None, None, None + is_training = mode == tf.estimator.ModeKeys.TRAIN + detection_model = detection_model_fn(is_training=is_training, + add_summaries=(not use_tpu)) + scaffold_fn = None + + if mode == tf.estimator.ModeKeys.TRAIN: + labels = unstack_batch( + labels, + unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) + elif mode == tf.estimator.ModeKeys.EVAL: + # For evaling on train data, it is necessary to check whether groundtruth + # must be unpadded. + boxes_shape = ( + labels[fields.InputDataFields.groundtruth_boxes].get_shape() + .as_list()) + unpad_groundtruth_tensors = True if boxes_shape[1] is not None else False + labels = unstack_batch( + labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) + + if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): + gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] + gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] + gt_masks_list = None + if fields.InputDataFields.groundtruth_instance_masks in labels: + gt_masks_list = labels[ + fields.InputDataFields.groundtruth_instance_masks] + gt_keypoints_list = None + if fields.InputDataFields.groundtruth_keypoints in labels: + gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] + if fields.InputDataFields.groundtruth_is_crowd in labels: + gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd] + detection_model.provide_groundtruth( + groundtruth_boxes_list=gt_boxes_list, + groundtruth_classes_list=gt_classes_list, + groundtruth_masks_list=gt_masks_list, + groundtruth_keypoints_list=gt_keypoints_list, + groundtruth_weights_list=labels[ + fields.InputDataFields.groundtruth_weights], + groundtruth_is_crowd_list=gt_is_crowd_list) + + preprocessed_images = features[fields.InputDataFields.image] + prediction_dict = detection_model.predict( + preprocessed_images, features[fields.InputDataFields.true_image_shape]) + detections = detection_model.postprocess( + prediction_dict, features[fields.InputDataFields.true_image_shape]) + + if mode == tf.estimator.ModeKeys.TRAIN: + if train_config.fine_tune_checkpoint and hparams.load_pretrained: + if not train_config.fine_tune_checkpoint_type: + # train_config.from_detection_checkpoint field is deprecated. For + # backward compatibility, set train_config.fine_tune_checkpoint_type + # based on train_config.from_detection_checkpoint. + if train_config.from_detection_checkpoint: + train_config.fine_tune_checkpoint_type = 'detection' + else: + train_config.fine_tune_checkpoint_type = 'classification' + asg_map = detection_model.restore_map( + fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type, + load_all_detection_checkpoint_vars=( + train_config.load_all_detection_checkpoint_vars)) + available_var_map = ( + variables_helper.get_variables_available_in_checkpoint( + asg_map, train_config.fine_tune_checkpoint, + include_global_step=False)) + if use_tpu: + def tpu_scaffold(): + tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, + available_var_map) + return tf.train.Scaffold() + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, + available_var_map) + + if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): + losses_dict = detection_model.loss( + prediction_dict, features[fields.InputDataFields.true_image_shape]) + losses = [loss_tensor for loss_tensor in losses_dict.itervalues()] + if train_config.add_regularization_loss: + regularization_losses = tf.get_collection( + tf.GraphKeys.REGULARIZATION_LOSSES) + if regularization_losses: + regularization_loss = tf.add_n(regularization_losses, + name='regularization_loss') + losses.append(regularization_loss) + losses_dict['Loss/regularization_loss'] = regularization_loss + total_loss = tf.add_n(losses, name='total_loss') + losses_dict['Loss/total_loss'] = total_loss + + if 'graph_rewriter_config' in configs: + graph_rewriter_fn = graph_rewriter_builder.build( + configs['graph_rewriter_config'], is_training=is_training) + graph_rewriter_fn() + + # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we + # can write learning rate summaries on TPU without host calls. + global_step = tf.train.get_or_create_global_step() + training_optimizer, optimizer_summary_vars = optimizer_builder.build( + train_config.optimizer) + + if mode == tf.estimator.ModeKeys.TRAIN: + if use_tpu: + training_optimizer = tf.contrib.tpu.CrossShardOptimizer( + training_optimizer) + + # Optionally freeze some layers by setting their gradients to be zero. + trainable_variables = None + if train_config.freeze_variables: + trainable_variables = tf.contrib.framework.filter_variables( + tf.trainable_variables(), + exclude_patterns=train_config.freeze_variables) + + clip_gradients_value = None + if train_config.gradient_clipping_by_norm > 0: + clip_gradients_value = train_config.gradient_clipping_by_norm + + if not use_tpu: + for var in optimizer_summary_vars: + tf.summary.scalar(var.op.name, var) + summaries = [] if use_tpu else None + train_op = tf.contrib.layers.optimize_loss( + loss=total_loss, + global_step=global_step, + learning_rate=None, + clip_gradients=clip_gradients_value, + optimizer=training_optimizer, + variables=trainable_variables, + summaries=summaries, + name='') # Preventing scope prefix on all variables. + + if mode == tf.estimator.ModeKeys.PREDICT: + export_outputs = { + tf.saved_model.signature_constants.PREDICT_METHOD_NAME: + tf.estimator.export.PredictOutput(detections) + } + + eval_metric_ops = None + scaffold = None + if mode == tf.estimator.ModeKeys.EVAL: + class_agnostic = (fields.DetectionResultFields.detection_classes + not in detections) + groundtruth = _prepare_groundtruth_for_eval( + detection_model, class_agnostic) + use_original_images = fields.InputDataFields.original_image in features + eval_images = ( + features[fields.InputDataFields.original_image] if use_original_images + else features[fields.InputDataFields.image]) + eval_dict = eval_util.result_dict_for_single_example( + eval_images[0:1], + features[inputs.HASH_KEY][0], + detections, + groundtruth, + class_agnostic=class_agnostic, + scale_to_absolute=True) + + if class_agnostic: + category_index = label_map_util.create_class_agnostic_category_index() + else: + category_index = label_map_util.create_category_index_from_labelmap( + eval_input_config.label_map_path) + img_summary = None + if not use_tpu and use_original_images: + detection_and_groundtruth = ( + vis_utils.draw_side_by_side_evaluation_image( + eval_dict, category_index, max_boxes_to_draw=20, + min_score_thresh=0.2, + use_normalized_coordinates=False)) + img_summary = tf.summary.image('Detections_Left_Groundtruth_Right', + detection_and_groundtruth) + + # Eval metrics on a single example. + eval_metrics = eval_config.metrics_set + if not eval_metrics: + eval_metrics = ['coco_detection_metrics'] + eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( + eval_metrics, + category_index.values(), + eval_dict, + include_metrics_per_category=eval_config.include_metrics_per_category) + for loss_key, loss_tensor in iter(losses_dict.items()): + eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) + for var in optimizer_summary_vars: + eval_metric_ops[var.op.name] = (var, tf.no_op()) + if img_summary is not None: + eval_metric_ops['Detections_Left_Groundtruth_Right'] = ( + img_summary, tf.no_op()) + eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()} + + if eval_config.use_moving_averages: + variable_averages = tf.train.ExponentialMovingAverage(0.0) + variables_to_restore = variable_averages.variables_to_restore() + keep_checkpoint_every_n_hours = ( + train_config.keep_checkpoint_every_n_hours) + saver = tf.train.Saver( + variables_to_restore, + keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) + scaffold = tf.train.Scaffold(saver=saver) + + if use_tpu: + return tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + scaffold_fn=scaffold_fn, + predictions=detections, + loss=total_loss, + train_op=train_op, + eval_metrics=eval_metric_ops, + export_outputs=export_outputs) + else: + return tf.estimator.EstimatorSpec( + mode=mode, + predictions=detections, + loss=total_loss, + train_op=train_op, + eval_metric_ops=eval_metric_ops, + export_outputs=export_outputs, + scaffold=scaffold) + + return model_fn + + +def create_estimator_and_inputs(run_config, + hparams, + pipeline_config_path, + train_steps=None, + eval_steps=None, + model_fn_creator=create_model_fn, + use_tpu_estimator=False, + use_tpu=False, + num_shards=1, + params=None, + **kwargs): + """Creates `Estimator`, input functions, and steps. + + Args: + run_config: A `RunConfig`. + hparams: A `HParams`. + pipeline_config_path: A path to a pipeline config file. + train_steps: Number of training steps. If None, the number of training steps + is set from the `TrainConfig` proto. + eval_steps: Number of evaluation steps per evaluation cycle. If None, the + number of evaluation steps is set from the `EvalConfig` proto. + model_fn_creator: A function that creates a `model_fn` for `Estimator`. + Follows the signature: + + * Args: + * `detection_model_fn`: Function that returns `DetectionModel` instance. + * `configs`: Dictionary of pipeline config objects. + * `hparams`: `HParams` object. + * Returns: + `model_fn` for `Estimator`. + + use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False, + an `Estimator` will be returned. + use_tpu: Boolean, whether training and evaluation should run on TPU. Only + used if `use_tpu_estimator` is True. + num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator` + is True. + params: Parameter dictionary passed from the estimator. Only used if + `use_tpu_estimator` is True. + **kwargs: Additional keyword arguments for configuration override. + + Returns: + A dictionary with the following fields: + 'estimator': An `Estimator` or `TPUEstimator`. + 'train_input_fn': A training input function. + 'eval_input_fn': An evaluation input function. + 'eval_on_train_input_fn': An evaluation-on-train input function. + 'predict_input_fn': A prediction input function. + 'train_steps': Number of training steps. Either directly from input or from + configuration. + 'eval_steps': Number of evaluation steps. Either directly from input or from + configuration. + """ + get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ + 'get_configs_from_pipeline_file'] + merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ + 'merge_external_params_with_configs'] + create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ + 'create_pipeline_proto_from_configs'] + create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn'] + create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn'] + create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn'] + + configs = get_configs_from_pipeline_file(pipeline_config_path) + configs = merge_external_params_with_configs( + configs, + hparams, + train_steps=train_steps, + eval_steps=eval_steps, + **kwargs) + model_config = configs['model'] + train_config = configs['train_config'] + train_input_config = configs['train_input_config'] + eval_config = configs['eval_config'] + eval_input_config = configs['eval_input_config'] + + if train_steps is None: + train_steps = configs['train_config'].num_steps + + if eval_steps is None: + eval_steps = configs['eval_config'].num_examples + + detection_model_fn = functools.partial( + model_builder.build, model_config=model_config) + + # Create the input functions for TRAIN/EVAL/PREDICT. + train_input_fn = create_train_input_fn( + train_config=train_config, + train_input_config=train_input_config, + model_config=model_config) + eval_input_fn = create_eval_input_fn( + eval_config=eval_config, + eval_input_config=eval_input_config, + model_config=model_config) + eval_on_train_input_fn = create_eval_input_fn( + eval_config=eval_config, + eval_input_config=train_input_config, + model_config=model_config) + predict_input_fn = create_predict_input_fn(model_config=model_config) + + model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu) + if use_tpu_estimator: + estimator = tf.contrib.tpu.TPUEstimator( + model_fn=model_fn, + train_batch_size=train_config.batch_size, + # For each core, only batch size 1 is supported for eval. + eval_batch_size=num_shards * 1 if use_tpu else 1, + use_tpu=use_tpu, + config=run_config, + params=params if params else {}) + else: + estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) + + # Write the as-run pipeline config to disk. + if run_config.is_chief: + pipeline_config_final = create_pipeline_proto_from_configs( + configs) + config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir) + + return dict( + estimator=estimator, + train_input_fn=train_input_fn, + eval_input_fn=eval_input_fn, + eval_on_train_input_fn=eval_on_train_input_fn, + predict_input_fn=predict_input_fn, + train_steps=train_steps, + eval_steps=eval_steps) + + +def create_train_and_eval_specs(train_input_fn, + eval_input_fn, + eval_on_train_input_fn, + predict_input_fn, + train_steps, + eval_steps, + eval_on_train_data=False, + eval_on_train_steps=None, + final_exporter_name='Servo', + eval_spec_name='eval'): + """Creates a `TrainSpec` and `EvalSpec`s. + + Args: + train_input_fn: Function that produces features and labels on train data. + eval_input_fn: Function that produces features and labels on eval data. + eval_on_train_input_fn: Function that produces features and labels for + evaluation on train data. + predict_input_fn: Function that produces features for inference. + train_steps: Number of training steps. + eval_steps: Number of eval steps. + eval_on_train_data: Whether to evaluate model on training data. Default is + False. + eval_on_train_steps: Number of eval steps for training data. If not given, + uses eval_steps. + final_exporter_name: String name given to `FinalExporter`. + eval_spec_name: String name given to main `EvalSpec`. + + Returns: + Tuple of `TrainSpec` and list of `EvalSpecs`. The first `EvalSpec` is for + evaluation data. If `eval_on_train_data` is True, the second `EvalSpec` in + the list will correspond to training data. + """ + + exporter = tf.estimator.FinalExporter( + name=final_exporter_name, serving_input_receiver_fn=predict_input_fn) + + train_spec = tf.estimator.TrainSpec( + input_fn=train_input_fn, max_steps=train_steps) + + eval_specs = [ + tf.estimator.EvalSpec( + name=eval_spec_name, + input_fn=eval_input_fn, + steps=eval_steps, + exporters=exporter) + ] + + if eval_on_train_data: + eval_specs.append( + tf.estimator.EvalSpec( + name='eval_on_train', input_fn=eval_on_train_input_fn, + steps=eval_on_train_steps or eval_steps)) + + return train_spec, eval_specs + + +def continuous_eval(estimator, model_dir, input_fn, eval_steps, train_steps, + name): + """Perform continuous evaluation on checkpoints written to a model directory. + + Args: + estimator: Estimator object to use for evaluation. + model_dir: Model directory to read checkpoints for continuous evaluation. + input_fn: Input function to use for evaluation. + eval_steps: Number of steps to run during each evaluation. + train_steps: Number of training steps. This is used to infer the last + checkpoint and stop evaluation loop. + name: Namescope for eval summary. + """ + def terminate_eval(): + tf.logging.info('Terminating eval after 180 seconds of no checkpoints') + return True + + for ckpt in tf.contrib.training.checkpoints_iterator( + model_dir, min_interval_secs=180, timeout=None, + timeout_fn=terminate_eval): + + tf.logging.info('Starting Evaluation.') + try: + eval_results = estimator.evaluate( + input_fn=input_fn, + steps=eval_steps, + checkpoint_path=ckpt, + name=name) + tf.logging.info('Eval results: %s' % eval_results) + + # Terminate eval job when final checkpoint is reached + current_step = int(os.path.basename(ckpt).split('-')[1]) + if current_step >= train_steps: + tf.logging.info( + 'Evaluation finished after training step %d' % current_step) + break + + except tf.errors.NotFoundError: + tf.logging.info( + 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) + + +def populate_experiment(run_config, + hparams, + pipeline_config_path, + train_steps=None, + eval_steps=None, + model_fn_creator=create_model_fn, + **kwargs): + """Populates an `Experiment` object. + + EXPERIMENT CLASS IS DEPRECATED. Please switch to + tf.estimator.train_and_evaluate. As an example, see model_main.py. + + Args: + run_config: A `RunConfig`. + hparams: A `HParams`. + pipeline_config_path: A path to a pipeline config file. + train_steps: Number of training steps. If None, the number of training steps + is set from the `TrainConfig` proto. + eval_steps: Number of evaluation steps per evaluation cycle. If None, the + number of evaluation steps is set from the `EvalConfig` proto. + model_fn_creator: A function that creates a `model_fn` for `Estimator`. + Follows the signature: + + * Args: + * `detection_model_fn`: Function that returns `DetectionModel` instance. + * `configs`: Dictionary of pipeline config objects. + * `hparams`: `HParams` object. + * Returns: + `model_fn` for `Estimator`. + + **kwargs: Additional keyword arguments for configuration override. + + Returns: + An `Experiment` that defines all aspects of training, evaluation, and + export. + """ + tf.logging.warning('Experiment is being deprecated. Please use ' + 'tf.estimator.train_and_evaluate(). See model_main.py for ' + 'an example.') + train_and_eval_dict = create_estimator_and_inputs( + run_config, + hparams, + pipeline_config_path, + train_steps=train_steps, + eval_steps=eval_steps, + model_fn_creator=model_fn_creator, + **kwargs) + estimator = train_and_eval_dict['estimator'] + train_input_fn = train_and_eval_dict['train_input_fn'] + eval_input_fn = train_and_eval_dict['eval_input_fn'] + predict_input_fn = train_and_eval_dict['predict_input_fn'] + train_steps = train_and_eval_dict['train_steps'] + eval_steps = train_and_eval_dict['eval_steps'] + + export_strategies = [ + tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy( + serving_input_fn=predict_input_fn) + ] + + return tf.contrib.learn.Experiment( + estimator=estimator, + train_input_fn=train_input_fn, + eval_input_fn=eval_input_fn, + train_steps=train_steps, + eval_steps=eval_steps, + export_strategies=export_strategies, + eval_delay_secs=120,) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/model_lib_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_lib_test.py new file mode 100644 index 0000000000000000000000000000000000000000..ec571e051d3b96e4f9aa8ae069d47fc109416c89 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_lib_test.py @@ -0,0 +1,402 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for object detection model library.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import os + +import numpy as np +import tensorflow as tf + +from tensorflow.contrib.tpu.python.tpu import tpu_config +from tensorflow.contrib.tpu.python.tpu import tpu_estimator + +from object_detection import inputs +from object_detection import model_hparams +from object_detection import model_lib +from object_detection.builders import model_builder +from object_detection.core import standard_fields as fields +from object_detection.utils import config_util + + +# Model for test. Options are: +# 'ssd_inception_v2_pets', 'faster_rcnn_resnet50_pets' +MODEL_NAME_FOR_TEST = 'ssd_inception_v2_pets' + + +def _get_data_path(): + """Returns an absolute path to TFRecord file.""" + return os.path.join(tf.resource_loader.get_data_files_path(), 'test_data', + 'pets_examples.record') + + +def get_pipeline_config_path(model_name): + """Returns path to the local pipeline config file.""" + return os.path.join(tf.resource_loader.get_data_files_path(), 'samples', + 'configs', model_name + '.config') + + +def _get_labelmap_path(): + """Returns an absolute path to label map file.""" + return os.path.join(tf.resource_loader.get_data_files_path(), 'data', + 'pet_label_map.pbtxt') + + +def _get_configs_for_model(model_name): + """Returns configurations for model.""" + filename = get_pipeline_config_path(model_name) + data_path = _get_data_path() + label_map_path = _get_labelmap_path() + configs = config_util.get_configs_from_pipeline_file(filename) + configs = config_util.merge_external_params_with_configs( + configs, + train_input_path=data_path, + eval_input_path=data_path, + label_map_path=label_map_path) + return configs + + +class ModelLibTest(tf.test.TestCase): + + @classmethod + def setUpClass(cls): + tf.reset_default_graph() + + def _assert_model_fn_for_train_eval(self, configs, mode, + class_agnostic=False): + model_config = configs['model'] + train_config = configs['train_config'] + with tf.Graph().as_default(): + if mode == 'train': + features, labels = inputs.create_train_input_fn( + configs['train_config'], + configs['train_input_config'], + configs['model'])() + model_mode = tf.estimator.ModeKeys.TRAIN + batch_size = train_config.batch_size + elif mode == 'eval': + features, labels = inputs.create_eval_input_fn( + configs['eval_config'], + configs['eval_input_config'], + configs['model'])() + model_mode = tf.estimator.ModeKeys.EVAL + batch_size = 1 + elif mode == 'eval_on_train': + features, labels = inputs.create_eval_input_fn( + configs['eval_config'], + configs['train_input_config'], + configs['model'])() + model_mode = tf.estimator.ModeKeys.EVAL + batch_size = 1 + + detection_model_fn = functools.partial( + model_builder.build, model_config=model_config, is_training=True) + + hparams = model_hparams.create_hparams( + hparams_overrides='load_pretrained=false') + + model_fn = model_lib.create_model_fn(detection_model_fn, configs, hparams) + estimator_spec = model_fn(features, labels, model_mode) + + self.assertIsNotNone(estimator_spec.loss) + self.assertIsNotNone(estimator_spec.predictions) + if class_agnostic: + self.assertNotIn('detection_classes', estimator_spec.predictions) + else: + detection_classes = estimator_spec.predictions['detection_classes'] + self.assertEqual(batch_size, detection_classes.shape.as_list()[0]) + self.assertEqual(tf.float32, detection_classes.dtype) + detection_boxes = estimator_spec.predictions['detection_boxes'] + detection_scores = estimator_spec.predictions['detection_scores'] + num_detections = estimator_spec.predictions['num_detections'] + self.assertEqual(batch_size, detection_boxes.shape.as_list()[0]) + self.assertEqual(tf.float32, detection_boxes.dtype) + self.assertEqual(batch_size, detection_scores.shape.as_list()[0]) + self.assertEqual(tf.float32, detection_scores.dtype) + self.assertEqual(tf.float32, num_detections.dtype) + if model_mode == tf.estimator.ModeKeys.TRAIN: + self.assertIsNotNone(estimator_spec.train_op) + return estimator_spec + + def _assert_model_fn_for_predict(self, configs): + model_config = configs['model'] + + with tf.Graph().as_default(): + features, _ = inputs.create_eval_input_fn( + configs['eval_config'], + configs['eval_input_config'], + configs['model'])() + detection_model_fn = functools.partial( + model_builder.build, model_config=model_config, is_training=False) + + hparams = model_hparams.create_hparams( + hparams_overrides='load_pretrained=false') + + model_fn = model_lib.create_model_fn(detection_model_fn, configs, hparams) + estimator_spec = model_fn(features, None, tf.estimator.ModeKeys.PREDICT) + + self.assertIsNone(estimator_spec.loss) + self.assertIsNone(estimator_spec.train_op) + self.assertIsNotNone(estimator_spec.predictions) + self.assertIsNotNone(estimator_spec.export_outputs) + self.assertIn(tf.saved_model.signature_constants.PREDICT_METHOD_NAME, + estimator_spec.export_outputs) + + def test_model_fn_in_train_mode(self): + """Tests the model function in TRAIN mode.""" + configs = _get_configs_for_model(MODEL_NAME_FOR_TEST) + self._assert_model_fn_for_train_eval(configs, 'train') + + def test_model_fn_in_eval_mode(self): + """Tests the model function in EVAL mode.""" + configs = _get_configs_for_model(MODEL_NAME_FOR_TEST) + self._assert_model_fn_for_train_eval(configs, 'eval') + + def test_model_fn_in_eval_on_train_mode(self): + """Tests the model function in EVAL mode with train data.""" + configs = _get_configs_for_model(MODEL_NAME_FOR_TEST) + self._assert_model_fn_for_train_eval(configs, 'eval_on_train') + + def test_model_fn_in_predict_mode(self): + """Tests the model function in PREDICT mode.""" + configs = _get_configs_for_model(MODEL_NAME_FOR_TEST) + self._assert_model_fn_for_predict(configs) + + def test_create_estimator_and_inputs(self): + """Tests that Estimator and input function are constructed correctly.""" + run_config = tf.estimator.RunConfig() + hparams = model_hparams.create_hparams( + hparams_overrides='load_pretrained=false') + pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) + train_steps = 20 + eval_steps = 10 + train_and_eval_dict = model_lib.create_estimator_and_inputs( + run_config, + hparams, + pipeline_config_path, + train_steps=train_steps, + eval_steps=eval_steps) + estimator = train_and_eval_dict['estimator'] + train_steps = train_and_eval_dict['train_steps'] + eval_steps = train_and_eval_dict['eval_steps'] + self.assertIsInstance(estimator, tf.estimator.Estimator) + self.assertEqual(20, train_steps) + self.assertEqual(10, eval_steps) + self.assertIn('train_input_fn', train_and_eval_dict) + self.assertIn('eval_input_fn', train_and_eval_dict) + self.assertIn('eval_on_train_input_fn', train_and_eval_dict) + + def test_create_estimator_with_default_train_eval_steps(self): + """Tests that number of train/eval defaults to config values.""" + run_config = tf.estimator.RunConfig() + hparams = model_hparams.create_hparams( + hparams_overrides='load_pretrained=false') + pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) + configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) + config_train_steps = configs['train_config'].num_steps + config_eval_steps = configs['eval_config'].num_examples + train_and_eval_dict = model_lib.create_estimator_and_inputs( + run_config, hparams, pipeline_config_path) + estimator = train_and_eval_dict['estimator'] + train_steps = train_and_eval_dict['train_steps'] + eval_steps = train_and_eval_dict['eval_steps'] + + self.assertIsInstance(estimator, tf.estimator.Estimator) + self.assertEqual(config_train_steps, train_steps) + self.assertEqual(config_eval_steps, eval_steps) + + def test_create_tpu_estimator_and_inputs(self): + """Tests that number of train/eval defaults to config values.""" + + run_config = tpu_config.RunConfig() + hparams = model_hparams.create_hparams( + hparams_overrides='load_pretrained=false') + pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) + train_steps = 20 + eval_steps = 10 + train_and_eval_dict = model_lib.create_estimator_and_inputs( + run_config, + hparams, + pipeline_config_path, + train_steps=train_steps, + eval_steps=eval_steps, + use_tpu_estimator=True) + estimator = train_and_eval_dict['estimator'] + train_steps = train_and_eval_dict['train_steps'] + eval_steps = train_and_eval_dict['eval_steps'] + + self.assertIsInstance(estimator, tpu_estimator.TPUEstimator) + self.assertEqual(20, train_steps) + self.assertEqual(10, eval_steps) + + def test_create_train_and_eval_specs(self): + """Tests that `TrainSpec` and `EvalSpec` is created correctly.""" + run_config = tf.estimator.RunConfig() + hparams = model_hparams.create_hparams( + hparams_overrides='load_pretrained=false') + pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) + train_steps = 20 + eval_steps = 10 + eval_on_train_steps = 15 + train_and_eval_dict = model_lib.create_estimator_and_inputs( + run_config, + hparams, + pipeline_config_path, + train_steps=train_steps, + eval_steps=eval_steps) + train_input_fn = train_and_eval_dict['train_input_fn'] + eval_input_fn = train_and_eval_dict['eval_input_fn'] + eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] + predict_input_fn = train_and_eval_dict['predict_input_fn'] + train_steps = train_and_eval_dict['train_steps'] + eval_steps = train_and_eval_dict['eval_steps'] + + train_spec, eval_specs = model_lib.create_train_and_eval_specs( + train_input_fn, + eval_input_fn, + eval_on_train_input_fn, + predict_input_fn, + train_steps, + eval_steps, + eval_on_train_data=True, + eval_on_train_steps=eval_on_train_steps, + final_exporter_name='exporter', + eval_spec_name='holdout') + self.assertEqual(train_steps, train_spec.max_steps) + self.assertEqual(2, len(eval_specs)) + self.assertEqual(eval_steps, eval_specs[0].steps) + self.assertEqual('holdout', eval_specs[0].name) + self.assertEqual('exporter', eval_specs[0].exporters[0].name) + self.assertEqual(eval_on_train_steps, eval_specs[1].steps) + self.assertEqual('eval_on_train', eval_specs[1].name) + + def test_experiment(self): + """Tests that the `Experiment` object is constructed correctly.""" + run_config = tf.estimator.RunConfig() + hparams = model_hparams.create_hparams( + hparams_overrides='load_pretrained=false') + pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST) + experiment = model_lib.populate_experiment( + run_config, + hparams, + pipeline_config_path, + train_steps=10, + eval_steps=20) + self.assertEqual(10, experiment.train_steps) + self.assertEqual(20, experiment.eval_steps) + + +class UnbatchTensorsTest(tf.test.TestCase): + + def test_unbatch_without_unpadding(self): + image_placeholder = tf.placeholder(tf.float32, [2, None, None, None]) + groundtruth_boxes_placeholder = tf.placeholder(tf.float32, [2, None, None]) + groundtruth_classes_placeholder = tf.placeholder(tf.float32, + [2, None, None]) + groundtruth_weights_placeholder = tf.placeholder(tf.float32, [2, None]) + + tensor_dict = { + fields.InputDataFields.image: + image_placeholder, + fields.InputDataFields.groundtruth_boxes: + groundtruth_boxes_placeholder, + fields.InputDataFields.groundtruth_classes: + groundtruth_classes_placeholder, + fields.InputDataFields.groundtruth_weights: + groundtruth_weights_placeholder + } + unbatched_tensor_dict = model_lib.unstack_batch( + tensor_dict, unpad_groundtruth_tensors=False) + + with self.test_session() as sess: + unbatched_tensor_dict_out = sess.run( + unbatched_tensor_dict, + feed_dict={ + image_placeholder: + np.random.rand(2, 4, 4, 3).astype(np.float32), + groundtruth_boxes_placeholder: + np.random.rand(2, 5, 4).astype(np.float32), + groundtruth_classes_placeholder: + np.random.rand(2, 5, 6).astype(np.float32), + groundtruth_weights_placeholder: + np.random.rand(2, 5).astype(np.float32) + }) + for image_out in unbatched_tensor_dict_out[fields.InputDataFields.image]: + self.assertAllEqual(image_out.shape, [4, 4, 3]) + for groundtruth_boxes_out in unbatched_tensor_dict_out[ + fields.InputDataFields.groundtruth_boxes]: + self.assertAllEqual(groundtruth_boxes_out.shape, [5, 4]) + for groundtruth_classes_out in unbatched_tensor_dict_out[ + fields.InputDataFields.groundtruth_classes]: + self.assertAllEqual(groundtruth_classes_out.shape, [5, 6]) + for groundtruth_weights_out in unbatched_tensor_dict_out[ + fields.InputDataFields.groundtruth_weights]: + self.assertAllEqual(groundtruth_weights_out.shape, [5]) + + def test_unbatch_and_unpad_groundtruth_tensors(self): + image_placeholder = tf.placeholder(tf.float32, [2, None, None, None]) + groundtruth_boxes_placeholder = tf.placeholder(tf.float32, [2, 5, None]) + groundtruth_classes_placeholder = tf.placeholder(tf.float32, [2, 5, None]) + groundtruth_weights_placeholder = tf.placeholder(tf.float32, [2, 5]) + num_groundtruth_placeholder = tf.placeholder(tf.int32, [2]) + + tensor_dict = { + fields.InputDataFields.image: + image_placeholder, + fields.InputDataFields.groundtruth_boxes: + groundtruth_boxes_placeholder, + fields.InputDataFields.groundtruth_classes: + groundtruth_classes_placeholder, + fields.InputDataFields.groundtruth_weights: + groundtruth_weights_placeholder, + fields.InputDataFields.num_groundtruth_boxes: + num_groundtruth_placeholder + } + unbatched_tensor_dict = model_lib.unstack_batch( + tensor_dict, unpad_groundtruth_tensors=True) + with self.test_session() as sess: + unbatched_tensor_dict_out = sess.run( + unbatched_tensor_dict, + feed_dict={ + image_placeholder: + np.random.rand(2, 4, 4, 3).astype(np.float32), + groundtruth_boxes_placeholder: + np.random.rand(2, 5, 4).astype(np.float32), + groundtruth_classes_placeholder: + np.random.rand(2, 5, 6).astype(np.float32), + groundtruth_weights_placeholder: + np.random.rand(2, 5).astype(np.float32), + num_groundtruth_placeholder: + np.array([3, 3], np.int32) + }) + for image_out in unbatched_tensor_dict_out[fields.InputDataFields.image]: + self.assertAllEqual(image_out.shape, [4, 4, 3]) + for groundtruth_boxes_out in unbatched_tensor_dict_out[ + fields.InputDataFields.groundtruth_boxes]: + self.assertAllEqual(groundtruth_boxes_out.shape, [3, 4]) + for groundtruth_classes_out in unbatched_tensor_dict_out[ + fields.InputDataFields.groundtruth_classes]: + self.assertAllEqual(groundtruth_classes_out.shape, [3, 6]) + for groundtruth_weights_out in unbatched_tensor_dict_out[ + fields.InputDataFields.groundtruth_weights]: + self.assertAllEqual(groundtruth_weights_out.shape, [3]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/model_main.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_main.py new file mode 100644 index 0000000000000000000000000000000000000000..b4bfcf325eb59086cb0bf7965285c2507a4a089e --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_main.py @@ -0,0 +1,86 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Binary to run train and evaluation on object detection model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import flags + +import tensorflow as tf + +from object_detection import model_hparams +from object_detection import model_lib + +flags.DEFINE_string( + 'model_dir', None, 'Path to output model directory ' + 'where event and checkpoint files will be written.') +flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config ' + 'file.') +flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.') +flags.DEFINE_integer('num_eval_steps', None, 'Number of train steps.') +flags.DEFINE_string( + 'hparams_overrides', None, 'Hyperparameter overrides, ' + 'represented as a string containing comma-separated ' + 'hparam_name=value pairs.') +flags.DEFINE_string( + 'checkpoint_dir', None, 'Path to directory holding a checkpoint. If ' + '`checkpoint_dir` is provided, this binary operates in eval-only mode, ' + 'writing resulting metrics to `model_dir`.') + +FLAGS = flags.FLAGS + + +def main(unused_argv): + flags.mark_flag_as_required('model_dir') + flags.mark_flag_as_required('pipeline_config_path') + config = tf.estimator.RunConfig(model_dir=FLAGS.model_dir) + + train_and_eval_dict = model_lib.create_estimator_and_inputs( + run_config=config, + hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), + pipeline_config_path=FLAGS.pipeline_config_path, + train_steps=FLAGS.num_train_steps, + eval_steps=FLAGS.num_eval_steps) + estimator = train_and_eval_dict['estimator'] + train_input_fn = train_and_eval_dict['train_input_fn'] + eval_input_fn = train_and_eval_dict['eval_input_fn'] + eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] + predict_input_fn = train_and_eval_dict['predict_input_fn'] + train_steps = train_and_eval_dict['train_steps'] + eval_steps = train_and_eval_dict['eval_steps'] + + if FLAGS.checkpoint_dir: + estimator.evaluate(eval_input_fn, + eval_steps, + checkpoint_path=tf.train.latest_checkpoint( + FLAGS.checkpoint_dir)) + else: + train_spec, eval_specs = model_lib.create_train_and_eval_specs( + train_input_fn, + eval_input_fn, + eval_on_train_input_fn, + predict_input_fn, + train_steps, + eval_steps, + eval_on_train_data=False) + + # Currently only a single Eval Spec is allowed. + tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0]) + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/model_tpu_main.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_tpu_main.py new file mode 100644 index 0000000000000000000000000000000000000000..50f8fb9a29ad0147558ecf998e009323d7fe6458 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/model_tpu_main.py @@ -0,0 +1,135 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Creates and runs `Estimator` for object detection model on TPUs. + +This uses the TPUEstimator API to define and run a model in TRAIN/EVAL modes. +""" +# pylint: enable=line-too-long + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import flags +import tensorflow as tf + +from tensorflow.contrib.tpu.python.tpu import tpu_config + +from object_detection import model_hparams +from object_detection import model_lib + +tf.flags.DEFINE_bool('use_tpu', True, 'Use TPUs rather than plain CPUs') + +# Cloud TPU Cluster Resolvers +flags.DEFINE_string( + 'gcp_project', + default=None, + help='Project name for the Cloud TPU-enabled project. If not specified, we ' + 'will attempt to automatically detect the GCE project from metadata.') +flags.DEFINE_string( + 'tpu_zone', + default=None, + help='GCE zone where the Cloud TPU is located in. If not specified, we ' + 'will attempt to automatically detect the GCE project from metadata.') +flags.DEFINE_string( + 'tpu_name', + default=None, + help='Name of the Cloud TPU for Cluster Resolvers.') + +flags.DEFINE_integer('num_shards', 8, 'Number of shards (TPU cores).') +flags.DEFINE_integer('iterations_per_loop', 100, + 'Number of iterations per TPU training loop.') +# For mode=train_and_eval, evaluation occurs after training is finished. +# Note: independently of steps_per_checkpoint, estimator will save the most +# recent checkpoint every 10 minutes by default for train_and_eval +flags.DEFINE_string('mode', 'train', + 'Mode to run: train, eval') +flags.DEFINE_integer('train_batch_size', None, 'Batch size for training. If ' + 'this is not provided, batch size is read from training ' + 'config.') + +flags.DEFINE_string( + 'hparams_overrides', None, 'Comma-separated list of ' + 'hyperparameters to override defaults.') +flags.DEFINE_boolean('eval_training_data', False, + 'If training data should be evaluated for this job.') +flags.DEFINE_string( + 'model_dir', None, 'Path to output model directory ' + 'where event and checkpoint files will be written.') +flags.DEFINE_string('pipeline_config_path', None, 'Path to pipeline config ' + 'file.') +flags.DEFINE_integer('num_train_steps', None, 'Number of train steps.') +flags.DEFINE_integer('num_eval_steps', None, 'Number of train steps.') + +FLAGS = tf.flags.FLAGS + + +def main(unused_argv): + flags.mark_flag_as_required('model_dir') + flags.mark_flag_as_required('pipeline_config_path') + + tpu_cluster_resolver = ( + tf.contrib.cluster_resolver.python.training.TPUClusterResolver( + tpu_names=[FLAGS.tpu_name], + zone=FLAGS.tpu_zone, + project=FLAGS.gcp_project)) + tpu_grpc_url = tpu_cluster_resolver.get_master() + + config = tpu_config.RunConfig( + master=tpu_grpc_url, + evaluation_master=tpu_grpc_url, + model_dir=FLAGS.model_dir, + tpu_config=tpu_config.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_shards)) + + kwargs = {} + if FLAGS.train_batch_size: + kwargs['batch_size'] = FLAGS.train_batch_size + + train_and_eval_dict = model_lib.create_estimator_and_inputs( + run_config=config, + hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), + pipeline_config_path=FLAGS.pipeline_config_path, + train_steps=FLAGS.num_train_steps, + eval_steps=FLAGS.num_eval_steps, + use_tpu_estimator=True, + use_tpu=FLAGS.use_tpu, + num_shards=FLAGS.num_shards, + **kwargs) + estimator = train_and_eval_dict['estimator'] + train_input_fn = train_and_eval_dict['train_input_fn'] + eval_input_fn = train_and_eval_dict['eval_input_fn'] + eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] + train_steps = train_and_eval_dict['train_steps'] + eval_steps = train_and_eval_dict['eval_steps'] + + if FLAGS.mode == 'train': + estimator.train(input_fn=train_input_fn, max_steps=train_steps) + + # Continuously evaluating. + if FLAGS.mode == 'eval': + if FLAGS.eval_training_data: + name = 'training_data' + input_fn = eval_on_train_input_fn + else: + name = 'validation_data' + input_fn = eval_input_fn + model_lib.continuous_eval(estimator, FLAGS.model_dir, input_fn, eval_steps, + train_steps, name) + + +if __name__ == '__main__': + tf.app.run() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..f0cad235408666d386454cccc64a264f25f58c29 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor.py @@ -0,0 +1,165 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Embedded-friendly SSDFeatureExtractor for MobilenetV1 features.""" + +import tensorflow as tf + +from object_detection.meta_architectures import ssd_meta_arch +from object_detection.models import feature_map_generators +from object_detection.utils import context_manager +from object_detection.utils import ops +from nets import mobilenet_v1 + +slim = tf.contrib.slim + + +class EmbeddedSSDMobileNetV1FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): + """Embedded-friendly SSD Feature Extractor using MobilenetV1 features. + + This feature extractor is similar to SSD MobileNetV1 feature extractor, and + it fixes input resolution to be 256x256, reduces the number of feature maps + used for box prediction and ensures convolution kernel to be no larger + than input tensor in spatial dimensions. + + This feature extractor requires support of the following ops if used in + embedded devices: + - Conv + - DepthwiseConv + - Relu6 + + All conv/depthwiseconv use SAME padding, and no additional spatial padding is + needed. + """ + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """MobileNetV1 Feature Extractor for Embedded-friendly SSD Models. + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + min_depth: minimum feature extractor depth. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. For EmbeddedSSD it must be set to 1. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + reuse_weights: Whether to reuse variables. Default is None. + use_explicit_padding: Whether to use explicit padding when extracting + features. Default is False. + use_depthwise: Whether to use depthwise convolutions. Default is False. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + + Raises: + ValueError: upon invalid `pad_to_multiple` values. + """ + if pad_to_multiple != 1: + raise ValueError('Embedded-specific SSD only supports `pad_to_multiple` ' + 'of 1.') + + super(EmbeddedSSDMobileNetV1FeatureExtractor, self).__init__( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams_fn, reuse_weights, use_explicit_padding, use_depthwise, + override_base_feature_extractor_hyperparams) + + def preprocess(self, resized_inputs): + """SSD preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def extract_features(self, preprocessed_inputs): + """Extract features from preprocessed inputs. + + Args: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + feature_maps: a list of tensors where the ith tensor has shape + [batch, height_i, width_i, depth_i] + + Raises: + ValueError: if image height or width are not 256 pixels. + """ + image_shape = preprocessed_inputs.get_shape() + image_shape.assert_has_rank(4) + image_height = image_shape[1].value + image_width = image_shape[2].value + + if image_height is None or image_width is None: + shape_assert = tf.Assert( + tf.logical_and(tf.equal(tf.shape(preprocessed_inputs)[1], 256), + tf.equal(tf.shape(preprocessed_inputs)[2], 256)), + ['image size must be 256 in both height and width.']) + with tf.control_dependencies([shape_assert]): + preprocessed_inputs = tf.identity(preprocessed_inputs) + elif image_height != 256 or image_width != 256: + raise ValueError('image size must be = 256 in both height and width;' + ' image dim = %d,%d' % (image_height, image_width)) + + feature_map_layout = { + 'from_layer': [ + 'Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '' + ], + 'layer_depth': [-1, -1, 512, 256, 256], + 'conv_kernel_size': [-1, -1, 3, 3, 2], + 'use_explicit_padding': self._use_explicit_padding, + 'use_depthwise': self._use_depthwise, + } + + with tf.variable_scope('MobilenetV1', + reuse=self._reuse_weights) as scope: + with slim.arg_scope( + mobilenet_v1.mobilenet_v1_arg_scope(is_training=None)): + with (slim.arg_scope(self._conv_hyperparams_fn()) + if self._override_base_feature_extractor_hyperparams + else context_manager.IdentityContextManager()): + _, image_features = mobilenet_v1.mobilenet_v1_base( + ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), + final_endpoint='Conv2d_13_pointwise', + min_depth=self._min_depth, + depth_multiplier=self._depth_multiplier, + use_explicit_padding=self._use_explicit_padding, + scope=scope) + with slim.arg_scope(self._conv_hyperparams_fn()): + feature_maps = feature_map_generators.multi_resolution_feature_maps( + feature_map_layout=feature_map_layout, + depth_multiplier=self._depth_multiplier, + min_depth=self._min_depth, + insert_1x1_conv=True, + image_features=image_features) + + return feature_maps.values() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1fee66c866b356753c8b9d712a705763f7a76638 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/embedded_ssd_mobilenet_v1_feature_extractor_test.py @@ -0,0 +1,129 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for embedded_ssd_mobilenet_v1_feature_extractor.""" +import numpy as np +import tensorflow as tf + +from object_detection.models import embedded_ssd_mobilenet_v1_feature_extractor +from object_detection.models import ssd_feature_extractor_test + + +class EmbeddedSSDMobileNetV1FeatureExtractorTest( + ssd_feature_extractor_test.SsdFeatureExtractorTestBase): + + def _create_feature_extractor(self, depth_multiplier, pad_to_multiple, + is_training=True): + """Constructs a new feature extractor. + + Args: + depth_multiplier: float depth multiplier for feature extractor + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + is_training: whether the network is in training mode. + + Returns: + an ssd_meta_arch.SSDFeatureExtractor object. + """ + min_depth = 32 + return (embedded_ssd_mobilenet_v1_feature_extractor. + EmbeddedSSDMobileNetV1FeatureExtractor( + is_training, depth_multiplier, min_depth, pad_to_multiple, + self.conv_hyperparams_fn, + override_base_feature_extractor_hyperparams=True)) + + def test_extract_features_returns_correct_shapes_256(self): + image_height = 256 + image_width = 256 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 16, 16, 512), (2, 8, 8, 1024), + (2, 4, 4, 512), (2, 2, 2, 256), + (2, 1, 1, 256)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self): + image_height = 256 + image_width = 256 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 16, 16, 512), (2, 8, 8, 1024), + (2, 4, 4, 512), (2, 2, 2, 256), + (2, 1, 1, 256)] + self.check_extract_features_returns_correct_shapes_with_dynamic_inputs( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_enforcing_min_depth(self): + image_height = 256 + image_width = 256 + depth_multiplier = 0.5**12 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 16, 16, 32), (2, 8, 8, 32), (2, 4, 4, 32), + (2, 2, 2, 32), (2, 1, 1, 32)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_pad_to_multiple_of_1( + self): + image_height = 256 + image_width = 256 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 16, 16, 512), (2, 8, 8, 1024), + (2, 4, 4, 512), (2, 2, 2, 256), + (2, 1, 1, 256)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_raises_error_with_pad_to_multiple_not_1(self): + depth_multiplier = 1.0 + pad_to_multiple = 2 + with self.assertRaises(ValueError): + _ = self._create_feature_extractor(depth_multiplier, pad_to_multiple) + + def test_extract_features_raises_error_with_invalid_image_size(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1.0 + pad_to_multiple = 1 + self.check_extract_features_raises_error_with_invalid_image_size( + image_height, image_width, depth_multiplier, pad_to_multiple) + + def test_preprocess_returns_correct_value_range(self): + image_height = 256 + image_width = 256 + depth_multiplier = 1 + pad_to_multiple = 1 + test_image = np.random.rand(4, image_height, image_width, 3) + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple) + preprocessed_image = feature_extractor.preprocess(test_image) + self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0))) + + def test_variables_only_created_in_scope(self): + depth_multiplier = 1 + pad_to_multiple = 1 + scope_name = 'MobilenetV1' + self.check_feature_extractor_variables_under_scope( + depth_multiplier, pad_to_multiple, scope_name) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..30b3dd4eccb29b60add46a0c91673c00bcbbdf01 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor.py @@ -0,0 +1,213 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Inception Resnet v2 Faster R-CNN implementation. + +See "Inception-v4, Inception-ResNet and the Impact of Residual Connections on +Learning" by Szegedy et al. (https://arxiv.org/abs/1602.07261) +as well as +"Speed/accuracy trade-offs for modern convolutional object detectors" by +Huang et al. (https://arxiv.org/abs/1611.10012) +""" + +import tensorflow as tf + +from object_detection.meta_architectures import faster_rcnn_meta_arch +from nets import inception_resnet_v2 + +slim = tf.contrib.slim + + +class FasterRCNNInceptionResnetV2FeatureExtractor( + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor): + """Faster R-CNN with Inception Resnet v2 feature extractor implementation.""" + + def __init__(self, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0): + """Constructor. + + Args: + is_training: See base class. + first_stage_features_stride: See base class. + batch_norm_trainable: See base class. + reuse_weights: See base class. + weight_decay: See base class. + + Raises: + ValueError: If `first_stage_features_stride` is not 8 or 16. + """ + if first_stage_features_stride != 8 and first_stage_features_stride != 16: + raise ValueError('`first_stage_features_stride` must be 8 or 16.') + super(FasterRCNNInceptionResnetV2FeatureExtractor, self).__init__( + is_training, first_stage_features_stride, batch_norm_trainable, + reuse_weights, weight_decay) + + def preprocess(self, resized_inputs): + """Faster R-CNN with Inception Resnet v2 preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: A [batch, height_in, width_in, channels] float32 tensor + representing a batch of images with values between 0 and 255.0. + + Returns: + preprocessed_inputs: A [batch, height_out, width_out, channels] float32 + tensor representing a batch of images. + + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def _extract_proposal_features(self, preprocessed_inputs, scope): + """Extracts first stage RPN features. + + Extracts features using the first half of the Inception Resnet v2 network. + We construct the network in `align_feature_maps=True` mode, which means + that all VALID paddings in the network are changed to SAME padding so that + the feature maps are aligned. + + Args: + preprocessed_inputs: A [batch, height, width, channels] float32 tensor + representing a batch of images. + scope: A scope name. + + Returns: + rpn_feature_map: A tensor with shape [batch, height, width, depth] + Raises: + InvalidArgumentError: If the spatial size of `preprocessed_inputs` + (height or width) is less than 33. + ValueError: If the created network is missing the required activation. + """ + if len(preprocessed_inputs.get_shape().as_list()) != 4: + raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a ' + 'tensor of shape %s' % preprocessed_inputs.get_shape()) + + with slim.arg_scope(inception_resnet_v2.inception_resnet_v2_arg_scope( + weight_decay=self._weight_decay)): + # Forces is_training to False to disable batch norm update. + with slim.arg_scope([slim.batch_norm], + is_training=self._train_batch_norm): + with tf.variable_scope('InceptionResnetV2', + reuse=self._reuse_weights) as scope: + return inception_resnet_v2.inception_resnet_v2_base( + preprocessed_inputs, final_endpoint='PreAuxLogits', + scope=scope, output_stride=self._first_stage_features_stride, + align_feature_maps=True) + + def _extract_box_classifier_features(self, proposal_feature_maps, scope): + """Extracts second stage box classifier features. + + This function reconstructs the "second half" of the Inception ResNet v2 + network after the part defined in `_extract_proposal_features`. + + Args: + proposal_feature_maps: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, crop_height, crop_width, depth] + representing the feature map cropped to each proposal. + scope: A scope name. + + Returns: + proposal_classifier_features: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, height, width, depth] + representing box classifier features for each proposal. + """ + with tf.variable_scope('InceptionResnetV2', reuse=self._reuse_weights): + with slim.arg_scope(inception_resnet_v2.inception_resnet_v2_arg_scope( + weight_decay=self._weight_decay)): + # Forces is_training to False to disable batch norm update. + with slim.arg_scope([slim.batch_norm], + is_training=self._train_batch_norm): + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='SAME'): + with tf.variable_scope('Mixed_7a'): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(proposal_feature_maps, + 256, 1, scope='Conv2d_0a_1x1') + tower_conv_1 = slim.conv2d( + tower_conv, 384, 3, stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + tower_conv1 = slim.conv2d( + proposal_feature_maps, 256, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d( + tower_conv1, 288, 3, stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + tower_conv2 = slim.conv2d( + proposal_feature_maps, 256, 1, scope='Conv2d_0a_1x1') + tower_conv2_1 = slim.conv2d(tower_conv2, 288, 3, + scope='Conv2d_0b_3x3') + tower_conv2_2 = slim.conv2d( + tower_conv2_1, 320, 3, stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_3'): + tower_pool = slim.max_pool2d( + proposal_feature_maps, 3, stride=2, padding='VALID', + scope='MaxPool_1a_3x3') + net = tf.concat( + [tower_conv_1, tower_conv1_1, tower_conv2_2, tower_pool], 3) + net = slim.repeat(net, 9, inception_resnet_v2.block8, scale=0.20) + net = inception_resnet_v2.block8(net, activation_fn=None) + proposal_classifier_features = slim.conv2d( + net, 1536, 1, scope='Conv2d_7b_1x1') + return proposal_classifier_features + + def restore_from_classification_checkpoint_fn( + self, + first_stage_feature_extractor_scope, + second_stage_feature_extractor_scope): + """Returns a map of variables to load from a foreign checkpoint. + + Note that this overrides the default implementation in + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor which does not work for + InceptionResnetV2 checkpoints. + + TODO(jonathanhuang,rathodv): revisit whether it's possible to force the + `Repeat` namescope as created in `_extract_box_classifier_features` to + start counting at 2 (e.g. `Repeat_2`) so that the default restore_fn can + be used. + + Args: + first_stage_feature_extractor_scope: A scope name for the first stage + feature extractor. + second_stage_feature_extractor_scope: A scope name for the second stage + feature extractor. + + Returns: + A dict mapping variable names (to load from a checkpoint) to variables in + the model graph. + """ + + variables_to_restore = {} + for variable in tf.global_variables(): + if variable.op.name.startswith( + first_stage_feature_extractor_scope): + var_name = variable.op.name.replace( + first_stage_feature_extractor_scope + '/', '') + variables_to_restore[var_name] = variable + if variable.op.name.startswith( + second_stage_feature_extractor_scope): + var_name = variable.op.name.replace( + second_stage_feature_extractor_scope + + '/InceptionResnetV2/Repeat', 'InceptionResnetV2/Repeat_2') + var_name = var_name.replace( + second_stage_feature_extractor_scope + '/', '') + variables_to_restore[var_name] = variable + return variables_to_restore + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1d9f088f3cbcb434f7305341cb32c5eb2ce35bf5 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_resnet_v2_feature_extractor_test.py @@ -0,0 +1,109 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for models.faster_rcnn_inception_resnet_v2_feature_extractor.""" + +import tensorflow as tf + +from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res + + +class FasterRcnnInceptionResnetV2FeatureExtractorTest(tf.test.TestCase): + + def _build_feature_extractor(self, first_stage_features_stride): + return frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor( + is_training=False, + first_stage_features_stride=first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0) + + def test_extract_proposal_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 299, 299, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 19, 19, 1088]) + + def test_extract_proposal_features_stride_eight(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=8) + preprocessed_inputs = tf.random_uniform( + [1, 224, 224, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 28, 28, 1088]) + + def test_extract_proposal_features_half_size_input(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 112, 112, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 7, 7, 1088]) + + def test_extract_proposal_features_dies_on_invalid_stride(self): + with self.assertRaises(ValueError): + self._build_feature_extractor(first_stage_features_stride=99) + + def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [224, 224, 3], maxval=255, dtype=tf.float32) + with self.assertRaises(ValueError): + feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + + def test_extract_box_classifier_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + proposal_feature_maps = tf.random_uniform( + [2, 17, 17, 1088], maxval=255, dtype=tf.float32) + proposal_classifier_features = ( + feature_extractor.extract_box_classifier_features( + proposal_feature_maps, scope='TestScope')) + features_shape = tf.shape(proposal_classifier_features) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [2, 8, 8, 1536]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_v2_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_v2_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..fe0675169bfd69611851680d5da81e002fe1b959 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_v2_feature_extractor.py @@ -0,0 +1,254 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Inception V2 Faster R-CNN implementation. + +See "Rethinking the Inception Architecture for Computer Vision" +https://arxiv.org/abs/1512.00567 +""" +import tensorflow as tf + +from object_detection.meta_architectures import faster_rcnn_meta_arch +from nets import inception_v2 + +slim = tf.contrib.slim + + +def _batch_norm_arg_scope(list_ops, + use_batch_norm=True, + batch_norm_decay=0.9997, + batch_norm_epsilon=0.001, + batch_norm_scale=False, + train_batch_norm=False): + """Slim arg scope for InceptionV2 batch norm.""" + if use_batch_norm: + batch_norm_params = { + 'is_training': train_batch_norm, + 'scale': batch_norm_scale, + 'decay': batch_norm_decay, + 'epsilon': batch_norm_epsilon + } + normalizer_fn = slim.batch_norm + else: + normalizer_fn = None + batch_norm_params = None + + return slim.arg_scope(list_ops, + normalizer_fn=normalizer_fn, + normalizer_params=batch_norm_params) + + +class FasterRCNNInceptionV2FeatureExtractor( + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor): + """Faster R-CNN Inception V2 feature extractor implementation.""" + + def __init__(self, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0, + depth_multiplier=1.0, + min_depth=16): + """Constructor. + + Args: + is_training: See base class. + first_stage_features_stride: See base class. + batch_norm_trainable: See base class. + reuse_weights: See base class. + weight_decay: See base class. + depth_multiplier: float depth multiplier for feature extractor. + min_depth: minimum feature extractor depth. + + Raises: + ValueError: If `first_stage_features_stride` is not 8 or 16. + """ + if first_stage_features_stride != 8 and first_stage_features_stride != 16: + raise ValueError('`first_stage_features_stride` must be 8 or 16.') + self._depth_multiplier = depth_multiplier + self._min_depth = min_depth + super(FasterRCNNInceptionV2FeatureExtractor, self).__init__( + is_training, first_stage_features_stride, batch_norm_trainable, + reuse_weights, weight_decay) + + def preprocess(self, resized_inputs): + """Faster R-CNN Inception V2 preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def _extract_proposal_features(self, preprocessed_inputs, scope): + """Extracts first stage RPN features. + + Args: + preprocessed_inputs: A [batch, height, width, channels] float32 tensor + representing a batch of images. + scope: A scope name. + + Returns: + rpn_feature_map: A tensor with shape [batch, height, width, depth] + activations: A dictionary mapping feature extractor tensor names to + tensors + + Raises: + InvalidArgumentError: If the spatial size of `preprocessed_inputs` + (height or width) is less than 33. + ValueError: If the created network is missing the required activation. + """ + + preprocessed_inputs.get_shape().assert_has_rank(4) + shape_assert = tf.Assert( + tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33), + tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)), + ['image size must at least be 33 in both height and width.']) + + with tf.control_dependencies([shape_assert]): + with tf.variable_scope('InceptionV2', + reuse=self._reuse_weights) as scope: + with _batch_norm_arg_scope([slim.conv2d, slim.separable_conv2d], + batch_norm_scale=True, + train_batch_norm=self._train_batch_norm): + _, activations = inception_v2.inception_v2_base( + preprocessed_inputs, + final_endpoint='Mixed_4e', + min_depth=self._min_depth, + depth_multiplier=self._depth_multiplier, + scope=scope) + + return activations['Mixed_4e'], activations + + def _extract_box_classifier_features(self, proposal_feature_maps, scope): + """Extracts second stage box classifier features. + + Args: + proposal_feature_maps: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, crop_height, crop_width, depth] + representing the feature map cropped to each proposal. + scope: A scope name (unused). + + Returns: + proposal_classifier_features: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, height, width, depth] + representing box classifier features for each proposal. + """ + net = proposal_feature_maps + + depth = lambda d: max(int(d * self._depth_multiplier), self._min_depth) + trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) + + data_format = 'NHWC' + concat_dim = 3 if data_format == 'NHWC' else 1 + + with tf.variable_scope('InceptionV2', reuse=self._reuse_weights): + with slim.arg_scope( + [slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, + padding='SAME', + data_format=data_format): + with _batch_norm_arg_scope([slim.conv2d, slim.separable_conv2d], + batch_norm_scale=True, + train_batch_norm=self._train_batch_norm): + + with tf.variable_scope('Mixed_5a'): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d( + net, depth(128), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_0 = slim.conv2d(branch_0, depth(192), [3, 3], stride=2, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(192), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(256), [3, 3], + scope='Conv2d_0b_3x3') + branch_1 = slim.conv2d(branch_1, depth(256), [3, 3], stride=2, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.max_pool2d(net, [3, 3], stride=2, + scope='MaxPool_1a_3x3') + net = tf.concat([branch_0, branch_1, branch_2], concat_dim) + + with tf.variable_scope('Mixed_5b'): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(352), [1, 1], + scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(192), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(320), [3, 3], + scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(160), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(128), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + net = tf.concat([branch_0, branch_1, branch_2, branch_3], + concat_dim) + + with tf.variable_scope('Mixed_5c'): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(352), [1, 1], + scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(192), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(320), [3, 3], + scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(192), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(128), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + proposal_classifier_features = tf.concat( + [branch_0, branch_1, branch_2, branch_3], concat_dim) + + return proposal_classifier_features diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_v2_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_v2_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6b5bc2f9be3c569d1fc6e5400833066aa42e7559 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_inception_v2_feature_extractor_test.py @@ -0,0 +1,126 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for faster_rcnn_inception_v2_feature_extractor.""" + +import numpy as np +import tensorflow as tf + +from object_detection.models import faster_rcnn_inception_v2_feature_extractor as faster_rcnn_inception_v2 + + +class FasterRcnnInceptionV2FeatureExtractorTest(tf.test.TestCase): + + def _build_feature_extractor(self, first_stage_features_stride): + return faster_rcnn_inception_v2.FasterRCNNInceptionV2FeatureExtractor( + is_training=False, + first_stage_features_stride=first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0) + + def test_extract_proposal_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [4, 224, 224, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [4, 14, 14, 576]) + + def test_extract_proposal_features_stride_eight(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=8) + preprocessed_inputs = tf.random_uniform( + [4, 224, 224, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [4, 14, 14, 576]) + + def test_extract_proposal_features_half_size_input(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 112, 112, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 7, 7, 576]) + + def test_extract_proposal_features_dies_on_invalid_stride(self): + with self.assertRaises(ValueError): + self._build_feature_extractor(first_stage_features_stride=99) + + def test_extract_proposal_features_dies_on_very_small_images(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3)) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + with self.assertRaises(tf.errors.InvalidArgumentError): + sess.run( + features_shape, + feed_dict={preprocessed_inputs: np.random.rand(4, 32, 32, 3)}) + + def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [224, 224, 3], maxval=255, dtype=tf.float32) + with self.assertRaises(ValueError): + feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + + def test_extract_box_classifier_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + proposal_feature_maps = tf.random_uniform( + [3, 14, 14, 576], maxval=255, dtype=tf.float32) + proposal_classifier_features = ( + feature_extractor.extract_box_classifier_features( + proposal_feature_maps, scope='TestScope')) + features_shape = tf.shape(proposal_classifier_features) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [3, 7, 7, 1024]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..52c744b8293b114941edada92c91f9c3f5d8dcd9 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor.py @@ -0,0 +1,194 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Mobilenet v1 Faster R-CNN implementation.""" +import numpy as np + +import tensorflow as tf + +from object_detection.meta_architectures import faster_rcnn_meta_arch +from object_detection.utils import shape_utils +from nets import mobilenet_v1 + +slim = tf.contrib.slim + + +def _get_mobilenet_conv_no_last_stride_defs(conv_depth_ratio_in_percentage): + if conv_depth_ratio_in_percentage not in [25, 50, 75, 100]: + raise ValueError( + 'Only the following ratio percentages are supported: 25, 50, 75, 100') + conv_depth_ratio_in_percentage = float(conv_depth_ratio_in_percentage) / 100.0 + channels = np.array([ + 32, 64, 128, 128, 256, 256, 512, 512, 512, 512, 512, 512, 1024, 1024 + ], dtype=np.float32) + channels = (channels * conv_depth_ratio_in_percentage).astype(np.int32) + return [ + mobilenet_v1.Conv(kernel=[3, 3], stride=2, depth=channels[0]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[1]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=channels[2]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[3]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=channels[4]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[5]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=2, depth=channels[6]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[7]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[8]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[9]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[10]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[11]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[12]), + mobilenet_v1.DepthSepConv(kernel=[3, 3], stride=1, depth=channels[13]) + ] + + +class FasterRCNNMobilenetV1FeatureExtractor( + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor): + """Faster R-CNN Mobilenet V1 feature extractor implementation.""" + + def __init__(self, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0, + depth_multiplier=1.0, + min_depth=16, + skip_last_stride=False, + conv_depth_ratio_in_percentage=100): + """Constructor. + + Args: + is_training: See base class. + first_stage_features_stride: See base class. + batch_norm_trainable: See base class. + reuse_weights: See base class. + weight_decay: See base class. + depth_multiplier: float depth multiplier for feature extractor. + min_depth: minimum feature extractor depth. + skip_last_stride: Skip the last stride if True. + conv_depth_ratio_in_percentage: Conv depth ratio in percentage. Only + applied if skip_last_stride is True. + + Raises: + ValueError: If `first_stage_features_stride` is not 8 or 16. + """ + if first_stage_features_stride != 8 and first_stage_features_stride != 16: + raise ValueError('`first_stage_features_stride` must be 8 or 16.') + self._depth_multiplier = depth_multiplier + self._min_depth = min_depth + self._skip_last_stride = skip_last_stride + self._conv_depth_ratio_in_percentage = conv_depth_ratio_in_percentage + super(FasterRCNNMobilenetV1FeatureExtractor, self).__init__( + is_training, first_stage_features_stride, batch_norm_trainable, + reuse_weights, weight_decay) + + def preprocess(self, resized_inputs): + """Faster R-CNN Mobilenet V1 preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def _extract_proposal_features(self, preprocessed_inputs, scope): + """Extracts first stage RPN features. + + Args: + preprocessed_inputs: A [batch, height, width, channels] float32 tensor + representing a batch of images. + scope: A scope name. + + Returns: + rpn_feature_map: A tensor with shape [batch, height, width, depth] + activations: A dictionary mapping feature extractor tensor names to + tensors + + Raises: + InvalidArgumentError: If the spatial size of `preprocessed_inputs` + (height or width) is less than 33. + ValueError: If the created network is missing the required activation. + """ + + preprocessed_inputs.get_shape().assert_has_rank(4) + preprocessed_inputs = shape_utils.check_min_image_dim( + min_dim=33, image_tensor=preprocessed_inputs) + + with slim.arg_scope( + mobilenet_v1.mobilenet_v1_arg_scope( + is_training=self._train_batch_norm, + weight_decay=self._weight_decay)): + with tf.variable_scope('MobilenetV1', + reuse=self._reuse_weights) as scope: + params = {} + if self._skip_last_stride: + params['conv_defs'] = _get_mobilenet_conv_no_last_stride_defs( + conv_depth_ratio_in_percentage=self. + _conv_depth_ratio_in_percentage) + _, activations = mobilenet_v1.mobilenet_v1_base( + preprocessed_inputs, + final_endpoint='Conv2d_11_pointwise', + min_depth=self._min_depth, + depth_multiplier=self._depth_multiplier, + scope=scope, + **params) + return activations['Conv2d_11_pointwise'], activations + + def _extract_box_classifier_features(self, proposal_feature_maps, scope): + """Extracts second stage box classifier features. + + Args: + proposal_feature_maps: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, crop_height, crop_width, depth] + representing the feature map cropped to each proposal. + scope: A scope name (unused). + + Returns: + proposal_classifier_features: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, height, width, depth] + representing box classifier features for each proposal. + """ + net = proposal_feature_maps + + conv_depth = 1024 + if self._skip_last_stride: + conv_depth_ratio = float(self._conv_depth_ratio_in_percentage) / 100.0 + conv_depth = int(float(conv_depth) * conv_depth_ratio) + + depth = lambda d: max(int(d * 1.0), 16) + with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights): + with slim.arg_scope( + mobilenet_v1.mobilenet_v1_arg_scope( + is_training=self._train_batch_norm, + weight_decay=self._weight_decay)): + with slim.arg_scope( + [slim.conv2d, slim.separable_conv2d], padding='SAME'): + net = slim.separable_conv2d( + net, + depth(conv_depth), [3, 3], + depth_multiplier=1, + stride=2, + scope='Conv2d_12_pointwise') + return slim.separable_conv2d( + net, + depth(conv_depth), [3, 3], + depth_multiplier=1, + stride=1, + scope='Conv2d_13_pointwise') diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..fcefe616f6f938d11ef333559b213a68ed206ec5 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_mobilenet_v1_feature_extractor_test.py @@ -0,0 +1,126 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for faster_rcnn_mobilenet_v1_feature_extractor.""" + +import numpy as np +import tensorflow as tf + +from object_detection.models import faster_rcnn_mobilenet_v1_feature_extractor as faster_rcnn_mobilenet_v1 + + +class FasterRcnnMobilenetV1FeatureExtractorTest(tf.test.TestCase): + + def _build_feature_extractor(self, first_stage_features_stride): + return faster_rcnn_mobilenet_v1.FasterRCNNMobilenetV1FeatureExtractor( + is_training=False, + first_stage_features_stride=first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0) + + def test_extract_proposal_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [4, 224, 224, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [4, 14, 14, 512]) + + def test_extract_proposal_features_stride_eight(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=8) + preprocessed_inputs = tf.random_uniform( + [4, 224, 224, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [4, 14, 14, 512]) + + def test_extract_proposal_features_half_size_input(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 112, 112, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 7, 7, 512]) + + def test_extract_proposal_features_dies_on_invalid_stride(self): + with self.assertRaises(ValueError): + self._build_feature_extractor(first_stage_features_stride=99) + + def test_extract_proposal_features_dies_on_very_small_images(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3)) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + with self.assertRaises(tf.errors.InvalidArgumentError): + sess.run( + features_shape, + feed_dict={preprocessed_inputs: np.random.rand(4, 32, 32, 3)}) + + def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [224, 224, 3], maxval=255, dtype=tf.float32) + with self.assertRaises(ValueError): + feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + + def test_extract_box_classifier_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + proposal_feature_maps = tf.random_uniform( + [3, 14, 14, 576], maxval=255, dtype=tf.float32) + proposal_classifier_features = ( + feature_extractor.extract_box_classifier_features( + proposal_feature_maps, scope='TestScope')) + features_shape = tf.shape(proposal_classifier_features) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [3, 7, 7, 1024]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_nas_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_nas_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..5fa6bf7531517b42a85521b71eb3025bd93742a9 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_nas_feature_extractor.py @@ -0,0 +1,324 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""NASNet Faster R-CNN implementation. + +Learning Transferable Architectures for Scalable Image Recognition +Barret Zoph, Vijay Vasudevan, Jonathon Shlens, Quoc V. Le +https://arxiv.org/abs/1707.07012 +""" + +import tensorflow as tf + +from object_detection.meta_architectures import faster_rcnn_meta_arch +from nets.nasnet import nasnet +from nets.nasnet import nasnet_utils + +arg_scope = tf.contrib.framework.arg_scope +slim = tf.contrib.slim + + +def nasnet_large_arg_scope_for_detection(is_batch_norm_training=False): + """Defines the default arg scope for the NASNet-A Large for object detection. + + This provides a small edit to switch batch norm training on and off. + + Args: + is_batch_norm_training: Boolean indicating whether to train with batch norm. + + Returns: + An `arg_scope` to use for the NASNet Large Model. + """ + imagenet_scope = nasnet.nasnet_large_arg_scope() + with arg_scope(imagenet_scope): + with arg_scope([slim.batch_norm], is_training=is_batch_norm_training) as sc: + return sc + + +# Note: This is largely a copy of _build_nasnet_base inside nasnet.py but +# with special edits to remove instantiation of the stem and the special +# ability to receive as input a pair of hidden states. +def _build_nasnet_base(hidden_previous, + hidden, + normal_cell, + reduction_cell, + hparams, + true_cell_num, + start_cell_num): + """Constructs a NASNet image model.""" + + # Find where to place the reduction cells or stride normal cells + reduction_indices = nasnet_utils.calc_reduction_layers( + hparams.num_cells, hparams.num_reduction_layers) + + # Note: The None is prepended to match the behavior of _imagenet_stem() + cell_outputs = [None, hidden_previous, hidden] + net = hidden + + # NOTE: In the nasnet.py code, filter_scaling starts at 1.0. We instead + # start at 2.0 because 1 reduction cell has been created which would + # update the filter_scaling to 2.0. + filter_scaling = 2.0 + + # Run the cells + for cell_num in range(start_cell_num, hparams.num_cells): + stride = 1 + if hparams.skip_reduction_layer_input: + prev_layer = cell_outputs[-2] + if cell_num in reduction_indices: + filter_scaling *= hparams.filter_scaling_rate + net = reduction_cell( + net, + scope='reduction_cell_{}'.format(reduction_indices.index(cell_num)), + filter_scaling=filter_scaling, + stride=2, + prev_layer=cell_outputs[-2], + cell_num=true_cell_num) + true_cell_num += 1 + cell_outputs.append(net) + if not hparams.skip_reduction_layer_input: + prev_layer = cell_outputs[-2] + net = normal_cell( + net, + scope='cell_{}'.format(cell_num), + filter_scaling=filter_scaling, + stride=stride, + prev_layer=prev_layer, + cell_num=true_cell_num) + true_cell_num += 1 + cell_outputs.append(net) + + # Final nonlinearity. + # Note that we have dropped the final pooling, dropout and softmax layers + # from the default nasnet version. + with tf.variable_scope('final_layer'): + net = tf.nn.relu(net) + return net + + +# TODO(shlens): Only fixed_shape_resizer is currently supported for NASNet +# featurization. The reason for this is that nasnet.py only supports +# inputs with fully known shapes. We need to update nasnet.py to handle +# shapes not known at compile time. +class FasterRCNNNASFeatureExtractor( + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor): + """Faster R-CNN with NASNet-A feature extractor implementation.""" + + def __init__(self, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0): + """Constructor. + + Args: + is_training: See base class. + first_stage_features_stride: See base class. + batch_norm_trainable: See base class. + reuse_weights: See base class. + weight_decay: See base class. + + Raises: + ValueError: If `first_stage_features_stride` is not 16. + """ + if first_stage_features_stride != 16: + raise ValueError('`first_stage_features_stride` must be 16.') + super(FasterRCNNNASFeatureExtractor, self).__init__( + is_training, first_stage_features_stride, batch_norm_trainable, + reuse_weights, weight_decay) + + def preprocess(self, resized_inputs): + """Faster R-CNN with NAS preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: A [batch, height_in, width_in, channels] float32 tensor + representing a batch of images with values between 0 and 255.0. + + Returns: + preprocessed_inputs: A [batch, height_out, width_out, channels] float32 + tensor representing a batch of images. + + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def _extract_proposal_features(self, preprocessed_inputs, scope): + """Extracts first stage RPN features. + + Extracts features using the first half of the NASNet network. + We construct the network in `align_feature_maps=True` mode, which means + that all VALID paddings in the network are changed to SAME padding so that + the feature maps are aligned. + + Args: + preprocessed_inputs: A [batch, height, width, channels] float32 tensor + representing a batch of images. + scope: A scope name. + + Returns: + rpn_feature_map: A tensor with shape [batch, height, width, depth] + end_points: A dictionary mapping feature extractor tensor names to tensors + + Raises: + ValueError: If the created network is missing the required activation. + """ + del scope + + if len(preprocessed_inputs.get_shape().as_list()) != 4: + raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a ' + 'tensor of shape %s' % preprocessed_inputs.get_shape()) + + with slim.arg_scope(nasnet_large_arg_scope_for_detection( + is_batch_norm_training=self._train_batch_norm)): + with arg_scope([slim.conv2d, + slim.batch_norm, + slim.separable_conv2d], + reuse=self._reuse_weights): + _, end_points = nasnet.build_nasnet_large( + preprocessed_inputs, num_classes=None, + is_training=self._is_training, + final_endpoint='Cell_11') + + # Note that both 'Cell_10' and 'Cell_11' have equal depth = 2016. + rpn_feature_map = tf.concat([end_points['Cell_10'], + end_points['Cell_11']], 3) + + # nasnet.py does not maintain the batch size in the first dimension. + # This work around permits us retaining the batch for below. + batch = preprocessed_inputs.get_shape().as_list()[0] + shape_without_batch = rpn_feature_map.get_shape().as_list()[1:] + rpn_feature_map_shape = [batch] + shape_without_batch + rpn_feature_map.set_shape(rpn_feature_map_shape) + + return rpn_feature_map, end_points + + def _extract_box_classifier_features(self, proposal_feature_maps, scope): + """Extracts second stage box classifier features. + + This function reconstructs the "second half" of the NASNet-A + network after the part defined in `_extract_proposal_features`. + + Args: + proposal_feature_maps: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, crop_height, crop_width, depth] + representing the feature map cropped to each proposal. + scope: A scope name. + + Returns: + proposal_classifier_features: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, height, width, depth] + representing box classifier features for each proposal. + """ + del scope + + # Note that we always feed into 2 layers of equal depth + # where the first N channels corresponds to previous hidden layer + # and the second N channels correspond to the final hidden layer. + hidden_previous, hidden = tf.split(proposal_feature_maps, 2, axis=3) + + # Note that what follows is largely a copy of build_nasnet_large() within + # nasnet.py. We are copying to minimize code pollution in slim. + + # TODO(shlens,skornblith): Determine the appropriate drop path schedule. + # For now the schedule is the default (1.0->0.7 over 250,000 train steps). + hparams = nasnet.large_imagenet_config() + if not self._is_training: + hparams.set_hparam('drop_path_keep_prob', 1.0) + + # Calculate the total number of cells in the network + # -- Add 2 for the reduction cells. + total_num_cells = hparams.num_cells + 2 + # -- And add 2 for the stem cells for ImageNet training. + total_num_cells += 2 + + normal_cell = nasnet_utils.NasNetANormalCell( + hparams.num_conv_filters, hparams.drop_path_keep_prob, + total_num_cells, hparams.total_training_steps) + reduction_cell = nasnet_utils.NasNetAReductionCell( + hparams.num_conv_filters, hparams.drop_path_keep_prob, + total_num_cells, hparams.total_training_steps) + with arg_scope([slim.dropout, nasnet_utils.drop_path], + is_training=self._is_training): + with arg_scope([slim.batch_norm], is_training=self._train_batch_norm): + with arg_scope([slim.avg_pool2d, + slim.max_pool2d, + slim.conv2d, + slim.batch_norm, + slim.separable_conv2d, + nasnet_utils.factorized_reduction, + nasnet_utils.global_avg_pool, + nasnet_utils.get_channel_index, + nasnet_utils.get_channel_dim], + data_format=hparams.data_format): + + # This corresponds to the cell number just past 'Cell_11' used by + # by _extract_proposal_features(). + start_cell_num = 12 + # Note that this number equals: + # start_cell_num + 2 stem cells + 1 reduction cell + true_cell_num = 15 + + with slim.arg_scope(nasnet.nasnet_large_arg_scope()): + net = _build_nasnet_base(hidden_previous, + hidden, + normal_cell=normal_cell, + reduction_cell=reduction_cell, + hparams=hparams, + true_cell_num=true_cell_num, + start_cell_num=start_cell_num) + + proposal_classifier_features = net + return proposal_classifier_features + + def restore_from_classification_checkpoint_fn( + self, + first_stage_feature_extractor_scope, + second_stage_feature_extractor_scope): + """Returns a map of variables to load from a foreign checkpoint. + + Note that this overrides the default implementation in + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor which does not work for + NASNet-A checkpoints. + + Args: + first_stage_feature_extractor_scope: A scope name for the first stage + feature extractor. + second_stage_feature_extractor_scope: A scope name for the second stage + feature extractor. + + Returns: + A dict mapping variable names (to load from a checkpoint) to variables in + the model graph. + """ + # Note that the NAS checkpoint only contains the moving average version of + # the Variables so we need to generate an appropriate dictionary mapping. + variables_to_restore = {} + for variable in tf.global_variables(): + if variable.op.name.startswith( + first_stage_feature_extractor_scope): + var_name = variable.op.name.replace( + first_stage_feature_extractor_scope + '/', '') + var_name += '/ExponentialMovingAverage' + variables_to_restore[var_name] = variable + if variable.op.name.startswith( + second_stage_feature_extractor_scope): + var_name = variable.op.name.replace( + second_stage_feature_extractor_scope + '/', '') + var_name += '/ExponentialMovingAverage' + variables_to_restore[var_name] = variable + return variables_to_restore + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_nas_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_nas_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..cecfc4f8d115e9f154f098bae2bf63f389b1732b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_nas_feature_extractor_test.py @@ -0,0 +1,109 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for models.faster_rcnn_nas_feature_extractor.""" + +import tensorflow as tf + +from object_detection.models import faster_rcnn_nas_feature_extractor as frcnn_nas + + +class FasterRcnnNASFeatureExtractorTest(tf.test.TestCase): + + def _build_feature_extractor(self, first_stage_features_stride): + return frcnn_nas.FasterRCNNNASFeatureExtractor( + is_training=False, + first_stage_features_stride=first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0) + + def test_extract_proposal_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 299, 299, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 19, 19, 4032]) + + def test_extract_proposal_features_input_size_224(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 224, 224, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 14, 14, 4032]) + + def test_extract_proposal_features_input_size_112(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 112, 112, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 7, 7, 4032]) + + def test_extract_proposal_features_dies_on_invalid_stride(self): + with self.assertRaises(ValueError): + self._build_feature_extractor(first_stage_features_stride=99) + + def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [224, 224, 3], maxval=255, dtype=tf.float32) + with self.assertRaises(ValueError): + feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + + def test_extract_box_classifier_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + proposal_feature_maps = tf.random_uniform( + [2, 17, 17, 1088], maxval=255, dtype=tf.float32) + proposal_classifier_features = ( + feature_extractor.extract_box_classifier_features( + proposal_feature_maps, scope='TestScope')) + features_shape = tf.shape(proposal_classifier_features) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [2, 9, 9, 4032]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_pnas_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_pnas_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..b5d0f43afd4cff71880bdedd3615e398a8ef60fb --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_pnas_feature_extractor.py @@ -0,0 +1,318 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""PNASNet Faster R-CNN implementation. + +Based on PNASNet model: https://arxiv.org/abs/1712.00559 +""" + +import tensorflow as tf + +from object_detection.meta_architectures import faster_rcnn_meta_arch +from nets.nasnet import nasnet_utils +from nets.nasnet import pnasnet + +arg_scope = tf.contrib.framework.arg_scope +slim = tf.contrib.slim + + +def pnasnet_large_arg_scope_for_detection(is_batch_norm_training=False): + """Defines the default arg scope for the PNASNet Large for object detection. + + This provides a small edit to switch batch norm training on and off. + + Args: + is_batch_norm_training: Boolean indicating whether to train with batch norm. + + Returns: + An `arg_scope` to use for the PNASNet Large Model. + """ + imagenet_scope = pnasnet.pnasnet_large_arg_scope() + with arg_scope(imagenet_scope): + with arg_scope([slim.batch_norm], is_training=is_batch_norm_training) as sc: + return sc + + +def _filter_scaling(reduction_indices, start_cell_num): + """Compute the expected filter scaling at given PNASNet cell start_cell_num. + + In the pnasnet.py code, filter_scaling starts at 1.0. We instead + adapt filter scaling to depend on the starting cell. + At first cells, before any reduction, filter_scalling is 1.0. With passing + any reduction cell, the filter_scaling is multiplied by 2. + + Args: + reduction_indices: list of int indices. + start_cell_num: int. + Returns: + filter_scaling: float. + """ + filter_scaling = 1.0 + for ind in reduction_indices: + if ind < start_cell_num: + filter_scaling *= 2.0 + return filter_scaling + + +# Note: This is largely a copy of _build_pnasnet_base inside pnasnet.py but +# with special edits to remove instantiation of the stem and the special +# ability to receive as input a pair of hidden states. It constructs only +# a sub-network from the original PNASNet model, starting from the +# start_cell_num cell and with modified final layer. +def _build_pnasnet_base( + hidden_previous, hidden, normal_cell, hparams, true_cell_num, + start_cell_num): + """Constructs a PNASNet image model for proposal classifier features.""" + + # Find where to place the reduction cells or stride normal cells + reduction_indices = nasnet_utils.calc_reduction_layers( + hparams.num_cells, hparams.num_reduction_layers) + filter_scaling = _filter_scaling(reduction_indices, start_cell_num) + + # Note: The None is prepended to match the behavior of _imagenet_stem() + cell_outputs = [None, hidden_previous, hidden] + net = hidden + + # Run the cells + for cell_num in range(start_cell_num, hparams.num_cells): + is_reduction = cell_num in reduction_indices + stride = 2 if is_reduction else 1 + if is_reduction: filter_scaling *= hparams.filter_scaling_rate + prev_layer = cell_outputs[-2] + net = normal_cell( + net, + scope='cell_{}'.format(cell_num), + filter_scaling=filter_scaling, + stride=stride, + prev_layer=prev_layer, + cell_num=true_cell_num) + true_cell_num += 1 + cell_outputs.append(net) + + # Final nonlinearity. + # Note that we have dropped the final pooling, dropout and softmax layers + # from the default pnasnet version. + with tf.variable_scope('final_layer'): + net = tf.nn.relu(net) + return net + + +# TODO(shlens): Only fixed_shape_resizer is currently supported for PNASNet +# featurization. The reason for this is that pnasnet.py only supports +# inputs with fully known shapes. We need to update pnasnet.py to handle +# shapes not known at compile time. +class FasterRCNNPNASFeatureExtractor( + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor): + """Faster R-CNN with PNASNet feature extractor implementation.""" + + def __init__(self, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0): + """Constructor. + + Args: + is_training: See base class. + first_stage_features_stride: See base class. + batch_norm_trainable: See base class. + reuse_weights: See base class. + weight_decay: See base class. + + Raises: + ValueError: If `first_stage_features_stride` is not 16. + """ + if first_stage_features_stride != 16: + raise ValueError('`first_stage_features_stride` must be 16.') + super(FasterRCNNPNASFeatureExtractor, self).__init__( + is_training, first_stage_features_stride, batch_norm_trainable, + reuse_weights, weight_decay) + + def preprocess(self, resized_inputs): + """Faster R-CNN with PNAS preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: A [batch, height_in, width_in, channels] float32 tensor + representing a batch of images with values between 0 and 255.0. + + Returns: + preprocessed_inputs: A [batch, height_out, width_out, channels] float32 + tensor representing a batch of images. + + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def _extract_proposal_features(self, preprocessed_inputs, scope): + """Extracts first stage RPN features. + + Extracts features using the first half of the PNASNet network. + We construct the network in `align_feature_maps=True` mode, which means + that all VALID paddings in the network are changed to SAME padding so that + the feature maps are aligned. + + Args: + preprocessed_inputs: A [batch, height, width, channels] float32 tensor + representing a batch of images. + scope: A scope name. + + Returns: + rpn_feature_map: A tensor with shape [batch, height, width, depth] + end_points: A dictionary mapping feature extractor tensor names to tensors + + Raises: + ValueError: If the created network is missing the required activation. + """ + del scope + + if len(preprocessed_inputs.get_shape().as_list()) != 4: + raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a ' + 'tensor of shape %s' % preprocessed_inputs.get_shape()) + + with slim.arg_scope(pnasnet_large_arg_scope_for_detection( + is_batch_norm_training=self._train_batch_norm)): + with arg_scope([slim.conv2d, + slim.batch_norm, + slim.separable_conv2d], + reuse=self._reuse_weights): + _, end_points = pnasnet.build_pnasnet_large( + preprocessed_inputs, num_classes=None, + is_training=self._is_training, + final_endpoint='Cell_7') + + # Note that both 'Cell_6' and 'Cell_7' have equal depth = 2160. + # Cell_7 is the last cell before second reduction. + rpn_feature_map = tf.concat([end_points['Cell_6'], + end_points['Cell_7']], 3) + + # pnasnet.py does not maintain the batch size in the first dimension. + # This work around permits us retaining the batch for below. + batch = preprocessed_inputs.get_shape().as_list()[0] + shape_without_batch = rpn_feature_map.get_shape().as_list()[1:] + rpn_feature_map_shape = [batch] + shape_without_batch + rpn_feature_map.set_shape(rpn_feature_map_shape) + + return rpn_feature_map, end_points + + def _extract_box_classifier_features(self, proposal_feature_maps, scope): + """Extracts second stage box classifier features. + + This function reconstructs the "second half" of the PNASNet + network after the part defined in `_extract_proposal_features`. + + Args: + proposal_feature_maps: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, crop_height, crop_width, depth] + representing the feature map cropped to each proposal. + scope: A scope name. + + Returns: + proposal_classifier_features: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, height, width, depth] + representing box classifier features for each proposal. + """ + del scope + + # Number of used stem cells. + num_stem_cells = 2 + + # Note that we always feed into 2 layers of equal depth + # where the first N channels corresponds to previous hidden layer + # and the second N channels correspond to the final hidden layer. + hidden_previous, hidden = tf.split(proposal_feature_maps, 2, axis=3) + + # Note that what follows is largely a copy of build_pnasnet_large() within + # pnasnet.py. We are copying to minimize code pollution in slim. + + # TODO(shlens,skornblith): Determine the appropriate drop path schedule. + # For now the schedule is the default (1.0->0.7 over 250,000 train steps). + hparams = pnasnet.large_imagenet_config() + if not self._is_training: + hparams.set_hparam('drop_path_keep_prob', 1.0) + + # Calculate the total number of cells in the network + total_num_cells = hparams.num_cells + num_stem_cells + + normal_cell = pnasnet.PNasNetNormalCell( + hparams.num_conv_filters, hparams.drop_path_keep_prob, + total_num_cells, hparams.total_training_steps) + with arg_scope([slim.dropout, nasnet_utils.drop_path], + is_training=self._is_training): + with arg_scope([slim.batch_norm], is_training=self._train_batch_norm): + with arg_scope([slim.avg_pool2d, + slim.max_pool2d, + slim.conv2d, + slim.batch_norm, + slim.separable_conv2d, + nasnet_utils.factorized_reduction, + nasnet_utils.global_avg_pool, + nasnet_utils.get_channel_index, + nasnet_utils.get_channel_dim], + data_format=hparams.data_format): + + # This corresponds to the cell number just past 'Cell_7' used by + # _extract_proposal_features(). + start_cell_num = 8 + true_cell_num = start_cell_num + num_stem_cells + + with slim.arg_scope(pnasnet.pnasnet_large_arg_scope()): + net = _build_pnasnet_base( + hidden_previous, + hidden, + normal_cell=normal_cell, + hparams=hparams, + true_cell_num=true_cell_num, + start_cell_num=start_cell_num) + + proposal_classifier_features = net + return proposal_classifier_features + + def restore_from_classification_checkpoint_fn( + self, + first_stage_feature_extractor_scope, + second_stage_feature_extractor_scope): + """Returns a map of variables to load from a foreign checkpoint. + + Note that this overrides the default implementation in + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor which does not work for + PNASNet checkpoints. + + Args: + first_stage_feature_extractor_scope: A scope name for the first stage + feature extractor. + second_stage_feature_extractor_scope: A scope name for the second stage + feature extractor. + + Returns: + A dict mapping variable names (to load from a checkpoint) to variables in + the model graph. + """ + variables_to_restore = {} + for variable in tf.global_variables(): + if variable.op.name.startswith( + first_stage_feature_extractor_scope): + var_name = variable.op.name.replace( + first_stage_feature_extractor_scope + '/', '') + var_name += '/ExponentialMovingAverage' + variables_to_restore[var_name] = variable + if variable.op.name.startswith( + second_stage_feature_extractor_scope): + var_name = variable.op.name.replace( + second_stage_feature_extractor_scope + '/', '') + var_name += '/ExponentialMovingAverage' + variables_to_restore[var_name] = variable + return variables_to_restore diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_pnas_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_pnas_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6bb368041ceba98a986eeeea85711c69fc94b288 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_pnas_feature_extractor_test.py @@ -0,0 +1,122 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for models.faster_rcnn_pnas_feature_extractor.""" + +import tensorflow as tf + +from object_detection.models import faster_rcnn_pnas_feature_extractor as frcnn_pnas + + +class FasterRcnnPNASFeatureExtractorTest(tf.test.TestCase): + + def _build_feature_extractor(self, first_stage_features_stride): + return frcnn_pnas.FasterRCNNPNASFeatureExtractor( + is_training=False, + first_stage_features_stride=first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0) + + def test_extract_proposal_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 299, 299, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 19, 19, 4320]) + + def test_extract_proposal_features_input_size_224(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 224, 224, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 14, 14, 4320]) + + def test_extract_proposal_features_input_size_112(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 112, 112, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 7, 7, 4320]) + + def test_extract_proposal_features_dies_on_invalid_stride(self): + with self.assertRaises(ValueError): + self._build_feature_extractor(first_stage_features_stride=99) + + def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [224, 224, 3], maxval=255, dtype=tf.float32) + with self.assertRaises(ValueError): + feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + + def test_extract_box_classifier_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + proposal_feature_maps = tf.random_uniform( + [2, 17, 17, 1088], maxval=255, dtype=tf.float32) + proposal_classifier_features = ( + feature_extractor.extract_box_classifier_features( + proposal_feature_maps, scope='TestScope')) + features_shape = tf.shape(proposal_classifier_features) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [2, 9, 9, 4320]) + + def test_filter_scaling_computation(self): + expected_filter_scaling = { + ((4, 8), 2): 1.0, + ((4, 8), 7): 2.0, + ((4, 8), 8): 2.0, + ((4, 8), 9): 4.0 + } + for args, filter_scaling in expected_filter_scaling.items(): + reduction_indices, start_cell_num = args + self.assertAlmostEqual( + frcnn_pnas._filter_scaling(reduction_indices, start_cell_num), + filter_scaling) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..286deae3de7f8edfb92073fff263e1f1e6369c6d --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_resnet_v1_feature_extractor.py @@ -0,0 +1,253 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Resnet V1 Faster R-CNN implementation. + +See "Deep Residual Learning for Image Recognition" by He et al., 2015. +https://arxiv.org/abs/1512.03385 + +Note: this implementation assumes that the classification checkpoint used +to finetune this model is trained using the same configuration as that of +the MSRA provided checkpoints +(see https://github.com/KaimingHe/deep-residual-networks), e.g., with +same preprocessing, batch norm scaling, etc. +""" +import tensorflow as tf + +from object_detection.meta_architectures import faster_rcnn_meta_arch +from nets import resnet_utils +from nets import resnet_v1 + +slim = tf.contrib.slim + + +class FasterRCNNResnetV1FeatureExtractor( + faster_rcnn_meta_arch.FasterRCNNFeatureExtractor): + """Faster R-CNN Resnet V1 feature extractor implementation.""" + + def __init__(self, + architecture, + resnet_model, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0): + """Constructor. + + Args: + architecture: Architecture name of the Resnet V1 model. + resnet_model: Definition of the Resnet V1 model. + is_training: See base class. + first_stage_features_stride: See base class. + batch_norm_trainable: See base class. + reuse_weights: See base class. + weight_decay: See base class. + + Raises: + ValueError: If `first_stage_features_stride` is not 8 or 16. + """ + if first_stage_features_stride != 8 and first_stage_features_stride != 16: + raise ValueError('`first_stage_features_stride` must be 8 or 16.') + self._architecture = architecture + self._resnet_model = resnet_model + super(FasterRCNNResnetV1FeatureExtractor, self).__init__( + is_training, first_stage_features_stride, batch_norm_trainable, + reuse_weights, weight_decay) + + def preprocess(self, resized_inputs): + """Faster R-CNN Resnet V1 preprocessing. + + VGG style channel mean subtraction as described here: + https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md + + Args: + resized_inputs: A [batch, height_in, width_in, channels] float32 tensor + representing a batch of images with values between 0 and 255.0. + + Returns: + preprocessed_inputs: A [batch, height_out, width_out, channels] float32 + tensor representing a batch of images. + + """ + channel_means = [123.68, 116.779, 103.939] + return resized_inputs - [[channel_means]] + + def _extract_proposal_features(self, preprocessed_inputs, scope): + """Extracts first stage RPN features. + + Args: + preprocessed_inputs: A [batch, height, width, channels] float32 tensor + representing a batch of images. + scope: A scope name. + + Returns: + rpn_feature_map: A tensor with shape [batch, height, width, depth] + activations: A dictionary mapping feature extractor tensor names to + tensors + + Raises: + InvalidArgumentError: If the spatial size of `preprocessed_inputs` + (height or width) is less than 33. + ValueError: If the created network is missing the required activation. + """ + if len(preprocessed_inputs.get_shape().as_list()) != 4: + raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a ' + 'tensor of shape %s' % preprocessed_inputs.get_shape()) + shape_assert = tf.Assert( + tf.logical_and( + tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33), + tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)), + ['image size must at least be 33 in both height and width.']) + + with tf.control_dependencies([shape_assert]): + # Disables batchnorm for fine-tuning with smaller batch sizes. + # TODO(chensun): Figure out if it is needed when image + # batch size is bigger. + with slim.arg_scope( + resnet_utils.resnet_arg_scope( + batch_norm_epsilon=1e-5, + batch_norm_scale=True, + weight_decay=self._weight_decay)): + with tf.variable_scope( + self._architecture, reuse=self._reuse_weights) as var_scope: + _, activations = self._resnet_model( + preprocessed_inputs, + num_classes=None, + is_training=self._train_batch_norm, + global_pool=False, + output_stride=self._first_stage_features_stride, + spatial_squeeze=False, + scope=var_scope) + + handle = scope + '/%s/block3' % self._architecture + return activations[handle], activations + + def _extract_box_classifier_features(self, proposal_feature_maps, scope): + """Extracts second stage box classifier features. + + Args: + proposal_feature_maps: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, crop_height, crop_width, depth] + representing the feature map cropped to each proposal. + scope: A scope name (unused). + + Returns: + proposal_classifier_features: A 4-D float tensor with shape + [batch_size * self.max_num_proposals, height, width, depth] + representing box classifier features for each proposal. + """ + with tf.variable_scope(self._architecture, reuse=self._reuse_weights): + with slim.arg_scope( + resnet_utils.resnet_arg_scope( + batch_norm_epsilon=1e-5, + batch_norm_scale=True, + weight_decay=self._weight_decay)): + with slim.arg_scope([slim.batch_norm], + is_training=self._train_batch_norm): + blocks = [ + resnet_utils.Block('block4', resnet_v1.bottleneck, [{ + 'depth': 2048, + 'depth_bottleneck': 512, + 'stride': 1 + }] * 3) + ] + proposal_classifier_features = resnet_utils.stack_blocks_dense( + proposal_feature_maps, blocks) + return proposal_classifier_features + + +class FasterRCNNResnet50FeatureExtractor(FasterRCNNResnetV1FeatureExtractor): + """Faster R-CNN Resnet 50 feature extractor implementation.""" + + def __init__(self, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0): + """Constructor. + + Args: + is_training: See base class. + first_stage_features_stride: See base class. + batch_norm_trainable: See base class. + reuse_weights: See base class. + weight_decay: See base class. + + Raises: + ValueError: If `first_stage_features_stride` is not 8 or 16, + or if `architecture` is not supported. + """ + super(FasterRCNNResnet50FeatureExtractor, self).__init__( + 'resnet_v1_50', resnet_v1.resnet_v1_50, is_training, + first_stage_features_stride, batch_norm_trainable, + reuse_weights, weight_decay) + + +class FasterRCNNResnet101FeatureExtractor(FasterRCNNResnetV1FeatureExtractor): + """Faster R-CNN Resnet 101 feature extractor implementation.""" + + def __init__(self, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0): + """Constructor. + + Args: + is_training: See base class. + first_stage_features_stride: See base class. + batch_norm_trainable: See base class. + reuse_weights: See base class. + weight_decay: See base class. + + Raises: + ValueError: If `first_stage_features_stride` is not 8 or 16, + or if `architecture` is not supported. + """ + super(FasterRCNNResnet101FeatureExtractor, self).__init__( + 'resnet_v1_101', resnet_v1.resnet_v1_101, is_training, + first_stage_features_stride, batch_norm_trainable, + reuse_weights, weight_decay) + + +class FasterRCNNResnet152FeatureExtractor(FasterRCNNResnetV1FeatureExtractor): + """Faster R-CNN Resnet 152 feature extractor implementation.""" + + def __init__(self, + is_training, + first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0): + """Constructor. + + Args: + is_training: See base class. + first_stage_features_stride: See base class. + batch_norm_trainable: See base class. + reuse_weights: See base class. + weight_decay: See base class. + + Raises: + ValueError: If `first_stage_features_stride` is not 8 or 16, + or if `architecture` is not supported. + """ + super(FasterRCNNResnet152FeatureExtractor, self).__init__( + 'resnet_v1_152', resnet_v1.resnet_v1_152, is_training, + first_stage_features_stride, batch_norm_trainable, + reuse_weights, weight_decay) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_resnet_v1_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_resnet_v1_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e2a336f0a8c3bc595670729bc8262ff40fbc366b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/faster_rcnn_resnet_v1_feature_extractor_test.py @@ -0,0 +1,137 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.models.faster_rcnn_resnet_v1_feature_extractor.""" + +import numpy as np +import tensorflow as tf + +from object_detection.models import faster_rcnn_resnet_v1_feature_extractor as faster_rcnn_resnet_v1 + + +class FasterRcnnResnetV1FeatureExtractorTest(tf.test.TestCase): + + def _build_feature_extractor(self, + first_stage_features_stride, + architecture='resnet_v1_101'): + feature_extractor_map = { + 'resnet_v1_50': + faster_rcnn_resnet_v1.FasterRCNNResnet50FeatureExtractor, + 'resnet_v1_101': + faster_rcnn_resnet_v1.FasterRCNNResnet101FeatureExtractor, + 'resnet_v1_152': + faster_rcnn_resnet_v1.FasterRCNNResnet152FeatureExtractor + } + return feature_extractor_map[architecture]( + is_training=False, + first_stage_features_stride=first_stage_features_stride, + batch_norm_trainable=False, + reuse_weights=None, + weight_decay=0.0) + + def test_extract_proposal_features_returns_expected_size(self): + for architecture in ['resnet_v1_50', 'resnet_v1_101', 'resnet_v1_152']: + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16, architecture=architecture) + preprocessed_inputs = tf.random_uniform( + [4, 224, 224, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [4, 14, 14, 1024]) + + def test_extract_proposal_features_stride_eight(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=8) + preprocessed_inputs = tf.random_uniform( + [4, 224, 224, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [4, 28, 28, 1024]) + + def test_extract_proposal_features_half_size_input(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [1, 112, 112, 3], maxval=255, dtype=tf.float32) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [1, 7, 7, 1024]) + + def test_extract_proposal_features_dies_on_invalid_stride(self): + with self.assertRaises(ValueError): + self._build_feature_extractor(first_stage_features_stride=99) + + def test_extract_proposal_features_dies_on_very_small_images(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3)) + rpn_feature_map, _ = feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + features_shape = tf.shape(rpn_feature_map) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + with self.assertRaises(tf.errors.InvalidArgumentError): + sess.run( + features_shape, + feed_dict={preprocessed_inputs: np.random.rand(4, 32, 32, 3)}) + + def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + preprocessed_inputs = tf.random_uniform( + [224, 224, 3], maxval=255, dtype=tf.float32) + with self.assertRaises(ValueError): + feature_extractor.extract_proposal_features( + preprocessed_inputs, scope='TestScope') + + def test_extract_box_classifier_features_returns_expected_size(self): + feature_extractor = self._build_feature_extractor( + first_stage_features_stride=16) + proposal_feature_maps = tf.random_uniform( + [3, 7, 7, 1024], maxval=255, dtype=tf.float32) + proposal_classifier_features = ( + feature_extractor.extract_box_classifier_features( + proposal_feature_maps, scope='TestScope')) + features_shape = tf.shape(proposal_classifier_features) + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + features_shape_out = sess.run(features_shape) + self.assertAllEqual(features_shape_out, [3, 7, 7, 2048]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/feature_map_generators.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/feature_map_generators.py new file mode 100644 index 0000000000000000000000000000000000000000..2c72eeb4279c7629fe6894bd6c44d9e97cafca7d --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/feature_map_generators.py @@ -0,0 +1,225 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Functions to generate a list of feature maps based on image features. + +Provides several feature map generators that can be used to build object +detection feature extractors. + +Object detection feature extractors usually are built by stacking two components +- A base feature extractor such as Inception V3 and a feature map generator. +Feature map generators build on the base feature extractors and produce a list +of final feature maps. +""" +import collections +import tensorflow as tf +from object_detection.utils import ops +slim = tf.contrib.slim + + +def get_depth_fn(depth_multiplier, min_depth): + """Builds a callable to compute depth (output channels) of conv filters. + + Args: + depth_multiplier: a multiplier for the nominal depth. + min_depth: a lower bound on the depth of filters. + + Returns: + A callable that takes in a nominal depth and returns the depth to use. + """ + def multiply_depth(depth): + new_depth = int(depth * depth_multiplier) + return max(new_depth, min_depth) + return multiply_depth + + +def multi_resolution_feature_maps(feature_map_layout, depth_multiplier, + min_depth, insert_1x1_conv, image_features): + """Generates multi resolution feature maps from input image features. + + Generates multi-scale feature maps for detection as in the SSD papers by + Liu et al: https://arxiv.org/pdf/1512.02325v2.pdf, See Sec 2.1. + + More specifically, it performs the following two tasks: + 1) If a layer name is provided in the configuration, returns that layer as a + feature map. + 2) If a layer name is left as an empty string, constructs a new feature map + based on the spatial shape and depth configuration. Note that the current + implementation only supports generating new layers using convolution of + stride 2 resulting in a spatial resolution reduction by a factor of 2. + By default convolution kernel size is set to 3, and it can be customized + by caller. + + An example of the configuration for Inception V3: + { + 'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''], + 'layer_depth': [-1, -1, -1, 512, 256, 128] + } + + Args: + feature_map_layout: Dictionary of specifications for the feature map + layouts in the following format (Inception V2/V3 respectively): + { + 'from_layer': ['Mixed_3c', 'Mixed_4c', 'Mixed_5c', '', '', ''], + 'layer_depth': [-1, -1, -1, 512, 256, 128] + } + or + { + 'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', '', ''], + 'layer_depth': [-1, -1, -1, 512, 256, 128] + } + If 'from_layer' is specified, the specified feature map is directly used + as a box predictor layer, and the layer_depth is directly infered from the + feature map (instead of using the provided 'layer_depth' parameter). In + this case, our convention is to set 'layer_depth' to -1 for clarity. + Otherwise, if 'from_layer' is an empty string, then the box predictor + layer will be built from the previous layer using convolution operations. + Note that the current implementation only supports generating new layers + using convolutions of stride 2 (resulting in a spatial resolution + reduction by a factor of 2), and will be extended to a more flexible + design. Convolution kernel size is set to 3 by default, and can be + customized by 'conv_kernel_size' parameter (similarily, 'conv_kernel_size' + should be set to -1 if 'from_layer' is specified). The created convolution + operation will be a normal 2D convolution by default, and a depthwise + convolution followed by 1x1 convolution if 'use_depthwise' is set to True. + depth_multiplier: Depth multiplier for convolutional layers. + min_depth: Minimum depth for convolutional layers. + insert_1x1_conv: A boolean indicating whether an additional 1x1 convolution + should be inserted before shrinking the feature map. + image_features: A dictionary of handles to activation tensors from the + base feature extractor. + + Returns: + feature_maps: an OrderedDict mapping keys (feature map names) to + tensors where each tensor has shape [batch, height_i, width_i, depth_i]. + + Raises: + ValueError: if the number entries in 'from_layer' and + 'layer_depth' do not match. + ValueError: if the generated layer does not have the same resolution + as specified. + """ + depth_fn = get_depth_fn(depth_multiplier, min_depth) + + feature_map_keys = [] + feature_maps = [] + base_from_layer = '' + use_explicit_padding = False + if 'use_explicit_padding' in feature_map_layout: + use_explicit_padding = feature_map_layout['use_explicit_padding'] + use_depthwise = False + if 'use_depthwise' in feature_map_layout: + use_depthwise = feature_map_layout['use_depthwise'] + for index, from_layer in enumerate(feature_map_layout['from_layer']): + layer_depth = feature_map_layout['layer_depth'][index] + conv_kernel_size = 3 + if 'conv_kernel_size' in feature_map_layout: + conv_kernel_size = feature_map_layout['conv_kernel_size'][index] + if from_layer: + feature_map = image_features[from_layer] + base_from_layer = from_layer + feature_map_keys.append(from_layer) + else: + pre_layer = feature_maps[-1] + intermediate_layer = pre_layer + if insert_1x1_conv: + layer_name = '{}_1_Conv2d_{}_1x1_{}'.format( + base_from_layer, index, depth_fn(layer_depth / 2)) + intermediate_layer = slim.conv2d( + pre_layer, + depth_fn(layer_depth / 2), [1, 1], + padding='SAME', + stride=1, + scope=layer_name) + layer_name = '{}_2_Conv2d_{}_{}x{}_s2_{}'.format( + base_from_layer, index, conv_kernel_size, conv_kernel_size, + depth_fn(layer_depth)) + stride = 2 + padding = 'SAME' + if use_explicit_padding: + padding = 'VALID' + intermediate_layer = ops.fixed_padding( + intermediate_layer, conv_kernel_size) + if use_depthwise: + feature_map = slim.separable_conv2d( + intermediate_layer, + None, [conv_kernel_size, conv_kernel_size], + depth_multiplier=1, + padding=padding, + stride=stride, + scope=layer_name + '_depthwise') + feature_map = slim.conv2d( + feature_map, + depth_fn(layer_depth), [1, 1], + padding='SAME', + stride=1, + scope=layer_name) + else: + feature_map = slim.conv2d( + intermediate_layer, + depth_fn(layer_depth), [conv_kernel_size, conv_kernel_size], + padding=padding, + stride=stride, + scope=layer_name) + feature_map_keys.append(layer_name) + feature_maps.append(feature_map) + return collections.OrderedDict( + [(x, y) for (x, y) in zip(feature_map_keys, feature_maps)]) + + +def fpn_top_down_feature_maps(image_features, depth, scope=None): + """Generates `top-down` feature maps for Feature Pyramid Networks. + + See https://arxiv.org/abs/1612.03144 for details. + + Args: + image_features: list of tuples of (tensor_name, image_feature_tensor). + Spatial resolutions of succesive tensors must reduce exactly by a factor + of 2. + depth: depth of output feature maps. + scope: A scope name to wrap this op under. + + Returns: + feature_maps: an OrderedDict mapping keys (feature map names) to + tensors where each tensor has shape [batch, height_i, width_i, depth_i]. + """ + with tf.name_scope(scope, 'top_down'): + num_levels = len(image_features) + output_feature_maps_list = [] + output_feature_map_keys = [] + with slim.arg_scope( + [slim.conv2d], padding='SAME', stride=1): + top_down = slim.conv2d( + image_features[-1][1], + depth, [1, 1], activation_fn=None, normalizer_fn=None, + scope='projection_%d' % num_levels) + output_feature_maps_list.append(top_down) + output_feature_map_keys.append( + 'top_down_%s' % image_features[-1][0]) + + for level in reversed(range(num_levels - 1)): + top_down = ops.nearest_neighbor_upsampling(top_down, 2) + residual = slim.conv2d( + image_features[level][1], depth, [1, 1], + activation_fn=None, normalizer_fn=None, + scope='projection_%d' % (level + 1)) + top_down += residual + output_feature_maps_list.append(slim.conv2d( + top_down, + depth, [3, 3], + scope='smoothing_%d' % (level + 1))) + output_feature_map_keys.append('top_down_%s' % image_features[level][0]) + return collections.OrderedDict( + reversed(zip(output_feature_map_keys, output_feature_maps_list))) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/feature_map_generators_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/feature_map_generators_test.py new file mode 100644 index 0000000000000000000000000000000000000000..540bc4efce83a3ec50f581aee01426064a1bc065 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/feature_map_generators_test.py @@ -0,0 +1,179 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for feature map generators.""" + +import tensorflow as tf + +from object_detection.models import feature_map_generators + +INCEPTION_V2_LAYOUT = { + 'from_layer': ['Mixed_3c', 'Mixed_4c', 'Mixed_5c', '', '', ''], + 'layer_depth': [-1, -1, -1, 512, 256, 256], + 'anchor_strides': [16, 32, 64, -1, -1, -1], + 'layer_target_norm': [20.0, -1, -1, -1, -1, -1], +} + +INCEPTION_V3_LAYOUT = { + 'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''], + 'layer_depth': [-1, -1, -1, 512, 256, 128], + 'anchor_strides': [16, 32, 64, -1, -1, -1], + 'aspect_ratios': [1.0, 2.0, 1.0/2, 3.0, 1.0/3] +} + +EMBEDDED_SSD_MOBILENET_V1_LAYOUT = { + 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', ''], + 'layer_depth': [-1, -1, 512, 256, 256], + 'conv_kernel_size': [-1, -1, 3, 3, 2], +} + + +# TODO(rathodv): add tests with different anchor strides. +class MultiResolutionFeatureMapGeneratorTest(tf.test.TestCase): + + def test_get_expected_feature_map_shapes_with_inception_v2(self): + image_features = { + 'Mixed_3c': tf.random_uniform([4, 28, 28, 256], dtype=tf.float32), + 'Mixed_4c': tf.random_uniform([4, 14, 14, 576], dtype=tf.float32), + 'Mixed_5c': tf.random_uniform([4, 7, 7, 1024], dtype=tf.float32) + } + feature_maps = feature_map_generators.multi_resolution_feature_maps( + feature_map_layout=INCEPTION_V2_LAYOUT, + depth_multiplier=1, + min_depth=32, + insert_1x1_conv=True, + image_features=image_features) + + expected_feature_map_shapes = { + 'Mixed_3c': (4, 28, 28, 256), + 'Mixed_4c': (4, 14, 14, 576), + 'Mixed_5c': (4, 7, 7, 1024), + 'Mixed_5c_2_Conv2d_3_3x3_s2_512': (4, 4, 4, 512), + 'Mixed_5c_2_Conv2d_4_3x3_s2_256': (4, 2, 2, 256), + 'Mixed_5c_2_Conv2d_5_3x3_s2_256': (4, 1, 1, 256)} + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + out_feature_maps = sess.run(feature_maps) + out_feature_map_shapes = dict( + (key, value.shape) for key, value in out_feature_maps.items()) + self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes) + + def test_get_expected_feature_map_shapes_with_inception_v3(self): + image_features = { + 'Mixed_5d': tf.random_uniform([4, 35, 35, 256], dtype=tf.float32), + 'Mixed_6e': tf.random_uniform([4, 17, 17, 576], dtype=tf.float32), + 'Mixed_7c': tf.random_uniform([4, 8, 8, 1024], dtype=tf.float32) + } + + feature_maps = feature_map_generators.multi_resolution_feature_maps( + feature_map_layout=INCEPTION_V3_LAYOUT, + depth_multiplier=1, + min_depth=32, + insert_1x1_conv=True, + image_features=image_features) + + expected_feature_map_shapes = { + 'Mixed_5d': (4, 35, 35, 256), + 'Mixed_6e': (4, 17, 17, 576), + 'Mixed_7c': (4, 8, 8, 1024), + 'Mixed_7c_2_Conv2d_3_3x3_s2_512': (4, 4, 4, 512), + 'Mixed_7c_2_Conv2d_4_3x3_s2_256': (4, 2, 2, 256), + 'Mixed_7c_2_Conv2d_5_3x3_s2_128': (4, 1, 1, 128)} + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + out_feature_maps = sess.run(feature_maps) + out_feature_map_shapes = dict( + (key, value.shape) for key, value in out_feature_maps.items()) + self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes) + + def test_get_expected_feature_map_shapes_with_embedded_ssd_mobilenet_v1( + self): + image_features = { + 'Conv2d_11_pointwise': tf.random_uniform([4, 16, 16, 512], + dtype=tf.float32), + 'Conv2d_13_pointwise': tf.random_uniform([4, 8, 8, 1024], + dtype=tf.float32), + } + + feature_maps = feature_map_generators.multi_resolution_feature_maps( + feature_map_layout=EMBEDDED_SSD_MOBILENET_V1_LAYOUT, + depth_multiplier=1, + min_depth=32, + insert_1x1_conv=True, + image_features=image_features) + + expected_feature_map_shapes = { + 'Conv2d_11_pointwise': (4, 16, 16, 512), + 'Conv2d_13_pointwise': (4, 8, 8, 1024), + 'Conv2d_13_pointwise_2_Conv2d_2_3x3_s2_512': (4, 4, 4, 512), + 'Conv2d_13_pointwise_2_Conv2d_3_3x3_s2_256': (4, 2, 2, 256), + 'Conv2d_13_pointwise_2_Conv2d_4_2x2_s2_256': (4, 1, 1, 256)} + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + out_feature_maps = sess.run(feature_maps) + out_feature_map_shapes = dict( + (key, value.shape) for key, value in out_feature_maps.items()) + self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes) + + +class FPNFeatureMapGeneratorTest(tf.test.TestCase): + + def test_get_expected_feature_map_shapes(self): + image_features = [ + ('block2', tf.random_uniform([4, 8, 8, 256], dtype=tf.float32)), + ('block3', tf.random_uniform([4, 4, 4, 256], dtype=tf.float32)), + ('block4', tf.random_uniform([4, 2, 2, 256], dtype=tf.float32)), + ('block5', tf.random_uniform([4, 1, 1, 256], dtype=tf.float32)) + ] + feature_maps = feature_map_generators.fpn_top_down_feature_maps( + image_features=image_features, depth=128) + + expected_feature_map_shapes = { + 'top_down_block2': (4, 8, 8, 128), + 'top_down_block3': (4, 4, 4, 128), + 'top_down_block4': (4, 2, 2, 128), + 'top_down_block5': (4, 1, 1, 128) + } + + init_op = tf.global_variables_initializer() + with self.test_session() as sess: + sess.run(init_op) + out_feature_maps = sess.run(feature_maps) + out_feature_map_shapes = {key: value.shape + for key, value in out_feature_maps.items()} + self.assertDictEqual(out_feature_map_shapes, expected_feature_map_shapes) + + +class GetDepthFunctionTest(tf.test.TestCase): + + def test_return_min_depth_when_multiplier_is_small(self): + depth_fn = feature_map_generators.get_depth_fn(depth_multiplier=0.5, + min_depth=16) + self.assertEqual(depth_fn(16), 16) + + def test_return_correct_depth_with_multiplier(self): + depth_fn = feature_map_generators.get_depth_fn(depth_multiplier=0.5, + min_depth=16) + self.assertEqual(depth_fn(64), 32) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..899214b2c40bf1c92570cd2d8432dcdfc7b569d2 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_feature_extractor_test.py @@ -0,0 +1,110 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Base test class SSDFeatureExtractors.""" + +from abc import abstractmethod + +import itertools +import numpy as np +import tensorflow as tf + +from object_detection.utils import test_case + + +class SsdFeatureExtractorTestBase(test_case.TestCase): + + def conv_hyperparams_fn(self): + with tf.contrib.slim.arg_scope([]) as sc: + return sc + + @abstractmethod + def _create_feature_extractor(self, depth_multiplier, pad_to_multiple, + use_explicit_padding=False): + """Constructs a new feature extractor. + + Args: + depth_multiplier: float depth multiplier for feature extractor + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + use_explicit_padding: use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + Returns: + an ssd_meta_arch.SSDFeatureExtractor object. + """ + pass + + def check_extract_features_returns_correct_shape( + self, batch_size, image_height, image_width, depth_multiplier, + pad_to_multiple, expected_feature_map_shapes, use_explicit_padding=False): + def graph_fn(image_tensor): + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple, + use_explicit_padding) + feature_maps = feature_extractor.extract_features(image_tensor) + return feature_maps + + image_tensor = np.random.rand(batch_size, image_height, image_width, + 3).astype(np.float32) + feature_maps = self.execute(graph_fn, [image_tensor]) + for feature_map, expected_shape in itertools.izip( + feature_maps, expected_feature_map_shapes): + self.assertAllEqual(feature_map.shape, expected_shape) + + def check_extract_features_returns_correct_shapes_with_dynamic_inputs( + self, batch_size, image_height, image_width, depth_multiplier, + pad_to_multiple, expected_feature_map_shapes, use_explicit_padding=False): + def graph_fn(image_height, image_width): + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple, + use_explicit_padding) + image_tensor = tf.random_uniform([batch_size, image_height, image_width, + 3], dtype=tf.float32) + feature_maps = feature_extractor.extract_features(image_tensor) + return feature_maps + + feature_maps = self.execute_cpu(graph_fn, [ + np.array(image_height, dtype=np.int32), + np.array(image_width, dtype=np.int32) + ]) + for feature_map, expected_shape in itertools.izip( + feature_maps, expected_feature_map_shapes): + self.assertAllEqual(feature_map.shape, expected_shape) + + def check_extract_features_raises_error_with_invalid_image_size( + self, image_height, image_width, depth_multiplier, pad_to_multiple): + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple) + preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3)) + feature_maps = feature_extractor.extract_features(preprocessed_inputs) + test_preprocessed_image = np.random.rand(4, image_height, image_width, 3) + with self.test_session() as sess: + sess.run(tf.global_variables_initializer()) + with self.assertRaises(tf.errors.InvalidArgumentError): + sess.run(feature_maps, + feed_dict={preprocessed_inputs: test_preprocessed_image}) + + def check_feature_extractor_variables_under_scope( + self, depth_multiplier, pad_to_multiple, scope_name): + g = tf.Graph() + with g.as_default(): + feature_extractor = self._create_feature_extractor( + depth_multiplier, pad_to_multiple) + preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3)) + feature_extractor.extract_features(preprocessed_inputs) + variables = g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + for variable in variables: + self.assertTrue(variable.name.startswith(scope_name)) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v2_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v2_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..b97b0f2bc9923a4fe9207a4dba28f69db3f52101 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v2_feature_extractor.py @@ -0,0 +1,126 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""SSDFeatureExtractor for InceptionV2 features.""" +import tensorflow as tf + +from object_detection.meta_architectures import ssd_meta_arch +from object_detection.models import feature_map_generators +from object_detection.utils import ops +from object_detection.utils import shape_utils +from nets import inception_v2 + +slim = tf.contrib.slim + + +class SSDInceptionV2FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): + """SSD Feature Extractor using InceptionV2 features.""" + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """InceptionV2 Feature Extractor for SSD Models. + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + min_depth: minimum feature extractor depth. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + reuse_weights: Whether to reuse variables. Default is None. + use_explicit_padding: Whether to use explicit padding when extracting + features. Default is False. + use_depthwise: Whether to use depthwise convolutions. Default is False. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + + Raises: + ValueError: If `override_base_feature_extractor_hyperparams` is False. + """ + super(SSDInceptionV2FeatureExtractor, self).__init__( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams_fn, reuse_weights, use_explicit_padding, use_depthwise, + override_base_feature_extractor_hyperparams) + if not self._override_base_feature_extractor_hyperparams: + raise ValueError('SSD Inception V2 feature extractor always uses' + 'scope returned by `conv_hyperparams_fn` for both the ' + 'base feature extractor and the additional layers ' + 'added since there is no arg_scope defined for the base ' + 'feature extractor.') + + def preprocess(self, resized_inputs): + """SSD preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def extract_features(self, preprocessed_inputs): + """Extract features from preprocessed inputs. + + Args: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + feature_maps: a list of tensors where the ith tensor has shape + [batch, height_i, width_i, depth_i] + """ + preprocessed_inputs = shape_utils.check_min_image_dim( + 33, preprocessed_inputs) + + feature_map_layout = { + 'from_layer': ['Mixed_4c', 'Mixed_5c', '', '', '', ''], + 'layer_depth': [-1, -1, 512, 256, 256, 128], + 'use_explicit_padding': self._use_explicit_padding, + 'use_depthwise': self._use_depthwise, + } + + with slim.arg_scope(self._conv_hyperparams_fn()): + with tf.variable_scope('InceptionV2', + reuse=self._reuse_weights) as scope: + _, image_features = inception_v2.inception_v2_base( + ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), + final_endpoint='Mixed_5c', + min_depth=self._min_depth, + depth_multiplier=self._depth_multiplier, + scope=scope) + feature_maps = feature_map_generators.multi_resolution_feature_maps( + feature_map_layout=feature_map_layout, + depth_multiplier=self._depth_multiplier, + min_depth=self._min_depth, + insert_1x1_conv=True, + image_features=image_features) + + return feature_maps.values() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v2_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v2_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..054dcc4e4e4057113762bc2b83e6b678911ce662 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v2_feature_extractor_test.py @@ -0,0 +1,134 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.models.ssd_inception_v2_feature_extractor.""" +import numpy as np +import tensorflow as tf + +from object_detection.models import ssd_feature_extractor_test +from object_detection.models import ssd_inception_v2_feature_extractor + + +class SsdInceptionV2FeatureExtractorTest( + ssd_feature_extractor_test.SsdFeatureExtractorTestBase): + + def _create_feature_extractor(self, depth_multiplier, pad_to_multiple, + is_training=True): + """Constructs a SsdInceptionV2FeatureExtractor. + + Args: + depth_multiplier: float depth multiplier for feature extractor + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + is_training: whether the network is in training mode. + + Returns: + an ssd_inception_v2_feature_extractor.SsdInceptionV2FeatureExtractor. + """ + min_depth = 32 + return ssd_inception_v2_feature_extractor.SSDInceptionV2FeatureExtractor( + is_training, depth_multiplier, min_depth, pad_to_multiple, + self.conv_hyperparams_fn, + override_base_feature_extractor_hyperparams=True) + + def test_extract_features_returns_correct_shapes_128(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 8, 8, 576), (2, 4, 4, 1024), + (2, 2, 2, 512), (2, 1, 1, 256), + (2, 1, 1, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 8, 8, 576), (2, 4, 4, 1024), + (2, 2, 2, 512), (2, 1, 1, 256), + (2, 1, 1, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shapes_with_dynamic_inputs( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_299(self): + image_height = 299 + image_width = 299 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 19, 19, 576), (2, 10, 10, 1024), + (2, 5, 5, 512), (2, 3, 3, 256), + (2, 2, 2, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_enforcing_min_depth(self): + image_height = 299 + image_width = 299 + depth_multiplier = 0.5**12 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 19, 19, 128), (2, 10, 10, 128), + (2, 5, 5, 32), (2, 3, 3, 32), + (2, 2, 2, 32), (2, 1, 1, 32)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self): + image_height = 299 + image_width = 299 + depth_multiplier = 1.0 + pad_to_multiple = 32 + expected_feature_map_shape = [(2, 20, 20, 576), (2, 10, 10, 1024), + (2, 5, 5, 512), (2, 3, 3, 256), + (2, 2, 2, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_raises_error_with_invalid_image_size(self): + image_height = 32 + image_width = 32 + depth_multiplier = 1.0 + pad_to_multiple = 1 + self.check_extract_features_raises_error_with_invalid_image_size( + image_height, image_width, depth_multiplier, pad_to_multiple) + + def test_preprocess_returns_correct_value_range(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1 + pad_to_multiple = 1 + test_image = np.random.rand(4, image_height, image_width, 3) + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple) + preprocessed_image = feature_extractor.preprocess(test_image) + self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0))) + + def test_variables_only_created_in_scope(self): + depth_multiplier = 1 + pad_to_multiple = 1 + scope_name = 'InceptionV2' + self.check_feature_extractor_variables_under_scope( + depth_multiplier, pad_to_multiple, scope_name) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v3_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v3_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..5d97e7b5c9c130edc8173f16a3d1deaa02ffa364 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v3_feature_extractor.py @@ -0,0 +1,126 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""SSDFeatureExtractor for InceptionV3 features.""" +import tensorflow as tf + +from object_detection.meta_architectures import ssd_meta_arch +from object_detection.models import feature_map_generators +from object_detection.utils import ops +from object_detection.utils import shape_utils +from nets import inception_v3 + +slim = tf.contrib.slim + + +class SSDInceptionV3FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): + """SSD Feature Extractor using InceptionV3 features.""" + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """InceptionV3 Feature Extractor for SSD Models. + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + min_depth: minimum feature extractor depth. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + reuse_weights: Whether to reuse variables. Default is None. + use_explicit_padding: Whether to use explicit padding when extracting + features. Default is False. + use_depthwise: Whether to use depthwise convolutions. Default is False. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + + Raises: + ValueError: If `override_base_feature_extractor_hyperparams` is False. + """ + super(SSDInceptionV3FeatureExtractor, self).__init__( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams_fn, reuse_weights, use_explicit_padding, use_depthwise, + override_base_feature_extractor_hyperparams) + + if not self._override_base_feature_extractor_hyperparams: + raise ValueError('SSD Inception V3 feature extractor always uses' + 'scope returned by `conv_hyperparams_fn` for both the ' + 'base feature extractor and the additional layers ' + 'added since there is no arg_scope defined for the base ' + 'feature extractor.') + + def preprocess(self, resized_inputs): + """SSD preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def extract_features(self, preprocessed_inputs): + """Extract features from preprocessed inputs. + + Args: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + feature_maps: a list of tensors where the ith tensor has shape + [batch, height_i, width_i, depth_i] + """ + preprocessed_inputs = shape_utils.check_min_image_dim( + 33, preprocessed_inputs) + + feature_map_layout = { + 'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''], + 'layer_depth': [-1, -1, -1, 512, 256, 128], + 'use_explicit_padding': self._use_explicit_padding, + 'use_depthwise': self._use_depthwise, + } + + with slim.arg_scope(self._conv_hyperparams_fn()): + with tf.variable_scope('InceptionV3', reuse=self._reuse_weights) as scope: + _, image_features = inception_v3.inception_v3_base( + ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), + final_endpoint='Mixed_7c', + min_depth=self._min_depth, + depth_multiplier=self._depth_multiplier, + scope=scope) + feature_maps = feature_map_generators.multi_resolution_feature_maps( + feature_map_layout=feature_map_layout, + depth_multiplier=self._depth_multiplier, + min_depth=self._min_depth, + insert_1x1_conv=True, + image_features=image_features) + + return feature_maps.values() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v3_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v3_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..fe3d3520d737942efefcc9623a6f0b0c07b72f55 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_inception_v3_feature_extractor_test.py @@ -0,0 +1,134 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for object_detection.models.ssd_inception_v3_feature_extractor.""" +import numpy as np +import tensorflow as tf + +from object_detection.models import ssd_feature_extractor_test +from object_detection.models import ssd_inception_v3_feature_extractor + + +class SsdInceptionV3FeatureExtractorTest( + ssd_feature_extractor_test.SsdFeatureExtractorTestBase): + + def _create_feature_extractor(self, depth_multiplier, pad_to_multiple, + is_training=True): + """Constructs a SsdInceptionV3FeatureExtractor. + + Args: + depth_multiplier: float depth multiplier for feature extractor + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + is_training: whether the network is in training mode. + + Returns: + an ssd_inception_v3_feature_extractor.SsdInceptionV3FeatureExtractor. + """ + min_depth = 32 + return ssd_inception_v3_feature_extractor.SSDInceptionV3FeatureExtractor( + is_training, depth_multiplier, min_depth, pad_to_multiple, + self.conv_hyperparams_fn, + override_base_feature_extractor_hyperparams=True) + + def test_extract_features_returns_correct_shapes_128(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 13, 13, 288), (2, 6, 6, 768), + (2, 2, 2, 2048), (2, 1, 1, 512), + (2, 1, 1, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 13, 13, 288), (2, 6, 6, 768), + (2, 2, 2, 2048), (2, 1, 1, 512), + (2, 1, 1, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shapes_with_dynamic_inputs( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_299(self): + image_height = 299 + image_width = 299 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 35, 35, 288), (2, 17, 17, 768), + (2, 8, 8, 2048), (2, 4, 4, 512), + (2, 2, 2, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_enforcing_min_depth(self): + image_height = 299 + image_width = 299 + depth_multiplier = 0.5**12 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 35, 35, 128), (2, 17, 17, 128), + (2, 8, 8, 192), (2, 4, 4, 32), + (2, 2, 2, 32), (2, 1, 1, 32)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self): + image_height = 299 + image_width = 299 + depth_multiplier = 1.0 + pad_to_multiple = 32 + expected_feature_map_shape = [(2, 37, 37, 288), (2, 18, 18, 768), + (2, 8, 8, 2048), (2, 4, 4, 512), + (2, 2, 2, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_raises_error_with_invalid_image_size(self): + image_height = 32 + image_width = 32 + depth_multiplier = 1.0 + pad_to_multiple = 1 + self.check_extract_features_raises_error_with_invalid_image_size( + image_height, image_width, depth_multiplier, pad_to_multiple) + + def test_preprocess_returns_correct_value_range(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1 + pad_to_multiple = 1 + test_image = np.random.rand(4, image_height, image_width, 3) + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple) + preprocessed_image = feature_extractor.preprocess(test_image) + self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0))) + + def test_variables_only_created_in_scope(self): + depth_multiplier = 1 + pad_to_multiple = 1 + scope_name = 'InceptionV3' + self.check_feature_extractor_variables_under_scope( + depth_multiplier, pad_to_multiple, scope_name) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v1_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v1_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..aada1111ed659cbaa0180a0535ea6095d0f1d9c9 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v1_feature_extractor.py @@ -0,0 +1,128 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""SSDFeatureExtractor for MobilenetV1 features.""" + +import tensorflow as tf + +from object_detection.meta_architectures import ssd_meta_arch +from object_detection.models import feature_map_generators +from object_detection.utils import context_manager +from object_detection.utils import ops +from object_detection.utils import shape_utils +from nets import mobilenet_v1 + +slim = tf.contrib.slim + + +class SSDMobileNetV1FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): + """SSD Feature Extractor using MobilenetV1 features.""" + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """MobileNetV1 Feature Extractor for SSD Models. + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + min_depth: minimum feature extractor depth. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + reuse_weights: Whether to reuse variables. Default is None. + use_explicit_padding: Use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + use_depthwise: Whether to use depthwise convolutions. Default is False. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + """ + super(SSDMobileNetV1FeatureExtractor, self).__init__( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams_fn, reuse_weights, use_explicit_padding, use_depthwise, + override_base_feature_extractor_hyperparams) + + def preprocess(self, resized_inputs): + """SSD preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def extract_features(self, preprocessed_inputs): + """Extract features from preprocessed inputs. + + Args: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + feature_maps: a list of tensors where the ith tensor has shape + [batch, height_i, width_i, depth_i] + """ + preprocessed_inputs = shape_utils.check_min_image_dim( + 33, preprocessed_inputs) + + feature_map_layout = { + 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', + '', ''], + 'layer_depth': [-1, -1, 512, 256, 256, 128], + 'use_explicit_padding': self._use_explicit_padding, + 'use_depthwise': self._use_depthwise, + } + + with tf.variable_scope('MobilenetV1', + reuse=self._reuse_weights) as scope: + with slim.arg_scope( + mobilenet_v1.mobilenet_v1_arg_scope( + is_training=None, regularize_depthwise=True)): + with (slim.arg_scope(self._conv_hyperparams_fn()) + if self._override_base_feature_extractor_hyperparams + else context_manager.IdentityContextManager()): + _, image_features = mobilenet_v1.mobilenet_v1_base( + ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), + final_endpoint='Conv2d_13_pointwise', + min_depth=self._min_depth, + depth_multiplier=self._depth_multiplier, + use_explicit_padding=self._use_explicit_padding, + scope=scope) + with slim.arg_scope(self._conv_hyperparams_fn()): + feature_maps = feature_map_generators.multi_resolution_feature_maps( + feature_map_layout=feature_map_layout, + depth_multiplier=self._depth_multiplier, + min_depth=self._min_depth, + insert_1x1_conv=True, + image_features=image_features) + + return feature_maps.values() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v1_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v1_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..d3a9542b3596712aa1d85da6b9d68416c23c3ff3 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v1_feature_extractor_test.py @@ -0,0 +1,166 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for ssd_mobilenet_v1_feature_extractor.""" +import numpy as np +import tensorflow as tf + +from object_detection.models import ssd_feature_extractor_test +from object_detection.models import ssd_mobilenet_v1_feature_extractor + +slim = tf.contrib.slim + + +class SsdMobilenetV1FeatureExtractorTest( + ssd_feature_extractor_test.SsdFeatureExtractorTestBase): + + def _create_feature_extractor(self, depth_multiplier, pad_to_multiple, + is_training=True, use_explicit_padding=False): + """Constructs a new feature extractor. + + Args: + depth_multiplier: float depth multiplier for feature extractor + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + is_training: whether the network is in training mode. + use_explicit_padding: Use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + Returns: + an ssd_meta_arch.SSDFeatureExtractor object. + """ + min_depth = 32 + return ssd_mobilenet_v1_feature_extractor.SSDMobileNetV1FeatureExtractor( + is_training, depth_multiplier, min_depth, pad_to_multiple, + self.conv_hyperparams_fn, + use_explicit_padding=use_explicit_padding) + + def test_extract_features_returns_correct_shapes_128(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 8, 8, 512), (2, 4, 4, 1024), + (2, 2, 2, 512), (2, 1, 1, 256), + (2, 1, 1, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=False) + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=True) + + def test_extract_features_returns_correct_shapes_299(self): + image_height = 299 + image_width = 299 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 19, 19, 512), (2, 10, 10, 1024), + (2, 5, 5, 512), (2, 3, 3, 256), + (2, 2, 2, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=False) + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=True) + + def test_extract_features_with_dynamic_image_shape(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 8, 8, 512), (2, 4, 4, 1024), + (2, 2, 2, 512), (2, 1, 1, 256), + (2, 1, 1, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shapes_with_dynamic_inputs( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=False) + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=True) + + def test_extract_features_returns_correct_shapes_enforcing_min_depth(self): + image_height = 299 + image_width = 299 + depth_multiplier = 0.5**12 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 19, 19, 32), (2, 10, 10, 32), + (2, 5, 5, 32), (2, 3, 3, 32), + (2, 2, 2, 32), (2, 1, 1, 32)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=False) + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=True) + + def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self): + image_height = 299 + image_width = 299 + depth_multiplier = 1.0 + pad_to_multiple = 32 + expected_feature_map_shape = [(2, 20, 20, 512), (2, 10, 10, 1024), + (2, 5, 5, 512), (2, 3, 3, 256), + (2, 2, 2, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=False) + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape, use_explicit_padding=True) + + def test_extract_features_raises_error_with_invalid_image_size(self): + image_height = 32 + image_width = 32 + depth_multiplier = 1.0 + pad_to_multiple = 1 + self.check_extract_features_raises_error_with_invalid_image_size( + image_height, image_width, depth_multiplier, pad_to_multiple) + + def test_preprocess_returns_correct_value_range(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1 + pad_to_multiple = 1 + test_image = np.random.rand(2, image_height, image_width, 3) + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple) + preprocessed_image = feature_extractor.preprocess(test_image) + self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0))) + + def test_variables_only_created_in_scope(self): + depth_multiplier = 1 + pad_to_multiple = 1 + scope_name = 'MobilenetV1' + self.check_feature_extractor_variables_under_scope( + depth_multiplier, pad_to_multiple, scope_name) + + def test_has_fused_batchnorm(self): + image_height = 40 + image_width = 40 + depth_multiplier = 1 + pad_to_multiple = 1 + image_placeholder = tf.placeholder(tf.float32, + [1, image_height, image_width, 3]) + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple) + preprocessed_image = feature_extractor.preprocess(image_placeholder) + _ = feature_extractor.extract_features(preprocessed_image) + self.assertTrue(any(op.type == 'FusedBatchNorm' + for op in tf.get_default_graph().get_operations())) + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v2_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v2_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..014b93a8e293608fde713f556979f8ecffc9a52d --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v2_feature_extractor.py @@ -0,0 +1,129 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""SSDFeatureExtractor for MobilenetV2 features.""" + +import tensorflow as tf + +from object_detection.meta_architectures import ssd_meta_arch +from object_detection.models import feature_map_generators +from object_detection.utils import context_manager +from object_detection.utils import ops +from object_detection.utils import shape_utils +from nets.mobilenet import mobilenet +from nets.mobilenet import mobilenet_v2 + +slim = tf.contrib.slim + + +class SSDMobileNetV2FeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): + """SSD Feature Extractor using MobilenetV2 features.""" + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """MobileNetV2 Feature Extractor for SSD Models. + + Mobilenet v2 (experimental), designed by sandler@. More details can be found + in //knowledge/cerebra/brain/compression/mobilenet/mobilenet_experimental.py + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + min_depth: minimum feature extractor depth. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + reuse_weights: Whether to reuse variables. Default is None. + use_explicit_padding: Whether to use explicit padding when extracting + features. Default is False. + use_depthwise: Whether to use depthwise convolutions. Default is False. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + """ + super(SSDMobileNetV2FeatureExtractor, self).__init__( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams_fn, reuse_weights, use_explicit_padding, use_depthwise, + override_base_feature_extractor_hyperparams) + + def preprocess(self, resized_inputs): + """SSD preprocessing. + + Maps pixel values to the range [-1, 1]. + + Args: + resized_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + """ + return (2.0 / 255.0) * resized_inputs - 1.0 + + def extract_features(self, preprocessed_inputs): + """Extract features from preprocessed inputs. + + Args: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + feature_maps: a list of tensors where the ith tensor has shape + [batch, height_i, width_i, depth_i] + """ + preprocessed_inputs = shape_utils.check_min_image_dim( + 33, preprocessed_inputs) + + feature_map_layout = { + 'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''], + 'layer_depth': [-1, -1, 512, 256, 256, 128], + 'use_depthwise': self._use_depthwise, + 'use_explicit_padding': self._use_explicit_padding, + } + + with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: + with slim.arg_scope( + mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ + slim.arg_scope( + [mobilenet.depth_multiplier], min_depth=self._min_depth): + with (slim.arg_scope(self._conv_hyperparams_fn()) + if self._override_base_feature_extractor_hyperparams else + context_manager.IdentityContextManager()): + _, image_features = mobilenet_v2.mobilenet_base( + ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), + final_endpoint='layer_19', + depth_multiplier=self._depth_multiplier, + use_explicit_padding=self._use_explicit_padding, + scope=scope) + with slim.arg_scope(self._conv_hyperparams_fn()): + feature_maps = feature_map_generators.multi_resolution_feature_maps( + feature_map_layout=feature_map_layout, + depth_multiplier=self._depth_multiplier, + min_depth=self._min_depth, + insert_1x1_conv=True, + image_features=image_features) + + return feature_maps.values() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v2_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v2_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0b374749f2947f848692275bd79129d9f5d4bd76 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_mobilenet_v2_feature_extractor_test.py @@ -0,0 +1,154 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for ssd_mobilenet_v2_feature_extractor.""" +import numpy as np +import tensorflow as tf + +from object_detection.models import ssd_feature_extractor_test +from object_detection.models import ssd_mobilenet_v2_feature_extractor + +slim = tf.contrib.slim + + +class SsdMobilenetV2FeatureExtractorTest( + ssd_feature_extractor_test.SsdFeatureExtractorTestBase): + + def _create_feature_extractor(self, depth_multiplier, pad_to_multiple, + use_explicit_padding=False): + """Constructs a new feature extractor. + + Args: + depth_multiplier: float depth multiplier for feature extractor + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + use_explicit_padding: use 'VALID' padding for convolutions, but prepad + inputs so that the output dimensions are the same as if 'SAME' padding + were used. + Returns: + an ssd_meta_arch.SSDFeatureExtractor object. + """ + min_depth = 32 + return ssd_mobilenet_v2_feature_extractor.SSDMobileNetV2FeatureExtractor( + False, + depth_multiplier, + min_depth, + pad_to_multiple, + self.conv_hyperparams_fn, + use_explicit_padding=use_explicit_padding) + + def test_extract_features_returns_correct_shapes_128(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 8, 8, 576), (2, 4, 4, 1280), + (2, 2, 2, 512), (2, 1, 1, 256), + (2, 1, 1, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 8, 8, 576), (2, 4, 4, 1280), + (2, 2, 2, 512), (2, 1, 1, 256), + (2, 1, 1, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shapes_with_dynamic_inputs( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_299(self): + image_height = 299 + image_width = 299 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 19, 19, 576), (2, 10, 10, 1280), + (2, 5, 5, 512), (2, 3, 3, 256), + (2, 2, 2, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_enforcing_min_depth(self): + image_height = 299 + image_width = 299 + depth_multiplier = 0.5**12 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 19, 19, 192), (2, 10, 10, 32), + (2, 5, 5, 32), (2, 3, 3, 32), + (2, 2, 2, 32), (2, 1, 1, 32)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self): + image_height = 299 + image_width = 299 + depth_multiplier = 1.0 + pad_to_multiple = 32 + expected_feature_map_shape = [(2, 20, 20, 576), (2, 10, 10, 1280), + (2, 5, 5, 512), (2, 3, 3, 256), + (2, 2, 2, 256), (2, 1, 1, 128)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_raises_error_with_invalid_image_size(self): + image_height = 32 + image_width = 32 + depth_multiplier = 1.0 + pad_to_multiple = 1 + self.check_extract_features_raises_error_with_invalid_image_size( + image_height, image_width, depth_multiplier, pad_to_multiple) + + def test_preprocess_returns_correct_value_range(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1 + pad_to_multiple = 1 + test_image = np.random.rand(4, image_height, image_width, 3) + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple) + preprocessed_image = feature_extractor.preprocess(test_image) + self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0))) + + def test_variables_only_created_in_scope(self): + depth_multiplier = 1 + pad_to_multiple = 1 + scope_name = 'MobilenetV2' + self.check_feature_extractor_variables_under_scope( + depth_multiplier, pad_to_multiple, scope_name) + + def test_has_fused_batchnorm(self): + image_height = 40 + image_width = 40 + depth_multiplier = 1 + pad_to_multiple = 1 + image_placeholder = tf.placeholder(tf.float32, + [1, image_height, image_width, 3]) + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple) + preprocessed_image = feature_extractor.preprocess(image_placeholder) + _ = feature_extractor.extract_features(preprocessed_image) + self.assertTrue(any(op.type == 'FusedBatchNorm' + for op in tf.get_default_graph().get_operations())) + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..65bda3f4fc3a33e9bf6f356438ebb35c51bbae59 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_resnet_v1_fpn_feature_extractor.py @@ -0,0 +1,290 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""SSD Feature Pyramid Network (FPN) feature extractors based on Resnet v1. + +See https://arxiv.org/abs/1708.02002 for details. +""" + +import tensorflow as tf + +from object_detection.meta_architectures import ssd_meta_arch +from object_detection.models import feature_map_generators +from object_detection.utils import context_manager +from object_detection.utils import ops +from object_detection.utils import shape_utils +from nets import resnet_v1 + +slim = tf.contrib.slim + + +class _SSDResnetV1FpnFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor): + """SSD FPN feature extractor based on Resnet v1 architecture.""" + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + resnet_base_fn, + resnet_scope_name, + fpn_scope_name, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """SSD FPN feature extractor based on Resnet v1 architecture. + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + UNUSED currently. + min_depth: minimum feature extractor depth. UNUSED Currently. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + resnet_base_fn: base resnet network to use. + resnet_scope_name: scope name under which to construct resnet + fpn_scope_name: scope name under which to construct the feature pyramid + network. + reuse_weights: Whether to reuse variables. Default is None. + use_explicit_padding: Whether to use explicit padding when extracting + features. Default is False. UNUSED currently. + use_depthwise: Whether to use depthwise convolutions. UNUSED currently. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + + Raises: + ValueError: On supplying invalid arguments for unused arguments. + """ + super(_SSDResnetV1FpnFeatureExtractor, self).__init__( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams_fn, reuse_weights, use_explicit_padding, + override_base_feature_extractor_hyperparams) + if self._depth_multiplier != 1.0: + raise ValueError('Only depth 1.0 is supported, found: {}'. + format(self._depth_multiplier)) + if self._use_explicit_padding is True: + raise ValueError('Explicit padding is not a valid option.') + self._resnet_base_fn = resnet_base_fn + self._resnet_scope_name = resnet_scope_name + self._fpn_scope_name = fpn_scope_name + + def preprocess(self, resized_inputs): + """SSD preprocessing. + + VGG style channel mean subtraction as described here: + https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-mdnge. + + Args: + resized_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + """ + channel_means = [123.68, 116.779, 103.939] + return resized_inputs - [[channel_means]] + + def _filter_features(self, image_features): + # TODO(rathodv): Change resnet endpoint to strip scope prefixes instead + # of munging the scope here. + filtered_image_features = dict({}) + for key, feature in image_features.items(): + feature_name = key.split('/')[-1] + if feature_name in ['block2', 'block3', 'block4']: + filtered_image_features[feature_name] = feature + return filtered_image_features + + def extract_features(self, preprocessed_inputs): + """Extract features from preprocessed inputs. + + Args: + preprocessed_inputs: a [batch, height, width, channels] float tensor + representing a batch of images. + + Returns: + feature_maps: a list of tensors where the ith tensor has shape + [batch, height_i, width_i, depth_i] + + Raises: + ValueError: depth multiplier is not supported. + """ + if self._depth_multiplier != 1.0: + raise ValueError('Depth multiplier not supported.') + + preprocessed_inputs = shape_utils.check_min_image_dim( + 129, preprocessed_inputs) + + with tf.variable_scope( + self._resnet_scope_name, reuse=self._reuse_weights) as scope: + with slim.arg_scope(resnet_v1.resnet_arg_scope()): + with (slim.arg_scope(self._conv_hyperparams_fn()) + if self._override_base_feature_extractor_hyperparams else + context_manager.IdentityContextManager()): + _, image_features = self._resnet_base_fn( + inputs=ops.pad_to_multiple(preprocessed_inputs, + self._pad_to_multiple), + num_classes=None, + is_training=None, + global_pool=False, + output_stride=None, + store_non_strided_activations=True, + scope=scope) + image_features = self._filter_features(image_features) + with slim.arg_scope(self._conv_hyperparams_fn()): + with tf.variable_scope(self._fpn_scope_name, + reuse=self._reuse_weights): + fpn_features = feature_map_generators.fpn_top_down_feature_maps( + [(key, image_features[key]) + for key in ['block2', 'block3', 'block4']], + depth=256) + last_feature_map = fpn_features['top_down_block4'] + coarse_features = {} + for i in range(5, 7): + last_feature_map = slim.conv2d( + last_feature_map, + num_outputs=256, + kernel_size=[3, 3], + stride=2, + padding='SAME', + scope='bottom_up_block{}'.format(i)) + coarse_features['bottom_up_block{}'.format(i)] = last_feature_map + return [fpn_features['top_down_block2'], + fpn_features['top_down_block3'], + fpn_features['top_down_block4'], + coarse_features['bottom_up_block5'], + coarse_features['bottom_up_block6']] + + +class SSDResnet50V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """SSD Resnet50 V1 FPN feature extractor based on Resnet v1 architecture. + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + UNUSED currently. + min_depth: minimum feature extractor depth. UNUSED Currently. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + reuse_weights: Whether to reuse variables. Default is None. + use_explicit_padding: Whether to use explicit padding when extracting + features. Default is False. UNUSED currently. + use_depthwise: Whether to use depthwise convolutions. UNUSED currently. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + """ + super(SSDResnet50V1FpnFeatureExtractor, self).__init__( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams_fn, resnet_v1.resnet_v1_50, 'resnet_v1_50', 'fpn', + reuse_weights, use_explicit_padding, + override_base_feature_extractor_hyperparams) + + +class SSDResnet101V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """SSD Resnet101 V1 FPN feature extractor based on Resnet v1 architecture. + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + UNUSED currently. + min_depth: minimum feature extractor depth. UNUSED Currently. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + reuse_weights: Whether to reuse variables. Default is None. + use_explicit_padding: Whether to use explicit padding when extracting + features. Default is False. UNUSED currently. + use_depthwise: Whether to use depthwise convolutions. UNUSED currently. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + """ + super(SSDResnet101V1FpnFeatureExtractor, self).__init__( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams_fn, resnet_v1.resnet_v1_101, 'resnet_v1_101', 'fpn', + reuse_weights, use_explicit_padding, + override_base_feature_extractor_hyperparams) + + +class SSDResnet152V1FpnFeatureExtractor(_SSDResnetV1FpnFeatureExtractor): + + def __init__(self, + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + conv_hyperparams_fn, + reuse_weights=None, + use_explicit_padding=False, + use_depthwise=False, + override_base_feature_extractor_hyperparams=False): + """SSD Resnet152 V1 FPN feature extractor based on Resnet v1 architecture. + + Args: + is_training: whether the network is in training mode. + depth_multiplier: float depth multiplier for feature extractor. + UNUSED currently. + min_depth: minimum feature extractor depth. UNUSED Currently. + pad_to_multiple: the nearest multiple to zero pad the input height and + width dimensions to. + conv_hyperparams_fn: A function to construct tf slim arg_scope for conv2d + and separable_conv2d ops in the layers that are added on top of the + base feature extractor. + reuse_weights: Whether to reuse variables. Default is None. + use_explicit_padding: Whether to use explicit padding when extracting + features. Default is False. UNUSED currently. + use_depthwise: Whether to use depthwise convolutions. UNUSED currently. + override_base_feature_extractor_hyperparams: Whether to override + hyperparameters of the base feature extractor with the one from + `conv_hyperparams_fn`. + """ + super(SSDResnet152V1FpnFeatureExtractor, self).__init__( + is_training, depth_multiplier, min_depth, pad_to_multiple, + conv_hyperparams_fn, resnet_v1.resnet_v1_152, 'resnet_v1_152', 'fpn', + reuse_weights, use_explicit_padding, + override_base_feature_extractor_hyperparams) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_test.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..5f406359ac15f5b6cfaaad2cde196a7529d0404a --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_test.py @@ -0,0 +1,84 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for ssd resnet v1 FPN feature extractors.""" +import tensorflow as tf + +from object_detection.models import ssd_resnet_v1_fpn_feature_extractor +from object_detection.models import ssd_resnet_v1_fpn_feature_extractor_testbase + + +class SSDResnet50V1FeatureExtractorTest( + ssd_resnet_v1_fpn_feature_extractor_testbase. + SSDResnetFPNFeatureExtractorTestBase): + """SSDResnet50v1Fpn feature extractor test.""" + + def _create_feature_extractor(self, depth_multiplier, pad_to_multiple, + use_explicit_padding=False): + min_depth = 32 + is_training = True + return ssd_resnet_v1_fpn_feature_extractor.SSDResnet50V1FpnFeatureExtractor( + is_training, depth_multiplier, min_depth, pad_to_multiple, + self.conv_hyperparams_fn, use_explicit_padding=use_explicit_padding) + + def _resnet_scope_name(self): + return 'resnet_v1_50' + + +class SSDResnet101V1FeatureExtractorTest( + ssd_resnet_v1_fpn_feature_extractor_testbase. + SSDResnetFPNFeatureExtractorTestBase): + """SSDResnet101v1Fpn feature extractor test.""" + + def _create_feature_extractor(self, depth_multiplier, pad_to_multiple, + use_explicit_padding=False): + min_depth = 32 + is_training = True + return ( + ssd_resnet_v1_fpn_feature_extractor.SSDResnet101V1FpnFeatureExtractor( + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + self.conv_hyperparams_fn, + use_explicit_padding=use_explicit_padding)) + + def _resnet_scope_name(self): + return 'resnet_v1_101' + + +class SSDResnet152V1FeatureExtractorTest( + ssd_resnet_v1_fpn_feature_extractor_testbase. + SSDResnetFPNFeatureExtractorTestBase): + """SSDResnet152v1Fpn feature extractor test.""" + + def _create_feature_extractor(self, depth_multiplier, pad_to_multiple, + use_explicit_padding=False): + min_depth = 32 + is_training = True + return ( + ssd_resnet_v1_fpn_feature_extractor.SSDResnet152V1FpnFeatureExtractor( + is_training, + depth_multiplier, + min_depth, + pad_to_multiple, + self.conv_hyperparams_fn, + use_explicit_padding=use_explicit_padding)) + + def _resnet_scope_name(self): + return 'resnet_v1_152' + + +if __name__ == '__main__': + tf.test.main() diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_testbase.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_testbase.py new file mode 100644 index 0000000000000000000000000000000000000000..186f2b1748b8038ebbe3236b3ba35be24b33afea --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/models/ssd_resnet_v1_fpn_feature_extractor_testbase.py @@ -0,0 +1,107 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for ssd resnet v1 FPN feature extractors.""" +import abc +import numpy as np +import tensorflow as tf + +from object_detection.models import ssd_feature_extractor_test + + +class SSDResnetFPNFeatureExtractorTestBase( + ssd_feature_extractor_test.SsdFeatureExtractorTestBase): + """Helper test class for SSD Resnet v1 FPN feature extractors.""" + + @abc.abstractmethod + def _resnet_scope_name(self): + pass + + @abc.abstractmethod + def _fpn_scope_name(self): + return 'fpn' + + def test_extract_features_returns_correct_shapes_256(self): + image_height = 256 + image_width = 256 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256), + (2, 8, 8, 256), (2, 4, 4, 256), + (2, 2, 2, 256)] + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_dynamic_inputs(self): + image_height = 256 + image_width = 256 + depth_multiplier = 1.0 + pad_to_multiple = 1 + expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256), + (2, 8, 8, 256), (2, 4, 4, 256), + (2, 2, 2, 256)] + self.check_extract_features_returns_correct_shapes_with_dynamic_inputs( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_returns_correct_shapes_with_pad_to_multiple(self): + image_height = 254 + image_width = 254 + depth_multiplier = 1.0 + pad_to_multiple = 32 + expected_feature_map_shape = [(2, 32, 32, 256), (2, 16, 16, 256), + (2, 8, 8, 256), (2, 4, 4, 256), + (2, 2, 2, 256)] + + self.check_extract_features_returns_correct_shape( + 2, image_height, image_width, depth_multiplier, pad_to_multiple, + expected_feature_map_shape) + + def test_extract_features_raises_error_with_invalid_image_size(self): + image_height = 32 + image_width = 32 + depth_multiplier = 1.0 + pad_to_multiple = 1 + self.check_extract_features_raises_error_with_invalid_image_size( + image_height, image_width, depth_multiplier, pad_to_multiple) + + def test_preprocess_returns_correct_value_range(self): + image_height = 128 + image_width = 128 + depth_multiplier = 1 + pad_to_multiple = 1 + test_image = np.random.rand(4, image_height, image_width, 3) + feature_extractor = self._create_feature_extractor(depth_multiplier, + pad_to_multiple) + preprocessed_image = feature_extractor.preprocess(test_image) + self.assertAllClose(preprocessed_image, + test_image - [[123.68, 116.779, 103.939]]) + + def test_variables_only_created_in_scope(self): + depth_multiplier = 1 + pad_to_multiple = 1 + g = tf.Graph() + with g.as_default(): + feature_extractor = self._create_feature_extractor( + depth_multiplier, pad_to_multiple) + preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3)) + feature_extractor.extract_features(preprocessed_inputs) + variables = g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + for variable in variables: + self.assertTrue( + variable.name.startswith(self._resnet_scope_name()) + or variable.name.startswith(self._fpn_scope_name())) + + diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/object_detection_tutorial.ipynb b/research/mlperf_object_detection/Mask_RCNN/object_detection/object_detection_tutorial.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b5acce97818a61fba0dfb1efbe3fe5cc4edc48ef --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/object_detection_tutorial.ipynb @@ -0,0 +1,343 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Object Detection Demo\n", + "Welcome to the object detection inference walkthrough! This notebook will walk you step by step through the process of using a pre-trained model to detect objects in an image. Make sure to follow the [installation instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md) before you start." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os\n", + "import six.moves.urllib as urllib\n", + "import sys\n", + "import tarfile\n", + "import tensorflow as tf\n", + "import zipfile\n", + "\n", + "from collections import defaultdict\n", + "from io import StringIO\n", + "from matplotlib import pyplot as plt\n", + "from PIL import Image\n", + "\n", + "# This is needed since the notebook is stored in the object_detection folder.\n", + "sys.path.append(\"..\")\n", + "from object_detection.utils import ops as utils_ops\n", + "\n", + "if tf.__version__ < '1.4.0':\n", + " raise ImportError('Please upgrade your tensorflow installation to v1.4.* or later!')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Env setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is needed to display the images.\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Object detection imports\n", + "Here are the imports from the object detection module." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import label_map_util\n", + "\n", + "from utils import visualization_utils as vis_util" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model preparation " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Variables\n", + "\n", + "Any model exported using the `export_inference_graph.py` tool can be loaded here simply by changing `PATH_TO_CKPT` to point to a new .pb file. \n", + "\n", + "By default we use an \"SSD with Mobilenet\" model here. See the [detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md) for a list of other models that can be run out-of-the-box with varying speeds and accuracies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What model to download.\n", + "MODEL_NAME = 'ssd_mobilenet_v1_coco_2017_11_17'\n", + "MODEL_FILE = MODEL_NAME + '.tar.gz'\n", + "DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'\n", + "\n", + "# Path to frozen detection graph. This is the actual model that is used for the object detection.\n", + "PATH_TO_CKPT = MODEL_NAME + '/frozen_inference_graph.pb'\n", + "\n", + "# List of the strings that is used to add correct label for each box.\n", + "PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')\n", + "\n", + "NUM_CLASSES = 90" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "opener = urllib.request.URLopener()\n", + "opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)\n", + "tar_file = tarfile.open(MODEL_FILE)\n", + "for file in tar_file.getmembers():\n", + " file_name = os.path.basename(file.name)\n", + " if 'frozen_inference_graph.pb' in file_name:\n", + " tar_file.extract(file, os.getcwd())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load a (frozen) Tensorflow model into memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "detection_graph = tf.Graph()\n", + "with detection_graph.as_default():\n", + " od_graph_def = tf.GraphDef()\n", + " with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:\n", + " serialized_graph = fid.read()\n", + " od_graph_def.ParseFromString(serialized_graph)\n", + " tf.import_graph_def(od_graph_def, name='')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading label map\n", + "Label maps map indices to category names, so that when our convolution network predicts `5`, we know that this corresponds to `airplane`. Here we use internal utility functions, but anything that returns a dictionary mapping integers to appropriate string labels would be fine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "label_map = label_map_util.load_labelmap(PATH_TO_LABELS)\n", + "categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)\n", + "category_index = label_map_util.create_category_index(categories)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Helper code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_image_into_numpy_array(image):\n", + " (im_width, im_height) = image.size\n", + " return np.array(image.getdata()).reshape(\n", + " (im_height, im_width, 3)).astype(np.uint8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Detection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For the sake of simplicity we will use only 2 images:\n", + "# image1.jpg\n", + "# image2.jpg\n", + "# If you want to test the code with your images, just add path to the images to the TEST_IMAGE_PATHS.\n", + "PATH_TO_TEST_IMAGES_DIR = 'test_images'\n", + "TEST_IMAGE_PATHS = [ os.path.join(PATH_TO_TEST_IMAGES_DIR, 'image{}.jpg'.format(i)) for i in range(1, 3) ]\n", + "\n", + "# Size, in inches, of the output images.\n", + "IMAGE_SIZE = (12, 8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def run_inference_for_single_image(image, graph):\n", + " with graph.as_default():\n", + " with tf.Session() as sess:\n", + " # Get handles to input and output tensors\n", + " ops = tf.get_default_graph().get_operations()\n", + " all_tensor_names = {output.name for op in ops for output in op.outputs}\n", + " tensor_dict = {}\n", + " for key in [\n", + " 'num_detections', 'detection_boxes', 'detection_scores',\n", + " 'detection_classes', 'detection_masks'\n", + " ]:\n", + " tensor_name = key + ':0'\n", + " if tensor_name in all_tensor_names:\n", + " tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(\n", + " tensor_name)\n", + " if 'detection_masks' in tensor_dict:\n", + " # The following processing is only for single image\n", + " detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])\n", + " detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])\n", + " # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.\n", + " real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)\n", + " detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])\n", + " detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])\n", + " detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(\n", + " detection_masks, detection_boxes, image.shape[0], image.shape[1])\n", + " detection_masks_reframed = tf.cast(\n", + " tf.greater(detection_masks_reframed, 0.5), tf.uint8)\n", + " # Follow the convention by adding back the batch dimension\n", + " tensor_dict['detection_masks'] = tf.expand_dims(\n", + " detection_masks_reframed, 0)\n", + " image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')\n", + "\n", + " # Run inference\n", + " output_dict = sess.run(tensor_dict,\n", + " feed_dict={image_tensor: np.expand_dims(image, 0)})\n", + "\n", + " # all outputs are float32 numpy arrays, so convert types as appropriate\n", + " output_dict['num_detections'] = int(output_dict['num_detections'][0])\n", + " output_dict['detection_classes'] = output_dict[\n", + " 'detection_classes'][0].astype(np.uint8)\n", + " output_dict['detection_boxes'] = output_dict['detection_boxes'][0]\n", + " output_dict['detection_scores'] = output_dict['detection_scores'][0]\n", + " if 'detection_masks' in output_dict:\n", + " output_dict['detection_masks'] = output_dict['detection_masks'][0]\n", + " return output_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "for image_path in TEST_IMAGE_PATHS:\n", + " image = Image.open(image_path)\n", + " # the array based representation of the image will be used later in order to prepare the\n", + " # result image with boxes and labels on it.\n", + " image_np = load_image_into_numpy_array(image)\n", + " # Expand dimensions since the model expects images to have shape: [1, None, None, 3]\n", + " image_np_expanded = np.expand_dims(image_np, axis=0)\n", + " # Actual detection.\n", + " output_dict = run_inference_for_single_image(image_np, detection_graph)\n", + " # Visualization of the results of a detection.\n", + " vis_util.visualize_boxes_and_labels_on_image_array(\n", + " image_np,\n", + " output_dict['detection_boxes'],\n", + " output_dict['detection_classes'],\n", + " output_dict['detection_scores'],\n", + " category_index,\n", + " instance_masks=output_dict.get('detection_masks'),\n", + " use_normalized_coordinates=True,\n", + " line_thickness=8)\n", + " plt.figure(figsize=IMAGE_SIZE)\n", + " plt.imshow(image_np)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/__init__.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/anchor_generator.proto b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/anchor_generator.proto new file mode 100644 index 0000000000000000000000000000000000000000..c47b558f0689e40b66d097994a23f08b4bde03b5 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/anchor_generator.proto @@ -0,0 +1,17 @@ +syntax = "proto2"; + +package object_detection.protos; + +import "object_detection/protos/grid_anchor_generator.proto"; +import "object_detection/protos/ssd_anchor_generator.proto"; +import "object_detection/protos/multiscale_anchor_generator.proto"; + +// Configuration proto for the anchor generator to use in the object detection +// pipeline. See core/anchor_generator.py for details. +message AnchorGenerator { + oneof anchor_generator_oneof { + GridAnchorGenerator grid_anchor_generator = 1; + SsdAnchorGenerator ssd_anchor_generator = 2; + MultiscaleAnchorGenerator multiscale_anchor_generator = 3; + } +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/anchor_generator_pb2.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/anchor_generator_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..bfb80a76b249a1f6072612d7b2e81ac80e215807 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/anchor_generator_pb2.py @@ -0,0 +1,102 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: object_detection/protos/anchor_generator.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from object_detection.protos import grid_anchor_generator_pb2 as object__detection_dot_protos_dot_grid__anchor__generator__pb2 +from object_detection.protos import ssd_anchor_generator_pb2 as object__detection_dot_protos_dot_ssd__anchor__generator__pb2 +from object_detection.protos import multiscale_anchor_generator_pb2 as object__detection_dot_protos_dot_multiscale__anchor__generator__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='object_detection/protos/anchor_generator.proto', + package='object_detection.protos', + syntax='proto2', + serialized_pb=_b('\n.object_detection/protos/anchor_generator.proto\x12\x17object_detection.protos\x1a\x33object_detection/protos/grid_anchor_generator.proto\x1a\x32object_detection/protos/ssd_anchor_generator.proto\x1a\x39object_detection/protos/multiscale_anchor_generator.proto\"\xa2\x02\n\x0f\x41nchorGenerator\x12M\n\x15grid_anchor_generator\x18\x01 \x01(\x0b\x32,.object_detection.protos.GridAnchorGeneratorH\x00\x12K\n\x14ssd_anchor_generator\x18\x02 \x01(\x0b\x32+.object_detection.protos.SsdAnchorGeneratorH\x00\x12Y\n\x1bmultiscale_anchor_generator\x18\x03 \x01(\x0b\x32\x32.object_detection.protos.MultiscaleAnchorGeneratorH\x00\x42\x18\n\x16\x61nchor_generator_oneof') + , + dependencies=[object__detection_dot_protos_dot_grid__anchor__generator__pb2.DESCRIPTOR,object__detection_dot_protos_dot_ssd__anchor__generator__pb2.DESCRIPTOR,object__detection_dot_protos_dot_multiscale__anchor__generator__pb2.DESCRIPTOR,]) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + + + +_ANCHORGENERATOR = _descriptor.Descriptor( + name='AnchorGenerator', + full_name='object_detection.protos.AnchorGenerator', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='grid_anchor_generator', full_name='object_detection.protos.AnchorGenerator.grid_anchor_generator', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ssd_anchor_generator', full_name='object_detection.protos.AnchorGenerator.ssd_anchor_generator', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='multiscale_anchor_generator', full_name='object_detection.protos.AnchorGenerator.multiscale_anchor_generator', index=2, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + _descriptor.OneofDescriptor( + name='anchor_generator_oneof', full_name='object_detection.protos.AnchorGenerator.anchor_generator_oneof', + index=0, containing_type=None, fields=[]), + ], + serialized_start=240, + serialized_end=530, +) + +_ANCHORGENERATOR.fields_by_name['grid_anchor_generator'].message_type = object__detection_dot_protos_dot_grid__anchor__generator__pb2._GRIDANCHORGENERATOR +_ANCHORGENERATOR.fields_by_name['ssd_anchor_generator'].message_type = object__detection_dot_protos_dot_ssd__anchor__generator__pb2._SSDANCHORGENERATOR +_ANCHORGENERATOR.fields_by_name['multiscale_anchor_generator'].message_type = object__detection_dot_protos_dot_multiscale__anchor__generator__pb2._MULTISCALEANCHORGENERATOR +_ANCHORGENERATOR.oneofs_by_name['anchor_generator_oneof'].fields.append( + _ANCHORGENERATOR.fields_by_name['grid_anchor_generator']) +_ANCHORGENERATOR.fields_by_name['grid_anchor_generator'].containing_oneof = _ANCHORGENERATOR.oneofs_by_name['anchor_generator_oneof'] +_ANCHORGENERATOR.oneofs_by_name['anchor_generator_oneof'].fields.append( + _ANCHORGENERATOR.fields_by_name['ssd_anchor_generator']) +_ANCHORGENERATOR.fields_by_name['ssd_anchor_generator'].containing_oneof = _ANCHORGENERATOR.oneofs_by_name['anchor_generator_oneof'] +_ANCHORGENERATOR.oneofs_by_name['anchor_generator_oneof'].fields.append( + _ANCHORGENERATOR.fields_by_name['multiscale_anchor_generator']) +_ANCHORGENERATOR.fields_by_name['multiscale_anchor_generator'].containing_oneof = _ANCHORGENERATOR.oneofs_by_name['anchor_generator_oneof'] +DESCRIPTOR.message_types_by_name['AnchorGenerator'] = _ANCHORGENERATOR + +AnchorGenerator = _reflection.GeneratedProtocolMessageType('AnchorGenerator', (_message.Message,), dict( + DESCRIPTOR = _ANCHORGENERATOR, + __module__ = 'object_detection.protos.anchor_generator_pb2' + # @@protoc_insertion_point(class_scope:object_detection.protos.AnchorGenerator) + )) +_sym_db.RegisterMessage(AnchorGenerator) + + +# @@protoc_insertion_point(module_scope) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/argmax_matcher.proto b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/argmax_matcher.proto new file mode 100644 index 0000000000000000000000000000000000000000..947fcb983dcbe3bcb0b39b7c8bd48a50b1667edf --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/argmax_matcher.proto @@ -0,0 +1,29 @@ +syntax = "proto2"; + +package object_detection.protos; + +// Configuration proto for ArgMaxMatcher. See +// matchers/argmax_matcher.py for details. +message ArgMaxMatcher { + // Threshold for positive matches. + optional float matched_threshold = 1 [default = 0.5]; + + // Threshold for negative matches. + optional float unmatched_threshold = 2 [default = 0.5]; + + // Whether to construct ArgMaxMatcher without thresholds. + optional bool ignore_thresholds = 3 [default = false]; + + // If True then negative matches are the ones below the unmatched_threshold, + // whereas ignored matches are in between the matched and umatched + // threshold. If False, then negative matches are in between the matched + // and unmatched threshold, and everything lower than unmatched is ignored. + optional bool negatives_lower_than_unmatched = 4 [default = true]; + + // Whether to ensure each row is matched to at least one column. + optional bool force_match_for_each_row = 5 [default = false]; + + // Force constructed match objects to use matrix multiplication based gather + // instead of standard tf.gather + optional bool use_matmul_gather = 6 [default = false]; +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/argmax_matcher_pb2.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/argmax_matcher_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..08209d5d0ca2785b6ea80a4ccdbd4609dec7e929 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/argmax_matcher_pb2.py @@ -0,0 +1,104 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: object_detection/protos/argmax_matcher.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='object_detection/protos/argmax_matcher.proto', + package='object_detection.protos', + syntax='proto2', + serialized_pb=_b('\n,object_detection/protos/argmax_matcher.proto\x12\x17object_detection.protos\"\xec\x01\n\rArgMaxMatcher\x12\x1e\n\x11matched_threshold\x18\x01 \x01(\x02:\x03\x30.5\x12 \n\x13unmatched_threshold\x18\x02 \x01(\x02:\x03\x30.5\x12 \n\x11ignore_thresholds\x18\x03 \x01(\x08:\x05\x66\x61lse\x12,\n\x1enegatives_lower_than_unmatched\x18\x04 \x01(\x08:\x04true\x12\'\n\x18\x66orce_match_for_each_row\x18\x05 \x01(\x08:\x05\x66\x61lse\x12 \n\x11use_matmul_gather\x18\x06 \x01(\x08:\x05\x66\x61lse') +) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + + + +_ARGMAXMATCHER = _descriptor.Descriptor( + name='ArgMaxMatcher', + full_name='object_detection.protos.ArgMaxMatcher', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='matched_threshold', full_name='object_detection.protos.ArgMaxMatcher.matched_threshold', index=0, + number=1, type=2, cpp_type=6, label=1, + has_default_value=True, default_value=float(0.5), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='unmatched_threshold', full_name='object_detection.protos.ArgMaxMatcher.unmatched_threshold', index=1, + number=2, type=2, cpp_type=6, label=1, + has_default_value=True, default_value=float(0.5), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ignore_thresholds', full_name='object_detection.protos.ArgMaxMatcher.ignore_thresholds', index=2, + number=3, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='negatives_lower_than_unmatched', full_name='object_detection.protos.ArgMaxMatcher.negatives_lower_than_unmatched', index=3, + number=4, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=True, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='force_match_for_each_row', full_name='object_detection.protos.ArgMaxMatcher.force_match_for_each_row', index=4, + number=5, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='use_matmul_gather', full_name='object_detection.protos.ArgMaxMatcher.use_matmul_gather', index=5, + number=6, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=74, + serialized_end=310, +) + +DESCRIPTOR.message_types_by_name['ArgMaxMatcher'] = _ARGMAXMATCHER + +ArgMaxMatcher = _reflection.GeneratedProtocolMessageType('ArgMaxMatcher', (_message.Message,), dict( + DESCRIPTOR = _ARGMAXMATCHER, + __module__ = 'object_detection.protos.argmax_matcher_pb2' + # @@protoc_insertion_point(class_scope:object_detection.protos.ArgMaxMatcher) + )) +_sym_db.RegisterMessage(ArgMaxMatcher) + + +# @@protoc_insertion_point(module_scope) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/bipartite_matcher.proto b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/bipartite_matcher.proto new file mode 100644 index 0000000000000000000000000000000000000000..175ecdd109653ae1c0b37a9655873b72161e963e --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/bipartite_matcher.proto @@ -0,0 +1,11 @@ +syntax = "proto2"; + +package object_detection.protos; + +// Configuration proto for bipartite matcher. See +// matchers/bipartite_matcher.py for details. +message BipartiteMatcher { + // Force constructed match objects to use matrix multiplication based gather + // instead of standard tf.gather + optional bool use_matmul_gather = 6 [default = false]; +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/bipartite_matcher_pb2.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/bipartite_matcher_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..d311068f1dc3bf26d2461898235124cde0b9f49e --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/bipartite_matcher_pb2.py @@ -0,0 +1,69 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: object_detection/protos/bipartite_matcher.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='object_detection/protos/bipartite_matcher.proto', + package='object_detection.protos', + syntax='proto2', + serialized_pb=_b('\n/object_detection/protos/bipartite_matcher.proto\x12\x17object_detection.protos\"4\n\x10\x42ipartiteMatcher\x12 \n\x11use_matmul_gather\x18\x06 \x01(\x08:\x05\x66\x61lse') +) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + + + +_BIPARTITEMATCHER = _descriptor.Descriptor( + name='BipartiteMatcher', + full_name='object_detection.protos.BipartiteMatcher', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='use_matmul_gather', full_name='object_detection.protos.BipartiteMatcher.use_matmul_gather', index=0, + number=6, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=76, + serialized_end=128, +) + +DESCRIPTOR.message_types_by_name['BipartiteMatcher'] = _BIPARTITEMATCHER + +BipartiteMatcher = _reflection.GeneratedProtocolMessageType('BipartiteMatcher', (_message.Message,), dict( + DESCRIPTOR = _BIPARTITEMATCHER, + __module__ = 'object_detection.protos.bipartite_matcher_pb2' + # @@protoc_insertion_point(class_scope:object_detection.protos.BipartiteMatcher) + )) +_sym_db.RegisterMessage(BipartiteMatcher) + + +# @@protoc_insertion_point(module_scope) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_coder.proto b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_coder.proto new file mode 100644 index 0000000000000000000000000000000000000000..79b818125a33c39022262b9fc7754f4081f6b169 --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_coder.proto @@ -0,0 +1,19 @@ +syntax = "proto2"; + +package object_detection.protos; + +import "object_detection/protos/faster_rcnn_box_coder.proto"; +import "object_detection/protos/keypoint_box_coder.proto"; +import "object_detection/protos/mean_stddev_box_coder.proto"; +import "object_detection/protos/square_box_coder.proto"; + +// Configuration proto for the box coder to be used in the object detection +// pipeline. See core/box_coder.py for details. +message BoxCoder { + oneof box_coder_oneof { + FasterRcnnBoxCoder faster_rcnn_box_coder = 1; + MeanStddevBoxCoder mean_stddev_box_coder = 2; + SquareBoxCoder square_box_coder = 3; + KeypointBoxCoder keypoint_box_coder = 4; + } +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_coder_pb2.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_coder_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..3cf2a2e5496a915682a070430e746cfecee5c89f --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_coder_pb2.py @@ -0,0 +1,114 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: object_detection/protos/box_coder.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from object_detection.protos import faster_rcnn_box_coder_pb2 as object__detection_dot_protos_dot_faster__rcnn__box__coder__pb2 +from object_detection.protos import keypoint_box_coder_pb2 as object__detection_dot_protos_dot_keypoint__box__coder__pb2 +from object_detection.protos import mean_stddev_box_coder_pb2 as object__detection_dot_protos_dot_mean__stddev__box__coder__pb2 +from object_detection.protos import square_box_coder_pb2 as object__detection_dot_protos_dot_square__box__coder__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='object_detection/protos/box_coder.proto', + package='object_detection.protos', + syntax='proto2', + serialized_pb=_b('\n\'object_detection/protos/box_coder.proto\x12\x17object_detection.protos\x1a\x33object_detection/protos/faster_rcnn_box_coder.proto\x1a\x30object_detection/protos/keypoint_box_coder.proto\x1a\x33object_detection/protos/mean_stddev_box_coder.proto\x1a.object_detection/protos/square_box_coder.proto\"\xc7\x02\n\x08\x42oxCoder\x12L\n\x15\x66\x61ster_rcnn_box_coder\x18\x01 \x01(\x0b\x32+.object_detection.protos.FasterRcnnBoxCoderH\x00\x12L\n\x15mean_stddev_box_coder\x18\x02 \x01(\x0b\x32+.object_detection.protos.MeanStddevBoxCoderH\x00\x12\x43\n\x10square_box_coder\x18\x03 \x01(\x0b\x32\'.object_detection.protos.SquareBoxCoderH\x00\x12G\n\x12keypoint_box_coder\x18\x04 \x01(\x0b\x32).object_detection.protos.KeypointBoxCoderH\x00\x42\x11\n\x0f\x62ox_coder_oneof') + , + dependencies=[object__detection_dot_protos_dot_faster__rcnn__box__coder__pb2.DESCRIPTOR,object__detection_dot_protos_dot_keypoint__box__coder__pb2.DESCRIPTOR,object__detection_dot_protos_dot_mean__stddev__box__coder__pb2.DESCRIPTOR,object__detection_dot_protos_dot_square__box__coder__pb2.DESCRIPTOR,]) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + + + +_BOXCODER = _descriptor.Descriptor( + name='BoxCoder', + full_name='object_detection.protos.BoxCoder', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='faster_rcnn_box_coder', full_name='object_detection.protos.BoxCoder.faster_rcnn_box_coder', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mean_stddev_box_coder', full_name='object_detection.protos.BoxCoder.mean_stddev_box_coder', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='square_box_coder', full_name='object_detection.protos.BoxCoder.square_box_coder', index=2, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='keypoint_box_coder', full_name='object_detection.protos.BoxCoder.keypoint_box_coder', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + _descriptor.OneofDescriptor( + name='box_coder_oneof', full_name='object_detection.protos.BoxCoder.box_coder_oneof', + index=0, containing_type=None, fields=[]), + ], + serialized_start=273, + serialized_end=600, +) + +_BOXCODER.fields_by_name['faster_rcnn_box_coder'].message_type = object__detection_dot_protos_dot_faster__rcnn__box__coder__pb2._FASTERRCNNBOXCODER +_BOXCODER.fields_by_name['mean_stddev_box_coder'].message_type = object__detection_dot_protos_dot_mean__stddev__box__coder__pb2._MEANSTDDEVBOXCODER +_BOXCODER.fields_by_name['square_box_coder'].message_type = object__detection_dot_protos_dot_square__box__coder__pb2._SQUAREBOXCODER +_BOXCODER.fields_by_name['keypoint_box_coder'].message_type = object__detection_dot_protos_dot_keypoint__box__coder__pb2._KEYPOINTBOXCODER +_BOXCODER.oneofs_by_name['box_coder_oneof'].fields.append( + _BOXCODER.fields_by_name['faster_rcnn_box_coder']) +_BOXCODER.fields_by_name['faster_rcnn_box_coder'].containing_oneof = _BOXCODER.oneofs_by_name['box_coder_oneof'] +_BOXCODER.oneofs_by_name['box_coder_oneof'].fields.append( + _BOXCODER.fields_by_name['mean_stddev_box_coder']) +_BOXCODER.fields_by_name['mean_stddev_box_coder'].containing_oneof = _BOXCODER.oneofs_by_name['box_coder_oneof'] +_BOXCODER.oneofs_by_name['box_coder_oneof'].fields.append( + _BOXCODER.fields_by_name['square_box_coder']) +_BOXCODER.fields_by_name['square_box_coder'].containing_oneof = _BOXCODER.oneofs_by_name['box_coder_oneof'] +_BOXCODER.oneofs_by_name['box_coder_oneof'].fields.append( + _BOXCODER.fields_by_name['keypoint_box_coder']) +_BOXCODER.fields_by_name['keypoint_box_coder'].containing_oneof = _BOXCODER.oneofs_by_name['box_coder_oneof'] +DESCRIPTOR.message_types_by_name['BoxCoder'] = _BOXCODER + +BoxCoder = _reflection.GeneratedProtocolMessageType('BoxCoder', (_message.Message,), dict( + DESCRIPTOR = _BOXCODER, + __module__ = 'object_detection.protos.box_coder_pb2' + # @@protoc_insertion_point(class_scope:object_detection.protos.BoxCoder) + )) +_sym_db.RegisterMessage(BoxCoder) + + +# @@protoc_insertion_point(module_scope) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_predictor.proto b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_predictor.proto new file mode 100644 index 0000000000000000000000000000000000000000..f5ceae683f854ef0262d230d45c1a879684e72ab --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_predictor.proto @@ -0,0 +1,153 @@ +syntax = "proto2"; + +package object_detection.protos; + +import "object_detection/protos/hyperparams.proto"; + + +// Configuration proto for box predictor. See core/box_predictor.py for details. +message BoxPredictor { + oneof box_predictor_oneof { + ConvolutionalBoxPredictor convolutional_box_predictor = 1; + MaskRCNNBoxPredictor mask_rcnn_box_predictor = 2; + RfcnBoxPredictor rfcn_box_predictor = 3; + WeightSharedConvolutionalBoxPredictor weight_shared_convolutional_box_predictor = 4; + } +} + +// Configuration proto for Convolutional box predictor. +message ConvolutionalBoxPredictor { + // Hyperparameters for convolution ops used in the box predictor. + optional Hyperparams conv_hyperparams = 1; + + // Minimum feature depth prior to predicting box encodings and class + // predictions. + optional int32 min_depth = 2 [default = 0]; + + // Maximum feature depth prior to predicting box encodings and class + // predictions. If max_depth is set to 0, no additional feature map will be + // inserted before location and class predictions. + optional int32 max_depth = 3 [default = 0]; + + // Number of the additional conv layers before the predictor. + optional int32 num_layers_before_predictor = 4 [default = 0]; + + // Whether to use dropout for class prediction. + optional bool use_dropout = 5 [default = true]; + + // Keep probability for dropout + optional float dropout_keep_probability = 6 [default = 0.8]; + + // Size of final convolution kernel. If the spatial resolution of the feature + // map is smaller than the kernel size, then the kernel size is set to + // min(feature_width, feature_height). + optional int32 kernel_size = 7 [default = 1]; + + // Size of the encoding for boxes. + optional int32 box_code_size = 8 [default = 4]; + + // Whether to apply sigmoid to the output of class predictions. + // TODO(jonathanhuang): Do we need this since we have a post processing + // module.? + optional bool apply_sigmoid_to_scores = 9 [default = false]; + + optional float class_prediction_bias_init = 10 [default = 0.0]; + + // Whether to use depthwise separable convolution for box predictor layers. + optional bool use_depthwise = 11 [default = false]; +} + +// Configuration proto for weight shared convolutional box predictor. +message WeightSharedConvolutionalBoxPredictor { + // Hyperparameters for convolution ops used in the box predictor. + optional Hyperparams conv_hyperparams = 1; + + // Number of the additional conv layers before the predictor. + optional int32 num_layers_before_predictor = 4 [default = 0]; + + // Output depth for the convolution ops prior to predicting box encodings + // and class predictions. + optional int32 depth = 2 [default = 0]; + + // Size of final convolution kernel. If the spatial resolution of the feature + // map is smaller than the kernel size, then the kernel size is set to + // min(feature_width, feature_height). + optional int32 kernel_size = 7 [default = 3]; + + // Size of the encoding for boxes. + optional int32 box_code_size = 8 [default = 4]; + + // Bias initialization for class prediction. It has been show to stabilize + // training where there are large number of negative boxes. See + // https://arxiv.org/abs/1708.02002 for details. + optional float class_prediction_bias_init = 10 [default = 0.0]; + + // Whether to use dropout for class prediction. + optional bool use_dropout = 11 [default = false]; + + // Keep probability for dropout + optional float dropout_keep_probability = 12 [default = 0.8]; +} + +message MaskRCNNBoxPredictor { + // Hyperparameters for fully connected ops used in the box predictor. + optional Hyperparams fc_hyperparams = 1; + + // Whether to use dropout op prior to the both box and class predictions. + optional bool use_dropout = 2 [default= false]; + + // Keep probability for dropout. This is only used if use_dropout is true. + optional float dropout_keep_probability = 3 [default = 0.5]; + + // Size of the encoding for the boxes. + optional int32 box_code_size = 4 [default = 4]; + + // Hyperparameters for convolution ops used in the box predictor. + optional Hyperparams conv_hyperparams = 5; + + // Whether to predict instance masks inside detection boxes. + optional bool predict_instance_masks = 6 [default = false]; + + // The depth for the first conv2d_transpose op applied to the + // image_features in the mask prediction branch. If set to 0, the value + // will be set automatically based on the number of channels in the image + // features and the number of classes. + optional int32 mask_prediction_conv_depth = 7 [default = 256]; + + // Whether to predict keypoints inside detection boxes. + optional bool predict_keypoints = 8 [default = false]; + + // The height and the width of the predicted mask. + optional int32 mask_height = 9 [default = 15]; + optional int32 mask_width = 10 [default = 15]; + + // The number of convolutions applied to image_features in the mask prediction + // branch. + optional int32 mask_prediction_num_conv_layers = 11 [default = 2]; + optional bool masks_are_class_agnostic = 12 [default = false]; + + // Whether to use one box for all classes rather than a different box for each + // class. + optional bool share_box_across_classes = 13 [default = false]; +} + +message RfcnBoxPredictor { + // Hyperparameters for convolution ops used in the box predictor. + optional Hyperparams conv_hyperparams = 1; + + // Bin sizes for RFCN crops. + optional int32 num_spatial_bins_height = 2 [default = 3]; + + optional int32 num_spatial_bins_width = 3 [default = 3]; + + // Target depth to reduce the input image features to. + optional int32 depth = 4 [default=1024]; + + // Size of the encoding for the boxes. + optional int32 box_code_size = 5 [default = 4]; + + // Size to resize the rfcn crops to. + optional int32 crop_height = 6 [default= 12]; + + optional int32 crop_width = 7 [default=12]; +} diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_predictor_pb2.py b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_predictor_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..8434aea1585391dc3ef5d7ee58e31e1bb6b8a35b --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/box_predictor_pb2.py @@ -0,0 +1,517 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: object_detection/protos/box_predictor.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from object_detection.protos import hyperparams_pb2 as object__detection_dot_protos_dot_hyperparams__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='object_detection/protos/box_predictor.proto', + package='object_detection.protos', + syntax='proto2', + serialized_pb=_b('\n+object_detection/protos/box_predictor.proto\x12\x17object_detection.protos\x1a)object_detection/protos/hyperparams.proto\"\x90\x03\n\x0c\x42oxPredictor\x12Y\n\x1b\x63onvolutional_box_predictor\x18\x01 \x01(\x0b\x32\x32.object_detection.protos.ConvolutionalBoxPredictorH\x00\x12P\n\x17mask_rcnn_box_predictor\x18\x02 \x01(\x0b\x32-.object_detection.protos.MaskRCNNBoxPredictorH\x00\x12G\n\x12rfcn_box_predictor\x18\x03 \x01(\x0b\x32).object_detection.protos.RfcnBoxPredictorH\x00\x12s\n)weight_shared_convolutional_box_predictor\x18\x04 \x01(\x0b\x32>.object_detection.protos.WeightSharedConvolutionalBoxPredictorH\x00\x42\x15\n\x13\x62ox_predictor_oneof\"\x90\x03\n\x19\x43onvolutionalBoxPredictor\x12>\n\x10\x63onv_hyperparams\x18\x01 \x01(\x0b\x32$.object_detection.protos.Hyperparams\x12\x14\n\tmin_depth\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\tmax_depth\x18\x03 \x01(\x05:\x01\x30\x12&\n\x1bnum_layers_before_predictor\x18\x04 \x01(\x05:\x01\x30\x12\x19\n\x0buse_dropout\x18\x05 \x01(\x08:\x04true\x12%\n\x18\x64ropout_keep_probability\x18\x06 \x01(\x02:\x03\x30.8\x12\x16\n\x0bkernel_size\x18\x07 \x01(\x05:\x01\x31\x12\x18\n\rbox_code_size\x18\x08 \x01(\x05:\x01\x34\x12&\n\x17\x61pply_sigmoid_to_scores\x18\t \x01(\x08:\x05\x66\x61lse\x12%\n\x1a\x63lass_prediction_bias_init\x18\n \x01(\x02:\x01\x30\x12\x1c\n\ruse_depthwise\x18\x0b \x01(\x08:\x05\x66\x61lse\"\xbd\x02\n%WeightSharedConvolutionalBoxPredictor\x12>\n\x10\x63onv_hyperparams\x18\x01 \x01(\x0b\x32$.object_detection.protos.Hyperparams\x12&\n\x1bnum_layers_before_predictor\x18\x04 \x01(\x05:\x01\x30\x12\x10\n\x05\x64\x65pth\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0bkernel_size\x18\x07 \x01(\x05:\x01\x33\x12\x18\n\rbox_code_size\x18\x08 \x01(\x05:\x01\x34\x12%\n\x1a\x63lass_prediction_bias_init\x18\n \x01(\x02:\x01\x30\x12\x1a\n\x0buse_dropout\x18\x0b \x01(\x08:\x05\x66\x61lse\x12%\n\x18\x64ropout_keep_probability\x18\x0c \x01(\x02:\x03\x30.8\"\x92\x04\n\x14MaskRCNNBoxPredictor\x12<\n\x0e\x66\x63_hyperparams\x18\x01 \x01(\x0b\x32$.object_detection.protos.Hyperparams\x12\x1a\n\x0buse_dropout\x18\x02 \x01(\x08:\x05\x66\x61lse\x12%\n\x18\x64ropout_keep_probability\x18\x03 \x01(\x02:\x03\x30.5\x12\x18\n\rbox_code_size\x18\x04 \x01(\x05:\x01\x34\x12>\n\x10\x63onv_hyperparams\x18\x05 \x01(\x0b\x32$.object_detection.protos.Hyperparams\x12%\n\x16predict_instance_masks\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\'\n\x1amask_prediction_conv_depth\x18\x07 \x01(\x05:\x03\x32\x35\x36\x12 \n\x11predict_keypoints\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0bmask_height\x18\t \x01(\x05:\x02\x31\x35\x12\x16\n\nmask_width\x18\n \x01(\x05:\x02\x31\x35\x12*\n\x1fmask_prediction_num_conv_layers\x18\x0b \x01(\x05:\x01\x32\x12\'\n\x18masks_are_class_agnostic\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\'\n\x18share_box_across_classes\x18\r \x01(\x08:\x05\x66\x61lse\"\xf9\x01\n\x10RfcnBoxPredictor\x12>\n\x10\x63onv_hyperparams\x18\x01 \x01(\x0b\x32$.object_detection.protos.Hyperparams\x12\"\n\x17num_spatial_bins_height\x18\x02 \x01(\x05:\x01\x33\x12!\n\x16num_spatial_bins_width\x18\x03 \x01(\x05:\x01\x33\x12\x13\n\x05\x64\x65pth\x18\x04 \x01(\x05:\x04\x31\x30\x32\x34\x12\x18\n\rbox_code_size\x18\x05 \x01(\x05:\x01\x34\x12\x17\n\x0b\x63rop_height\x18\x06 \x01(\x05:\x02\x31\x32\x12\x16\n\ncrop_width\x18\x07 \x01(\x05:\x02\x31\x32') + , + dependencies=[object__detection_dot_protos_dot_hyperparams__pb2.DESCRIPTOR,]) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + + + +_BOXPREDICTOR = _descriptor.Descriptor( + name='BoxPredictor', + full_name='object_detection.protos.BoxPredictor', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='convolutional_box_predictor', full_name='object_detection.protos.BoxPredictor.convolutional_box_predictor', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mask_rcnn_box_predictor', full_name='object_detection.protos.BoxPredictor.mask_rcnn_box_predictor', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='rfcn_box_predictor', full_name='object_detection.protos.BoxPredictor.rfcn_box_predictor', index=2, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='weight_shared_convolutional_box_predictor', full_name='object_detection.protos.BoxPredictor.weight_shared_convolutional_box_predictor', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + _descriptor.OneofDescriptor( + name='box_predictor_oneof', full_name='object_detection.protos.BoxPredictor.box_predictor_oneof', + index=0, containing_type=None, fields=[]), + ], + serialized_start=116, + serialized_end=516, +) + + +_CONVOLUTIONALBOXPREDICTOR = _descriptor.Descriptor( + name='ConvolutionalBoxPredictor', + full_name='object_detection.protos.ConvolutionalBoxPredictor', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='conv_hyperparams', full_name='object_detection.protos.ConvolutionalBoxPredictor.conv_hyperparams', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='min_depth', full_name='object_detection.protos.ConvolutionalBoxPredictor.min_depth', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='max_depth', full_name='object_detection.protos.ConvolutionalBoxPredictor.max_depth', index=2, + number=3, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='num_layers_before_predictor', full_name='object_detection.protos.ConvolutionalBoxPredictor.num_layers_before_predictor', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='use_dropout', full_name='object_detection.protos.ConvolutionalBoxPredictor.use_dropout', index=4, + number=5, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=True, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dropout_keep_probability', full_name='object_detection.protos.ConvolutionalBoxPredictor.dropout_keep_probability', index=5, + number=6, type=2, cpp_type=6, label=1, + has_default_value=True, default_value=float(0.8), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='kernel_size', full_name='object_detection.protos.ConvolutionalBoxPredictor.kernel_size', index=6, + number=7, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=1, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='box_code_size', full_name='object_detection.protos.ConvolutionalBoxPredictor.box_code_size', index=7, + number=8, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=4, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='apply_sigmoid_to_scores', full_name='object_detection.protos.ConvolutionalBoxPredictor.apply_sigmoid_to_scores', index=8, + number=9, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='class_prediction_bias_init', full_name='object_detection.protos.ConvolutionalBoxPredictor.class_prediction_bias_init', index=9, + number=10, type=2, cpp_type=6, label=1, + has_default_value=True, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='use_depthwise', full_name='object_detection.protos.ConvolutionalBoxPredictor.use_depthwise', index=10, + number=11, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=519, + serialized_end=919, +) + + +_WEIGHTSHAREDCONVOLUTIONALBOXPREDICTOR = _descriptor.Descriptor( + name='WeightSharedConvolutionalBoxPredictor', + full_name='object_detection.protos.WeightSharedConvolutionalBoxPredictor', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='conv_hyperparams', full_name='object_detection.protos.WeightSharedConvolutionalBoxPredictor.conv_hyperparams', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='num_layers_before_predictor', full_name='object_detection.protos.WeightSharedConvolutionalBoxPredictor.num_layers_before_predictor', index=1, + number=4, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='depth', full_name='object_detection.protos.WeightSharedConvolutionalBoxPredictor.depth', index=2, + number=2, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='kernel_size', full_name='object_detection.protos.WeightSharedConvolutionalBoxPredictor.kernel_size', index=3, + number=7, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=3, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='box_code_size', full_name='object_detection.protos.WeightSharedConvolutionalBoxPredictor.box_code_size', index=4, + number=8, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=4, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='class_prediction_bias_init', full_name='object_detection.protos.WeightSharedConvolutionalBoxPredictor.class_prediction_bias_init', index=5, + number=10, type=2, cpp_type=6, label=1, + has_default_value=True, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='use_dropout', full_name='object_detection.protos.WeightSharedConvolutionalBoxPredictor.use_dropout', index=6, + number=11, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dropout_keep_probability', full_name='object_detection.protos.WeightSharedConvolutionalBoxPredictor.dropout_keep_probability', index=7, + number=12, type=2, cpp_type=6, label=1, + has_default_value=True, default_value=float(0.8), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=922, + serialized_end=1239, +) + + +_MASKRCNNBOXPREDICTOR = _descriptor.Descriptor( + name='MaskRCNNBoxPredictor', + full_name='object_detection.protos.MaskRCNNBoxPredictor', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='fc_hyperparams', full_name='object_detection.protos.MaskRCNNBoxPredictor.fc_hyperparams', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='use_dropout', full_name='object_detection.protos.MaskRCNNBoxPredictor.use_dropout', index=1, + number=2, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dropout_keep_probability', full_name='object_detection.protos.MaskRCNNBoxPredictor.dropout_keep_probability', index=2, + number=3, type=2, cpp_type=6, label=1, + has_default_value=True, default_value=float(0.5), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='box_code_size', full_name='object_detection.protos.MaskRCNNBoxPredictor.box_code_size', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=4, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='conv_hyperparams', full_name='object_detection.protos.MaskRCNNBoxPredictor.conv_hyperparams', index=4, + number=5, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='predict_instance_masks', full_name='object_detection.protos.MaskRCNNBoxPredictor.predict_instance_masks', index=5, + number=6, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mask_prediction_conv_depth', full_name='object_detection.protos.MaskRCNNBoxPredictor.mask_prediction_conv_depth', index=6, + number=7, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=256, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='predict_keypoints', full_name='object_detection.protos.MaskRCNNBoxPredictor.predict_keypoints', index=7, + number=8, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mask_height', full_name='object_detection.protos.MaskRCNNBoxPredictor.mask_height', index=8, + number=9, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=15, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mask_width', full_name='object_detection.protos.MaskRCNNBoxPredictor.mask_width', index=9, + number=10, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=15, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mask_prediction_num_conv_layers', full_name='object_detection.protos.MaskRCNNBoxPredictor.mask_prediction_num_conv_layers', index=10, + number=11, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=2, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='masks_are_class_agnostic', full_name='object_detection.protos.MaskRCNNBoxPredictor.masks_are_class_agnostic', index=11, + number=12, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='share_box_across_classes', full_name='object_detection.protos.MaskRCNNBoxPredictor.share_box_across_classes', index=12, + number=13, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1242, + serialized_end=1772, +) + + +_RFCNBOXPREDICTOR = _descriptor.Descriptor( + name='RfcnBoxPredictor', + full_name='object_detection.protos.RfcnBoxPredictor', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='conv_hyperparams', full_name='object_detection.protos.RfcnBoxPredictor.conv_hyperparams', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='num_spatial_bins_height', full_name='object_detection.protos.RfcnBoxPredictor.num_spatial_bins_height', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=3, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='num_spatial_bins_width', full_name='object_detection.protos.RfcnBoxPredictor.num_spatial_bins_width', index=2, + number=3, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=3, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='depth', full_name='object_detection.protos.RfcnBoxPredictor.depth', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=1024, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='box_code_size', full_name='object_detection.protos.RfcnBoxPredictor.box_code_size', index=4, + number=5, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=4, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='crop_height', full_name='object_detection.protos.RfcnBoxPredictor.crop_height', index=5, + number=6, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=12, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='crop_width', full_name='object_detection.protos.RfcnBoxPredictor.crop_width', index=6, + number=7, type=5, cpp_type=1, label=1, + has_default_value=True, default_value=12, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1775, + serialized_end=2024, +) + +_BOXPREDICTOR.fields_by_name['convolutional_box_predictor'].message_type = _CONVOLUTIONALBOXPREDICTOR +_BOXPREDICTOR.fields_by_name['mask_rcnn_box_predictor'].message_type = _MASKRCNNBOXPREDICTOR +_BOXPREDICTOR.fields_by_name['rfcn_box_predictor'].message_type = _RFCNBOXPREDICTOR +_BOXPREDICTOR.fields_by_name['weight_shared_convolutional_box_predictor'].message_type = _WEIGHTSHAREDCONVOLUTIONALBOXPREDICTOR +_BOXPREDICTOR.oneofs_by_name['box_predictor_oneof'].fields.append( + _BOXPREDICTOR.fields_by_name['convolutional_box_predictor']) +_BOXPREDICTOR.fields_by_name['convolutional_box_predictor'].containing_oneof = _BOXPREDICTOR.oneofs_by_name['box_predictor_oneof'] +_BOXPREDICTOR.oneofs_by_name['box_predictor_oneof'].fields.append( + _BOXPREDICTOR.fields_by_name['mask_rcnn_box_predictor']) +_BOXPREDICTOR.fields_by_name['mask_rcnn_box_predictor'].containing_oneof = _BOXPREDICTOR.oneofs_by_name['box_predictor_oneof'] +_BOXPREDICTOR.oneofs_by_name['box_predictor_oneof'].fields.append( + _BOXPREDICTOR.fields_by_name['rfcn_box_predictor']) +_BOXPREDICTOR.fields_by_name['rfcn_box_predictor'].containing_oneof = _BOXPREDICTOR.oneofs_by_name['box_predictor_oneof'] +_BOXPREDICTOR.oneofs_by_name['box_predictor_oneof'].fields.append( + _BOXPREDICTOR.fields_by_name['weight_shared_convolutional_box_predictor']) +_BOXPREDICTOR.fields_by_name['weight_shared_convolutional_box_predictor'].containing_oneof = _BOXPREDICTOR.oneofs_by_name['box_predictor_oneof'] +_CONVOLUTIONALBOXPREDICTOR.fields_by_name['conv_hyperparams'].message_type = object__detection_dot_protos_dot_hyperparams__pb2._HYPERPARAMS +_WEIGHTSHAREDCONVOLUTIONALBOXPREDICTOR.fields_by_name['conv_hyperparams'].message_type = object__detection_dot_protos_dot_hyperparams__pb2._HYPERPARAMS +_MASKRCNNBOXPREDICTOR.fields_by_name['fc_hyperparams'].message_type = object__detection_dot_protos_dot_hyperparams__pb2._HYPERPARAMS +_MASKRCNNBOXPREDICTOR.fields_by_name['conv_hyperparams'].message_type = object__detection_dot_protos_dot_hyperparams__pb2._HYPERPARAMS +_RFCNBOXPREDICTOR.fields_by_name['conv_hyperparams'].message_type = object__detection_dot_protos_dot_hyperparams__pb2._HYPERPARAMS +DESCRIPTOR.message_types_by_name['BoxPredictor'] = _BOXPREDICTOR +DESCRIPTOR.message_types_by_name['ConvolutionalBoxPredictor'] = _CONVOLUTIONALBOXPREDICTOR +DESCRIPTOR.message_types_by_name['WeightSharedConvolutionalBoxPredictor'] = _WEIGHTSHAREDCONVOLUTIONALBOXPREDICTOR +DESCRIPTOR.message_types_by_name['MaskRCNNBoxPredictor'] = _MASKRCNNBOXPREDICTOR +DESCRIPTOR.message_types_by_name['RfcnBoxPredictor'] = _RFCNBOXPREDICTOR + +BoxPredictor = _reflection.GeneratedProtocolMessageType('BoxPredictor', (_message.Message,), dict( + DESCRIPTOR = _BOXPREDICTOR, + __module__ = 'object_detection.protos.box_predictor_pb2' + # @@protoc_insertion_point(class_scope:object_detection.protos.BoxPredictor) + )) +_sym_db.RegisterMessage(BoxPredictor) + +ConvolutionalBoxPredictor = _reflection.GeneratedProtocolMessageType('ConvolutionalBoxPredictor', (_message.Message,), dict( + DESCRIPTOR = _CONVOLUTIONALBOXPREDICTOR, + __module__ = 'object_detection.protos.box_predictor_pb2' + # @@protoc_insertion_point(class_scope:object_detection.protos.ConvolutionalBoxPredictor) + )) +_sym_db.RegisterMessage(ConvolutionalBoxPredictor) + +WeightSharedConvolutionalBoxPredictor = _reflection.GeneratedProtocolMessageType('WeightSharedConvolutionalBoxPredictor', (_message.Message,), dict( + DESCRIPTOR = _WEIGHTSHAREDCONVOLUTIONALBOXPREDICTOR, + __module__ = 'object_detection.protos.box_predictor_pb2' + # @@protoc_insertion_point(class_scope:object_detection.protos.WeightSharedConvolutionalBoxPredictor) + )) +_sym_db.RegisterMessage(WeightSharedConvolutionalBoxPredictor) + +MaskRCNNBoxPredictor = _reflection.GeneratedProtocolMessageType('MaskRCNNBoxPredictor', (_message.Message,), dict( + DESCRIPTOR = _MASKRCNNBOXPREDICTOR, + __module__ = 'object_detection.protos.box_predictor_pb2' + # @@protoc_insertion_point(class_scope:object_detection.protos.MaskRCNNBoxPredictor) + )) +_sym_db.RegisterMessage(MaskRCNNBoxPredictor) + +RfcnBoxPredictor = _reflection.GeneratedProtocolMessageType('RfcnBoxPredictor', (_message.Message,), dict( + DESCRIPTOR = _RFCNBOXPREDICTOR, + __module__ = 'object_detection.protos.box_predictor_pb2' + # @@protoc_insertion_point(class_scope:object_detection.protos.RfcnBoxPredictor) + )) +_sym_db.RegisterMessage(RfcnBoxPredictor) + + +# @@protoc_insertion_point(module_scope) diff --git a/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/eval.proto b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/eval.proto new file mode 100644 index 0000000000000000000000000000000000000000..2108ddf41f0f343381d66c9fd52e74a102237c2c --- /dev/null +++ b/research/mlperf_object_detection/Mask_RCNN/object_detection/protos/eval.proto @@ -0,0 +1,78 @@ +syntax = "proto2"; + +package object_detection.protos; + +// Message for configuring DetectionModel evaluation jobs (eval.py). +message EvalConfig { + // Number of visualization images to generate. + optional uint32 num_visualizations = 1 [default=10]; + + // Number of examples to process of evaluation. + optional uint32 num_examples = 2 [default=5000]; + + // How often to run evaluation. + optional uint32 eval_interval_secs = 3 [default=300]; + + // Maximum number of times to run evaluation. If set to 0, will run forever. + optional uint32 max_evals = 4 [default=0]; + + // Whether the TensorFlow graph used for evaluation should be saved to disk. + optional bool save_graph = 5 [default=false]; + + // Path to directory to store visualizations in. If empty, visualization + // images are not exported (only shown on Tensorboard). + optional string visualization_export_dir = 6 [default=""]; + + // BNS name of the TensorFlow master. + optional string eval_master = 7 [default=""]; + + // Type of metrics to use for evaluation. + repeated string metrics_set = 8; + + // Path to export detections to COCO compatible JSON format. + optional string export_path = 9 [default='']; + + // Option to not read groundtruth labels and only export detections to + // COCO-compatible JSON file. + optional bool ignore_groundtruth = 10 [default=false]; + + // Use exponential moving averages of variables for evaluation. + // TODO(rathodv): When this is false make sure the model is constructed + // without moving averages in restore_fn. + optional bool use_moving_averages = 11 [default=false]; + + // Whether to evaluate instance masks. + // Note that since there is no evaluation code currently for instance + // segmenation this option is unused. + optional bool eval_instance_masks = 12 [default=false]; + + // Minimum score threshold for a detected object box to be visualized + optional float min_score_threshold = 13 [default=0.5]; + + // Maximum number of detections to visualize + optional int32 max_num_boxes_to_visualize = 14 [default=20]; + + // When drawing a single detection, each label is by default visualized as + //