Commit 31ca3b97 authored by Kaushik Shivakumar's avatar Kaushik Shivakumar
Browse files

resovle merge conflicts

parents 3e9d886d 7fcd7cba
......@@ -269,8 +269,7 @@ class ExtractAggregatedRepresentation(object):
axis=0), [num_assignments, 1]) - tf.gather(
codebook, selected_visual_words[ind])
return ind + 1, tf.tensor_scatter_nd_add(
vlad, tf.expand_dims(selected_visual_words[ind], axis=1),
tf.cast(diff, dtype=tf.float32))
vlad, tf.expand_dims(selected_visual_words[ind], axis=1), diff)
ind_vlad = tf.constant(0, dtype=tf.int32)
keep_going = lambda j, vlad: tf.less(j, num_features)
......@@ -396,9 +395,7 @@ class ExtractAggregatedRepresentation(object):
visual_words = tf.reshape(
tf.where(
tf.greater(
per_centroid_norms,
tf.cast(tf.sqrt(_NORM_SQUARED_TOLERANCE), dtype=tf.float32))),
tf.greater(per_centroid_norms, tf.sqrt(_NORM_SQUARED_TOLERANCE))),
[-1])
per_centroid_normalized_vector = tf.math.l2_normalize(
......
# DELF Training Instructions
This README documents the end-to-end process for training a landmark detection and retrieval
model using the DELF library on the [Google Landmarks Dataset v2](https://github.com/cvdfoundation/google-landmark) (GLDv2). This can be achieved following these steps:
1. Install the DELF Python library.
2. Download the raw images of the GLDv2 dataset.
3. Prepare the training data.
4. Run the training.
This README documents the end-to-end process for training a landmark detection
and retrieval model using the DELF library on the
[Google Landmarks Dataset v2](https://github.com/cvdfoundation/google-landmark)
(GLDv2). This can be achieved following these steps:
1. Install the DELF Python library.
2. Download the raw images of the GLDv2 dataset.
3. Prepare the training data.
4. Run the training.
The next sections will cove each of these steps in greater detail.
## Prerequisites
Clone the [TensorFlow Model Garden](https://github.com/tensorflow/models) repository and move
into the `models/research/delf/delf/python/training`folder.
Clone the [TensorFlow Model Garden](https://github.com/tensorflow/models)
repository and move into the `models/research/delf/delf/python/training`folder.
```
git clone https://github.com/tensorflow/models.git
cd models/research/delf/delf/python/training
......@@ -20,109 +24,245 @@ cd models/research/delf/delf/python/training
## Install the DELF Library
The DELF Python library can be installed by running the [`install_delf.sh`](./install_delf.sh)
script using the command:
```
bash install_delf.sh
```
The script installs both the DELF library and its dependencies in the following sequence:
* Install TensorFlow 2.2 and TensorFlow 2.2 for GPU.
* Install the [TF-Slim](https://github.com/google-research/tf-slim) library from source.
* Download [protoc](https://github.com/protocolbuffers/protobuf) and compile the DELF Protocol
Buffers.
* Install the matplotlib, numpy, scikit-image, scipy and python3-tk Python libraries.
* Install the [TensorFlow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection) from the cloned TensorFlow Model Garden repository.
* Install the DELF package.
*Please note that the current installation only works on 64 bits Linux architectures due to the
`protoc` binary downloaded by the installation script. If you wish to install the DELF library on
other architectures please update the [`install_delf.sh`](./install_delf.sh) script by referencing
the desired `protoc` [binary release](https://github.com/protocolbuffers/protobuf/releases).*
To be able to use this code, please follow
[these instructions](../../../INSTALL_INSTRUCTIONS.md) to properly install the
DELF library.
## Download the GLDv2 Training Data
The [GLDv2](https://github.com/cvdfoundation/google-landmark) images are grouped in 3 datasets: TRAIN, INDEX, TEST. Images in each dataset are grouped into `*.tar` files and individually
referenced in `*.csv`files containing training metadata and licensing information. The number of
`*.tar` files per dataset is as follows:
* TRAIN: 500 files.
* INDEX: 100 files.
* TEST: 20 files.
The [GLDv2](https://github.com/cvdfoundation/google-landmark) images are grouped
in 3 datasets: TRAIN, INDEX, TEST. Images in each dataset are grouped into
`*.tar` files and individually referenced in `*.csv`files containing training
metadata and licensing information. The number of `*.tar` files per dataset is
as follows:
* TRAIN: 500 files.
* INDEX: 100 files.
* TEST: 20 files.
To download the GLDv2 images, run the
[`download_dataset.sh`](./download_dataset.sh) script like in the following
example:
To download the GLDv2 images, run the [`download_dataset.sh`](./download_dataset.sh) script like in
the following example:
```
bash download_dataset.sh 500 100 20
```
The script takes the following parameters, in order:
* The number of image files from the TRAIN dataset to download (maximum 500).
* The number of image files from the INDEX dataset to download (maximum 100).
* The number of image files from the TEST dataset to download (maximum 20).
* The number of image files from the TRAIN dataset to download (maximum 500).
* The number of image files from the INDEX dataset to download (maximum 100).
* The number of image files from the TEST dataset to download (maximum 20).
The script downloads the GLDv2 images under the following directory structure:
* gldv2_dataset/
* train/ - Contains raw images from the TRAIN dataset.
* index/ - Contains raw images from the INDEX dataset.
* test/ - Contains raw images from the TEST dataset.
Each of the three folders `gldv2_dataset/train/`, `gldv2_dataset/index/` and `gldv2_dataset/test/`
contains the following:
* The downloaded `*.tar` files.
* The corresponding MD5 checksum files, `*.txt`.
* The unpacked content of the downloaded files. (*Images are organized in folders and subfolders
based on the first, second and third character in their file name.*)
* The CSV files containing training and licensing metadata of the downloaded images.
*Please note that due to the large size of the GLDv2 dataset, the download can take up to 12
hours and up to 1 TB of disk space. In order to save bandwidth and disk space, you may want to start by downloading only the TRAIN dataset, the only one required for the training, thus saving
approximately ~95 GB, the equivalent of the INDEX and TEST datasets. To further save disk space,
the `*.tar` files can be deleted after downloading and upacking them.*
* gldv2_dataset/
* train/ - Contains raw images from the TRAIN dataset.
* index/ - Contains raw images from the INDEX dataset.
* test/ - Contains raw images from the TEST dataset.
Each of the three folders `gldv2_dataset/train/`, `gldv2_dataset/index/` and
`gldv2_dataset/test/` contains the following:
* The downloaded `*.tar` files.
* The corresponding MD5 checksum files, `*.txt`.
* The unpacked content of the downloaded files. (*Images are organized in
folders and subfolders based on the first, second and third character in
their file name.*)
* The CSV files containing training and licensing metadata of the downloaded
images.
*Please note that due to the large size of the GLDv2 dataset, the download can
take up to 12 hours and up to 1 TB of disk space. In order to save bandwidth and
disk space, you may want to start by downloading only the TRAIN dataset, the
only one required for the training, thus saving approximately ~95 GB, the
equivalent of the INDEX and TEST datasets. To further save disk space, the
`*.tar` files can be deleted after downloading and upacking them.*
## Prepare the Data for Training
Preparing the data for training consists of creating [TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord)
files from the raw GLDv2 images grouped into TRAIN and VALIDATION splits. The training set
produced contains only the *clean* subset of the GLDv2 dataset. The [CVPR'20 paper](https://arxiv.org/abs/2004.01804)
introducing the GLDv2 dataset contains a detailed description of the *clean* subset.
Preparing the data for training consists of creating
[TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) files from
the raw GLDv2 images grouped into TRAIN and VALIDATION splits. The training set
produced contains only the *clean* subset of the GLDv2 dataset. The
[CVPR'20 paper](https://arxiv.org/abs/2004.01804) introducing the GLDv2 dataset
contains a detailed description of the *clean* subset.
Generating the TFRecord files containing the TRAIN and VALIDATION splits of the
*clean* GLDv2 subset can be achieved by running the
[`build_image_dataset.py`](./build_image_dataset.py) script. Assuming that the
GLDv2 images have been downloaded to the `gldv2_dataset` folder, the script can
be run as follows:
Generating the TFRecord files containing the TRAIN and VALIDATION splits of the *clean* GLDv2
subset can be achieved by running the [`build_image_dataset.py`](./build_image_dataset.py)
script. Assuming that the GLDv2 images have been downloaded to the `gldv2_dataset` folder, the
script can be run as follows:
```
python3 build_image_dataset.py \
--train_csv_path=gldv2_dataset/train/train.csv \
--train_clean_csv_path=gldv2_dataset/train/train_clean.csv \
--train_directory=gldv2_dataset/train/*/*/*/ \
--output_directory=gldv2_dataset/tfrecord/ \
--num_shards=128 \
--generate_train_validation_splits \
--validation_split_size=0.2
--train_csv_path=gldv2_dataset/train/train.csv \
--train_clean_csv_path=gldv2_dataset/train/train_clean.csv \
--train_directory=gldv2_dataset/train/*/*/*/ \
--output_directory=gldv2_dataset/tfrecord/ \
--num_shards=128 \
--generate_train_validation_splits \
--validation_split_size=0.2
```
*Please refer to the source code of the [`build_image_dataset.py`](./build_image_dataset.py) script for a detailed description of its parameters.*
The TFRecord files written in the `OUTPUT_DIRECTORY` will be prefixed as follows:
* TRAIN split: `train-*`
* VALIDATION split: `validation-*`
*Please refer to the source code of the
[`build_image_dataset.py`](./build_image_dataset.py) script for a detailed
description of its parameters.*
The TFRecord files written in the `OUTPUT_DIRECTORY` will be prefixed as
follows:
* TRAIN split: `train-*`
* VALIDATION split: `validation-*`
The same script can be used to generate TFRecord files for the TEST split for
post-training evaluation purposes. This can be achieved by adding the
parameters:
The same script can be used to generate TFRecord files for the TEST split for post-training
evaluation purposes. This can be achieved by adding the parameters:
```
--test_csv_path=gldv2_dataset/train/test.csv \
--test_directory=gldv2_dataset/test/*/*/*/ \
--test_csv_path=gldv2_dataset/train/test.csv \
--test_directory=gldv2_dataset/test/*/*/*/ \
```
In this scenario, the TFRecord files of the TEST split written in the `OUTPUT_DIRECTORY` will be
named according to the pattern `test-*`.
*Please note that due to the large size of the GLDv2 dataset, the generation of the TFRecord
files can take up to 12 hours and up to 500 GB of space disk.*
In this scenario, the TFRecord files of the TEST split written in the
`OUTPUT_DIRECTORY` will be named according to the pattern `test-*`.
*Please note that due to the large size of the GLDv2 dataset, the generation of
the TFRecord files can take up to 12 hours and up to 500 GB of space disk.*
## Running the Training
Assuming the TFRecord files were generated in the `gldv2_dataset/tfrecord/` directory, running
the following command should start training a model:
For the training to converge faster, it is possible to initialize the ResNet
backbone with the weights of a pretrained ImageNet model. The ImageNet
checkpoint is available at the following location:
[`http://storage.googleapis.com/delf/resnet50_imagenet_weights.tar.gz`](http://storage.googleapis.com/delf/resnet50_imagenet_weights.tar.gz).
To download and unpack it run the following commands on a Linux box:
```
curl -Os http://storage.googleapis.com/delf/resnet50_imagenet_weights.tar.gz
tar -xzvf resnet50_imagenet_weights.tar.gz
```
Assuming the TFRecord files were generated in the `gldv2_dataset/tfrecord/`
directory, running the following command should start training a model and
output the results in the `gldv2_training` directory:
```
python3 train.py \
--train_file_pattern=gldv2_dataset/tfrecord/train* \
--validation_file_pattern=gldv2_dataset/tfrecord/validation*
--validation_file_pattern=gldv2_dataset/tfrecord/validation* \
--imagenet_checkpoint=resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5 \
--dataset_version=gld_v2_clean \
--logdir=gldv2_training/
```
On a multi-GPU machine the batch size can be increased to speed up the training
using the `--batch_size` parameter. On a 8 Tesla P100 GPUs machine you can set
the batch size to `256`:
```
--batch_size=256
```
## Exporting the Trained Model
Assuming the training output, the TensorFlow checkpoint, is in the
`gldv2_training` directory, running the following commands exports the model.
### DELF local feature model
```
python3 model/export_model.py \
--ckpt_path=gldv2_training/delf_weights \
--export_path=gldv2_model_local \
--block3_strides
```
### Kaggle-compatible global feature model
To export a global feature model in the format required by the
[2020 Landmark Retrieval challenge](https://www.kaggle.com/c/landmark-retrieval-2020),
you can use the following command:
```
python3 model/export_global_model.py \
--ckpt_path=gldv2_training/delf_weights \
--export_path=gldv2_model_global \
--input_scales_list=0.70710677,1.0,1.4142135 \
--multi_scale_pool_type=sum \
--normalize_global_descriptor
```
## Testing the Trained Model
After the trained model has been exported, it can be used to extract DELF
features from 2 images of the same landmark and to perform a matching test
between the 2 images based on the extracted features to validate they represent
the same landmark.
Start by downloading the Oxford buildings dataset:
```
mkdir data && cd data
wget http://www.robots.ox.ac.uk/~vgg/data/oxbuildings/oxbuild_images.tgz
mkdir oxford5k_images oxford5k_features
tar -xvzf oxbuild_images.tgz -C oxford5k_images/
cd ../
echo data/oxford5k_images/hertford_000056.jpg >> list_images.txt
echo data/oxford5k_images/oxford_000317.jpg >> list_images.txt
```
Make a copy of the
[`delf_config_example.pbtxt`](../examples/delf_config_example.pbtxt) protobuffer
file which configures the DELF feature extraction. Update the file by making the
following changes:
* set the `model_path` attribute to the directory containing the exported
model, `gldv2_model_local` in this example
* add at the root level the attribute `is_tf2_exported` with the value `true`
* set to `false` the `use_pca` attribute inside `delf_local_config`
The ensuing file should resemble the following:
```
model_path: "gldv2_model_local"
image_scales: .25
image_scales: .3536
image_scales: .5
image_scales: .7071
image_scales: 1.0
image_scales: 1.4142
image_scales: 2.0
is_tf2_exported: true
delf_local_config {
use_pca: false
max_feature_num: 1000
score_threshold: 100.0
}
```
Run the following command to extract DELF features for the images
`hertford_000056.jpg` and `oxford_000317.jpg`:
```
python3 ../examples/extract_features.py \
--config_path delf_config_example.pbtxt \
--list_images_path list_images.txt \
--output_dir data/oxford5k_features
```
Run the following command to perform feature matching between the images
`hertford_000056.jpg` and `oxford_000317.jpg`:
```
python3 ../examples/match_images.py \
--image_1_path data/oxford5k_images/hertford_000056.jpg \
--image_2_path data/oxford5k_images/oxford_000317.jpg \
--features_1_path data/oxford5k_features/hertford_000056.delf \
--features_2_path data/oxford5k_features/oxford_000317.delf \
--output_image matched_images.png
```
The generated image `matched_images.png` should look similar to this one:
![MatchedImagesDemo](./matched_images_demo.png)
......@@ -302,6 +302,21 @@ def _write_relabeling_rules(relabeling_rules):
csv_writer.writerow([new_label, old_label])
def _shuffle_by_columns(np_array, random_state):
"""Shuffle the columns of a 2D numpy array.
Args:
np_array: array to shuffle.
random_state: numpy RandomState to be used for shuffling.
Returns:
The shuffled array.
"""
columns = np_array.shape[1]
columns_indices = np.arange(columns)
random_state.shuffle(columns_indices)
return np_array[:, columns_indices]
def _build_train_and_validation_splits(image_paths, file_ids, labels,
validation_split_size, seed):
"""Create TRAIN and VALIDATION splits containg all labels in equal proportion.
......@@ -353,19 +368,21 @@ def _build_train_and_validation_splits(image_paths, file_ids, labels,
for label, indexes in image_attrs_idx_by_label.items():
# Create the subset for the current label.
image_attrs_label = image_attrs[:, indexes]
images_per_label = image_attrs_label.shape[1]
# Shuffle the current label subset.
columns_indices = np.arange(images_per_label)
rs.shuffle(columns_indices)
image_attrs_label = image_attrs_label[:, columns_indices]
image_attrs_label = _shuffle_by_columns(image_attrs_label, rs)
# Split the current label subset into TRAIN and VALIDATION splits and add
# each split to the list of all splits.
images_per_label = image_attrs_label.shape[1]
cutoff_idx = max(1, int(validation_split_size * images_per_label))
splits[_VALIDATION_SPLIT].append(image_attrs_label[:, 0 : cutoff_idx])
splits[_TRAIN_SPLIT].append(image_attrs_label[:, cutoff_idx : ])
validation_split = np.concatenate(splits[_VALIDATION_SPLIT], axis=1)
train_split = np.concatenate(splits[_TRAIN_SPLIT], axis=1)
# Concatenate all subsets of image attributes into TRAIN and VALIDATION splits
# and reshuffle them again to ensure variance of labels across batches.
validation_split = _shuffle_by_columns(
np.concatenate(splits[_VALIDATION_SPLIT], axis=1), rs)
train_split = _shuffle_by_columns(
np.concatenate(splits[_TRAIN_SPLIT], axis=1), rs)
# Unstack the image attribute arrays in the TRAIN and VALIDATION splits and
# convert them back to lists. Convert labels back to 'int' from 'str'
......
......@@ -29,11 +29,7 @@ import tensorflow as tf
class _GoogleLandmarksInfo(object):
"""Metadata about the Google Landmarks dataset."""
num_classes = {
'gld_v1': 14951,
'gld_v2': 203094,
'gld_v2_clean': 81313
}
num_classes = {'gld_v1': 14951, 'gld_v2': 203094, 'gld_v2_clean': 81313}
class _DataAugmentationParams(object):
......@@ -123,6 +119,8 @@ def _ParseFunction(example, name_to_features, image_size, augmentation):
# Parse to get image.
image = parsed_example['image/encoded']
image = tf.io.decode_jpeg(image)
image = NormalizeImages(
image, pixel_value_scale=128.0, pixel_value_offset=128.0)
if augmentation:
image = _ImageNetCrop(image)
else:
......@@ -130,6 +128,7 @@ def _ParseFunction(example, name_to_features, image_size, augmentation):
image.set_shape([image_size, image_size, 3])
# Parse to get label.
label = parsed_example['image/class/label']
return image, label
......@@ -162,6 +161,7 @@ def CreateDataset(file_pattern,
'image/width': tf.io.FixedLenFeature([], tf.int64, default_value=0),
'image/channels': tf.io.FixedLenFeature([], tf.int64, default_value=0),
'image/format': tf.io.FixedLenFeature([], tf.string, default_value=''),
'image/id': tf.io.FixedLenFeature([], tf.string, default_value=''),
'image/filename': tf.io.FixedLenFeature([], tf.string, default_value=''),
'image/encoded': tf.io.FixedLenFeature([], tf.string, default_value=''),
'image/class/label': tf.io.FixedLenFeature([], tf.int64, default_value=0),
......
......@@ -132,10 +132,12 @@ class Delf(tf.keras.Model):
self.attn_classification.trainable_weights)
def call(self, input_image, training=True):
blocks = {'block3': None}
self.backbone(input_image, intermediates_dict=blocks, training=training)
blocks = {}
features = blocks['block3']
self.backbone.build_call(
input_image, intermediates_dict=blocks, training=training)
features = blocks['block3'] # pytype: disable=key-error
_, probs, _ = self.attention(features, training=training)
return probs, features
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Export global feature tensorflow inference model.
This model includes image pyramids for multi-scale processing.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import app
from absl import flags
import tensorflow as tf
from delf.python.training.model import delf_model
from delf.python.training.model import export_model_utils
FLAGS = flags.FLAGS
flags.DEFINE_string('ckpt_path', '/tmp/delf-logdir/delf-weights',
'Path to saved checkpoint.')
flags.DEFINE_string('export_path', None, 'Path where model will be exported.')
flags.DEFINE_list(
'input_scales_list', None,
'Optional input image scales to use. If None (default), an input end-point '
'"input_scales" is added for the exported model. If not None, the '
'specified list of floats will be hard-coded as the desired input scales.')
flags.DEFINE_enum(
'multi_scale_pool_type', 'None', ['None', 'average', 'sum'],
"If 'None' (default), the model is exported with an output end-point "
"'global_descriptors', where the global descriptor for each scale is "
"returned separately. If not 'None', the global descriptor of each scale is"
' pooled and a 1D global descriptor is returned, with output end-point '
"'global_descriptor'.")
flags.DEFINE_boolean('normalize_global_descriptor', False,
'If True, L2-normalizes global descriptor.')
class _ExtractModule(tf.Module):
"""Helper module to build and save global feature model."""
def __init__(self,
multi_scale_pool_type='None',
normalize_global_descriptor=False,
input_scales_tensor=None):
"""Initialization of global feature model.
Args:
multi_scale_pool_type: Type of multi-scale pooling to perform.
normalize_global_descriptor: Whether to L2-normalize global descriptor.
input_scales_tensor: If None, the exported function to be used should be
ExtractFeatures, where an input end-point "input_scales" is added for
the exported model. If not None, the specified 1D tensor of floats will
be hard-coded as the desired input scales, in conjunction with
ExtractFeaturesFixedScales.
"""
self._multi_scale_pool_type = multi_scale_pool_type
self._normalize_global_descriptor = normalize_global_descriptor
if input_scales_tensor is None:
self._input_scales_tensor = []
else:
self._input_scales_tensor = input_scales_tensor
# Setup the DELF model for extraction.
self._model = delf_model.Delf(block3_strides=False, name='DELF')
def LoadWeights(self, checkpoint_path):
self._model.load_weights(checkpoint_path)
@tf.function(input_signature=[
tf.TensorSpec(shape=[None, None, 3], dtype=tf.uint8, name='input_image'),
tf.TensorSpec(shape=[None], dtype=tf.float32, name='input_scales'),
tf.TensorSpec(
shape=[None], dtype=tf.int32, name='input_global_scales_ind')
])
def ExtractFeatures(self, input_image, input_scales, input_global_scales_ind):
extracted_features = export_model_utils.ExtractGlobalFeatures(
input_image,
input_scales,
input_global_scales_ind,
lambda x: self._model.backbone.build_call(x, training=False),
multi_scale_pool_type=self._multi_scale_pool_type,
normalize_global_descriptor=self._normalize_global_descriptor)
named_output_tensors = {}
if self._multi_scale_pool_type == 'None':
named_output_tensors['global_descriptors'] = tf.identity(
extracted_features, name='global_descriptors')
else:
named_output_tensors['global_descriptor'] = tf.identity(
extracted_features, name='global_descriptor')
return named_output_tensors
@tf.function(input_signature=[
tf.TensorSpec(shape=[None, None, 3], dtype=tf.uint8, name='input_image')
])
def ExtractFeaturesFixedScales(self, input_image):
return self.ExtractFeatures(input_image, self._input_scales_tensor,
tf.range(tf.size(self._input_scales_tensor)))
def main(argv):
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
export_path = FLAGS.export_path
if os.path.exists(export_path):
raise ValueError('export_path %s already exists.' % export_path)
if FLAGS.input_scales_list is None:
input_scales_tensor = None
else:
input_scales_tensor = tf.constant(
[float(s) for s in FLAGS.input_scales_list],
dtype=tf.float32,
shape=[len(FLAGS.input_scales_list)],
name='input_scales')
module = _ExtractModule(FLAGS.multi_scale_pool_type,
FLAGS.normalize_global_descriptor,
input_scales_tensor)
# Load the weights.
checkpoint_path = FLAGS.ckpt_path
module.LoadWeights(checkpoint_path)
print('Checkpoint loaded from ', checkpoint_path)
# Save the module
if FLAGS.input_scales_list is None:
served_function = module.ExtractFeatures
else:
served_function = module.ExtractFeaturesFixedScales
tf.saved_model.save(
module, export_path, signatures={'serving_default': served_function})
if __name__ == '__main__':
app.run(main)
......@@ -42,67 +42,39 @@ flags.DEFINE_boolean('block3_strides', False,
flags.DEFINE_float('iou', 1.0, 'IOU for non-max suppression.')
def _build_tensor_info(tensor_dict):
"""Replace the dict's value by the tensor info.
Args:
tensor_dict: A dictionary contains <string, tensor>.
Returns:
dict: New dictionary contains <string, tensor_info>.
"""
return {
k: tf.compat.v1.saved_model.utils.build_tensor_info(t)
for k, t in tensor_dict.items()
}
def main(argv):
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
export_path = FLAGS.export_path
if os.path.exists(export_path):
raise ValueError('Export_path already exists.')
with tf.Graph().as_default() as g, tf.compat.v1.Session(graph=g) as sess:
class _ExtractModule(tf.Module):
"""Helper module to build and save DELF model."""
def __init__(self, block3_strides, iou):
"""Initialization of DELF model.
Args:
block3_strides: bool, whether to add strides to the output of block3.
iou: IOU for non-max suppression.
"""
self._stride_factor = 2.0 if block3_strides else 1.0
self._iou = iou
# Setup the DELF model for extraction.
model = delf_model.Delf(block3_strides=FLAGS.block3_strides, name='DELF')
# Initial forward pass to build model.
images = tf.zeros((1, 321, 321, 3), dtype=tf.float32)
model(images)
self._model = delf_model.Delf(
block3_strides=block3_strides, name='DELF')
stride_factor = 2.0 if FLAGS.block3_strides else 1.0
def LoadWeights(self, checkpoint_path):
self._model.load_weights(checkpoint_path)
# Setup the multiscale keypoint extraction.
input_image = tf.compat.v1.placeholder(
tf.uint8, shape=(None, None, 3), name='input_image')
input_abs_thres = tf.compat.v1.placeholder(
tf.float32, shape=(), name='input_abs_thres')
input_scales = tf.compat.v1.placeholder(
tf.float32, shape=[None], name='input_scales')
input_max_feature_num = tf.compat.v1.placeholder(
tf.int32, shape=(), name='input_max_feature_num')
@tf.function(input_signature=[
tf.TensorSpec(shape=[None, None, 3], dtype=tf.uint8, name='input_image'),
tf.TensorSpec(shape=[None], dtype=tf.float32, name='input_scales'),
tf.TensorSpec(shape=(), dtype=tf.int32, name='input_max_feature_num'),
tf.TensorSpec(shape=(), dtype=tf.float32, name='input_abs_thres')
])
def ExtractFeatures(self, input_image, input_scales, input_max_feature_num,
input_abs_thres):
extracted_features = export_model_utils.ExtractLocalFeatures(
input_image, input_scales, input_max_feature_num, input_abs_thres,
FLAGS.iou, lambda x: model(x, training=False), stride_factor)
self._iou, lambda x: self._model(x, training=False),
self._stride_factor)
# Load the weights.
checkpoint_path = FLAGS.ckpt_path
model.load_weights(checkpoint_path)
print('Checkpoint loaded from ', checkpoint_path)
named_input_tensors = {
'input_image': input_image,
'input_scales': input_scales,
'input_abs_thres': input_abs_thres,
'input_max_feature_num': input_max_feature_num,
}
# Outputs to the exported model.
named_output_tensors = {}
named_output_tensors['boxes'] = tf.identity(
extracted_features[0], name='boxes')
......@@ -112,25 +84,27 @@ def main(argv):
extracted_features[2], name='scales')
named_output_tensors['scores'] = tf.identity(
extracted_features[3], name='scores')
return named_output_tensors
def main(argv):
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
export_path = FLAGS.export_path
if os.path.exists(export_path):
raise ValueError(f'Export_path {export_path} already exists. Please '
'specify a different path or delete the existing one.')
module = _ExtractModule(FLAGS.block3_strides, FLAGS.iou)
# Load the weights.
checkpoint_path = FLAGS.ckpt_path
module.LoadWeights(checkpoint_path)
print('Checkpoint loaded from ', checkpoint_path)
# Export the model.
signature_def = tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
inputs=_build_tensor_info(named_input_tensors),
outputs=_build_tensor_info(named_output_tensors))
print('Exporting trained model to:', export_path)
builder = tf.compat.v1.saved_model.builder.SavedModelBuilder(export_path)
init_op = None
builder.add_meta_graph_and_variables(
sess, [tf.compat.v1.saved_model.tag_constants.SERVING],
signature_def_map={
tf.compat.v1.saved_model.signature_constants
.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
signature_def
},
main_op=init_op)
builder.save()
# Save the module
tf.saved_model.save(module, export_path)
if __name__ == '__main__':
......
......@@ -142,20 +142,21 @@ def ExtractLocalFeatures(image, image_scales, max_feature_num, abs_thres, iou,
keep_going = lambda j, b, f, scales, scores: tf.less(j, num_scales)
(_, output_boxes, output_features, output_scales,
output_scores) = tf.while_loop(
cond=keep_going,
body=_ProcessSingleScale,
loop_vars=[
i, output_boxes, output_features, output_scales, output_scores
],
shape_invariants=[
i.get_shape(),
tf.TensorShape([None, 4]),
tf.TensorShape([None, feature_depth]),
tf.TensorShape([None]),
tf.TensorShape([None])
],
back_prop=False)
output_scores) = tf.nest.map_structure(
tf.stop_gradient,
tf.while_loop(
cond=keep_going,
body=_ProcessSingleScale,
loop_vars=[
i, output_boxes, output_features, output_scales, output_scores
],
shape_invariants=[
i.get_shape(),
tf.TensorShape([None, 4]),
tf.TensorShape([None, feature_depth]),
tf.TensorShape([None]),
tf.TensorShape([None])
]))
feature_boxes = box_list.BoxList(output_boxes)
feature_boxes.add_field('features', output_features)
......@@ -169,3 +170,99 @@ def ExtractLocalFeatures(image, image_scales, max_feature_num, abs_thres, iou,
return final_boxes.get(), final_boxes.get_field(
'features'), final_boxes.get_field('scales'), tf.expand_dims(
final_boxes.get_field('scores'), 1)
@tf.function
def ExtractGlobalFeatures(image,
image_scales,
global_scales_ind,
model_fn,
multi_scale_pool_type='None',
normalize_global_descriptor=False):
"""Extract global features for input image.
Args:
image: image tensor of type tf.uint8 with shape [h, w, channels].
image_scales: 1D float tensor which contains float scales used for image
pyramid construction.
global_scales_ind: Feature extraction happens only for a subset of
`image_scales`, those with corresponding indices from this tensor.
model_fn: model function. Follows the signature:
* Args:
* `images`: Image tensor which is re-scaled.
* Returns:
* `global_descriptors`: Global descriptors for input images.
multi_scale_pool_type: If set, the global descriptor of each scale is pooled
and a 1D global descriptor is returned.
normalize_global_descriptor: If True, output global descriptors are
L2-normalized.
Returns:
global_descriptors: If `multi_scale_pool_type` is 'None', returns a [S, D]
float tensor. S is the number of scales, and D the global descriptor
dimensionality. Each D-dimensional entry is a global descriptor, which may
be L2-normalized depending on `normalize_global_descriptor`. If
`multi_scale_pool_type` is not 'None', returns a [D] float tensor with the
pooled global descriptor.
"""
original_image_shape_float = tf.gather(
tf.dtypes.cast(tf.shape(image), tf.float32), [0, 1])
image_tensor = gld.NormalizeImages(
image, pixel_value_offset=128.0, pixel_value_scale=128.0)
image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims')
def _ResizeAndExtract(scale_index):
"""Helper function to resize image then extract global feature.
Args:
scale_index: A valid index in image_scales.
Returns:
global_descriptor: [1,D] tensor denoting the extracted global descriptor.
"""
scale = tf.gather(image_scales, scale_index)
new_image_size = tf.dtypes.cast(
tf.round(original_image_shape_float * scale), tf.int32)
resized_image = tf.image.resize(image_tensor, new_image_size)
global_descriptor = model_fn(resized_image)
return global_descriptor
# First loop to find initial scale to be used.
num_scales = tf.shape(image_scales)[0]
initial_scale_index = tf.constant(-1, dtype=tf.int32)
for scale_index in tf.range(num_scales):
if tf.reduce_any(tf.equal(global_scales_ind, scale_index)):
initial_scale_index = scale_index
break
output_global = _ResizeAndExtract(initial_scale_index)
# Loop over subsequent scales.
for scale_index in tf.range(initial_scale_index + 1, num_scales):
# Allow an undefined number of global feature scales to be extracted.
tf.autograph.experimental.set_loop_options(
shape_invariants=[(output_global, tf.TensorShape([None, None]))])
if tf.reduce_any(tf.equal(global_scales_ind, scale_index)):
global_descriptor = _ResizeAndExtract(scale_index)
output_global = tf.concat([output_global, global_descriptor], 0)
normalization_axis = 1
if multi_scale_pool_type == 'average':
output_global = tf.reduce_mean(
output_global,
axis=0,
keepdims=False,
name='multi_scale_average_pooling')
normalization_axis = 0
elif multi_scale_pool_type == 'sum':
output_global = tf.reduce_sum(
output_global, axis=0, keepdims=False, name='multi_scale_sum_pooling')
normalization_axis = 0
if normalize_global_descriptor:
output_global = tf.nn.l2_normalize(
output_global, axis=normalization_axis, name='l2_normalization')
return output_global
......@@ -22,9 +22,14 @@ from __future__ import division
from __future__ import print_function
import functools
import os
import tempfile
from absl import logging
import h5py
import tensorflow as tf
layers = tf.keras.layers
......@@ -284,8 +289,8 @@ class ResNet50(tf.keras.Model):
else:
self.global_pooling = None
def call(self, inputs, training=True, intermediates_dict=None):
"""Call the ResNet50 model.
def build_call(self, inputs, training=True, intermediates_dict=None):
"""Building the ResNet50 model.
Args:
inputs: Images to compute features for.
......@@ -356,3 +361,79 @@ class ResNet50(tf.keras.Model):
return self.global_pooling(x)
else:
return x
def call(self, inputs, training=True, intermediates_dict=None):
"""Call the ResNet50 model.
Args:
inputs: Images to compute features for.
training: Whether model is in training phase.
intermediates_dict: `None` or dictionary. If not None, accumulate feature
maps from intermediate blocks into the dictionary. ""
Returns:
Tensor with featuremap.
"""
return self.build_call(inputs, training, intermediates_dict)
def restore_weights(self, filepath):
"""Load pretrained weights.
This function loads a .h5 file from the filepath with saved model weights
and assigns them to the model.
Args:
filepath: String, path to the .h5 file
Raises:
ValueError: if the file referenced by `filepath` does not exist.
"""
if not tf.io.gfile.exists(filepath):
raise ValueError('Unable to load weights from %s. You must provide a'
'valid file.' % (filepath))
# Create a local copy of the weights file for h5py to be able to read it.
local_filename = os.path.basename(filepath)
tmp_filename = os.path.join(tempfile.gettempdir(), local_filename)
tf.io.gfile.copy(filepath, tmp_filename, overwrite=True)
# Load the content of the weights file.
f = h5py.File(tmp_filename, mode='r')
saved_layer_names = [n.decode('utf8') for n in f.attrs['layer_names']]
try:
# Iterate through all the layers assuming the max `depth` is 2.
for layer in self.layers:
if hasattr(layer, 'layers'):
for inlayer in layer.layers:
# Make sure the weights are in the saved model, and that we are in
# the innermost layer.
if inlayer.name not in saved_layer_names:
raise ValueError('Layer %s absent from the pretrained weights.'
'Unable to load its weights.' % (inlayer.name))
if hasattr(inlayer, 'layers'):
raise ValueError('Layer %s is not a depth 2 layer. Unable to load'
'its weights.' % (inlayer.name))
# Assign the weights in the current layer.
g = f[inlayer.name]
weight_names = [n.decode('utf8') for n in g.attrs['weight_names']]
weight_values = [g[weight_name] for weight_name in weight_names]
print('Setting the weights for layer %s' % (inlayer.name))
inlayer.set_weights(weight_values)
finally:
# Clean up the temporary file.
tf.io.gfile.remove(tmp_filename)
def log_weights(self):
"""Log backbone weights."""
logging.info('Logging backbone weights')
logging.info('------------------------')
for layer in self.layers:
if hasattr(layer, 'layers'):
for inlayer in layer.layers:
logging.info('Weights for layer: %s, inlayer % s', layer.name,
inlayer.name)
weights = inlayer.get_weights()
logging.info(weights)
else:
logging.info('Layer %s does not have inner layers.',
layer.name)
......@@ -43,17 +43,20 @@ flags.DEFINE_string('train_file_pattern', '/tmp/data/train*',
'File pattern of training dataset files.')
flags.DEFINE_string('validation_file_pattern', '/tmp/data/validation*',
'File pattern of validation dataset files.')
flags.DEFINE_enum('dataset_version', 'gld_v1',
['gld_v1', 'gld_v2', 'gld_v2_clean'],
'Google Landmarks dataset version, used to determine the'
'number of classes.')
flags.DEFINE_enum(
'dataset_version', 'gld_v1', ['gld_v1', 'gld_v2', 'gld_v2_clean'],
'Google Landmarks dataset version, used to determine the'
'number of classes.')
flags.DEFINE_integer('seed', 0, 'Seed to training dataset.')
flags.DEFINE_float('initial_lr', 0.001, 'Initial learning rate.')
flags.DEFINE_float('initial_lr', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('batch_size', 32, 'Global batch size.')
flags.DEFINE_integer('max_iters', 500000, 'Maximum iterations.')
flags.DEFINE_boolean('block3_strides', False, 'Whether to use block3_strides.')
flags.DEFINE_boolean('block3_strides', True, 'Whether to use block3_strides.')
flags.DEFINE_boolean('use_augmentation', True,
'Whether to use ImageNet style augmentation.')
flags.DEFINE_string(
'imagenet_checkpoint', None,
'ImageNet checkpoint for ResNet backbone. If None, no checkpoint is used.')
def _record_accuracy(metric, logits, labels):
......@@ -64,6 +67,10 @@ def _record_accuracy(metric, logits, labels):
def _attention_summaries(scores, global_step):
"""Record statistics of the attention score."""
tf.summary.image(
'batch_attention',
scores / tf.reduce_max(scores + 1e-3),
step=global_step)
tf.summary.scalar('attention/max', tf.reduce_max(scores), step=global_step)
tf.summary.scalar('attention/min', tf.reduce_min(scores), step=global_step)
tf.summary.scalar('attention/mean', tf.reduce_mean(scores), step=global_step)
......@@ -124,7 +131,7 @@ def main(argv):
max_iters = FLAGS.max_iters
global_batch_size = FLAGS.batch_size
image_size = 321
num_eval = 1000
num_eval_batches = int(50000 / global_batch_size)
report_interval = 100
eval_interval = 1000
save_interval = 20000
......@@ -134,9 +141,10 @@ def main(argv):
clip_val = tf.constant(10.0)
if FLAGS.debug:
tf.config.run_functions_eagerly(True)
global_batch_size = 4
max_iters = 4
num_eval = 1
max_iters = 100
num_eval_batches = 1
save_interval = 1
report_interval = 1
......@@ -159,11 +167,12 @@ def main(argv):
augmentation=False,
seed=FLAGS.seed)
train_iterator = strategy.make_dataset_iterator(train_dataset)
validation_iterator = strategy.make_dataset_iterator(validation_dataset)
train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
validation_dist_dataset = strategy.experimental_distribute_dataset(
validation_dataset)
train_iterator.initialize()
validation_iterator.initialize()
train_iter = iter(train_dist_dataset)
validation_iter = iter(validation_dist_dataset)
# Create a checkpoint directory to store the checkpoints.
checkpoint_prefix = os.path.join(FLAGS.logdir, 'delf_tf2-ckpt')
......@@ -219,11 +228,14 @@ def main(argv):
labels = tf.clip_by_value(labels, 0, model.num_classes)
global_step = optimizer.iterations
tf.summary.image('batch_images', (images + 1.0) / 2.0, step=global_step)
tf.summary.scalar(
'image_range/max', tf.reduce_max(images), step=global_step)
tf.summary.scalar(
'image_range/min', tf.reduce_min(images), step=global_step)
# TODO(andrearaujo): we should try to unify the backprop into a single
# function, instead of applying once to descriptor then to attention.
def _backprop_loss(tape, loss, weights):
"""Backpropogate losses using clipped gradients.
......@@ -344,12 +356,25 @@ def main(argv):
with tf.summary.record_if(
tf.math.equal(0, optimizer.iterations % report_interval)):
# TODO(dananghel): try to load pretrained weights at backbone creation.
# Load pretrained weights for ResNet50 trained on ImageNet.
if FLAGS.imagenet_checkpoint is not None:
logging.info('Attempting to load ImageNet pretrained weights.')
input_batch = next(train_iter)
_, _ = distributed_train_step(input_batch)
model.backbone.restore_weights(FLAGS.imagenet_checkpoint)
logging.info('Done.')
else:
logging.info('Skip loading ImageNet pretrained weights.')
if FLAGS.debug:
model.backbone.log_weights()
global_step_value = optimizer.iterations.numpy()
while global_step_value < max_iters:
# input_batch : images(b, h, w, c), labels(b,).
try:
input_batch = train_iterator.get_next()
input_batch = next(train_iter)
except tf.errors.OutOfRangeError:
# Break if we run out of data in the dataset.
logging.info('Stopping training at global step %d, no more data',
......@@ -392,9 +417,9 @@ def main(argv):
# Validate once in {eval_interval*n, n \in N} steps.
if global_step_value % eval_interval == 0:
for i in range(num_eval):
for i in range(num_eval_batches):
try:
validation_batch = validation_iterator.get_next()
validation_batch = next(validation_iter)
desc_validation_result, attn_validation_result = (
distributed_validation_step(validation_batch))
except tf.errors.OutOfRangeError:
......@@ -416,13 +441,17 @@ def main(argv):
print(' : attn:', attn_validation_result.numpy())
# Save checkpoint once (each save_interval*n, n \in N) steps.
# TODO(andrearaujo): save only in one of the two ways. They are
# identical, the only difference is that the manager adds some extra
# prefixes and variables (eg, optimizer variables).
if global_step_value % save_interval == 0:
save_path = manager.save()
logging.info('Saved({global_step_value}) at %s', save_path)
logging.info('Saved (%d) at %s', global_step_value, save_path)
file_path = '%s/delf_weights' % FLAGS.logdir
model.save_weights(file_path, save_format='tf')
logging.info('Saved weights({global_step_value}) at %s', file_path)
logging.info('Saved weights (%d) at %s', global_step_value,
file_path)
# Reset metrics for next step.
desc_train_accuracy.reset_states()
......
......@@ -22,7 +22,7 @@ install_requires = [
'pandas >= 0.24.2',
'numpy >= 1.16.1',
'scipy >= 1.2.2',
'tensorflow >= 2.0.0b1',
'tensorflow >= 2.2.0',
'tf_slim >= 1.1',
'tensorflow_probability >= 0.9.0',
]
......
# Contributing to the Tensorflow Object Detection API
# Contributing to the TensorFlow Object Detection API
Patches to Tensorflow Object Detection API are welcome!
Patches to TensorFlow Object Detection API are welcome!
We require contributors to fill out either the individual or corporate
Contributor License Agreement (CLA).
......@@ -9,5 +9,5 @@ Contributor License Agreement (CLA).
* If you work for a company that wants to allow you to contribute your work, then you'll need to sign a [corporate CLA](http://code.google.com/legal/corporate-cla-v1.0.html).
Please follow the
[Tensorflow contributing guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md)
[TensorFlow contributing guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md)
when submitting pull requests.
![TensorFlow Requirement: 1.15](https://img.shields.io/badge/TensorFlow%20Requirement-1.15-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Tensorflow Object Detection API
# TensorFlow Object Detection API
[![TensorFlow 2.2](https://img.shields.io/badge/TensorFlow-2.2-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
[![TensorFlow 1.15](https://img.shields.io/badge/TensorFlow-1.15-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v1.15.0)
[![Python 3.6](https://img.shields.io/badge/Python-3.6-3776AB)](https://www.python.org/downloads/release/python-360/)
Creating accurate machine learning models capable of localizing and identifying
multiple objects in a single image remains a core challenge in computer vision.
......@@ -11,7 +11,7 @@ models. At Google we’ve certainly found this codebase to be useful for our
computer vision needs, and we hope that you will as well. <p align="center">
<img src="g3doc/img/kites_detections_output.jpg" width=676 height=450> </p>
Contributions to the codebase are welcome and we would love to hear back from
you if you find this API useful. Finally if you use the Tensorflow Object
you if you find this API useful. Finally if you use the TensorFlow Object
Detection API for a research publication, please consider citing:
```
......@@ -26,91 +26,110 @@ Song Y, Guadarrama S, Murphy K, CVPR 2017
<img src="g3doc/img/tf-od-api-logo.png" width=140 height=195>
</p>
## Maintainers
## Support for TensorFlow 2 and 1
The TensorFlow Object Detection API supports both TensorFlow 2 (TF2) and
TensorFlow 1 (TF1). A majority of the modules in the library are both TF1 and
TF2 compatible. In cases where they are not, we provide two versions.
Name | GitHub
-------------- | ---------------------------------------------
Jonathan Huang | [jch1](https://github.com/jch1)
Vivek Rathod | [tombstone](https://github.com/tombstone)
Ronny Votel | [ronnyvotel](https://github.com/ronnyvotel)
Derek Chow | [derekjchow](https://github.com/derekjchow)
Chen Sun | [jesu9](https://github.com/jesu9)
Menglong Zhu | [dreamdragon](https://github.com/dreamdragon)
Alireza Fathi | [afathi3](https://github.com/afathi3)
Zhichao Lu | [pkulzc](https://github.com/pkulzc)
## Table of contents
Setup:
* <a href='g3doc/installation.md'>Installation</a><br>
Quick Start:
* <a href='object_detection_tutorial.ipynb'>
Quick Start: Jupyter notebook for off-the-shelf inference</a><br>
* <a href="g3doc/running_pets.md">Quick Start: Training a pet detector</a><br>
Customizing a Pipeline:
* <a href='g3doc/configuring_jobs.md'>
Configuring an object detection pipeline</a><br>
* <a href='g3doc/preparing_inputs.md'>Preparing inputs</a><br>
Running:
* <a href='g3doc/running_locally.md'>Running locally</a><br>
* <a href='g3doc/running_on_cloud.md'>Running on the cloud</a><br>
Extras:
* <a href='g3doc/detection_model_zoo.md'>Tensorflow detection model zoo</a><br>
* <a href='g3doc/exporting_models.md'>
Exporting a trained model for inference</a><br>
* <a href='g3doc/tpu_exporters.md'>
Exporting a trained model for TPU inference</a><br>
* <a href='g3doc/defining_your_own_model.md'>
Defining your own model architecture</a><br>
* <a href='g3doc/using_your_own_dataset.md'>
Bringing in your own dataset</a><br>
* <a href='g3doc/evaluation_protocols.md'>
Supported object detection evaluation protocols</a><br>
* <a href='g3doc/oid_inference_and_evaluation.md'>
Inference and evaluation on the Open Images dataset</a><br>
* <a href='g3doc/instance_segmentation.md'>
Run an instance segmentation model</a><br>
* <a href='g3doc/challenge_evaluation.md'>
Run the evaluation for the Open Images Challenge 2018/2019</a><br>
* <a href='g3doc/tpu_compatibility.md'>
TPU compatible detection pipelines</a><br>
* <a href='g3doc/running_on_mobile_tensorflowlite.md'>
Running object detection on mobile devices with TensorFlow Lite</a><br>
* <a href='g3doc/context_rcnn.md'>
Context R-CNN documentation for data preparation, training, and export</a><br>
Although we will continue to maintain the TF1 models and provide support, we
encourage users to try the Object Detection API with TF2 for the following
reasons:
## Getting Help
* We provide new architectures supported in TF2 only and we will continue to
develop in TF2 going forward.
To get help with issues you may encounter using the Tensorflow Object Detection
API, create a new question on [StackOverflow](https://stackoverflow.com/) with
the tags "tensorflow" and "object-detection".
* The popular models we ported from TF1 to TF2 achieve the same performance.
Please report bugs (actually broken code, not usage questions) to the
tensorflow/models GitHub
[issue tracker](https://github.com/tensorflow/models/issues), prefixing the
issue name with "object_detection".
* A single training and evaluation binary now supports both GPU and TPU
distribution strategies making it possible to train models with synchronous
SGD by default.
* Eager execution with new binaries makes debugging easy!
Finally, if are an existing user of the Object Detection API we have retained
the same config language you are familiar with and ensured that the
TF2 training/eval binary takes the same arguments as our TF1 binaries.
Note: The models we provide in [TF2 Zoo](g3doc/tf2_detection_zoo.md) and
[TF1 Zoo](g3doc/tf1_detection_zoo.md) are specific to the TensorFlow major
version and are not interoperable.
Please select one of the links below for TensorFlow version-specific
documentation of the Object Detection API:
Please check [FAQ](g3doc/faq.md) for frequently asked questions before reporting
an issue.
<!-- mdlint off(WHITESPACE_LINE_LENGTH) -->
### Tensorflow 2.x
* <a href='g3doc/tf2.md'>
Object Detection API TensorFlow 2</a><br>
* <a href='g3doc/tf2_detection_zoo.md'>
TensorFlow 2 Model Zoo</a><br>
## Release information
### June 17th, 2020
### Tensorflow 1.x
* <a href='g3doc/tf1.md'>
Object Detection API TensorFlow 1</a><br>
* <a href='g3doc/tf1_detection_zoo.md'>
TensorFlow 1 Model Zoo</a><br>
<!-- mdlint on -->
## Whats New
### TensorFlow 2 Support
We are happy to announce that the TF OD API officially supports TF2! Our release
includes:
* New binaries for train/eval/export that are designed to run in eager mode.
* A suite of TF2 compatible (Keras-based) models; this includes migrations of
our most popular TF1.x models (e.g., SSD with MobileNet, RetinaNet,
Faster R-CNN, Mask R-CNN), as well as a few new architectures for which we
will only maintain TF2 implementations:
1. CenterNet - a simple and effective anchor-free architecture based on
the recent [Objects as Points](https://arxiv.org/abs/1904.07850) paper by
Zhou et al.
2. [EfficientDet](https://arxiv.org/abs/1911.09070) - a recent family of
SOTA models discovered with the help of Neural Architecture Search.
* COCO pre-trained weights for all of the models provided as TF2 style
object-based checkpoints.
* Access to [Distribution Strategies](https://www.tensorflow.org/guide/distributed_training)
for distributed training --- our model are designed to be trainable using sync
multi-GPU and TPU platforms.
* Colabs demo’ing eager mode training and inference.
See our release blogpost [here](https://blog.tensorflow.org/2020/07/tensorflow-2-meets-object-detection-api.html).
If you are an existing user of the TF OD API using TF 1.x, don’t worry, we’ve
got you covered.
**Thanks to contributors**: Akhil Chinnakotla, Allen Lavoie, Anirudh Vegesana,
Anjali Sridhar, Austin Myers, Dan Kondratyuk, David Ross, Derek Chow, Jaeyoun
Kim, Jing Li, Jonathan Huang, Jordi Pont-Tuset, Karmel Allison, Kathy Ruan,
Kaushik Shivakumar, Lu He, Mingxing Tan, Pengchong Jin, Ronny Votel, Sara Beery,
Sergi Caelles Prat, Shan Yang, Sudheendra Vijayanarasimhan, Tina Tian, Tomer
Kaftan, Vighnesh Birodkar, Vishnu Banna, Vivek Rathod, Yanhui Liang, Yiming Shi,
Yixin Shi, Yu-hui Chen, Zhichao Lu.
### MobileDet GPU
We have released SSDLite with MobileDet GPU backbone, which achieves 17% mAP
higher than the MobileNetV2 SSDLite (27.5 mAP vs 23.5 mAP) on a NVIDIA Jetson
Xavier at comparable latency (3.2ms vs 3.3ms).
Along with the model definition, we are also releasing model checkpoints trained
on the COCO dataset.
<b>Thanks to contributors</b>: Yongzhe Wang, Bo Chen, Hanxiao Liu, Le An
(NVIDIA), Yu-Te Cheng (NVIDIA), Oliver Knieps (NVIDIA), and Josh Park (NVIDIA).
### Context R-CNN
We have released [Context R-CNN](https://arxiv.org/abs/1912.03538), a model that
uses attention to incorporate contextual information images (e.g. from
temporally nearby frames taken by a static camera) in order to improve accuracy.
Importantly, these contextual images need not be labeled.
* When applied to a challenging wildlife detection dataset ([Snapshot Serengeti](http://lila.science/datasets/snapshot-serengeti)),
* When applied to a challenging wildlife detection dataset
([Snapshot Serengeti](http://lila.science/datasets/snapshot-serengeti)),
Context R-CNN with context from up to a month of images outperforms a
single-frame baseline by 17.9% mAP, and outperforms S3D (a 3d convolution
based baseline) by 11.2% mAP.
......@@ -118,280 +137,48 @@ Importantly, these contextual images need not be labeled.
novel camera deployment to improve performance at that camera, boosting
model generalizeability.
Read about Context R-CNN on the Google AI blog
[here](https://ai.googleblog.com/2020/06/leveraging-temporal-context-for-object.html).
We have provided code for generating data with associated context
[here](g3doc/context_rcnn.md), and a sample config for a Context R-CNN
model [here](samples/configs/context_rcnn_resnet101_snapshot_serengeti_sync.config).
[here](g3doc/context_rcnn.md), and a sample config for a Context R-CNN model
[here](samples/configs/context_rcnn_resnet101_snapshot_serengeti_sync.config).
Snapshot Serengeti-trained Faster R-CNN and Context R-CNN models can be found in
the [model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md#snapshot-serengeti-camera-trap-trained-models).
the
[model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md#snapshot-serengeti-camera-trap-trained-models).
A colab demonstrating Context R-CNN is provided
[here](colab_tutorials/context_rcnn_tutorial.ipynb).
<b>Thanks to contributors</b>: Sara Beery, Jonathan Huang, Guanhang Wu, Vivek
Rathod, Ronny Votel, Zhichao Lu, David Ross, Pietro Perona, Tanya Birch, and
the Wildlife Insights AI Team.
### May 19th, 2020
We have released [MobileDets](https://arxiv.org/abs/2004.14525), a set of
high-performance models for mobile CPUs, DSPs and EdgeTPUs.
* MobileDets outperform MobileNetV3+SSDLite by 1.7 mAP at comparable mobile
CPU inference latencies. MobileDets also outperform MobileNetV2+SSDLite by
1.9 mAP on mobile CPUs, 3.7 mAP on EdgeTPUs and 3.4 mAP on DSPs while
running equally fast. MobileDets also offer up to 2x speedup over MnasFPN on
EdgeTPUs and DSPs.
For each of the three hardware platforms we have released model definition,
model checkpoints trained on the COCO14 dataset and converted TFLite models in
fp32 and/or uint8.
<b>Thanks to contributors</b>: Yunyang Xiong, Hanxiao Liu, Suyog Gupta, Berkin
Akin, Gabriel Bender, Pieter-Jan Kindermans, Mingxing Tan, Vikas Singh, Bo Chen,
Quoc Le, Zhichao Lu.
### May 7th, 2020
We have released a mobile model with the
[MnasFPN head](https://arxiv.org/abs/1912.01106).
* MnasFPN with MobileNet-V2 backbone is the most accurate (26.6 mAP at 183ms
on Pixel 1) mobile detection model we have released to date. With
depth-multiplier, MnasFPN with MobileNet-V2 backbone is 1.8 mAP higher than
MobileNet-V3-Large with SSDLite (23.8 mAP vs 22.0 mAP) at similar latency
(120ms) on Pixel 1.
We have released model definition, model checkpoints trained on the COCO14
dataset and a converted TFLite model.
<b>Thanks to contributors</b>: Bo Chen, Golnaz Ghiasi, Hanxiao Liu, Tsung-Yi
Lin, Dmitry Kalenichenko, Hartwig Adam, Quoc Le, Zhichao Lu, Jonathan Huang, Hao
Xu.
### Nov 13th, 2019
We have released MobileNetEdgeTPU SSDLite model.
* SSDLite with MobileNetEdgeTPU backbone, which achieves 10% mAP higher than
MobileNetV2 SSDLite (24.3 mAP vs 22 mAP) on a Google Pixel4 at comparable
latency (6.6ms vs 6.8ms).
Along with the model definition, we are also releasing model checkpoints trained
on the COCO dataset.
<b>Thanks to contributors</b>: Yunyang Xiong, Bo Chen, Suyog Gupta, Hanxiao Liu,
Gabriel Bender, Mingxing Tan, Berkin Akin, Zhichao Lu, Quoc Le
### Oct 15th, 2019
We have released two MobileNet V3 SSDLite models (presented in
[Searching for MobileNetV3](https://arxiv.org/abs/1905.02244)).
* SSDLite with MobileNet-V3-Large backbone, which is 27% faster than Mobilenet
V2 SSDLite (119ms vs 162ms) on a Google Pixel phone CPU at the same mAP.
* SSDLite with MobileNet-V3-Small backbone, which is 37% faster than MnasNet
SSDLite reduced with depth-multiplier (43ms vs 68ms) at the same mAP.
Along with the model definition, we are also releasing model checkpoints trained
on the COCO dataset.
<b>Thanks to contributors</b>: Bo Chen, Zhichao Lu, Vivek Rathod, Jonathan Huang
### July 1st, 2019
Rathod, Ronny Votel, Zhichao Lu, David Ross, Pietro Perona, Tanya Birch, and the
Wildlife Insights AI Team.
We have released an updated set of utils and an updated
[tutorial](g3doc/challenge_evaluation.md) for all three tracks of the
[Open Images Challenge 2019](https://storage.googleapis.com/openimages/web/challenge2019.html)!
## Release Notes
See [notes](g3doc/release_notes.md) for all past releases.
The Instance Segmentation metric for
[Open Images V5](https://storage.googleapis.com/openimages/web/index.html) and
[Challenge 2019](https://storage.googleapis.com/openimages/web/challenge2019.html)
is part of this release. Check out
[the metric description](https://storage.googleapis.com/openimages/web/evaluation.html#instance_segmentation_eval)
on the Open Images website.
<b>Thanks to contributors</b>: Alina Kuznetsova, Rodrigo Benenson
### Feb 11, 2019
We have released detection models trained on the Open Images Dataset V4 in our
detection model zoo, including
* Faster R-CNN detector with Inception Resnet V2 feature extractor
* SSD detector with MobileNet V2 feature extractor
* SSD detector with ResNet 101 FPN feature extractor (aka RetinaNet-101)
<b>Thanks to contributors</b>: Alina Kuznetsova, Yinxiao Li
### Sep 17, 2018
We have released Faster R-CNN detectors with ResNet-50 / ResNet-101 feature
extractors trained on the
[iNaturalist Species Detection Dataset](https://github.com/visipedia/inat_comp/blob/master/2017/README.md#bounding-boxes).
The models are trained on the training split of the iNaturalist data for 4M
iterations, they achieve 55% and 58% mean AP@.5 over 2854 classes respectively.
For more details please refer to this [paper](https://arxiv.org/abs/1707.06642).
<b>Thanks to contributors</b>: Chen Sun
### July 13, 2018
There are many new updates in this release, extending the functionality and
capability of the API:
* Moving from slim-based training to
[Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)-based
training.
* Support for [RetinaNet](https://arxiv.org/abs/1708.02002), and a
[MobileNet](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
adaptation of RetinaNet.
* A novel SSD-based architecture called the
[Pooling Pyramid Network](https://arxiv.org/abs/1807.03284) (PPN).
* Releasing several [TPU](https://cloud.google.com/tpu/)-compatible models.
These can be found in the `samples/configs/` directory with a comment in the
pipeline configuration files indicating TPU compatibility.
* Support for quantized training.
* Updated documentation for new binaries, Cloud training, and
[Tensorflow Lite](https://www.tensorflow.org/mobile/tflite/).
See also our
[expanded announcement blogpost](https://ai.googleblog.com/2018/07/accelerated-training-and-inference-with.html)
and accompanying tutorial at the
[TensorFlow blog](https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193).
<b>Thanks to contributors</b>: Sara Robinson, Aakanksha Chowdhery, Derek Chow,
Pengchong Jin, Jonathan Huang, Vivek Rathod, Zhichao Lu, Ronny Votel
### June 25, 2018
Additional evaluation tools for the
[Open Images Challenge 2018](https://storage.googleapis.com/openimages/web/challenge.html)
are out. Check out our short tutorial on data preparation and running evaluation
[here](g3doc/challenge_evaluation.md)!
<b>Thanks to contributors</b>: Alina Kuznetsova
### June 5, 2018
We have released the implementation of evaluation metrics for both tracks of the
[Open Images Challenge 2018](https://storage.googleapis.com/openimages/web/challenge.html)
as a part of the Object Detection API - see the
[evaluation protocols](g3doc/evaluation_protocols.md) for more details.
Additionally, we have released a tool for hierarchical labels expansion for the
Open Images Challenge: check out
[oid_hierarchical_labels_expansion.py](dataset_tools/oid_hierarchical_labels_expansion.py).
<b>Thanks to contributors</b>: Alina Kuznetsova, Vittorio Ferrari, Jasper
Uijlings
### April 30, 2018
We have released a Faster R-CNN detector with ResNet-101 feature extractor
trained on [AVA](https://research.google.com/ava/) v2.1. Compared with other
commonly used object detectors, it changes the action classification loss
function to per-class Sigmoid loss to handle boxes with multiple labels. The
model is trained on the training split of AVA v2.1 for 1.5M iterations, it
achieves mean AP of 11.25% over 60 classes on the validation split of AVA v2.1.
For more details please refer to this [paper](https://arxiv.org/abs/1705.08421).
<b>Thanks to contributors</b>: Chen Sun, David Ross
### April 2, 2018
Supercharge your mobile phones with the next generation mobile object detector!
We are adding support for MobileNet V2 with SSDLite presented in
[MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381).
This model is 35% faster than Mobilenet V1 SSD on a Google Pixel phone CPU
(200ms vs. 270ms) at the same accuracy. Along with the model definition, we are
also releasing a model checkpoint trained on the COCO dataset.
<b>Thanks to contributors</b>: Menglong Zhu, Mark Sandler, Zhichao Lu, Vivek
Rathod, Jonathan Huang
### February 9, 2018
We now support instance segmentation!! In this API update we support a number of
instance segmentation models similar to those discussed in the
[Mask R-CNN paper](https://arxiv.org/abs/1703.06870). For further details refer
to [our slides](http://presentations.cocodataset.org/Places17-GMRI.pdf) from the
2017 Coco + Places Workshop. Refer to the section on
[Running an Instance Segmentation Model](g3doc/instance_segmentation.md) for
instructions on how to configure a model that predicts masks in addition to
object bounding boxes.
<b>Thanks to contributors</b>: Alireza Fathi, Zhichao Lu, Vivek Rathod, Ronny
Votel, Jonathan Huang
### November 17, 2017
As a part of the Open Images V3 release we have released:
* An implementation of the Open Images evaluation metric and the
[protocol](g3doc/evaluation_protocols.md#open-images).
* Additional tools to separate inference of detection and evaluation (see
[this tutorial](g3doc/oid_inference_and_evaluation.md)).
* A new detection model trained on the Open Images V2 data release (see
[Open Images model](g3doc/detection_model_zoo.md#open-images-models)).
See more information on the
[Open Images website](https://github.com/openimages/dataset)!
<b>Thanks to contributors</b>: Stefan Popov, Alina Kuznetsova
### November 6, 2017
We have re-released faster versions of our (pre-trained) models in the
<a href='g3doc/detection_model_zoo.md'>model zoo</a>. In addition to what was
available before, we are also adding Faster R-CNN models trained on COCO with
Inception V2 and Resnet-50 feature extractors, as well as a Faster R-CNN with
Resnet-101 model trained on the KITTI dataset.
<b>Thanks to contributors</b>: Jonathan Huang, Vivek Rathod, Derek Chow, Tal
Remez, Chen Sun.
### October 31, 2017
We have released a new state-of-the-art model for object detection using the
Faster-RCNN with the
[NASNet-A image featurization](https://arxiv.org/abs/1707.07012). This model
achieves mAP of 43.1% on the test-dev validation dataset for COCO, improving on
the best available model in the zoo by 6% in terms of absolute mAP.
<b>Thanks to contributors</b>: Barret Zoph, Vijay Vasudevan, Jonathon Shlens,
Quoc Le
### August 11, 2017
## Getting Help
We have released an update to the
[Android Detect demo](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android)
which will now run models trained using the Tensorflow Object Detection API on
an Android device. By default, it currently runs a frozen SSD w/Mobilenet
detector trained on COCO, but we encourage you to try out other detection
models!
To get help with issues you may encounter using the TensorFlow Object Detection
API, create a new question on [StackOverflow](https://stackoverflow.com/) with
the tags "tensorflow" and "object-detection".
<b>Thanks to contributors</b>: Jonathan Huang, Andrew Harp
Please report bugs (actually broken code, not usage questions) to the
tensorflow/models GitHub
[issue tracker](https://github.com/tensorflow/models/issues), prefixing the
issue name with "object_detection".
### June 15, 2017
Please check the [FAQ](g3doc/faq.md) for frequently asked questions before
reporting an issue.
In addition to our base Tensorflow detection model definitions, this release
includes:
## Maintainers
* A selection of trainable detection models, including:
* Single Shot Multibox Detector (SSD) with MobileNet,
* SSD with Inception V2,
* Region-Based Fully Convolutional Networks (R-FCN) with Resnet 101,
* Faster RCNN with Resnet 101,
* Faster RCNN with Inception Resnet v2
* Frozen weights (trained on the COCO dataset) for each of the above models to
be used for out-of-the-box inference purposes.
* A [Jupyter notebook](colab_tutorials/object_detection_tutorial.ipynb) for
performing out-of-the-box inference with one of our released models
* Convenient [local training](g3doc/running_locally.md) scripts as well as
distributed training and evaluation pipelines via
[Google Cloud](g3doc/running_on_cloud.md).
<b>Thanks to contributors</b>: Jonathan Huang, Vivek Rathod, Derek Chow, Chen
Sun, Menglong Zhu, Matthew Tang, Anoop Korattikara, Alireza Fathi, Ian Fischer,
Zbigniew Wojna, Yang Song, Sergio Guadarrama, Jasper Uijlings, Viacheslav
Kovalevskyi, Kevin Murphy
* Jonathan Huang ([@GitHub jch1](https://github.com/jch1))
* Vivek Rathod ([@GitHub tombstone](https://github.com/tombstone))
* Vighnesh Birodkar ([@GitHub vighneshbirodkar](https://github.com/vighneshbirodkar))
* Austin Myers ([@GitHub austin-myers](https://github.com/austin-myers))
* Zhichao Lu ([@GitHub pkulzc](https://github.com/pkulzc))
* Ronny Votel ([@GitHub ronnyvotel](https://github.com/ronnyvotel))
* Yu-hui Chen ([@GitHub yuhuichen1015](https://github.com/yuhuichen1015))
* Derek Chow ([@GitHub derekjchow](https://github.com/derekjchow))
......@@ -17,9 +17,8 @@
"""Tests for box_predictor_builder."""
import unittest
import mock
from unittest import mock # pylint: disable=g-importing-member
import tensorflow.compat.v1 as tf
from google.protobuf import text_format
from object_detection.builders import box_predictor_builder
from object_detection.builders import hyperparams_builder
......
......@@ -390,7 +390,7 @@ class DatasetBuilderTest(test_case.TestCase):
return iter1.get_next(), iter2.get_next()
output_dict1, output_dict2 = self.execute(graph_fn, [])
self.assertAllEqual(['0'], output_dict1[fields.InputDataFields.source_id])
self.assertAllEqual([b'0'], output_dict1[fields.InputDataFields.source_id])
self.assertEqual([b'1'], output_dict2[fields.InputDataFields.source_id])
def test_sample_one_of_n_shards(self):
......
......@@ -58,7 +58,8 @@ def build(input_reader_config):
use_display_name=input_reader_config.use_display_name,
num_additional_channels=input_reader_config.num_additional_channels,
num_keypoints=input_reader_config.num_keypoints,
expand_hierarchy_labels=input_reader_config.expand_labels_hierarchy)
expand_hierarchy_labels=input_reader_config.expand_labels_hierarchy,
load_dense_pose=input_reader_config.load_dense_pose)
return decoder
elif input_type == input_reader_pb2.InputType.Value('TF_SEQUENCE_EXAMPLE'):
decoder = tf_sequence_example_decoder.TfSequenceExampleDecoder(
......
......@@ -14,7 +14,7 @@
# ==============================================================================
"""Tests for graph_rewriter_builder."""
import unittest
import mock
from unittest import mock # pylint: disable=g-importing-member
import tensorflow.compat.v1 as tf
import tf_slim as slim
......
......@@ -16,6 +16,7 @@
"""A function to build a DetectionModel from configuration."""
import functools
import sys
from object_detection.builders import anchor_generator_builder
from object_detection.builders import box_coder_builder
from object_detection.builders import box_predictor_builder
......@@ -38,6 +39,7 @@ from object_detection.protos import losses_pb2
from object_detection.protos import model_pb2
from object_detection.utils import label_map_util
from object_detection.utils import ops
from object_detection.utils import spatial_transform_ops as spatial_ops
from object_detection.utils import tf_version
## Feature Extractors for TF
......@@ -47,16 +49,20 @@ from object_detection.utils import tf_version
# pylint: disable=g-import-not-at-top
if tf_version.is_tf2():
from object_detection.models import center_net_hourglass_feature_extractor
from object_detection.models import center_net_mobilenet_v2_feature_extractor
from object_detection.models import center_net_resnet_feature_extractor
from object_detection.models import center_net_resnet_v1_fpn_feature_extractor
from object_detection.models import faster_rcnn_inception_resnet_v2_keras_feature_extractor as frcnn_inc_res_keras
from object_detection.models import faster_rcnn_resnet_keras_feature_extractor as frcnn_resnet_keras
from object_detection.models import ssd_resnet_v1_fpn_keras_feature_extractor as ssd_resnet_v1_fpn_keras
from object_detection.models import faster_rcnn_resnet_v1_fpn_keras_feature_extractor as frcnn_resnet_fpn_keras
from object_detection.models.ssd_mobilenet_v1_fpn_keras_feature_extractor import SSDMobileNetV1FpnKerasFeatureExtractor
from object_detection.models.ssd_mobilenet_v1_keras_feature_extractor import SSDMobileNetV1KerasFeatureExtractor
from object_detection.models.ssd_mobilenet_v2_fpn_keras_feature_extractor import SSDMobileNetV2FpnKerasFeatureExtractor
from object_detection.models.ssd_mobilenet_v2_keras_feature_extractor import SSDMobileNetV2KerasFeatureExtractor
from object_detection.predictors import rfcn_keras_box_predictor
if sys.version_info[0] >= 3:
from object_detection.models import ssd_efficientnet_bifpn_feature_extractor as ssd_efficientnet_bifpn
if tf_version.is_tf1():
from object_detection.models import faster_rcnn_inception_resnet_v2_feature_extractor as frcnn_inc_res
......@@ -98,6 +104,22 @@ if tf_version.is_tf2():
ssd_resnet_v1_fpn_keras.SSDResNet101V1FpnKerasFeatureExtractor,
'ssd_resnet152_v1_fpn_keras':
ssd_resnet_v1_fpn_keras.SSDResNet152V1FpnKerasFeatureExtractor,
'ssd_efficientnet-b0_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB0BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b1_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB1BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b2_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB2BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b3_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB3BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b4_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB4BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b5_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB5BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b6_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB6BiFPNKerasFeatureExtractor,
'ssd_efficientnet-b7_bifpn_keras':
ssd_efficientnet_bifpn.SSDEfficientNetB7BiFPNKerasFeatureExtractor,
}
FASTER_RCNN_KERAS_FEATURE_EXTRACTOR_CLASS_MAP = {
......@@ -109,16 +131,29 @@ if tf_version.is_tf2():
frcnn_resnet_keras.FasterRCNNResnet152KerasFeatureExtractor,
'faster_rcnn_inception_resnet_v2_keras':
frcnn_inc_res_keras.FasterRCNNInceptionResnetV2KerasFeatureExtractor,
'faster_rcnn_resnet50_fpn_keras':
frcnn_resnet_fpn_keras.FasterRCNNResnet50FpnKerasFeatureExtractor,
'faster_rcnn_resnet101_fpn_keras':
frcnn_resnet_fpn_keras.FasterRCNNResnet101FpnKerasFeatureExtractor,
'faster_rcnn_resnet152_fpn_keras':
frcnn_resnet_fpn_keras.FasterRCNNResnet152FpnKerasFeatureExtractor,
}
CENTER_NET_EXTRACTOR_FUNCTION_MAP = {
'resnet_v2_50': center_net_resnet_feature_extractor.resnet_v2_50,
'resnet_v2_101': center_net_resnet_feature_extractor.resnet_v2_101,
'resnet_v1_18_fpn':
center_net_resnet_v1_fpn_feature_extractor.resnet_v1_18_fpn,
'resnet_v1_34_fpn':
center_net_resnet_v1_fpn_feature_extractor.resnet_v1_34_fpn,
'resnet_v1_50_fpn':
center_net_resnet_v1_fpn_feature_extractor.resnet_v1_50_fpn,
'resnet_v1_101_fpn':
center_net_resnet_v1_fpn_feature_extractor.resnet_v1_101_fpn,
'hourglass_104': center_net_hourglass_feature_extractor.hourglass_104,
'hourglass_104':
center_net_hourglass_feature_extractor.hourglass_104,
'mobilenet_v2':
center_net_mobilenet_v2_feature_extractor.mobilenet_v2,
}
FEATURE_EXTRACTOR_MAPS = [
......@@ -303,6 +338,14 @@ def _build_ssd_feature_extractor(feature_extractor_config,
feature_extractor_config.fpn.additional_layer_depth,
})
if feature_extractor_config.HasField('bifpn'):
kwargs.update({
'bifpn_min_level': feature_extractor_config.bifpn.min_level,
'bifpn_max_level': feature_extractor_config.bifpn.max_level,
'bifpn_num_iterations': feature_extractor_config.bifpn.num_iterations,
'bifpn_num_filters': feature_extractor_config.bifpn.num_filters,
'bifpn_combine_method': feature_extractor_config.bifpn.combine_method,
})
return feature_extractor_class(**kwargs)
......@@ -614,8 +657,9 @@ def _build_faster_rcnn_model(frcnn_config, is_training, add_summaries):
second_stage_localization_loss_weight)
crop_and_resize_fn = (
ops.matmul_crop_and_resize if frcnn_config.use_matmul_crop_and_resize
else ops.native_crop_and_resize)
spatial_ops.multilevel_matmul_crop_and_resize
if frcnn_config.use_matmul_crop_and_resize
else spatial_ops.multilevel_native_crop_and_resize)
clip_anchors_to_image = (
frcnn_config.clip_anchors_to_image)
......@@ -836,6 +880,22 @@ def mask_proto_to_params(mask_config):
heatmap_bias_init=mask_config.heatmap_bias_init)
def densepose_proto_to_params(densepose_config):
"""Converts CenterNet.DensePoseEstimation proto to parameter namedtuple."""
classification_loss, localization_loss, _, _, _, _, _ = (
losses_builder.build(densepose_config.loss))
return center_net_meta_arch.DensePoseParams(
class_id=densepose_config.class_id,
classification_loss=classification_loss,
localization_loss=localization_loss,
part_loss_weight=densepose_config.part_loss_weight,
coordinate_loss_weight=densepose_config.coordinate_loss_weight,
num_parts=densepose_config.num_parts,
task_loss_weight=densepose_config.task_loss_weight,
upsample_to_input_res=densepose_config.upsample_to_input_res,
heatmap_bias_init=densepose_config.heatmap_bias_init)
def _build_center_net_model(center_net_config, is_training, add_summaries):
"""Build a CenterNet detection model.
......@@ -888,6 +948,11 @@ def _build_center_net_model(center_net_config, is_training, add_summaries):
if center_net_config.HasField('mask_estimation_task'):
mask_params = mask_proto_to_params(center_net_config.mask_estimation_task)
densepose_params = None
if center_net_config.HasField('densepose_estimation_task'):
densepose_params = densepose_proto_to_params(
center_net_config.densepose_estimation_task)
return center_net_meta_arch.CenterNetMetaArch(
is_training=is_training,
add_summaries=add_summaries,
......@@ -897,7 +962,8 @@ def _build_center_net_model(center_net_config, is_training, add_summaries):
object_center_params=object_center_params,
object_detection_params=object_detection_params,
keypoint_params_dict=keypoint_params_dict,
mask_params=mask_params)
mask_params=mask_params,
densepose_params=densepose_params)
def _build_center_net_feature_extractor(
......
......@@ -39,6 +39,9 @@ class ModelBuilderTest(test_case.TestCase, parameterized.TestCase):
def ssd_feature_extractors(self):
raise NotImplementedError
def get_override_base_feature_extractor_hyperparams(self, extractor_type):
raise NotImplementedError
def faster_rcnn_feature_extractors(self):
raise NotImplementedError
......@@ -70,7 +73,6 @@ class ModelBuilderTest(test_case.TestCase, parameterized.TestCase):
}
}
}
override_base_feature_extractor_hyperparams: true
}
box_coder {
faster_rcnn_box_coder {
......@@ -205,6 +207,8 @@ class ModelBuilderTest(test_case.TestCase, parameterized.TestCase):
for extractor_type, extractor_class in self.ssd_feature_extractors().items(
):
model_proto.ssd.feature_extractor.type = extractor_type
model_proto.ssd.feature_extractor.override_base_feature_extractor_hyperparams = (
self.get_override_base_feature_extractor_hyperparams(extractor_type))
model = model_builder.build(model_proto, is_training=True)
self.assertIsInstance(model, ssd_meta_arch.SSDMetaArch)
self.assertIsInstance(model._feature_extractor, extractor_class)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment