"git@developer.sourcefind.cn:zhaoyu6/sglang.git" did not exist on "f1cf6eefbec615bfec1c026f29c0f5bb06f00ba6"
Commit c8354cb4 authored by Kaushik Shivakumar's avatar Kaushik Shivakumar
Browse files

remove wrong files

parent 9efe44f1
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""A Beam job to generate embedding data for camera trap images.
This tool runs inference with an exported Object Detection model in
`saved_model` format and produce raw embeddings for camera trap data. These
embeddings contain an object-centric feature embedding from Faster R-CNN, the
datetime that the image was taken (normalized in a specific way), and the
position of the object of interest. By default, only the highest-scoring object
embedding is included.
Steps to generate a embedding dataset:
1. Use object_detection/export_inference_graph.py to get a Faster R-CNN
`saved_model` for inference. The input node must accept a tf.Example proto.
2. Run this tool with `saved_model` from step 1 and an TFRecord of tf.Example
protos containing images for inference.
Example Usage:
--------------
python tensorflow_models/object_detection/export_inference_graph.py \
--alsologtostderr \
--input_type tf_example \
--pipeline_config_path path/to/faster_rcnn_model.config \
--trained_checkpoint_prefix path/to/model.ckpt \
--output_directory path/to/exported_model_directory \
--additional_output_tensor_names detection_features
python generate_embedding_data.py \
--alsologtostderr \
--embedding_input_tfrecord path/to/input_tfrecords* \
--embedding_output_tfrecord path/to/output_tfrecords \
--embedding_model_dir path/to/exported_model_directory/saved_model
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import datetime
import os
import threading
import numpy as np
import six
import tensorflow.compat.v1 as tf
try:
import apache_beam as beam # pylint:disable=g-import-not-at-top
except ModuleNotFoundError:
pass
class GenerateEmbeddingDataFn(beam.DoFn):
"""Generates embedding data for camera trap images.
This Beam DoFn performs inference with an object detection `saved_model` and
produces contextual embedding vectors.
"""
session_lock = threading.Lock()
def __init__(self, model_dir, top_k_embedding_count,
bottom_k_embedding_count):
"""Initialization function.
Args:
model_dir: A directory containing saved model.
top_k_embedding_count: the number of high-confidence embeddings to store
bottom_k_embedding_count: the number of low-confidence embeddings to store
"""
self._model_dir = model_dir
self._session = None
self._num_examples_processed = beam.metrics.Metrics.counter(
'embedding_data_generation', 'num_tf_examples_processed')
self._top_k_embedding_count = top_k_embedding_count
self._bottom_k_embedding_count = bottom_k_embedding_count
def start_bundle(self):
self._load_inference_model()
def _load_inference_model(self):
# Because initialization of the tf.Session is expensive we share
# one instance across all threads in the worker. This is possible since
# tf.Session.run() is thread safe.
with self.session_lock:
if self._session is None:
graph = tf.Graph()
self._session = tf.Session(graph=graph)
with graph.as_default():
meta_graph = tf.saved_model.loader.load(
self._session, [tf.saved_model.tag_constants.SERVING],
self._model_dir)
signature = meta_graph.signature_def['serving_default']
input_tensor_name = signature.inputs['input_tensor'].name
detection_features_name = signature.outputs['detection_features'].name
detection_boxes_name = signature.outputs['detection_boxes'].name
num_detections_name = signature.outputs['num_detections'].name
self._input = graph.get_tensor_by_name(input_tensor_name)
self._embedding_node = graph.get_tensor_by_name(detection_features_name)
self._box_node = graph.get_tensor_by_name(detection_boxes_name)
self._scores_node = graph.get_tensor_by_name(
signature.outputs['detection_scores'].name)
self._num_detections = graph.get_tensor_by_name(num_detections_name)
tf.logging.info(signature.outputs['detection_features'].name)
tf.logging.info(signature.outputs['detection_boxes'].name)
tf.logging.info(signature.outputs['num_detections'].name)
def process(self, tfrecord_entry):
return self._run_inference_and_generate_embedding(tfrecord_entry)
def _run_inference_and_generate_embedding(self, tfrecord_entry):
input_example = tf.train.Example.FromString(tfrecord_entry)
# Convert date_captured datetime string to unix time integer and store
def get_date_captured(example):
date_captured = datetime.datetime.strptime(
six.ensure_str(
example.features.feature[
'image/date_captured'].bytes_list.value[0]),
'%Y-%m-%d %H:%M:%S')
return date_captured
try:
date_captured = get_date_captured(input_example)
except Exception: # pylint: disable=broad-except
# we require date_captured to be available for all images
return []
def embed_date_captured(date_captured):
"""Encodes the datetime of the image."""
embedded_date_captured = []
month_max = 12.0
day_max = 31.0
hour_max = 24.0
minute_max = 60.0
min_year = 1990.0
max_year = 2030.0
year = (date_captured.year-min_year)/float(max_year-min_year)
embedded_date_captured.append(year)
month = (date_captured.month-1)/month_max
embedded_date_captured.append(month)
day = (date_captured.day-1)/day_max
embedded_date_captured.append(day)
hour = date_captured.hour/hour_max
embedded_date_captured.append(hour)
minute = date_captured.minute/minute_max
embedded_date_captured.append(minute)
return np.asarray(embedded_date_captured)
def embed_position_and_size(box):
"""Encodes the bounding box of the object of interest."""
ymin = box[0]
xmin = box[1]
ymax = box[2]
xmax = box[3]
w = xmax - xmin
h = ymax - ymin
x = xmin + w / 2.0
y = ymin + h / 2.0
return np.asarray([x, y, w, h])
unix_time = (
(date_captured - datetime.datetime.fromtimestamp(0)).total_seconds())
example = tf.train.Example()
example.features.feature['image/unix_time'].float_list.value.extend(
[unix_time])
(detection_features, detection_boxes, num_detections,
detection_scores) = self._session.run(
[
self._embedding_node, self._box_node, self._num_detections[0],
self._scores_node
],
feed_dict={self._input: [tfrecord_entry]})
num_detections = int(num_detections)
embed_all = []
score_all = []
detection_features = np.asarray(detection_features)
def get_bb_embedding(detection_features, detection_boxes, detection_scores,
index):
embedding = detection_features[0][index]
pooled_embedding = np.mean(np.mean(embedding, axis=1), axis=0)
box = detection_boxes[0][index]
position_embedding = embed_position_and_size(box)
score = detection_scores[0][index]
return np.concatenate((pooled_embedding, position_embedding)), score
temporal_embedding = embed_date_captured(date_captured)
embedding_count = 0
for index in range(min(num_detections, self._top_k_embedding_count)):
bb_embedding, score = get_bb_embedding(
detection_features, detection_boxes, detection_scores, index)
embed_all.extend(bb_embedding)
embed_all.extend(temporal_embedding)
score_all.append(score)
embedding_count += 1
for index in range(
max(0, num_detections - 1),
max(-1, num_detections - 1 - self._bottom_k_embedding_count), -1):
bb_embedding, score = get_bb_embedding(
detection_features, detection_boxes, detection_scores, index)
embed_all.extend(bb_embedding)
embed_all.extend(temporal_embedding)
score_all.append(score)
embedding_count += 1
if embedding_count == 0:
bb_embedding, score = get_bb_embedding(
detection_features, detection_boxes, detection_scores, 0)
embed_all.extend(bb_embedding)
embed_all.extend(temporal_embedding)
score_all.append(score)
# Takes max in case embedding_count is 0.
embedding_length = len(embed_all) // max(1, embedding_count)
embed_all = np.asarray(embed_all)
example.features.feature['image/embedding'].float_list.value.extend(
embed_all)
example.features.feature['image/embedding_score'].float_list.value.extend(
score_all)
example.features.feature['image/embedding_length'].int64_list.value.append(
embedding_length)
example.features.feature['image/embedding_count'].int64_list.value.append(
embedding_count)
# Add other essential example attributes
example.features.feature['image/encoded'].bytes_list.value.extend(
input_example.features.feature['image/encoded'].bytes_list.value)
example.features.feature['image/height'].int64_list.value.extend(
input_example.features.feature['image/height'].int64_list.value)
example.features.feature['image/width'].int64_list.value.extend(
input_example.features.feature['image/width'].int64_list.value)
example.features.feature['image/source_id'].bytes_list.value.extend(
input_example.features.feature['image/source_id'].bytes_list.value)
example.features.feature['image/location'].bytes_list.value.extend(
input_example.features.feature['image/location'].bytes_list.value)
example.features.feature['image/date_captured'].bytes_list.value.extend(
input_example.features.feature['image/date_captured'].bytes_list.value)
example.features.feature['image/class/text'].bytes_list.value.extend(
input_example.features.feature['image/class/text'].bytes_list.value)
example.features.feature['image/class/label'].int64_list.value.extend(
input_example.features.feature['image/class/label'].int64_list.value)
example.features.feature['image/seq_id'].bytes_list.value.extend(
input_example.features.feature['image/seq_id'].bytes_list.value)
example.features.feature['image/seq_num_frames'].int64_list.value.extend(
input_example.features.feature['image/seq_num_frames'].int64_list.value)
example.features.feature['image/seq_frame_num'].int64_list.value.extend(
input_example.features.feature['image/seq_frame_num'].int64_list.value)
example.features.feature['image/object/bbox/ymax'].float_list.value.extend(
input_example.features.feature[
'image/object/bbox/ymax'].float_list.value)
example.features.feature['image/object/bbox/ymin'].float_list.value.extend(
input_example.features.feature[
'image/object/bbox/ymin'].float_list.value)
example.features.feature['image/object/bbox/xmax'].float_list.value.extend(
input_example.features.feature[
'image/object/bbox/xmax'].float_list.value)
example.features.feature['image/object/bbox/xmin'].float_list.value.extend(
input_example.features.feature[
'image/object/bbox/xmin'].float_list.value)
example.features.feature[
'image/object/class/score'].float_list.value.extend(
input_example.features.feature[
'image/object/class/score'].float_list.value)
example.features.feature[
'image/object/class/label'].int64_list.value.extend(
input_example.features.feature[
'image/object/class/label'].int64_list.value)
example.features.feature[
'image/object/class/text'].bytes_list.value.extend(
input_example.features.feature[
'image/object/class/text'].bytes_list.value)
self._num_examples_processed.inc(1)
return [example]
def construct_pipeline(pipeline, input_tfrecord, output_tfrecord, model_dir,
top_k_embedding_count, bottom_k_embedding_count,
num_shards):
"""Returns a beam pipeline to run object detection inference.
Args:
pipeline: Initialized beam pipeline.
input_tfrecord: An TFRecord of tf.train.Example protos containing images.
output_tfrecord: An TFRecord of tf.train.Example protos that contain images
in the input TFRecord and the detections from the model.
model_dir: Path to `saved_model` to use for inference.
top_k_embedding_count: The number of high-confidence embeddings to store.
bottom_k_embedding_count: The number of low-confidence embeddings to store.
num_shards: The number of output shards.
"""
input_collection = (
pipeline | 'ReadInputTFRecord' >> beam.io.tfrecordio.ReadFromTFRecord(
input_tfrecord,
coder=beam.coders.BytesCoder()))
output_collection = input_collection | 'ExtractEmbedding' >> beam.ParDo(
GenerateEmbeddingDataFn(model_dir, top_k_embedding_count,
bottom_k_embedding_count))
output_collection = output_collection | 'Reshuffle' >> beam.Reshuffle()
_ = output_collection | 'WritetoDisk' >> beam.io.tfrecordio.WriteToTFRecord(
output_tfrecord,
num_shards=num_shards,
coder=beam.coders.ProtoCoder(tf.train.Example))
def parse_args(argv):
"""Command-line argument parser.
Args:
argv: command line arguments
Returns:
beam_args: Arguments for the beam pipeline.
pipeline_args: Arguments for the pipeline options, such as runner type.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'--embedding_input_tfrecord',
dest='embedding_input_tfrecord',
required=True,
help='TFRecord containing images in tf.Example format for object '
'detection.')
parser.add_argument(
'--embedding_output_tfrecord',
dest='embedding_output_tfrecord',
required=True,
help='TFRecord containing embeddings in tf.Example format.')
parser.add_argument(
'--embedding_model_dir',
dest='embedding_model_dir',
required=True,
help='Path to directory containing an object detection SavedModel with'
'detection_box_classifier_features in the output.')
parser.add_argument(
'--top_k_embedding_count',
dest='top_k_embedding_count',
default=1,
help='The number of top k embeddings to add to the memory bank.')
parser.add_argument(
'--bottom_k_embedding_count',
dest='bottom_k_embedding_count',
default=0,
help='The number of bottom k embeddings to add to the memory bank.')
parser.add_argument(
'--num_shards',
dest='num_shards',
default=0,
help='Number of output shards.')
beam_args, pipeline_args = parser.parse_known_args(argv)
return beam_args, pipeline_args
def main(argv=None, save_main_session=True):
"""Runs the Beam pipeline that performs inference.
Args:
argv: Command line arguments.
save_main_session: Whether to save the main session.
"""
args, pipeline_args = parse_args(argv)
pipeline_options = beam.options.pipeline_options.PipelineOptions(
pipeline_args)
pipeline_options.view_as(
beam.options.pipeline_options.SetupOptions).save_main_session = (
save_main_session)
dirname = os.path.dirname(args.embedding_output_tfrecord)
tf.io.gfile.makedirs(dirname)
p = beam.Pipeline(options=pipeline_options)
construct_pipeline(
p,
args.embedding_input_tfrecord,
args.embedding_output_tfrecord,
args.embedding_model_dir,
args.top_k_embedding_count,
args.bottom_k_embedding_count,
args.num_shards)
p.run()
if __name__ == '__main__':
main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""A Beam job to generate embedding data for camera trap images.
This tool runs inference with an exported Object Detection model in
`saved_model` format and produce raw embeddings for camera trap data. These
embeddings contain an object-centric feature embedding from Faster R-CNN, the
datetime that the image was taken (normalized in a specific way), and the
position of the object of interest. By default, only the highest-scoring object
embedding is included.
Steps to generate a embedding dataset:
1. Use object_detection/export_inference_graph.py to get a Faster R-CNN
`saved_model` for inference. The input node must accept a tf.Example proto.
2. Run this tool with `saved_model` from step 1 and an TFRecord of tf.Example
protos containing images for inference.
Example Usage:
--------------
python tensorflow_models/object_detection/export_inference_graph.py \
--alsologtostderr \
--input_type tf_example \
--pipeline_config_path path/to/faster_rcnn_model.config \
--trained_checkpoint_prefix path/to/model.ckpt \
--output_directory path/to/exported_model_directory \
--additional_output_tensor_names detection_features
python generate_embedding_data.py \
--alsologtostderr \
--embedding_input_tfrecord path/to/input_tfrecords* \
--embedding_output_tfrecord path/to/output_tfrecords \
--embedding_model_dir path/to/exported_model_directory/saved_model
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import datetime
import os
import threading
import numpy as np
import six
import tensorflow.compat.v1 as tf
try:
import apache_beam as beam # pylint:disable=g-import-not-at-top
except ModuleNotFoundError:
pass
def _load_inference_model(args):
# Because initialization of the tf.Session is expensive we share
# one instance across all threads in the worker. This is possible since
# tf.Session.run() is thread safe.
print(args)
args = vars(args)
session_lock = threading.Lock()
session = None
with session_lock:
if session is None:
graph = tf.Graph()
session = tf.Session(graph=graph)
with graph.as_default():
meta_graph = tf.saved_model.loader.load(
session, [tf.saved_model.tag_constants.SERVING],
args['embedding_model_dir'])
signature = meta_graph.signature_def['serving_default']
print(signature.inputs)
print(type(signature.inputs))
input_tensor_name = signature.inputs['input_tensor'].name
print(input_tensor_name)
_input = graph.get_tensor_by_name(input_tensor_name)
print(_input.shape)
detection_features_name = signature.outputs['detection_features'].name
detection_boxes_name = signature.outputs['detection_boxes'].name
num_detections_name = signature.outputs['num_detections'].name
self._embedding_node = graph.get_tensor_by_name(detection_features_name)
self._box_node = graph.get_tensor_by_name(detection_boxes_name)
self._scores_node = graph.get_tensor_by_name(
signature.outputs['detection_scores'].name)
self._num_detections = graph.get_tensor_by_name(num_detections_name)
tf.logging.info(signature.outputs['detection_features'].name)
tf.logging.info(signature.outputs['detection_boxes'].name)
tf.logging.info(signature.outputs['num_detections'].name)
print("Hello")
def parse_args(argv):
"""Command-line argument parser.
Args:
argv: command line arguments
Returns:
beam_args: Arguments for the beam pipeline.
pipeline_args: Arguments for the pipeline options, such as runner type.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'--embedding_input_tfrecord',
dest='embedding_input_tfrecord',
required=True,
help='TFRecord containing images in tf.Example format for object '
'detection.')
parser.add_argument(
'--embedding_output_tfrecord',
dest='embedding_output_tfrecord',
required=True,
help='TFRecord containing embeddings in tf.Example format.')
parser.add_argument(
'--embedding_model_dir',
dest='embedding_model_dir',
required=True,
help='Path to directory containing an object detection SavedModel with'
'detection_box_classifier_features in the output.')
parser.add_argument(
'--top_k_embedding_count',
dest='top_k_embedding_count',
default=1,
help='The number of top k embeddings to add to the memory bank.')
parser.add_argument(
'--bottom_k_embedding_count',
dest='bottom_k_embedding_count',
default=0,
help='The number of bottom k embeddings to add to the memory bank.')
parser.add_argument(
'--num_shards',
dest='num_shards',
default=0,
help='Number of output shards.')
beam_args, pipeline_args = parser.parse_known_args(argv)
return beam_args, pipeline_args
def main(argv=None, save_main_session=True):
"""Runs the Beam pipeline that performs inference.
Args:
argv: Command line arguments.
save_main_session: Whether to save the main session.
"""
args, pipeline_args = parse_args(argv)
_load_inference_model(args)
if __name__ == '__main__':
main()
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library functions for ContextRCNN."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow.compat.v1 as tf
import tf_slim as slim
# The negative value used in padding the invalid weights.
_NEGATIVE_PADDING_VALUE = -100000
def filter_weight_value(weights, values, valid_mask):
"""Filters weights and values based on valid_mask.
_NEGATIVE_PADDING_VALUE will be added to invalid elements in the weights to
avoid their contribution in softmax. 0 will be set for the invalid elements in
the values.
Args:
weights: A float Tensor of shape [batch_size, input_size, context_size].
values: A float Tensor of shape [batch_size, context_size,
projected_dimension].
valid_mask: A boolean Tensor of shape [batch_size, context_size]. True means
valid and False means invalid.
Returns:
weights: A float Tensor of shape [batch_size, input_size, context_size].
values: A float Tensor of shape [batch_size, context_size,
projected_dimension].
Raises:
ValueError: If shape of doesn't match.
"""
w_batch_size, _, w_context_size = weights.shape
v_batch_size, v_context_size, _ = values.shape
m_batch_size, m_context_size = valid_mask.shape
if w_batch_size != v_batch_size or v_batch_size != m_batch_size:
raise ValueError("Please make sure the first dimension of the input"
" tensors are the same.")
if w_context_size != v_context_size:
raise ValueError("Please make sure the third dimension of weights matches"
" the second dimension of values.")
if w_context_size != m_context_size:
raise ValueError("Please make sure the third dimension of the weights"
" matches the second dimension of the valid_mask.")
valid_mask = valid_mask[..., tf.newaxis]
# Force the invalid weights to be very negative so it won't contribute to
# the softmax.
weights += tf.transpose(
tf.cast(tf.math.logical_not(valid_mask), weights.dtype) *
_NEGATIVE_PADDING_VALUE,
perm=[0, 2, 1])
# Force the invalid values to be 0.
values *= tf.cast(valid_mask, values.dtype)
return weights, values
def compute_valid_mask(num_valid_elements, num_elements):
"""Computes mask of valid entries within padded context feature.
Args:
num_valid_elements: A int32 Tensor of shape [batch_size].
num_elements: An int32 Tensor.
Returns:
A boolean Tensor of the shape [batch_size, num_elements]. True means
valid and False means invalid.
"""
batch_size = num_valid_elements.shape[0]
element_idxs = tf.range(num_elements, dtype=tf.int32)
batch_element_idxs = tf.tile(element_idxs[tf.newaxis, ...], [batch_size, 1])
num_valid_elements = num_valid_elements[..., tf.newaxis]
valid_mask = tf.less(batch_element_idxs, num_valid_elements)
return valid_mask
def project_features(features, projection_dimension, is_training, normalize):
"""Projects features to another feature space.
Args:
features: A float Tensor of shape [batch_size, features_size,
num_features].
projection_dimension: A int32 Tensor.
is_training: A boolean Tensor (affecting batch normalization).
normalize: A boolean Tensor. If true, the output features will be l2
normalized on the last dimension.
Returns:
A float Tensor of shape [batch, features_size, projection_dimension].
"""
# TODO(guanhangwu) Figure out a better way of specifying the batch norm
# params.
batch_norm_params = {
"is_training": is_training,
"decay": 0.97,
"epsilon": 0.001,
"center": True,
"scale": True
}
batch_size, _, num_features = features.shape
features = tf.reshape(features, [-1, num_features])
projected_features = slim.fully_connected(
features,
num_outputs=projection_dimension,
activation_fn=tf.nn.relu6,
normalizer_fn=slim.batch_norm,
normalizer_params=batch_norm_params)
projected_features = tf.reshape(projected_features,
[batch_size, -1, projection_dimension])
if normalize:
projected_features = tf.math.l2_normalize(projected_features, axis=-1)
return projected_features
def attention_block(input_features, context_features, bottleneck_dimension,
output_dimension, attention_temperature, valid_mask,
is_training):
"""Generic attention block.
Args:
input_features: A float Tensor of shape [batch_size, input_size,
num_input_features].
context_features: A float Tensor of shape [batch_size, context_size,
num_context_features].
bottleneck_dimension: A int32 Tensor representing the bottleneck dimension
for intermediate projections.
output_dimension: A int32 Tensor representing the last dimension of the
output feature.
attention_temperature: A float Tensor. It controls the temperature of the
softmax for weights calculation. The formula for calculation as follows:
weights = exp(weights / temperature) / sum(exp(weights / temperature))
valid_mask: A boolean Tensor of shape [batch_size, context_size].
is_training: A boolean Tensor (affecting batch normalization).
Returns:
A float Tensor of shape [batch_size, input_size, output_dimension].
"""
with tf.variable_scope("AttentionBlock"):
queries = project_features(
input_features, bottleneck_dimension, is_training, normalize=True)
keys = project_features(
context_features, bottleneck_dimension, is_training, normalize=True)
values = project_features(
context_features, bottleneck_dimension, is_training, normalize=True)
weights = tf.matmul(queries, keys, transpose_b=True)
weights, values = filter_weight_value(weights, values, valid_mask)
weights = tf.nn.softmax(weights / attention_temperature)
features = tf.matmul(weights, values)
output_features = project_features(
features, output_dimension, is_training, normalize=False)
return output_features
def compute_box_context_attention(box_features, context_features,
valid_context_size, bottleneck_dimension,
attention_temperature, is_training):
"""Computes the attention feature from the context given a batch of box.
Args:
box_features: A float Tensor of shape [batch_size, max_num_proposals,
height, width, channels]. It is pooled features from first stage
proposals.
context_features: A float Tensor of shape [batch_size, context_size,
num_context_features].
valid_context_size: A int32 Tensor of shape [batch_size].
bottleneck_dimension: A int32 Tensor representing the bottleneck dimension
for intermediate projections.
attention_temperature: A float Tensor. It controls the temperature of the
softmax for weights calculation. The formula for calculation as follows:
weights = exp(weights / temperature) / sum(exp(weights / temperature))
is_training: A boolean Tensor (affecting batch normalization).
Returns:
A float Tensor of shape [batch_size, max_num_proposals, 1, 1, channels].
"""
_, context_size, _ = context_features.shape
valid_mask = compute_valid_mask(valid_context_size, context_size)
channels = box_features.shape[-1]
# Average pools over height and width dimension so that the shape of
# box_features becomes [batch_size, max_num_proposals, channels].
box_features = tf.reduce_mean(box_features, [2, 3])
output_features = attention_block(box_features, context_features,
bottleneck_dimension, channels.value,
attention_temperature, valid_mask,
is_training)
# Expands the dimension back to match with the original feature map.
output_features = output_features[:, :, tf.newaxis, tf.newaxis, :]
return output_features
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for context_rcnn_lib."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
from absl.testing import parameterized
import tensorflow.compat.v1 as tf
from object_detection.meta_architectures import context_rcnn_lib
from object_detection.utils import test_case
from object_detection.utils import tf_version
_NEGATIVE_PADDING_VALUE = -100000
@unittest.skipIf(tf_version.is_tf2(), 'Skipping TF1.X only test.')
class ContextRcnnLibTest(parameterized.TestCase, test_case.TestCase,
tf.test.TestCase):
"""Tests for the functions in context_rcnn_lib."""
def test_compute_valid_mask(self):
num_elements = tf.constant(3, tf.int32)
num_valid_elementss = tf.constant((1, 2), tf.int32)
valid_mask = context_rcnn_lib.compute_valid_mask(num_valid_elementss,
num_elements)
expected_valid_mask = tf.constant([[1, 0, 0], [1, 1, 0]], tf.float32)
self.assertAllEqual(valid_mask, expected_valid_mask)
def test_filter_weight_value(self):
weights = tf.ones((2, 3, 2), tf.float32) * 4
values = tf.ones((2, 2, 4), tf.float32)
valid_mask = tf.constant([[True, True], [True, False]], tf.bool)
filtered_weights, filtered_values = context_rcnn_lib.filter_weight_value(
weights, values, valid_mask)
expected_weights = tf.constant([[[4, 4], [4, 4], [4, 4]],
[[4, _NEGATIVE_PADDING_VALUE + 4],
[4, _NEGATIVE_PADDING_VALUE + 4],
[4, _NEGATIVE_PADDING_VALUE + 4]]])
expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
[[1, 1, 1, 1], [0, 0, 0, 0]]])
self.assertAllEqual(filtered_weights, expected_weights)
self.assertAllEqual(filtered_values, expected_values)
# Changes the valid_mask so the results will be different.
valid_mask = tf.constant([[True, True], [False, False]], tf.bool)
filtered_weights, filtered_values = context_rcnn_lib.filter_weight_value(
weights, values, valid_mask)
expected_weights = tf.constant(
[[[4, 4], [4, 4], [4, 4]],
[[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4],
[_NEGATIVE_PADDING_VALUE + 4, _NEGATIVE_PADDING_VALUE + 4]]])
expected_values = tf.constant([[[1, 1, 1, 1], [1, 1, 1, 1]],
[[0, 0, 0, 0], [0, 0, 0, 0]]])
self.assertAllEqual(filtered_weights, expected_weights)
self.assertAllEqual(filtered_values, expected_values)
@parameterized.parameters((2, True, True), (2, False, True),
(10, True, False), (10, False, False))
def test_project_features(self, projection_dimension, is_training, normalize):
features = tf.ones([2, 3, 4], tf.float32)
projected_features = context_rcnn_lib.project_features(
features,
projection_dimension,
is_training=is_training,
normalize=normalize)
# Makes sure the shape is correct.
self.assertAllEqual(projected_features.shape, [2, 3, projection_dimension])
@parameterized.parameters(
(2, 10, 1),
(3, 10, 2),
(4, 20, 3),
(5, 20, 4),
(7, 20, 5),
)
def test_attention_block(self, bottleneck_dimension, output_dimension,
attention_temperature):
input_features = tf.ones([2, 3, 4], tf.float32)
context_features = tf.ones([2, 2, 3], tf.float32)
valid_mask = tf.constant([[True, True], [False, False]], tf.bool)
is_training = False
output_features = context_rcnn_lib.attention_block(
input_features, context_features, bottleneck_dimension,
output_dimension, attention_temperature, valid_mask, is_training)
# Makes sure the shape is correct.
self.assertAllEqual(output_features.shape, [2, 3, output_dimension])
@parameterized.parameters(True, False)
def test_compute_box_context_attention(self, is_training):
box_features = tf.ones([2, 3, 4, 4, 4], tf.float32)
context_features = tf.ones([2, 5, 6], tf.float32)
valid_context_size = tf.constant((2, 3), tf.int32)
bottleneck_dimension = 10
attention_temperature = 1
attention_features = context_rcnn_lib.compute_box_context_attention(
box_features, context_features, valid_context_size,
bottleneck_dimension, attention_temperature, is_training)
# Makes sure the shape is correct.
self.assertAllEqual(attention_features.shape, [2, 3, 1, 1, 4])
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Context R-CNN meta-architecture definition.
This adds the ability to use attention into contextual features within the
Faster R-CNN object detection framework to improve object detection performance.
See https://arxiv.org/abs/1912.03538 for more information.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
from object_detection.core import standard_fields as fields
from object_detection.meta_architectures import context_rcnn_lib
from object_detection.meta_architectures import faster_rcnn_meta_arch
class ContextRCNNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
"""Context R-CNN Meta-architecture definition."""
def __init__(self,
is_training,
num_classes,
image_resizer_fn,
feature_extractor,
number_of_stages,
first_stage_anchor_generator,
first_stage_target_assigner,
first_stage_atrous_rate,
first_stage_box_predictor_arg_scope_fn,
first_stage_box_predictor_kernel_size,
first_stage_box_predictor_depth,
first_stage_minibatch_size,
first_stage_sampler,
first_stage_non_max_suppression_fn,
first_stage_max_proposals,
first_stage_localization_loss_weight,
first_stage_objectness_loss_weight,
crop_and_resize_fn,
initial_crop_size,
maxpool_kernel_size,
maxpool_stride,
second_stage_target_assigner,
second_stage_mask_rcnn_box_predictor,
second_stage_batch_size,
second_stage_sampler,
second_stage_non_max_suppression_fn,
second_stage_score_conversion_fn,
second_stage_localization_loss_weight,
second_stage_classification_loss_weight,
second_stage_classification_loss,
second_stage_mask_prediction_loss_weight=1.0,
hard_example_miner=None,
parallel_iterations=16,
add_summaries=True,
clip_anchors_to_image=False,
use_static_shapes=False,
resize_masks=True,
freeze_batchnorm=False,
return_raw_detections_during_predict=False,
output_final_box_features=False,
attention_bottleneck_dimension=None,
attention_temperature=None):
"""ContextRCNNMetaArch Constructor.
Args:
is_training: A boolean indicating whether the training version of the
computation graph should be constructed.
num_classes: Number of classes. Note that num_classes *does not*
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
image_resizer_fn: A callable for image resizing. This callable
takes a rank-3 image tensor of shape [height, width, channels]
(corresponding to a single image), an optional rank-3 instance mask
tensor of shape [num_masks, height, width] and returns a resized rank-3
image tensor, a resized mask tensor if one was provided in the input. In
addition this callable must also return a 1-D tensor of the form
[height, width, channels] containing the size of the true image, as the
image resizer can perform zero padding. See protos/image_resizer.proto.
feature_extractor: A FasterRCNNFeatureExtractor object.
number_of_stages: An integer values taking values in {1, 2, 3}. If
1, the function will construct only the Region Proposal Network (RPN)
part of the model. If 2, the function will perform box refinement and
other auxiliary predictions all in the second stage. If 3, it will
extract features from refined boxes and perform the auxiliary
predictions on the non-maximum suppressed refined boxes.
If is_training is true and the value of number_of_stages is 3, it is
reduced to 2 since all the model heads are trained in parallel in second
stage during training.
first_stage_anchor_generator: An anchor_generator.AnchorGenerator object
(note that currently we only support
grid_anchor_generator.GridAnchorGenerator objects)
first_stage_target_assigner: Target assigner to use for first stage of
Faster R-CNN (RPN).
first_stage_atrous_rate: A single integer indicating the atrous rate for
the single convolution op which is applied to the `rpn_features_to_crop`
tensor to obtain a tensor to be used for box prediction. Some feature
extractors optionally allow for producing feature maps computed at
denser resolutions. The atrous rate is used to compensate for the
denser feature maps by using an effectively larger receptive field.
(This should typically be set to 1).
first_stage_box_predictor_arg_scope_fn: Either a
Keras layer hyperparams object or a function to construct tf-slim
arg_scope for conv2d, separable_conv2d and fully_connected ops. Used
for the RPN box predictor. If it is a keras hyperparams object the
RPN box predictor will be a Keras model. If it is a function to
construct an arg scope it will be a tf-slim box predictor.
first_stage_box_predictor_kernel_size: Kernel size to use for the
convolution op just prior to RPN box predictions.
first_stage_box_predictor_depth: Output depth for the convolution op
just prior to RPN box predictions.
first_stage_minibatch_size: The "batch size" to use for computing the
objectness and location loss of the region proposal network. This
"batch size" refers to the number of anchors selected as contributing
to the loss function for any given image within the image batch and is
only called "batch_size" due to terminology from the Faster R-CNN paper.
first_stage_sampler: Sampler to use for first stage loss (RPN loss).
first_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
callable that takes `boxes`, `scores` and optional `clip_window`(with
all other inputs already set) and returns a dictionary containing
tensors with keys: `detection_boxes`, `detection_scores`,
`detection_classes`, `num_detections`. This is used to perform non max
suppression on the boxes predicted by the Region Proposal Network
(RPN).
See `post_processing.batch_multiclass_non_max_suppression` for the type
and shape of these tensors.
first_stage_max_proposals: Maximum number of boxes to retain after
performing Non-Max Suppression (NMS) on the boxes predicted by the
Region Proposal Network (RPN).
first_stage_localization_loss_weight: A float
first_stage_objectness_loss_weight: A float
crop_and_resize_fn: A differentiable resampler to use for cropping RPN
proposal features.
initial_crop_size: A single integer indicating the output size
(width and height are set to be the same) of the initial bilinear
interpolation based cropping during ROI pooling.
maxpool_kernel_size: A single integer indicating the kernel size of the
max pool op on the cropped feature map during ROI pooling.
maxpool_stride: A single integer indicating the stride of the max pool
op on the cropped feature map during ROI pooling.
second_stage_target_assigner: Target assigner to use for second stage of
Faster R-CNN. If the model is configured with multiple prediction heads,
this target assigner is used to generate targets for all heads (with the
correct `unmatched_class_label`).
second_stage_mask_rcnn_box_predictor: Mask R-CNN box predictor to use for
the second stage.
second_stage_batch_size: The batch size used for computing the
classification and refined location loss of the box classifier. This
"batch size" refers to the number of proposals selected as contributing
to the loss function for any given image within the image batch and is
only called "batch_size" due to terminology from the Faster R-CNN paper.
second_stage_sampler: Sampler to use for second stage loss (box
classifier loss).
second_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
callable that takes `boxes`, `scores`, optional `clip_window` and
optional (kwarg) `mask` inputs (with all other inputs already set)
and returns a dictionary containing tensors with keys:
`detection_boxes`, `detection_scores`, `detection_classes`,
`num_detections`, and (optionally) `detection_masks`. See
`post_processing.batch_multiclass_non_max_suppression` for the type and
shape of these tensors.
second_stage_score_conversion_fn: Callable elementwise nonlinearity
(that takes tensors as inputs and returns tensors). This is usually
used to convert logits to probabilities.
second_stage_localization_loss_weight: A float indicating the scale factor
for second stage localization loss.
second_stage_classification_loss_weight: A float indicating the scale
factor for second stage classification loss.
second_stage_classification_loss: Classification loss used by the second
stage classifier. Either losses.WeightedSigmoidClassificationLoss or
losses.WeightedSoftmaxClassificationLoss.
second_stage_mask_prediction_loss_weight: A float indicating the scale
factor for second stage mask prediction loss. This is applicable only if
second stage box predictor is configured to predict masks.
hard_example_miner: A losses.HardExampleMiner object (can be None).
parallel_iterations: (Optional) The number of iterations allowed to run
in parallel for calls to tf.map_fn.
add_summaries: boolean (default: True) controlling whether summary ops
should be added to tensorflow graph.
clip_anchors_to_image: Normally, anchors generated for a given image size
are pruned during training if they lie outside the image window. This
option clips the anchors to be within the image instead of pruning.
use_static_shapes: If True, uses implementation of ops with static shape
guarantees.
resize_masks: Indicates whether the masks presend in the groundtruth
should be resized in the model with `image_resizer_fn`
freeze_batchnorm: Whether to freeze batch norm parameters in the first
stage box predictor during training or not. When training with a small
batch size (e.g. 1), it is desirable to freeze batch norm update and
use pretrained batch norm params.
return_raw_detections_during_predict: Whether to return raw detection
boxes in the predict() method. These are decoded boxes that have not
been through postprocessing (i.e. NMS). Default False.
output_final_box_features: Whether to output final box features. If true,
it crops the feauture map based on the final box prediction and returns
in the dict as detection_features.
attention_bottleneck_dimension: A single integer. The bottleneck feature
dimension of the attention block.
attention_temperature: A single float. The attention temperature.
Raises:
ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` at
training time.
ValueError: If first_stage_anchor_generator is not of type
grid_anchor_generator.GridAnchorGenerator.
"""
super(ContextRCNNMetaArch, self).__init__(
is_training,
num_classes,
image_resizer_fn,
feature_extractor,
number_of_stages,
first_stage_anchor_generator,
first_stage_target_assigner,
first_stage_atrous_rate,
first_stage_box_predictor_arg_scope_fn,
first_stage_box_predictor_kernel_size,
first_stage_box_predictor_depth,
first_stage_minibatch_size,
first_stage_sampler,
first_stage_non_max_suppression_fn,
first_stage_max_proposals,
first_stage_localization_loss_weight,
first_stage_objectness_loss_weight,
crop_and_resize_fn,
initial_crop_size,
maxpool_kernel_size,
maxpool_stride,
second_stage_target_assigner,
second_stage_mask_rcnn_box_predictor,
second_stage_batch_size,
second_stage_sampler,
second_stage_non_max_suppression_fn,
second_stage_score_conversion_fn,
second_stage_localization_loss_weight,
second_stage_classification_loss_weight,
second_stage_classification_loss,
second_stage_mask_prediction_loss_weight=(
second_stage_mask_prediction_loss_weight),
hard_example_miner=hard_example_miner,
parallel_iterations=parallel_iterations,
add_summaries=add_summaries,
clip_anchors_to_image=clip_anchors_to_image,
use_static_shapes=use_static_shapes,
resize_masks=resize_masks,
freeze_batchnorm=freeze_batchnorm,
return_raw_detections_during_predict=(
return_raw_detections_during_predict),
output_final_box_features=output_final_box_features)
self._context_feature_extract_fn = functools.partial(
context_rcnn_lib.compute_box_context_attention,
bottleneck_dimension=attention_bottleneck_dimension,
attention_temperature=attention_temperature,
is_training=is_training)
@staticmethod
def get_side_inputs(features):
"""Overrides the get_side_inputs function in the base class.
This function returns context_features and valid_context_size, which will be
used in the _compute_second_stage_input_feature_maps function.
Args:
features: A dictionary of tensors.
Returns:
A dictionary of tensors contains context_features and valid_context_size.
Raises:
ValueError: If context_features or valid_context_size is not in the
features.
"""
if (fields.InputDataFields.context_features not in features or
fields.InputDataFields.valid_context_size not in features):
raise ValueError(
"Please make sure context_features and valid_context_size are in the "
"features")
return {
fields.InputDataFields.context_features:
features[fields.InputDataFields.context_features],
fields.InputDataFields.valid_context_size:
features[fields.InputDataFields.valid_context_size]
}
def _compute_second_stage_input_feature_maps(self, features_to_crop,
proposal_boxes_normalized,
context_features,
valid_context_size):
"""Crops to a set of proposals from the feature map for a batch of images.
This function overrides the one in the FasterRCNNMetaArch. Aside from
cropping and resizing the feature maps, which is done in the parent class,
it adds context attention features to the box features.
Args:
features_to_crop: A float32 Tensor with shape [batch_size, height, width,
depth]
proposal_boxes_normalized: A float32 Tensor with shape [batch_size,
num_proposals, box_code_size] containing proposal boxes in normalized
coordinates.
context_features: A float Tensor of shape [batch_size, context_size,
num_context_features].
valid_context_size: A int32 Tensor of shape [batch_size].
Returns:
A float32 Tensor with shape [K, new_height, new_width, depth].
"""
box_features = self._crop_and_resize_fn(
features_to_crop, proposal_boxes_normalized,
[self._initial_crop_size, self._initial_crop_size])
attention_features = self._context_feature_extract_fn(
box_features=box_features,
context_features=context_features,
valid_context_size=valid_context_size)
# Adds box features with attention features.
box_features += attention_features
flattened_feature_maps = self._flatten_first_two_dimensions(box_features)
return self._maxpool_layer(flattened_feature_maps)
import abc
import collections
import functools
import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow.compat.v2 as tf2
from object_detection.core import box_list
from object_detection.core import box_list_ops
from object_detection.core import keypoint_ops
from object_detection.core import model
from object_detection.core import standard_fields as fields
from object_detection.core import target_assigner
from object_detection.utils import shape_utils
from object_detection.models import faster_rcnn_resnet_keras_feature_extractor
from object_detection.meta_architectures import detr_transformer
from object_detection.matchers import hungarian_matcher
class DETRMetaArch(model.DetectionModel):
def __init__(self):
self.num_queries = 100
self.hidden_dimension = 100
self.feature_extractor = faster_rcnn_resnet_keras_feature_extractor.FasterRCNNResnet50KerasFeatureExtractor(is_training=False)
self.first_stage = self.feature_extractor.get_proposal_feature_extractor_model()
self.target_assigner = target_assigner.create_target_assigner('DETR', 'detection')
self.transformer = detr_transformer.Transformer()
self.ffn = self.feature_extractor.get_box_classifier_feature_extractor_model()
self.bboxes = tf.keras.layers.Dense(4)
self.cls = tf.keras.layers.Dense(2)
self.queries = tf.keras.Variable(tf.random([self.num_queries, self.hidden_dimension]))
def predict(self, preprocessed_inputs, true_image_shapes, **side_inputs):
x = self.first_stage(preprocessed_inputs)
x = tf.reshape(x, [x.shape[0], x.shape[1] * x.shape[2], x.shape[3]])
x = self.transformer([x, tf.repeat(tf.expand_dims(self.queries, 0), x.shape[0], axis=0)])
x = self.ffn(x)
return self.bboxes(x), self.cls(x)
def loss(self, prediction_dict, true_image_shapes, scope=None):
return 1
def preprocess(self, inputs):
"""Feature-extractor specific preprocessing.
See base class.
For Faster R-CNN, we perform image resizing in the base class --- each
class subclassing FasterRCNNMetaArch is responsible for any additional
preprocessing (e.g., scaling pixel values to be in [-1, 1]).
Args:
inputs: a [batch, height_in, width_in, channels] float tensor representing
a batch of images with values between 0 and 255.0.
Returns:
preprocessed_inputs: a [batch, height_out, width_out, channels] float
tensor representing a batch of images.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Raises:
ValueError: if inputs tensor does not have type tf.float32
"""
with tf.name_scope('Preprocessor'):
(resized_inputs,
true_image_shapes) = shape_utils.resize_images_and_return_shapes(
inputs, self._image_resizer_fn)
return (self.feature_extractor.preprocess(resized_inputs),
true_image_shapes)
def restore_from_objects(self, fine_tune_checkpoint_type='detection'):
raise NotImplementedError("Model restoration implemented yet.")
def restore_map(self,
fine_tune_checkpoint_type='detection',
load_all_detection_checkpoint_vars=False):
raise NotImplementedError("Model restoration implemented yet.")
def loss(self, prediction_dict, true_image_shapes, scope=None):
"""Compute scalar loss tensors given prediction tensors.
If number_of_stages=1, only RPN related losses are computed (i.e.,
`rpn_localization_loss` and `rpn_objectness_loss`). Otherwise all
losses are computed.
Args:
prediction_dict: a dictionary holding prediction tensors (see the
documentation for the predict method. If number_of_stages=1, we
expect prediction_dict to contain `rpn_box_encodings`,
`rpn_objectness_predictions_with_background`, `rpn_features_to_crop`,
`image_shape`, and `anchors` fields. Otherwise we expect
prediction_dict to additionally contain `refined_box_encodings`,
`class_predictions_with_background`, `num_proposals`, and
`proposal_boxes` fields.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
scope: Optional scope name.
Returns:
a dictionary mapping loss keys (`first_stage_localization_loss`,
`first_stage_objectness_loss`, 'second_stage_localization_loss',
'second_stage_classification_loss') to scalar tensors representing
corresponding loss values.
"""
with tf.name_scope(scope, 'Loss', prediction_dict.values()):
(groundtruth_boxlists, groundtruth_classes_with_background_list,
groundtruth_masks_list, groundtruth_weights_list
) = self._format_groundtruth_data(
self._image_batch_shape_2d(prediction_dict['image_shape']))
loss_dict = self._loss_box_classifier(
prediction_dict['refined_box_encodings'],
prediction_dict['class_predictions_with_background'],
prediction_dict['proposal_boxes'],
prediction_dict['num_proposals'], groundtruth_boxlists,
groundtruth_classes_with_background_list,
groundtruth_weights_list, prediction_dict['image_shape'],
prediction_dict.get('mask_predictions'), groundtruth_masks_list,
prediction_dict.get(
fields.DetectionResultFields.detection_boxes),
prediction_dict.get(
fields.DetectionResultFields.num_detections))
return loss_dict
def _loss_box_classifier(self,
refined_box_encodings,
class_predictions_with_background,
proposal_boxes,
num_proposals,
groundtruth_boxlists,
groundtruth_classes_with_background_list,
groundtruth_weights_list,
image_shape,
prediction_masks=None,
groundtruth_masks_list=None,
detection_boxes=None,
num_detections=None):
"""Computes scalar box classifier loss tensors.
Uses self._detector_target_assigner to obtain regression and classification
targets for the second stage box classifier, optionally performs
hard mining, and returns losses. All losses are computed independently
for each image and then averaged across the batch.
Please note that for boxes and masks with multiple labels, the box
regression and mask prediction losses are only computed for one label.
This function assumes that the proposal boxes in the "padded" regions are
actually zero (and thus should not be matched to).
Args:
refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, box_coder.code_size] representing
predicted (final) refined box encodings. If using a shared box across
classes this will instead have shape
[total_num_proposals, 1, box_coder.code_size].
class_predictions_with_background: a 2-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors. Note that this tensor
*includes* background class predictions (at class index 0).
proposal_boxes: [batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes.
num_proposals: A Tensor of type `int32`. A 1-D tensor of shape [batch]
representing the number of proposals predicted for each image in
the batch.
groundtruth_boxlists: a list of BoxLists containing coordinates of the
groundtruth boxes.
groundtruth_classes_with_background_list: a list of 2-D one-hot
(or k-hot) tensors of shape [num_boxes, num_classes + 1] containing the
class targets with the 0th index assumed to map to the background class.
groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape
[num_boxes] containing weights for groundtruth boxes.
image_shape: a 1-D tensor of shape [4] representing the image shape.
prediction_masks: an optional 4-D tensor with shape [total_num_proposals,
num_classes, mask_height, mask_width] containing the instance masks for
each box.
groundtruth_masks_list: an optional list of 3-D tensors of shape
[num_boxes, image_height, image_width] containing the instance masks for
each of the boxes.
detection_boxes: 3-D float tensor of shape [batch,
max_total_detections, 4] containing post-processed detection boxes in
normalized co-ordinates.
num_detections: 1-D int32 tensor of shape [batch] containing number of
valid detections in `detection_boxes`.
Returns:
a dictionary mapping loss keys ('second_stage_localization_loss',
'second_stage_classification_loss') to scalar tensors representing
corresponding loss values.
Raises:
ValueError: if `predict_instance_masks` in
second_stage_mask_rcnn_box_predictor is True and
`groundtruth_masks_list` is not provided.
"""
with tf.name_scope('BoxClassifierLoss'):
paddings_indicator = self._padded_batched_proposals_indicator(
num_proposals, proposal_boxes.shape[1])
proposal_boxlists = [
box_list.BoxList(proposal_boxes_single_image)
for proposal_boxes_single_image in tf.unstack(proposal_boxes)]
batch_size = len(proposal_boxlists)
num_proposals_or_one = tf.cast(tf.expand_dims(
tf.maximum(num_proposals, tf.ones_like(num_proposals)), 1),
dtype=tf.float32)
normalizer = tf.tile(num_proposals_or_one,
[1, self.max_num_proposals]) * batch_size
(batch_cls_targets_with_background, batch_cls_weights, batch_reg_targets,
batch_reg_weights, _) = target_assigner.batch_assign_targets(
target_assigner=self._detector_target_assigner,
anchors_batch=proposal_boxlists,
gt_box_batch=groundtruth_boxlists,
gt_class_targets_batch=groundtruth_classes_with_background_list,
unmatched_class_label=tf.constant(
[1] + self._num_classes * [0], dtype=tf.float32),
gt_weights_batch=groundtruth_weights_list)
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
[batch_size, self.max_num_proposals, -1])
flat_cls_targets_with_background = tf.reshape(
batch_cls_targets_with_background,
[batch_size * self.max_num_proposals, -1])
one_hot_flat_cls_targets_with_background = tf.argmax(
flat_cls_targets_with_background, axis=1)
one_hot_flat_cls_targets_with_background = tf.one_hot(
one_hot_flat_cls_targets_with_background,
flat_cls_targets_with_background.get_shape()[1])
# If using a shared box across classes use directly
if refined_box_encodings.shape[1] == 1:
reshaped_refined_box_encodings = tf.reshape(
refined_box_encodings,
[batch_size, self.max_num_proposals, self._box_coder.code_size])
# For anchors with multiple labels, picks refined_location_encodings
# for just one class to avoid over-counting for regression loss and
# (optionally) mask loss.
else:
reshaped_refined_box_encodings = (
self._get_refined_encodings_for_postitive_class(
refined_box_encodings,
one_hot_flat_cls_targets_with_background, batch_size))
losses_mask = None
if self.groundtruth_has_field(fields.InputDataFields.is_annotated):
losses_mask = tf.stack(self.groundtruth_lists(
fields.InputDataFields.is_annotated))
second_stage_loc_losses = self._second_stage_localization_loss(
reshaped_refined_box_encodings,
batch_reg_targets,
weights=batch_reg_weights,
losses_mask=losses_mask) / normalizer
second_stage_cls_losses = ops.reduce_sum_trailing_dimensions(
self._second_stage_classification_loss(
class_predictions_with_background,
batch_cls_targets_with_background,
weights=batch_cls_weights,
losses_mask=losses_mask),
ndims=2) / normalizer
second_stage_loc_loss = tf.reduce_sum(
second_stage_loc_losses * tf.cast(paddings_indicator,
dtype=tf.float32))
second_stage_cls_loss = tf.reduce_sum(
second_stage_cls_losses * tf.cast(paddings_indicator,
dtype=tf.float32))
if self._hard_example_miner:
(second_stage_loc_loss, second_stage_cls_loss
) = self._unpad_proposals_and_apply_hard_mining(
proposal_boxlists, second_stage_loc_losses,
second_stage_cls_losses, num_proposals)
localization_loss = tf.multiply(self._second_stage_loc_loss_weight,
second_stage_loc_loss,
name='localization_loss')
classification_loss = tf.multiply(self._second_stage_cls_loss_weight,
second_stage_cls_loss,
name='classification_loss')
loss_dict = {'Loss/BoxClassifierLoss/localization_loss':
localization_loss,
'Loss/BoxClassifierLoss/classification_loss':
classification_loss}
return loss_dict
\ No newline at end of file
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for object_detection.meta_architectures.rfcn_meta_arch."""
import tensorflow.compat.v1 as tf
from object_detection.meta_architectures import faster_rcnn_meta_arch_test_lib
from object_detection.meta_architectures import rfcn_meta_arch
class RFCNMetaArchTest(
faster_rcnn_meta_arch_test_lib.FasterRCNNMetaArchTestBase):
def _get_second_stage_box_predictor_text_proto(
self, share_box_across_classes=False):
del share_box_across_classes
box_predictor_text_proto = """
rfcn_box_predictor {
conv_hyperparams {
op: CONV
activation: NONE
regularizer {
l2_regularizer {
weight: 0.0005
}
}
initializer {
variance_scaling_initializer {
factor: 1.0
uniform: true
mode: FAN_AVG
}
}
}
}
"""
return box_predictor_text_proto
def _get_model(self, box_predictor, **common_kwargs):
return rfcn_meta_arch.RFCNMetaArch(
second_stage_rfcn_box_predictor=box_predictor, **common_kwargs)
def _get_box_classifier_features_shape(self,
image_size,
batch_size,
max_num_proposals,
initial_crop_size,
maxpool_stride,
num_features):
return (batch_size, image_size, image_size, num_features)
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment