Commit 0d8e49ec authored by Yinxiao Li's avatar Yinxiao Li Committed by dreamdragon
Browse files

PiperOrigin-RevId: 206648257

parent d7676c1c
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# For training on Imagenet Video with LSTM Mobilenet V1
[object_detection.protos.lstm_model] {
train_unroll_length: 4
eval_unroll_length: 4
}
model {
ssd {
num_classes: 30
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
}
}
similarity_calculator {
iou_similarity {
}
}
anchor_generator {
ssd_anchor_generator {
num_layers: 5
min_scale: 0.2
max_scale: 0.95
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
aspect_ratios: 3.0
aspect_ratios: 0.3333
}
}
image_resizer {
fixed_shape_resizer {
height: 256
width: 256
}
}
box_predictor {
convolutional_box_predictor {
min_depth: 0
max_depth: 0
num_layers_before_predictor: 3
use_dropout: false
dropout_keep_probability: 0.8
kernel_size: 3
box_code_size: 4
apply_sigmoid_to_scores: false
use_depthwise: true
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
batch_norm {
train: true,
scale: true,
center: true,
decay: 0.9997,
epsilon: 0.001,
}
}
}
}
feature_extractor {
type: 'lstm_mobilenet_v1'
min_depth: 16
depth_multiplier: 1.0
use_depthwise: true
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
batch_norm {
train: true,
scale: true,
center: true,
decay: 0.9997,
epsilon: 0.001,
}
}
}
loss {
classification_loss {
weighted_sigmoid {
}
}
localization_loss {
weighted_smooth_l1 {
}
}
hard_example_miner {
num_hard_examples: 3000
iou_threshold: 0.99
loss_type: CLASSIFICATION
max_negatives_per_positive: 3
min_negatives_per_image: 0
}
classification_weight: 1.0
localization_weight: 4.0
}
normalize_loss_by_num_matches: true
post_processing {
batch_non_max_suppression {
score_threshold: -20.0
iou_threshold: 0.5
max_detections_per_class: 100
max_total_detections: 100
}
score_converter: SIGMOID
}
}
}
train_config: {
batch_size: 8
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
ssd_random_crop {
}
}
optimizer {
use_moving_average: false
rms_prop_optimizer: {
learning_rate: {
exponential_decay_learning_rate {
initial_learning_rate: 0.002
decay_steps: 200000
decay_factor: 0.95
}
}
momentum_optimizer_value: 0.9
decay: 0.9
epsilon: 1.0
}
}
from_detection_checkpoint: true
gradient_clipping_by_norm: 10.0
batch_queue_capacity: 12
prefetch_queue_capacity: 4
fine_tune_checkpoint: "/path/to/checkpoint/"
fine_tune_checkpoint_type: "detection"
}
train_input_reader: {
shuffle_buffer_size: 32
queue_capacity: 12
prefetch_size: 12
min_after_dequeue: 4
label_map_path: "path/to/label_map"
external_input_reader {
[lstm_object_detection.input_readers.GoogleInputReader.google_input_reader] {
tf_record_video_input_reader: {
input_path: "your/cns/path"
data_type: TF_SEQUENCE_EXAMPLE
video_length: 4
}
}
}
}
eval_config: {
metrics_set: "coco_evaluation_last_frame"
use_moving_averages: true
min_score_threshold: 0.5
max_num_boxes_to_visualize: 300
visualize_groundtruth_boxes: true
groundtruth_box_visualization_color: "red"
}
eval_input_reader: {
label_map_path: "path/to/label_map"
external_input_reader {
[lstm_object_detection.input_readers.GoogleInputReader.google_input_reader] {
tf_record_video_input_reader: {
input_path: "your/cns/path"
data_type: TF_SEQUENCE_EXAMPLE
video_length: 4
}
}
}
shuffle: true
num_readers: 1
}
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Evaluation executable for detection models.
This executable is used to evaluate DetectionModels. Example usage:
./eval \
--logtostderr \
--checkpoint_dir=path/to/checkpoint_dir \
--eval_dir=path/to/eval_dir \
--pipeline_config_path=pipeline_config.pbtxt
"""
import functools
import os
import tensorflow as tf
from google.protobuf import text_format
from google3.pyglib import app
from google3.pyglib import flags
from lstm_object_detection import evaluator
from lstm_object_detection import model_builder
from lstm_object_detection import seq_dataset_builder
from lstm_object_detection.utils import config_util
from google3.third_party.tensorflow_models.object_detection.utils import label_map_util
tf.logging.set_verbosity(tf.logging.INFO)
flags = tf.app.flags
flags.DEFINE_boolean('eval_training_data', False,
'If training data should be evaluated for this job.')
flags.DEFINE_string('checkpoint_dir', '',
'Directory containing checkpoints to evaluate, typically '
'set to `train_dir` used in the training job.')
flags.DEFINE_string('eval_dir', '', 'Directory to write eval summaries to.')
flags.DEFINE_string('pipeline_config_path', '',
'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
'file. If provided, other configs are ignored')
flags.DEFINE_boolean('run_once', False, 'Option to only run a single pass of '
'evaluation. Overrides the `max_evals` parameter in the '
'provided config.')
FLAGS = flags.FLAGS
def main(unused_argv):
assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.'
assert FLAGS.eval_dir, '`eval_dir` is missing.'
if FLAGS.pipeline_config_path:
configs = config_util.get_configs_from_pipeline_file(
FLAGS.pipeline_config_path)
else:
configs = config_util.get_configs_from_multiple_files(
model_config_path=FLAGS.model_config_path,
eval_config_path=FLAGS.eval_config_path,
eval_input_config_path=FLAGS.input_config_path)
pipeline_proto = config_util.create_pipeline_proto_from_configs(configs)
config_text = text_format.MessageToString(pipeline_proto)
tf.gfile.MakeDirs(FLAGS.eval_dir)
with tf.gfile.Open(os.path.join(FLAGS.eval_dir, 'pipeline.config'),
'wb') as f:
f.write(config_text)
model_config = configs['model']
lstm_config = configs['lstm_model']
eval_config = configs['eval_config']
input_config = configs['eval_input_config']
if FLAGS.eval_training_data:
input_config.external_input_reader.CopyFrom(
configs['train_input_config'].external_input_reader)
lstm_config.eval_unroll_length = lstm_config.train_unroll_length
model_fn = functools.partial(
model_builder.build,
model_config=model_config,
lstm_config=lstm_config,
is_training=False)
def get_next(config, model_config, lstm_config, unroll_length):
return seq_dataset_builder.build(config, model_config, lstm_config,
unroll_length)
create_input_dict_fn = functools.partial(get_next, input_config, model_config,
lstm_config,
lstm_config.eval_unroll_length)
label_map = label_map_util.load_labelmap(input_config.label_map_path)
max_num_classes = max([item.id for item in label_map.item])
categories = label_map_util.convert_label_map_to_categories(
label_map, max_num_classes)
if FLAGS.run_once:
eval_config.max_evals = 1
evaluator.evaluate(create_input_dict_fn, model_fn, eval_config, categories,
FLAGS.checkpoint_dir, FLAGS.eval_dir)
if __name__ == '__main__':
app.run()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Detection model evaluator.
This file provides a generic evaluation method that can be used to evaluate a
DetectionModel.
"""
import logging
import tensorflow as tf
from lstm_object_detection.metrics import coco_evaluation_all_frames
from google3.third_party.tensorflow_models.object_detection import eval_util
from google3.third_party.tensorflow_models.object_detection.core import prefetcher
from google3.third_party.tensorflow_models.object_detection.core import standard_fields as fields
from google3.third_party.tensorflow_models.object_detection.metrics import coco_evaluation
from google3.third_party.tensorflow_models.object_detection.utils import object_detection_evaluation
# A dictionary of metric names to classes that implement the metric. The classes
# in the dictionary must implement
# utils.object_detection_evaluation.DetectionEvaluator interface.
EVAL_METRICS_CLASS_DICT = {
'pascal_voc_detection_metrics':
object_detection_evaluation.PascalDetectionEvaluator,
'weighted_pascal_voc_detection_metrics':
object_detection_evaluation.WeightedPascalDetectionEvaluator,
'pascal_voc_instance_segmentation_metrics':
object_detection_evaluation.PascalInstanceSegmentationEvaluator,
'weighted_pascal_voc_instance_segmentation_metrics':
object_detection_evaluation.WeightedPascalInstanceSegmentationEvaluator,
'open_images_detection_metrics':
object_detection_evaluation.OpenImagesDetectionEvaluator,
'coco_detection_metrics':
coco_evaluation.CocoDetectionEvaluator,
'coco_mask_metrics':
coco_evaluation.CocoMaskEvaluator,
'coco_evaluation_all_frames':
coco_evaluation_all_frames.CocoEvaluationAllFrames,
}
EVAL_DEFAULT_METRIC = 'pascal_voc_detection_metrics'
def _create_detection_op(model, input_dict, batch):
"""Create detection ops.
Args:
model: model to perform predictions with.
input_dict: A dict holds input data.
batch: batch size for evaluation.
Returns:
Detection tensor ops.
"""
video_tensor = tf.stack(list(input_dict[fields.InputDataFields.image]))
preprocessed_video, true_image_shapes = model.preprocess(
tf.to_float(video_tensor))
if batch is not None:
prediction_dict = model.predict(preprocessed_video, true_image_shapes,
batch)
else:
prediction_dict = model.predict(preprocessed_video, true_image_shapes)
return model.postprocess(prediction_dict, true_image_shapes)
def _extract_prediction_tensors(model,
create_input_dict_fn,
ignore_groundtruth=False):
"""Restores the model in a tensorflow session.
Args:
model: model to perform predictions with.
create_input_dict_fn: function to create input tensor dictionaries.
ignore_groundtruth: whether groundtruth should be ignored.
Returns:
tensor_dict: A tensor dictionary with evaluations.
"""
input_dict = create_input_dict_fn()
batch = None
if 'batch' in input_dict:
batch = input_dict.pop('batch')
else:
prefetch_queue = prefetcher.prefetch(input_dict, capacity=500)
input_dict = prefetch_queue.dequeue()
# consistent format for images and videos
for key, value in input_dict.iteritems():
input_dict[key] = (value,)
detections = _create_detection_op(model, input_dict, batch)
# Print out anaylsis of the model.
tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(),
tfprof_options=tf.contrib.tfprof.model_analyzer.
TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
tf.contrib.tfprof.model_analyzer.print_model_analysis(
tf.get_default_graph(),
tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
num_frames = len(input_dict[fields.InputDataFields.image])
ret = []
for i in range(num_frames):
original_image = tf.expand_dims(input_dict[fields.InputDataFields.image][i],
0)
groundtruth = None
if not ignore_groundtruth:
groundtruth = {
fields.InputDataFields.groundtruth_boxes:
input_dict[fields.InputDataFields.groundtruth_boxes][i],
fields.InputDataFields.groundtruth_classes:
input_dict[fields.InputDataFields.groundtruth_classes][i],
}
optional_keys = (
fields.InputDataFields.groundtruth_area,
fields.InputDataFields.groundtruth_is_crowd,
fields.InputDataFields.groundtruth_difficult,
fields.InputDataFields.groundtruth_group_of,
)
for opt_key in optional_keys:
if opt_key in input_dict:
groundtruth[opt_key] = input_dict[opt_key][i]
if fields.DetectionResultFields.detection_masks in detections:
groundtruth[fields.InputDataFields.groundtruth_instance_masks] = (
input_dict[fields.InputDataFields.groundtruth_instance_masks][i])
detections_frame = {
key: tf.expand_dims(value[i], 0)
for key, value in detections.iteritems()
}
source_id = (
batch.key[0] if batch is not None else
input_dict[fields.InputDataFields.source_id][i])
ret.append(
eval_util.result_dict_for_single_example(
original_image,
source_id,
detections_frame,
groundtruth,
class_agnostic=(fields.DetectionResultFields.detection_classes
not in detections),
scale_to_absolute=True))
return ret
def get_evaluators(eval_config, categories):
"""Returns the evaluator class according to eval_config, valid for categories.
Args:
eval_config: evaluation configurations.
categories: a list of categories to evaluate.
Returns:
An list of instances of DetectionEvaluator.
Raises:
ValueError: if metric is not in the metric class dictionary.
"""
eval_metric_fn_keys = eval_config.metrics_set
if not eval_metric_fn_keys:
eval_metric_fn_keys = [EVAL_DEFAULT_METRIC]
evaluators_list = []
for eval_metric_fn_key in eval_metric_fn_keys:
if eval_metric_fn_key not in EVAL_METRICS_CLASS_DICT:
raise ValueError('Metric not found: {}'.format(eval_metric_fn_key))
else:
evaluators_list.append(
EVAL_METRICS_CLASS_DICT[eval_metric_fn_key](categories=categories))
return evaluators_list
def evaluate(create_input_dict_fn,
create_model_fn,
eval_config,
categories,
checkpoint_dir,
eval_dir,
graph_hook_fn=None):
"""Evaluation function for detection models.
Args:
create_input_dict_fn: a function to create a tensor input dictionary.
create_model_fn: a function that creates a DetectionModel.
eval_config: a eval_pb2.EvalConfig protobuf.
categories: a list of category dictionaries. Each dict in the list should
have an integer 'id' field and string 'name' field.
checkpoint_dir: directory to load the checkpoints to evaluate from.
eval_dir: directory to write evaluation metrics summary to.
graph_hook_fn: Optional function that is called after the training graph is
completely built. This is helpful to perform additional changes to the
training graph such as optimizing batchnorm. The function should modify
the default graph.
Returns:
metrics: A dictionary containing metric names and values from the latest
run.
"""
model = create_model_fn()
if eval_config.ignore_groundtruth and not eval_config.export_path:
logging.fatal('If ignore_groundtruth=True then an export_path is '
'required. Aborting!!!')
tensor_dicts = _extract_prediction_tensors(
model=model,
create_input_dict_fn=create_input_dict_fn,
ignore_groundtruth=eval_config.ignore_groundtruth)
def _process_batch(tensor_dicts,
sess,
batch_index,
counters,
losses_dict=None):
"""Evaluates tensors in tensor_dicts, visualizing the first K examples.
This function calls sess.run on tensor_dicts, evaluating the original_image
tensor only on the first K examples and visualizing detections overlaid
on this original_image.
Args:
tensor_dicts: a dictionary of tensors
sess: tensorflow session
batch_index: the index of the batch amongst all batches in the run.
counters: a dictionary holding 'success' and 'skipped' fields which can
be updated to keep track of number of successful and failed runs,
respectively. If these fields are not updated, then the success/skipped
counter values shown at the end of evaluation will be incorrect.
losses_dict: Optional dictonary of scalar loss tensors. Necessary only
for matching function signiture in third_party eval_util.py.
Returns:
result_dict: a dictionary of numpy arrays
result_losses_dict: a dictionary of scalar losses. This is empty if input
losses_dict is None. Necessary only for matching function signiture in
third_party eval_util.py.
"""
if batch_index % 10 == 0:
logging.info('Running eval ops batch %d', batch_index)
if not losses_dict:
losses_dict = {}
try:
result_dicts, result_losses_dict = sess.run([tensor_dicts, losses_dict])
counters['success'] += 1
except tf.errors.InvalidArgumentError:
logging.info('Skipping image')
counters['skipped'] += 1
return {}
num_images = len(tensor_dicts)
for i in range(num_images):
result_dict = result_dicts[i]
global_step = tf.train.global_step(sess, tf.train.get_global_step())
tag = 'image-%d' % (batch_index * num_images + i)
if batch_index < eval_config.num_visualizations / num_images:
eval_util.visualize_detection_results(
result_dict,
tag,
global_step,
categories=categories,
summary_dir=eval_dir,
export_dir=eval_config.visualization_export_dir,
show_groundtruth=eval_config.visualize_groundtruth_boxes,
groundtruth_box_visualization_color=eval_config.
groundtruth_box_visualization_color,
min_score_thresh=eval_config.min_score_threshold,
max_num_predictions=eval_config.max_num_boxes_to_visualize,
skip_scores=eval_config.skip_scores,
skip_labels=eval_config.skip_labels,
keep_image_id_for_visualization_export=eval_config.
keep_image_id_for_visualization_export)
if num_images > 1:
return result_dicts, result_losses_dict
else:
return result_dicts[0], result_losses_dict
variables_to_restore = tf.global_variables()
global_step = tf.train.get_or_create_global_step()
variables_to_restore.append(global_step)
if graph_hook_fn:
graph_hook_fn()
if eval_config.use_moving_averages:
variable_averages = tf.train.ExponentialMovingAverage(0.0)
variables_to_restore = variable_averages.variables_to_restore()
for key in variables_to_restore.keys():
if 'moving_mean' in key:
variables_to_restore[key.replace(
'moving_mean', 'moving_mean/ExponentialMovingAverage')] = (
variables_to_restore[key])
del variables_to_restore[key]
if 'moving_variance' in key:
variables_to_restore[key.replace(
'moving_variance', 'moving_variance/ExponentialMovingAverage')] = (
variables_to_restore[key])
del variables_to_restore[key]
saver = tf.train.Saver(variables_to_restore)
def _restore_latest_checkpoint(sess):
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
saver.restore(sess, latest_checkpoint)
metrics = eval_util.repeated_checkpoint_run(
tensor_dict=tensor_dicts,
summary_dir=eval_dir,
evaluators=get_evaluators(eval_config, categories),
batch_processor=_process_batch,
checkpoint_dirs=[checkpoint_dir],
variables_to_restore=None,
restore_fn=_restore_latest_checkpoint,
num_batches=eval_config.num_examples,
eval_interval_secs=eval_config.eval_interval_secs,
max_number_of_evaluations=(1 if eval_config.ignore_groundtruth else
eval_config.max_evals
if eval_config.max_evals else None),
master=eval_config.eval_master,
save_graph=eval_config.save_graph,
save_graph_dir=(eval_dir if eval_config.save_graph else ''))
return metrics
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""BottleneckConvLSTMCell implementation."""
import google3
import tensorflow.google as tf
import google3.learning.brain.contrib.slim as slim
from tensorflow.contrib.framework.python.ops import variables
_batch_norm = tf.contrib.layers.batch_norm
class BottleneckConvLSTMCell(tf.contrib.rnn.RNNCell):
"""Basic LSTM recurrent network cell using separable convolutions.
The implementation is based on: http://arxiv.org/abs/1409.2329.
We add forget_bias (default: 1) to the biases of the forget gate in order to
reduce the scale of forgetting in the beginning of the training.
This LSTM first projects inputs to the size of the output before doing gate
computations. This saves params unless the input is less than a third of the
state size channel-wise.
"""
def __init__(self,
filter_size,
output_size,
num_units,
forget_bias=1.0,
activation=tf.tanh,
flattened_state=False,
visualize_gates=True):
"""Initializes the basic LSTM cell.
Args:
filter_size: collection, conv filter size
output_size: collection, the width/height dimensions of the cell/output
num_units: int, The number of channels in the LSTM cell.
forget_bias: float, The bias added to forget gates (see above).
activation: Activation function of the inner states.
flattened_state: if True, state tensor will be flattened and stored as
a 2-d tensor. Use for exporting the model to tfmini
visualize_gates: if True, add histogram summaries of all gates
and outputs to tensorboard
"""
self._filter_size = list(filter_size)
self._output_size = list(output_size)
self._num_units = num_units
self._forget_bias = forget_bias
self._activation = activation
self._viz_gates = visualize_gates
self._flattened_state = flattened_state
self._param_count = self._num_units
for dim in self._output_size:
self._param_count *= dim
@property
def state_size(self):
return tf.contrib.rnn.LSTMStateTuple(self._output_size + [self._num_units],
self._output_size + [self._num_units])
@property
def state_size_flat(self):
return tf.contrib.rnn.LSTMStateTuple([self._param_count],
[self._param_count])
@property
def output_size(self):
return self._output_size + [self._num_units]
def __call__(self, inputs, state, scope=None):
"""Long short-term memory cell (LSTM) with bottlenecking.
Args:
inputs: Input tensor at the current timestep.
state: Tuple of tensors, the state and output at the previous timestep.
scope: Optional scope.
Returns:
A tuple where the first element is the LSTM output and the second is
a LSTMStateTuple of the state at the current timestep.
"""
scope = scope or 'conv_lstm_cell'
with tf.variable_scope(scope):
c, h = state
# unflatten state if neccesary
if self._flattened_state:
c = tf.reshape(c, [-1] + self.output_size)
h = tf.reshape(h, [-1] + self.output_size)
# summary of input passed into cell
if self._viz_gates:
slim.summaries.add_histogram_summary(inputs, 'cell_input')
bottleneck = tf.contrib.layers.separable_conv2d(
tf.concat([inputs, h], 3),
self._num_units,
self._filter_size,
depth_multiplier=1,
activation_fn=self._activation,
normalizer_fn=None,
scope='bottleneck')
if self._viz_gates:
slim.summaries.add_histogram_summary(bottleneck, 'bottleneck')
concat = tf.contrib.layers.separable_conv2d(
bottleneck,
4 * self._num_units,
self._filter_size,
depth_multiplier=1,
activation_fn=None,
normalizer_fn=None,
scope='gates')
i, j, f, o = tf.split(concat, 4, 3)
new_c = (
c * tf.sigmoid(f + self._forget_bias) +
tf.sigmoid(i) * self._activation(j))
new_h = self._activation(new_c) * tf.sigmoid(o)
# summary of cell output and new state
if self._viz_gates:
slim.summaries.add_histogram_summary(new_h, 'cell_output')
slim.summaries.add_histogram_summary(new_c, 'cell_state')
# reflatten state to store it
if self._flattened_state:
new_c = tf.reshape(new_c, [-1, self._param_count])
new_h = tf.reshape(new_h, [-1, self._param_count])
return new_h, tf.contrib.rnn.LSTMStateTuple(
new_c, new_h if self._flattened_state else new_h)
def init_state(self, state_name, batch_size, dtype, learned_state=False):
"""Creates an initial state compatible with this cell.
Args:
state_name: name of the state tensor
batch_size: model batch size
dtype: dtype for the tensor values i.e. tf.float32
learned_state: whether the initial state should be learnable. If false,
the initial state is set to all 0's
Returns:
The created initial state.
"""
state_size = (
self.state_size_flat if self._flattened_state else self.state_size)
# list of 2 zero tensors or variables tensors, depending on if
# learned_state is true
ret_flat = [(variables.model_variable(
state_name + str(i),
shape=s,
dtype=dtype,
initializer=tf.truncated_normal_initializer(stddev=0.03))
if learned_state else tf.zeros(
[batch_size] + s, dtype=dtype, name=state_name))
for i, s in enumerate(state_size)]
# duplicates initial state across the batch axis if it's learned
if learned_state:
ret_flat = [
tf.stack([tensor
for i in range(int(batch_size))])
for tensor in ret_flat
]
for s, r in zip(state_size, ret_flat):
r.set_shape([None] + s)
return tf.nest.pack_sequence_as(structure=[1, 1], flat_sequence=ret_flat)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""LSTM Meta-architecture definition.
General tensorflow implementation of convolutional Multibox/SSD detection
models with LSTM states, for use on video data.
See https://arxiv.org/abs/1711.06368 for details.
"""
import re
import tensorflow as tf
from google3.third_party.tensorflow_models.object_detection.core import box_list_ops
from google3.third_party.tensorflow_models.object_detection.core import standard_fields as fields
from google3.third_party.tensorflow_models.object_detection.meta_architectures import ssd_meta_arch
from google3.third_party.tensorflow_models.object_detection.utils import ops
from google3.third_party.tensorflow_models.object_detection.utils import shape_utils
slim = tf.contrib.slim
class LSTMMetaArch(ssd_meta_arch.SSDMetaArch):
"""LSTM Meta-architecture definition."""
def __init__(self,
is_training,
anchor_generator,
box_predictor,
box_coder,
feature_extractor,
matcher,
region_similarity_calculator,
encode_background_as_zeros,
negative_class_weight,
image_resizer_fn,
non_max_suppression_fn,
score_conversion_fn,
classification_loss,
localization_loss,
classification_loss_weight,
localization_loss_weight,
normalize_loss_by_num_matches,
hard_example_miner,
unroll_length,
add_summaries=True):
super(LSTMMetaArch, self).__init__(
is_training, anchor_generator, box_predictor, box_coder,
feature_extractor, matcher, region_similarity_calculator,
encode_background_as_zeros, negative_class_weight, image_resizer_fn,
non_max_suppression_fn, score_conversion_fn, classification_loss,
localization_loss, classification_loss_weight, localization_loss_weight,
normalize_loss_by_num_matches, hard_example_miner, add_summaries)
self._unroll_length = unroll_length
@property
def unroll_length(self):
return self._unroll_length
@unroll_length.setter
def unroll_length(self, unroll_length):
self._unroll_length = unroll_length
def predict(self, preprocessed_inputs, true_image_shapes, states=None,
state_name='lstm_state', feature_scope=None):
with tf.variable_scope(self._extract_features_scope,
values=[preprocessed_inputs], reuse=tf.AUTO_REUSE):
feature_maps = self._feature_extractor.extract_features(
preprocessed_inputs, states, state_name,
unroll_length=self._unroll_length, scope=feature_scope)
feature_map_spatial_dims = self._get_feature_map_spatial_dims(feature_maps)
image_shape = shape_utils.combined_static_and_dynamic_shape(
preprocessed_inputs)
self._batch_size = preprocessed_inputs.shape[0].value / self._unroll_length
self._states = states
self._anchors = box_list_ops.concatenate(
self._anchor_generator.generate(
feature_map_spatial_dims,
im_height=image_shape[1],
im_width=image_shape[2]))
prediction_dict = self._box_predictor.predict(
feature_maps, self._anchor_generator.num_anchors_per_location())
# Multiscale_anchor_generator currently has a different dim compared to
# ssd_anchor_generator. Current fix is to check the dim of the box_encodings
# tensor. If dim is not 3(multiscale_anchor_generator), squeeze the 3rd dim.
# TODO(yinxiao): Remove this check once the anchor generator has unified
# dimension.
if len(prediction_dict['box_encodings'][0].get_shape().as_list()) == 3:
box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1)
else:
box_encodings = tf.squeeze(
tf.concat(prediction_dict['box_encodings'], axis=1), axis=2)
class_predictions_with_background = tf.concat(
prediction_dict['class_predictions_with_background'], axis=1)
predictions_dict = {
'preprocessed_inputs': preprocessed_inputs,
'box_encodings': box_encodings,
'class_predictions_with_background': class_predictions_with_background,
'feature_maps': feature_maps,
'anchors': self._anchors.get(),
'states_and_outputs': self._feature_extractor.states_and_outputs,
}
# In cases such as exporting the model, the states is always zero. Thus the
# step should be ignored.
if states is not None:
predictions_dict['step'] = self._feature_extractor.step
return predictions_dict
def loss(self, prediction_dict, true_image_shapes, scope=None):
"""Computes scalar loss tensors with respect to provided groundtruth.
Calling this function requires that groundtruth tensors have been
provided via the provide_groundtruth function.
Args:
prediction_dict: a dictionary holding prediction tensors with
1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
box_code_dimension] containing predicted boxes.
2) class_predictions_with_background: 3-D float tensor of shape
[batch_size, num_anchors, num_classes+1] containing class predictions
(logits) for each of the anchors. Note that this tensor *includes*
background class predictions.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
scope: Optional scope name.
Returns:
a dictionary mapping loss keys (`localization_loss` and
`classification_loss`) to scalar tensors representing corresponding loss
values.
"""
with tf.name_scope(scope, 'Loss', prediction_dict.values()):
keypoints = None
if self.groundtruth_has_field(fields.BoxListFields.keypoints):
keypoints = self.groundtruth_lists(fields.BoxListFields.keypoints)
weights = None
if self.groundtruth_has_field(fields.BoxListFields.weights):
weights = self.groundtruth_lists(fields.BoxListFields.weights)
(batch_cls_targets, batch_cls_weights, batch_reg_targets,
batch_reg_weights, match_list) = self._assign_targets(
self.groundtruth_lists(fields.BoxListFields.boxes),
self.groundtruth_lists(fields.BoxListFields.classes),
keypoints, weights)
if self._add_summaries:
self._summarize_target_assignment(
self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
location_losses = self._localization_loss(
prediction_dict['box_encodings'],
batch_reg_targets,
ignore_nan_targets=True,
weights=batch_reg_weights)
cls_losses = ops.reduce_sum_trailing_dimensions(
self._classification_loss(
prediction_dict['class_predictions_with_background'],
batch_cls_targets,
weights=batch_cls_weights),
ndims=2)
if self._hard_example_miner:
(loc_loss_list, cls_loss_list) = self._apply_hard_mining(
location_losses, cls_losses, prediction_dict, match_list)
localization_loss = tf.reduce_sum(tf.stack(loc_loss_list))
classification_loss = tf.reduce_sum(tf.stack(cls_loss_list))
if self._add_summaries:
self._hard_example_miner.summarize()
else:
if self._add_summaries:
class_ids = tf.argmax(batch_cls_targets, axis=2)
flattened_class_ids = tf.reshape(class_ids, [-1])
flattened_classification_losses = tf.reshape(cls_losses, [-1])
self._summarize_anchor_classification_loss(
flattened_class_ids, flattened_classification_losses)
localization_loss = tf.reduce_sum(location_losses)
classification_loss = tf.reduce_sum(cls_losses)
# Optionally normalize by number of positive matches
normalizer = tf.constant(1.0, dtype=tf.float32)
if self._normalize_loss_by_num_matches:
normalizer = tf.maximum(tf.to_float(tf.reduce_sum(batch_reg_weights)),
1.0)
with tf.name_scope('localization_loss'):
localization_loss_normalizer = normalizer
if self._normalize_loc_loss_by_codesize:
localization_loss_normalizer *= self._box_coder.code_size
localization_loss = ((self._localization_loss_weight / (
localization_loss_normalizer)) * localization_loss)
with tf.name_scope('classification_loss'):
classification_loss = ((self._classification_loss_weight / normalizer) *
classification_loss)
loss_dict = {
'localization_loss': localization_loss,
'classification_loss': classification_loss
}
return loss_dict
def restore_map(self, fine_tune_checkpoint_type='lstm'):
"""Returns a map of variables to load from a foreign checkpoint.
See parent class for details.
Args:
fine_tune_checkpoint_type: the type of checkpoint to restore from, either
SSD/LSTM detection checkpoint (with compatible variable names)
classification checkpoint for initialization prior to training.
Available options: `classification`, `detection`, `interleaved`,
and `lstm`.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
the model graph.
Raises:
ValueError: if fine_tune_checkpoint_type is not among
`classification`/`detection`/`interleaved`/`lstm`.
"""
if fine_tune_checkpoint_type not in [
'classification', 'detection', 'lstm'
]:
raise ValueError('Not supported fine_tune_checkpoint_type: {}'.format(
fine_tune_checkpoint_type))
variables_to_restore = {}
for variable in tf.global_variables():
var_name = variable.op.name
if 'global_step' in var_name:
continue
# Remove FeatureExtractor prefix for classification checkpoints.
if fine_tune_checkpoint_type == 'classification':
var_name = (
re.split('^' + self._extract_features_scope + '/', var_name)[-1])
# When loading from single frame detection checkpoints, we need to
# remap FeatureMaps variable names.
if ('FeatureMaps' in var_name and
fine_tune_checkpoint_type == 'detection'):
var_name = var_name.replace('FeatureMaps',
self.get_base_network_scope())
variables_to_restore[var_name] = variable
return variables_to_restore
def get_base_network_scope(self):
"""Returns the variable scope of the base network.
Returns:
The variable scope of the feature extractor base network, e.g. MobilenetV1
"""
return self._feature_extractor.get_base_network_scope()
class LSTMFeatureExtractor(ssd_meta_arch.SSDFeatureExtractor):
"""LSTM Meta-architecture Feature Extractor definition."""
@property
def depth_multipliers(self):
return self._depth_multipliers
@depth_multipliers.setter
def depth_multipliers(self, depth_multipliers):
self._depth_multipliers = depth_multipliers
@property
def lstm_state_depth(self):
return self._lstm_state_depth
@lstm_state_depth.setter
def lstm_state_depth(self, lstm_state_depth):
self._lstm_state_depth = lstm_state_depth
@property
def states_and_outputs(self):
"""LSTM states and outputs.
This variable includes both LSTM states {C_t} and outputs {h_t}.
Returns:
states_and_outputs: A list of 4-D float tensors, including the lstm state
and output at each timestep.
"""
return self._states_out
@property
def step(self):
return self._step
def preprocess(self, resized_inputs):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def get_base_network_scope(self):
"""Returns the variable scope of the base network.
Returns:
The variable scope of the base network, e.g. MobilenetV1
"""
return self._base_network_scope
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Custom RNN decoder."""
from tensorflow.python.ops import variable_scope
def rnn_decoder(decoder_inputs,
initial_state,
cell,
loop_function=None,
scope=None):
"""RNN decoder for the sequence-to-sequence model.
This decoder returns a list of all states, rather than only the final state.
Args:
decoder_inputs: A list of 4D Tensors with shape [batch_size x input_size].
initial_state: 2D Tensor with shape [batch_size x cell.state_size].
cell: rnn_cell.RNNCell defining the cell function and size.
loop_function: If not None, this function will be applied to the i-th output
in order to generate the i+1-st input, and decoder_inputs will be ignored,
except for the first element ("GO" symbol). This can be used for decoding,
but also for training to emulate http://arxiv.org/abs/1506.03099.
Signature -- loop_function(prev, i) = next
* prev is a 2D Tensor of shape [batch_size x output_size],
* i is an integer, the step number (when advanced control is needed),
* next is a 2D Tensor of shape [batch_size x input_size].
scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
Returns:
A tuple of the form (outputs, state), where:
outputs: A list of the same length as decoder_inputs of 4D Tensors with
shape [batch_size x output_size] containing generated outputs.
state: A list of the same length as decoder_inputs of the state of each
cell at each time-step. It is a 2D Tensor of shape
[batch_size x cell.state_size].
"""
with variable_scope.variable_scope(scope or 'rnn_decoder'):
state = initial_state
outputs = []
states = []
prev = None
for i, decoder_input in enumerate(decoder_inputs):
if loop_function is not None and prev is not None:
with variable_scope.variable_scope('loop_function', reuse=True):
decoder_input = loop_function(prev, i)
if i > 0:
variable_scope.get_variable_scope().reuse_variables()
output, state = cell(decoder_input, state)
outputs.append(output)
states.append(state)
if loop_function is not None:
prev = output
return outputs, states
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Class for evaluating video object detections with COCO metrics."""
import tensorflow as tf
from google3.third_party.tensorflow_models.object_detection.core import standard_fields
from google3.third_party.tensorflow_models.object_detection.metrics import coco_evaluation
from google3.third_party.tensorflow_models.object_detection.metrics import coco_tools
class CocoEvaluationAllFrames(coco_evaluation.CocoDetectionEvaluator):
"""Class to evaluate COCO detection metrics for frame sequences.
The class overrides two functions: add_single_ground_truth_image_info and
add_single_detected_image_info.
For the evaluation of sequence video detection, by iterating through the
entire groundtruth_dict, all the frames in the unrolled frames in one LSTM
training sample are considered. Therefore, both groundtruth and detection
results of all frames are added for the evaluation. This is used when all the
frames are labeled in the video object detection training job.
"""
def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
"""Add groundtruth results of all frames to the eval pipeline.
This method overrides the function defined in the base class.
Args:
image_id: A unique string/integer identifier for the image.
groundtruth_dict: A list of dictionary containing -
InputDataFields.groundtruth_boxes: float32 numpy array of shape
[num_boxes, 4] containing `num_boxes` groundtruth boxes of the format
[ymin, xmin, ymax, xmax] in absolute image coordinates.
InputDataFields.groundtruth_classes: integer numpy array of shape
[num_boxes] containing 1-indexed groundtruth classes for the boxes.
InputDataFields.groundtruth_is_crowd (optional): integer numpy array of
shape [num_boxes] containing iscrowd flag for groundtruth boxes.
"""
for idx, gt in enumerate(groundtruth_dict):
if not gt:
continue
image_frame_id = '{}_{}'.format(image_id, idx)
if image_frame_id in self._image_ids:
tf.logging.warning(
'Ignoring ground truth with image id %s since it was '
'previously added', image_frame_id)
continue
self._groundtruth_list.extend(
coco_tools.ExportSingleImageGroundtruthToCoco(
image_id=image_frame_id,
next_annotation_id=self._annotation_id,
category_id_set=self._category_id_set,
groundtruth_boxes=gt[
standard_fields.InputDataFields.groundtruth_boxes],
groundtruth_classes=gt[
standard_fields.InputDataFields.groundtruth_classes]))
self._annotation_id += (
gt[standard_fields.InputDataFields.groundtruth_boxes].shape[0])
# Boolean to indicate whether a detection has been added for this image.
self._image_ids[image_frame_id] = False
def add_single_detected_image_info(self, image_id, detections_dict):
"""Add detection results of all frames to the eval pipeline.
This method overrides the function defined in the base class.
Args:
image_id: A unique string/integer identifier for the image.
detections_dict: A list of dictionary containing -
DetectionResultFields.detection_boxes: float32 numpy array of shape
[num_boxes, 4] containing `num_boxes` detection boxes of the format
[ymin, xmin, ymax, xmax] in absolute image coordinates.
DetectionResultFields.detection_scores: float32 numpy array of shape
[num_boxes] containing detection scores for the boxes.
DetectionResultFields.detection_classes: integer numpy array of shape
[num_boxes] containing 1-indexed detection classes for the boxes.
Raises:
ValueError: If groundtruth for the image_id is not available.
"""
for idx, det in enumerate(detections_dict):
if not det:
continue
image_frame_id = '{}_{}'.format(image_id, idx)
if image_frame_id not in self._image_ids:
raise ValueError(
'Missing groundtruth for image-frame id: {}'.format(image_frame_id))
if self._image_ids[image_frame_id]:
tf.logging.warning(
'Ignoring detection with image id %s since it was '
'previously added', image_frame_id)
continue
self._detection_boxes_list.extend(
coco_tools.ExportSingleImageDetectionBoxesToCoco(
image_id=image_frame_id,
category_id_set=self._category_id_set,
detection_boxes=det[
standard_fields.DetectionResultFields.detection_boxes],
detection_scores=det[
standard_fields.DetectionResultFields.detection_scores],
detection_classes=det[
standard_fields.DetectionResultFields.detection_classes]))
self._image_ids[image_frame_id] = True
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for video_object_detection.metrics.coco_video_evaluation."""
import numpy as np
import tensorflow as tf
from lstm_object_detection.metrics import coco_evaluation_all_frames
from google3.third_party.tensorflow_models.object_detection.core import standard_fields
class CocoEvaluationAllFramesTest(tf.test.TestCase):
def testGroundtruthAndDetectionsDisagreeOnAllFrames(self):
"""Tests that mAP is calculated on several different frame results."""
category_list = [{'id': 0, 'name': 'dog'}, {'id': 1, 'name': 'cat'}]
video_evaluator = coco_evaluation_all_frames.CocoEvaluationAllFrames(
category_list)
video_evaluator.add_single_ground_truth_image_info(
image_id='image1',
groundtruth_dict=[{
standard_fields.InputDataFields.groundtruth_boxes:
np.array([[50., 50., 200., 200.]]),
standard_fields.InputDataFields.groundtruth_classes:
np.array([1])
}, {
standard_fields.InputDataFields.groundtruth_boxes:
np.array([[50., 50., 100., 100.]]),
standard_fields.InputDataFields.groundtruth_classes:
np.array([1])
}])
video_evaluator.add_single_detected_image_info(
image_id='image1',
# A different groundtruth box on the frame other than the last one.
detections_dict=[{
standard_fields.DetectionResultFields.detection_boxes:
np.array([[100., 100., 200., 200.]]),
standard_fields.DetectionResultFields.detection_scores:
np.array([.8]),
standard_fields.DetectionResultFields.detection_classes:
np.array([1])
}, {
standard_fields.DetectionResultFields.detection_boxes:
np.array([[50., 50., 100., 100.]]),
standard_fields.DetectionResultFields.detection_scores:
np.array([.8]),
standard_fields.DetectionResultFields.detection_classes:
np.array([1])
}])
metrics = video_evaluator.evaluate()
self.assertNotEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
def testGroundtruthAndDetections(self):
"""Tests that mAP is calculated correctly on GT and Detections."""
category_list = [{'id': 0, 'name': 'dog'}, {'id': 1, 'name': 'cat'}]
video_evaluator = coco_evaluation_all_frames.CocoEvaluationAllFrames(
category_list)
video_evaluator.add_single_ground_truth_image_info(
image_id='image1',
groundtruth_dict=[{
standard_fields.InputDataFields.groundtruth_boxes:
np.array([[100., 100., 200., 200.]]),
standard_fields.InputDataFields.groundtruth_classes:
np.array([1])
}])
video_evaluator.add_single_ground_truth_image_info(
image_id='image2',
groundtruth_dict=[{
standard_fields.InputDataFields.groundtruth_boxes:
np.array([[50., 50., 100., 100.]]),
standard_fields.InputDataFields.groundtruth_classes:
np.array([1])
}])
video_evaluator.add_single_ground_truth_image_info(
image_id='image3',
groundtruth_dict=[{
standard_fields.InputDataFields.groundtruth_boxes:
np.array([[50., 100., 100., 120.]]),
standard_fields.InputDataFields.groundtruth_classes:
np.array([1])
}])
video_evaluator.add_single_detected_image_info(
image_id='image1',
detections_dict=[{
standard_fields.DetectionResultFields.detection_boxes:
np.array([[100., 100., 200., 200.]]),
standard_fields.DetectionResultFields.detection_scores:
np.array([.8]),
standard_fields.DetectionResultFields.detection_classes:
np.array([1])
}])
video_evaluator.add_single_detected_image_info(
image_id='image2',
detections_dict=[{
standard_fields.DetectionResultFields.detection_boxes:
np.array([[50., 50., 100., 100.]]),
standard_fields.DetectionResultFields.detection_scores:
np.array([.8]),
standard_fields.DetectionResultFields.detection_classes:
np.array([1])
}])
video_evaluator.add_single_detected_image_info(
image_id='image3',
detections_dict=[{
standard_fields.DetectionResultFields.detection_boxes:
np.array([[50., 100., 100., 120.]]),
standard_fields.DetectionResultFields.detection_scores:
np.array([.8]),
standard_fields.DetectionResultFields.detection_classes:
np.array([1])
}])
metrics = video_evaluator.evaluate()
self.assertAlmostEqual(metrics['DetectionBoxes_Precision/mAP'], 1.0)
def testMissingDetectionResults(self):
"""Tests if groundtrue is missing, raises ValueError."""
category_list = [{'id': 0, 'name': 'dog'}]
video_evaluator = coco_evaluation_all_frames.CocoEvaluationAllFrames(
category_list)
video_evaluator.add_single_ground_truth_image_info(
image_id='image1',
groundtruth_dict=[{
standard_fields.InputDataFields.groundtruth_boxes:
np.array([[100., 100., 200., 200.]]),
standard_fields.InputDataFields.groundtruth_classes:
np.array([1])
}])
with self.assertRaisesRegexp(ValueError,
r'Missing groundtruth for image-frame id:.*'):
video_evaluator.add_single_detected_image_info(
image_id='image3',
detections_dict=[{
standard_fields.DetectionResultFields.detection_boxes:
np.array([[100., 100., 200., 200.]]),
standard_fields.DetectionResultFields.detection_scores:
np.array([.8]),
standard_fields.DetectionResultFields.detection_classes:
np.array([1])
}])
if __name__ == '__main__':
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A function to build a DetectionModel from configuration."""
from lstm_object_detection.lstm import lstm_meta_arch
from lstm_object_detection.models.lstm_ssd_mobilenet_v1_feature_extractor import LSTMMobileNetV1FeatureExtractor
from google3.third_party.tensorflow_models.object_detection.builders import anchor_generator_builder
from google3.third_party.tensorflow_models.object_detection.builders import box_coder_builder
from google3.third_party.tensorflow_models.object_detection.builders import box_predictor_builder
from google3.third_party.tensorflow_models.object_detection.builders import hyperparams_builder
from google3.third_party.tensorflow_models.object_detection.builders import image_resizer_builder
from google3.third_party.tensorflow_models.object_detection.builders import losses_builder
from google3.third_party.tensorflow_models.object_detection.builders import matcher_builder
from google3.third_party.tensorflow_models.object_detection.builders import model_builder
from google3.third_party.tensorflow_models.object_detection.builders import post_processing_builder
from google3.third_party.tensorflow_models.object_detection.builders import region_similarity_calculator_builder as sim_calc
model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP.update({
'lstm_mobilenet_v1': LSTMMobileNetV1FeatureExtractor,
})
SSD_FEATURE_EXTRACTOR_CLASS_MAP = model_builder.SSD_FEATURE_EXTRACTOR_CLASS_MAP
def build(model_config, lstm_config, is_training):
"""Builds a DetectionModel based on the model config.
Args:
model_config: A model.proto object containing the config for the desired
DetectionModel.
lstm_config: LstmModel config proto that specifies LSTM train/eval configs.
is_training: True if this model is being built for training purposes.
Returns:
DetectionModel based on the config.
Raises:
ValueError: On invalid meta architecture or model.
"""
return _build_lstm_model(model_config.ssd, lstm_config, is_training)
def _build_lstm_feature_extractor(feature_extractor_config,
is_training,
lstm_state_depth,
reuse_weights=None):
"""Builds a ssd_meta_arch.SSDFeatureExtractor based on config.
Args:
feature_extractor_config: A SSDFeatureExtractor proto config from ssd.proto.
is_training: True if this feature extractor is being built for training.
lstm_state_depth: An integer of the depth of the lstm state.
reuse_weights: If the feature extractor should reuse weights.
Returns:
ssd_meta_arch.SSDFeatureExtractor based on config.
Raises:
ValueError: On invalid feature extractor type.
"""
feature_type = feature_extractor_config.type
depth_multiplier = feature_extractor_config.depth_multiplier
min_depth = feature_extractor_config.min_depth
pad_to_multiple = feature_extractor_config.pad_to_multiple
use_explicit_padding = feature_extractor_config.use_explicit_padding
use_depthwise = feature_extractor_config.use_depthwise
conv_hyperparams = hyperparams_builder.build(
feature_extractor_config.conv_hyperparams, is_training)
override_base_feature_extractor_hyperparams = (
feature_extractor_config.override_base_feature_extractor_hyperparams)
if feature_type not in SSD_FEATURE_EXTRACTOR_CLASS_MAP:
raise ValueError('Unknown ssd feature_extractor: {}'.format(feature_type))
feature_extractor_class = SSD_FEATURE_EXTRACTOR_CLASS_MAP[feature_type]
return feature_extractor_class(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise,
override_base_feature_extractor_hyperparams, lstm_state_depth)
def _build_lstm_model(ssd_config, lstm_config, is_training):
"""Builds an LSTM detection model based on the model config.
Args:
ssd_config: A ssd.proto object containing the config for the desired
LSTMMetaArch.
lstm_config: LstmModel config proto that specifies LSTM train/eval configs.
is_training: True if this model is being built for training purposes.
Returns:
LSTMMetaArch based on the config.
Raises:
ValueError: If ssd_config.type is not recognized (i.e. not registered in
model_class_map), or if lstm_config.interleave_strategy is not recognized.
ValueError: If unroll_length is not specified in the config file.
"""
feature_extractor = _build_lstm_feature_extractor(
ssd_config.feature_extractor, is_training, lstm_config.lstm_state_depth)
box_coder = box_coder_builder.build(ssd_config.box_coder)
matcher = matcher_builder.build(ssd_config.matcher)
region_similarity_calculator = sim_calc.build(
ssd_config.similarity_calculator)
num_classes = ssd_config.num_classes
ssd_box_predictor = box_predictor_builder.build(hyperparams_builder.build,
ssd_config.box_predictor,
is_training, num_classes)
anchor_generator = anchor_generator_builder.build(ssd_config.anchor_generator)
image_resizer_fn = image_resizer_builder.build(ssd_config.image_resizer)
non_max_suppression_fn, score_conversion_fn = post_processing_builder.build(
ssd_config.post_processing)
(classification_loss, localization_loss, classification_weight,
localization_weight, miner, _) = losses_builder.build(ssd_config.loss)
normalize_loss_by_num_matches = ssd_config.normalize_loss_by_num_matches
encode_background_as_zeros = ssd_config.encode_background_as_zeros
negative_class_weight = ssd_config.negative_class_weight
# Extra configs for lstm unroll length.
unroll_length = None
if 'lstm' in ssd_config.feature_extractor.type:
if is_training:
unroll_length = lstm_config.train_unroll_length
else:
unroll_length = lstm_config.eval_unroll_length
if unroll_length is None:
raise ValueError('No unroll length found in the config file')
lstm_model = lstm_meta_arch.LSTMMetaArch(
is_training, anchor_generator, ssd_box_predictor, box_coder,
feature_extractor, matcher, region_similarity_calculator,
encode_background_as_zeros, negative_class_weight, image_resizer_fn,
non_max_suppression_fn, score_conversion_fn, classification_loss,
localization_loss, classification_weight, localization_weight,
normalize_loss_by_num_matches, miner, unroll_length)
return lstm_model
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for video_object_detection.tensorflow.model_builder."""
import tensorflow as tf
from google.protobuf import text_format
from lstm_object_detection import model_builder
from lstm_object_detection.lstm import lstm_meta_arch
from lstm_object_detection.protos import pipeline_pb2 as internal_pipeline_pb2
from google3.third_party.tensorflow_models.object_detection.protos import pipeline_pb2
class ModelBuilderTest(tf.test.TestCase):
def create_model(self, model_config, lstm_config):
"""Builds a DetectionModel based on the model config.
Args:
model_config: A model.proto object containing the config for the desired
DetectionModel.
lstm_config: LstmModel config proto that specifies LSTM train/eval
configs.
Returns:
DetectionModel based on the config.
"""
return model_builder.build(model_config, lstm_config, is_training=True)
def get_model_configs_from_proto(self):
"""Creates a model text proto for testing.
Returns:
A dictionary of model configs.
"""
model_text_proto = """
[object_detection.protos.lstm_model] {
train_unroll_length: 4
eval_unroll_length: 4
}
model {
ssd {
feature_extractor {
type: 'lstm_mobilenet_v1'
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
}
negative_class_weight: 2.0
box_coder {
faster_rcnn_box_coder {
}
}
matcher {
argmax_matcher {
}
}
similarity_calculator {
iou_similarity {
}
}
anchor_generator {
ssd_anchor_generator {
aspect_ratios: 1.0
}
}
image_resizer {
fixed_shape_resizer {
height: 320
width: 320
}
}
box_predictor {
convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
}
}
normalize_loc_loss_by_codesize: true
loss {
classification_loss {
weighted_softmax {
}
}
localization_loss {
weighted_smooth_l1 {
}
}
}
}
}"""
pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
text_format.Merge(model_text_proto, pipeline_config)
configs = {}
configs['model'] = pipeline_config.model
configs['lstm_model'] = pipeline_config.Extensions[
internal_pipeline_pb2.lstm_model]
return configs
def test_model_creation_from_valid_configs(self):
configs = self.get_model_configs_from_proto()
# Test model properties.
self.assertEqual(configs['model'].ssd.negative_class_weight, 2.0)
self.assertTrue(configs['model'].ssd.normalize_loc_loss_by_codesize)
self.assertEqual(configs['model'].ssd.feature_extractor.type,
'lstm_mobilenet_v1')
model = self.create_model(configs['model'], configs['lstm_model'])
# Test architechture type.
self.assertIsInstance(model, lstm_meta_arch.LSTMMetaArch)
# Test LSTM unroll length.
self.assertEqual(model.unroll_length, 4)
def test_model_creation_from_invalid_configs(self):
configs = self.get_model_configs_from_proto()
# Test model build failure with wrong input configs.
with self.assertRaises(AttributeError):
_ = self.create_model(configs['model'], configs['model'])
# Test model builder failure with missing configs.
with self.assertRaises(TypeError):
# pylint: disable=no-value-for-parameter
_ = self.create_model(configs['lstm_model'])
if __name__ == '__main__':
tf.test.main()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""LSTMFeatureExtractor for MobilenetV1 features."""
import tensorflow as tf
from tensorflow.python.framework import ops as tf_ops
from lstm_object_detection.lstm import lstm_cells
from lstm_object_detection.lstm import lstm_meta_arch
from lstm_object_detection.lstm import rnn_decoder
from google3.third_party.tensorflow_models.object_detection.models import feature_map_generators
from google3.third_party.tensorflow_models.object_detection.utils import context_manager
from google3.third_party.tensorflow_models.object_detection.utils import ops
from google3.third_party.tensorflow_models.object_detection.utils import shape_utils
from nets import mobilenet_v1
slim = tf.contrib.slim
class LSTMMobileNetV1FeatureExtractor(lstm_meta_arch.LSTMFeatureExtractor):
"""LSTM Feature Extractor using MobilenetV1 features."""
def __init__(self,
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
conv_hyperparams,
reuse_weights=None,
use_explicit_padding=False,
use_depthwise=True,
override_base_feature_extractor_hyperparams=False,
lstm_state_depth=256):
"""Initializes instance of MobileNetV1 Feature Extractor for LSTM Models.
Args:
is_training: A boolean whether the network is in training mode.
depth_multiplier: A float depth multiplier for feature extractor.
min_depth: A number representing minimum feature extractor depth.
pad_to_multiple: The nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
use_depthwise: Whether to use depthwise convolutions. Default is True.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
lstm_state_depth: An integter of the depth of the lstm state.
"""
super(LSTMMobileNetV1FeatureExtractor, self).__init__(
is_training, depth_multiplier, min_depth, pad_to_multiple,
conv_hyperparams, reuse_weights, use_explicit_padding, use_depthwise,
override_base_feature_extractor_hyperparams)
self._feature_map_layout = {
'from_layer': ['Conv2d_13_pointwise_lstm', '', '', '', ''],
'layer_depth': [-1, 512, 256, 256, 128],
'use_explicit_padding': self._use_explicit_padding,
'use_depthwise': self._use_depthwise,
}
self._base_network_scope = 'MobilenetV1'
self._lstm_state_depth = lstm_state_depth
def extract_features(self,
preprocessed_inputs,
state_saver=None,
state_name='lstm_state',
unroll_length=5,
scope=None):
"""Extracts features from preprocessed inputs.
The features include the base network features, lstm features and SSD
features, organized in the following name scope:
<parent scope>/MobilenetV1/...
<parent scope>/LSTM/...
<parent scope>/FeatureMaps/...
Args:
preprocessed_inputs: A [batch, height, width, channels] float tensor
representing a batch of consecutive frames from video clips.
state_saver: A state saver object with methods `state` and `save_state`.
state_name: A python string for the name to use with the state_saver.
unroll_length: The number of steps to unroll the lstm.
scope: The scope for the base network of the feature extractor.
Returns:
A list of tensors where the ith tensor has shape [batch, height_i,
width_i, depth_i]
"""
preprocessed_inputs = shape_utils.check_min_image_dim(
33, preprocessed_inputs)
with slim.arg_scope(
mobilenet_v1.mobilenet_v1_arg_scope(is_training=self._is_training)):
with (slim.arg_scope(self._conv_hyperparams_fn())
if self._override_base_feature_extractor_hyperparams else
context_manager.IdentityContextManager()):
with slim.arg_scope([slim.batch_norm], fused=False):
# Base network.
with tf.variable_scope(
scope, self._base_network_scope,
reuse=self._reuse_weights) as scope:
net, image_features = mobilenet_v1.mobilenet_v1_base(
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
final_endpoint='Conv2d_13_pointwise',
min_depth=self._min_depth,
depth_multiplier=self._depth_multiplier,
scope=scope)
with slim.arg_scope(self._conv_hyperparams_fn()):
with slim.arg_scope(
[slim.batch_norm], fused=False, is_training=self._is_training):
# ConvLSTM layers.
with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope:
lstm_cell = lstm_cells.BottleneckConvLSTMCell(
filter_size=(3, 3),
output_size=(net.shape[1].value, net.shape[2].value),
num_units=max(self._min_depth, self._lstm_state_depth),
activation=tf.nn.relu6,
visualize_gates=True)
net_seq = list(tf.split(net, unroll_length))
if state_saver is None:
init_state = lstm_cell.init_state(
state_name, net.shape[0].value / unroll_length, tf.float32)
else:
c = state_saver.state('%s_c' % state_name)
h = state_saver.state('%s_h' % state_name)
init_state = (c, h)
# Identities added for inputing state tensors externally.
c_ident = tf.identity(init_state[0], name='lstm_state_in_c')
h_ident = tf.identity(init_state[1], name='lstm_state_in_h')
init_state = (c_ident, h_ident)
net_seq, states_out = rnn_decoder.rnn_decoder(
net_seq, init_state, lstm_cell, scope=lstm_scope)
batcher_ops = None
self._states_out = states_out
if state_saver is not None:
self._step = state_saver.state('%s_step' % state_name)
batcher_ops = [
state_saver.save_state('%s_c' % state_name, states_out[-1][0]),
state_saver.save_state('%s_h' % state_name, states_out[-1][1]),
state_saver.save_state('%s_step' % state_name, self._step - 1)
]
with tf_ops.control_dependencies(batcher_ops):
image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0)
# Identities added for reading output states, to be reused externally.
tf.identity(states_out[-1][0], name='lstm_state_out_c')
tf.identity(states_out[-1][1], name='lstm_state_out_h')
# SSD layers.
with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights):
feature_maps = feature_map_generators.multi_resolution_feature_maps(
feature_map_layout=self._feature_map_layout,
depth_multiplier=(self._depth_multiplier),
min_depth=self._min_depth,
insert_1x1_conv=True,
image_features=image_features)
return feature_maps.values()
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for models.lstm_ssd_mobilenet_v1_feature_extractor."""
import numpy as np
import tensorflow as tf
from lstm_object_detection.models import lstm_ssd_mobilenet_v1_feature_extractor as feature_extactor
from google3.third_party.tensorflow_models.object_detection.models import ssd_feature_extractor_test
slim = tf.contrib.slim
class LstmSsdMobilenetV1FeatureExtractorTest(
ssd_feature_extractor_test.SsdFeatureExtractorTestBase):
def _create_feature_extractor(self,
depth_multiplier=1.0,
pad_to_multiple=1,
is_training=True,
use_explicit_padding=False):
"""Constructs a new feature extractor.
Args:
depth_multiplier: A float depth multiplier for feature extractor.
pad_to_multiple: The nearest multiple to zero pad the input height and
width dimensions to.
is_training: A boolean whether the network is in training mode.
use_explicit_padding: A boolean whether to use explicit padding.
Returns:
An lstm_ssd_meta_arch.LSTMMobileNetV1FeatureExtractor object.
"""
min_depth = 32
extractor = (
feature_extactor.LSTMMobileNetV1FeatureExtractor(
is_training,
depth_multiplier,
min_depth,
pad_to_multiple,
self.conv_hyperparams_fn,
use_explicit_padding=use_explicit_padding))
extractor.lstm_state_depth = int(256 * depth_multiplier)
return extractor
def test_extract_features_returns_correct_shapes_256(self):
image_height = 256
image_width = 256
depth_multiplier = 1.0
pad_to_multiple = 1
batch_size = 5
expected_feature_map_shape = [(batch_size, 8, 8, 256), (batch_size, 4, 4,
512),
(batch_size, 2, 2, 256), (batch_size, 1, 1,
256)]
self.check_extract_features_returns_correct_shape(
batch_size,
image_height,
image_width,
depth_multiplier,
pad_to_multiple,
expected_feature_map_shape,
use_explicit_padding=False)
self.check_extract_features_returns_correct_shape(
batch_size,
image_height,
image_width,
depth_multiplier,
pad_to_multiple,
expected_feature_map_shape,
use_explicit_padding=True)
def test_preprocess_returns_correct_value_range(self):
test_image = np.random.rand(5, 128, 128, 3)
feature_extractor = self._create_feature_extractor()
preprocessed_image = feature_extractor.preprocess(test_image)
self.assertTrue(np.all(np.less_equal(np.abs(preprocessed_image), 1.0)))
def test_variables_only_created_in_scope(self):
scope_name = 'MobilenetV1'
g = tf.Graph()
with g.as_default():
preprocessed_inputs = tf.placeholder(tf.float32, (5, 256, 256, 3))
feature_extractor = self._create_feature_extractor()
feature_extractor.extract_features(preprocessed_inputs)
variables = g.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
find_scope = False
for variable in variables:
if scope_name in variable.name:
find_scope = True
break
self.assertTrue(find_scope)
def test_lstm_non_zero_state(self):
init_state = {
'lstm_state_c': tf.zeros([8, 8, 256]),
'lstm_state_h': tf.zeros([8, 8, 256]),
'lstm_state_step': tf.zeros([1])
}
seq = {'test': tf.random_uniform([3, 1, 1, 1])}
stateful_reader = tf.contrib.training.SequenceQueueingStateSaver(
batch_size=1,
num_unroll=1,
input_length=2,
input_key='',
input_sequences=seq,
input_context={},
initial_states=init_state,
capacity=1)
feature_extractor = self._create_feature_extractor()
image = tf.random_uniform([5, 256, 256, 3])
with tf.variable_scope('zero_state'):
feature_map = feature_extractor.extract_features(
image, stateful_reader.next_batch)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run([stateful_reader.prefetch_op])
_ = sess.run([feature_map])
# Update states with the next batch.
state = sess.run(stateful_reader.next_batch.state('lstm_state_c'))
# State should no longer be zero after update.
self.assertTrue(state.any())
if __name__ == '__main__':
tf.test.main()
syntax = "proto2";
package lstm_object_detection.input_readers;
import "third_party/tensorflow_models/object_detection/protos/input_reader.proto";
message GoogleInputReader {
extend object_detection.protos.ExternalInputReader {
optional GoogleInputReader google_input_reader = 444;
}
oneof input_reader {
TFRecordVideoInputReader tf_record_video_input_reader = 1;
}
}
message TFRecordVideoInputReader {
// Path(s) to tfrecords of input data.
repeated string input_path = 1;
enum DataType {
UNSPECIFIED = 0;
ANNOTATED_IMAGE = 1;
TF_EXAMPLE = 2;
TF_SEQUENCE_EXAMPLE = 3;
}
optional DataType data_type = 2 [default=TF_SEQUENCE_EXAMPLE];
// Length of the video sequence. All the input video sequence should have the
// same length in frames, e.g. 5 frames.
optional int32 video_length = 3;
}
syntax = "proto2";
package object_detection.protos;
import "third_party/tensorflow_models/object_detection/protos/pipeline.proto";
extend TrainEvalPipelineConfig {
optional LstmModel lstm_model = 205743444;
}
// Message for extra fields needed for configuring LSTM model.
message LstmModel {
// Unroll length for training LSTMs.
optional int32 train_unroll_length = 1;
// Unroll length for evaluating LSTMs.
optional int32 eval_unroll_length = 2;
// Depth of the lstm feature map.
optional int32 lstm_state_depth = 3 [default = 256];
}
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""tf.data.Dataset builder.
Creates data sources for DetectionModels from an InputReader config. See
input_reader.proto for options.
Note: If users wishes to also use their own InputReaders with the Object
Detection configuration framework, they should define their own builder function
that wraps the build function.
"""
import tensorflow as tf
import tensorflow.google as google_tf
from google3.learning.brain.contrib.slim.data import parallel_reader
from tensorflow.contrib.training.python.training import sequence_queueing_state_saver as sqss
from lstm_object_detection import tf_sequence_example_decoder
from lstm_object_detection.protos import input_reader_google_pb2
from google3.third_party.tensorflow_models.object_detection.core import preprocessor
from google3.third_party.tensorflow_models.object_detection.core import preprocessor_cache
from google3.third_party.tensorflow_models.object_detection.core import standard_fields as fields
from google3.third_party.tensorflow_models.object_detection.protos import input_reader_pb2
from google3.third_party.tensorflow_models.object_detection.utils import ops as util_ops
# TODO(yinxiao): Make the following variable into configurable proto.
# Padding size for the labeled objects in each frame. Here we assume each
# frame has a total number of objects less than _PADDING_SIZE.
_PADDING_SIZE = 30
def _build_training_batch_dict(batch_sequences_with_states, unroll_length,
batch_size):
"""Builds training batch samples.
Args:
batch_sequences_with_states: A batch_sequences_with_states object.
unroll_length: Unrolled length for LSTM training.
batch_size: Batch size for queue outputs.
Returns:
A dictionary of tensors based on items in input_reader_config.
"""
seq_tensors_dict = {
fields.InputDataFields.image: [],
fields.InputDataFields.groundtruth_boxes: [],
fields.InputDataFields.groundtruth_classes: [],
'batch': batch_sequences_with_states,
}
for i in range(unroll_length):
for j in range(batch_size):
filtered_dict = util_ops.filter_groundtruth_with_nan_box_coordinates({
fields.InputDataFields.groundtruth_boxes: (
batch_sequences_with_states.sequences['groundtruth_boxes'][j][i]),
fields.InputDataFields.groundtruth_classes: (
batch_sequences_with_states.sequences['groundtruth_classes'][j][i]
),
})
filtered_dict = util_ops.retain_groundtruth_with_positive_classes(
filtered_dict)
seq_tensors_dict[fields.InputDataFields.image].append(
batch_sequences_with_states.sequences['image'][j][i])
seq_tensors_dict[fields.InputDataFields.groundtruth_boxes].append(
filtered_dict[fields.InputDataFields.groundtruth_boxes])
seq_tensors_dict[fields.InputDataFields.groundtruth_classes].append(
filtered_dict[fields.InputDataFields.groundtruth_classes])
seq_tensors_dict[fields.InputDataFields.image] = tuple(
seq_tensors_dict[fields.InputDataFields.image])
seq_tensors_dict[fields.InputDataFields.groundtruth_boxes] = tuple(
seq_tensors_dict[fields.InputDataFields.groundtruth_boxes])
seq_tensors_dict[fields.InputDataFields.groundtruth_classes] = tuple(
seq_tensors_dict[fields.InputDataFields.groundtruth_classes])
return seq_tensors_dict
def build(input_reader_config,
model_config,
lstm_config,
unroll_length,
data_augmentation_options=None,
batch_size=1):
"""Builds a tensor dictionary based on the InputReader config.
Args:
input_reader_config: An input_reader_builder.InputReader object.
model_config: A model.proto object containing the config for the desired
DetectionModel.
lstm_config: LSTM specific configs.
unroll_length: Unrolled length for LSTM training.
data_augmentation_options: A list of tuples, where each tuple contains a
data augmentation function and a dictionary containing arguments and their
values (see preprocessor.py).
batch_size: Batch size for queue outputs.
Returns:
A dictionary of tensors based on items in the input_reader_config.
Raises:
ValueError: On invalid input reader proto.
ValueError: If no input paths are specified.
"""
if not isinstance(input_reader_config, input_reader_pb2.InputReader):
raise ValueError('input_reader_config not of type '
'input_reader_pb2.InputReader.')
external_reader_config = input_reader_config.external_input_reader
google_input_reader_config = external_reader_config.Extensions[
input_reader_google_pb2.GoogleInputReader.google_input_reader]
input_reader_type = google_input_reader_config.WhichOneof('input_reader')
if input_reader_type == 'tf_record_video_input_reader':
config = google_input_reader_config.tf_record_video_input_reader
reader_type_class = tf.TFRecordReader
else:
raise ValueError(
'Unsupported reader in input_reader_config: %s' % input_reader_type)
if not config.input_path:
raise ValueError('At least one input path must be specified in '
'`input_reader_config`.')
key, value = parallel_reader.parallel_read(
config.input_path[:], # Convert `RepeatedScalarContainer` to list.
reader_class=reader_type_class,
num_epochs=(input_reader_config.num_epochs
if input_reader_config.num_epochs else None),
num_readers=input_reader_config.num_readers,
shuffle=input_reader_config.shuffle,
dtypes=[tf.string, tf.string],
capacity=input_reader_config.queue_capacity,
min_after_dequeue=input_reader_config.min_after_dequeue)
# TODO(yinxiao): Add loading instance mask option.
decoder = tf_sequence_example_decoder.TfSequenceExampleDecoder()
keys_to_decode = [
fields.InputDataFields.image, fields.InputDataFields.groundtruth_boxes,
fields.InputDataFields.groundtruth_classes
]
tensor_dict = decoder.decode(value, items=keys_to_decode)
tensor_dict['image'].set_shape([None, None, None, 3])
tensor_dict['groundtruth_boxes'].set_shape([None, None, 4])
height = model_config.ssd.image_resizer.fixed_shape_resizer.height
width = model_config.ssd.image_resizer.fixed_shape_resizer.width
# If data augmentation is specified in the config file, the preprocessor
# will be called here to augment the data as specified. Most common
# augmentations include horizontal flip and cropping.
if data_augmentation_options:
images_pre = tf.split(tensor_dict['image'], config.video_length, axis=0)
bboxes_pre = tf.split(
tensor_dict['groundtruth_boxes'], config.video_length, axis=0)
labels_pre = tf.split(
tensor_dict['groundtruth_classes'], config.video_length, axis=0)
images_proc, bboxes_proc, labels_proc = [], [], []
cache = preprocessor_cache.PreprocessorCache()
for i, _ in enumerate(images_pre):
image_dict = {
fields.InputDataFields.image:
images_pre[i],
fields.InputDataFields.groundtruth_boxes:
tf.squeeze(bboxes_pre[i], axis=0),
fields.InputDataFields.groundtruth_classes:
tf.squeeze(labels_pre[i], axis=0),
}
image_dict = preprocessor.preprocess(
image_dict,
data_augmentation_options,
func_arg_map=preprocessor.get_default_func_arg_map(),
preprocess_vars_cache=cache)
# Pads detection count to _PADDING_SIZE.
image_dict[fields.InputDataFields.groundtruth_boxes] = tf.pad(
image_dict[fields.InputDataFields.groundtruth_boxes],
[[0, _PADDING_SIZE], [0, 0]])
image_dict[fields.InputDataFields.groundtruth_boxes] = tf.slice(
image_dict[fields.InputDataFields.groundtruth_boxes], [0, 0],
[_PADDING_SIZE, -1])
image_dict[fields.InputDataFields.groundtruth_classes] = tf.pad(
image_dict[fields.InputDataFields.groundtruth_classes],
[[0, _PADDING_SIZE]])
image_dict[fields.InputDataFields.groundtruth_classes] = tf.slice(
image_dict[fields.InputDataFields.groundtruth_classes], [0],
[_PADDING_SIZE])
images_proc.append(image_dict[fields.InputDataFields.image])
bboxes_proc.append(image_dict[fields.InputDataFields.groundtruth_boxes])
labels_proc.append(image_dict[fields.InputDataFields.groundtruth_classes])
tensor_dict['image'] = tf.concat(images_proc, axis=0)
tensor_dict['groundtruth_boxes'] = tf.stack(bboxes_proc, axis=0)
tensor_dict['groundtruth_classes'] = tf.stack(labels_proc, axis=0)
else:
# Pads detection count to _PADDING_SIZE per frame.
tensor_dict['groundtruth_boxes'] = tf.pad(
tensor_dict['groundtruth_boxes'], [[0, 0], [0, _PADDING_SIZE], [0, 0]])
tensor_dict['groundtruth_boxes'] = tf.slice(
tensor_dict['groundtruth_boxes'], [0, 0, 0], [-1, _PADDING_SIZE, -1])
tensor_dict['groundtruth_classes'] = tf.pad(
tensor_dict['groundtruth_classes'], [[0, 0], [0, _PADDING_SIZE]])
tensor_dict['groundtruth_classes'] = tf.slice(
tensor_dict['groundtruth_classes'], [0, 0], [-1, _PADDING_SIZE])
tensor_dict['image'], _ = preprocessor.resize_image(
tensor_dict['image'], new_height=height, new_width=width)
num_steps = config.video_length / unroll_length
init_states = {
'lstm_state_c':
tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
'lstm_state_h':
tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
'lstm_state_step':
tf.constant(num_steps, shape=[]),
}
batch = sqss.batch_sequences_with_states(
input_key=key,
input_sequences=tensor_dict,
input_context={},
input_length=None,
initial_states=init_states,
num_unroll=unroll_length,
batch_size=batch_size,
num_threads=batch_size,
make_keys_unique=True,
capacity=batch_size * batch_size)
return _build_training_batch_dict(batch, unroll_length, batch_size)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment