Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
0d8e49ec
Commit
0d8e49ec
authored
Jul 30, 2018
by
Yinxiao Li
Committed by
dreamdragon
Oct 24, 2018
Browse files
PiperOrigin-RevId: 206648257
parent
d7676c1c
Changes
27
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2455 additions
and
0 deletions
+2455
-0
research/lstm_object_detection/__init__.py
research/lstm_object_detection/__init__.py
+0
-0
research/lstm_object_detection/configs/lstm_ssd_mobilenet_v1_imagenet.config
...t_detection/configs/lstm_ssd_mobilenet_v1_imagenet.config
+232
-0
research/lstm_object_detection/eval.py
research/lstm_object_detection/eval.py
+110
-0
research/lstm_object_detection/evaluator.py
research/lstm_object_detection/evaluator.py
+337
-0
research/lstm_object_detection/lstm/__init__.py
research/lstm_object_detection/lstm/__init__.py
+0
-0
research/lstm_object_detection/lstm/lstm_cells.py
research/lstm_object_detection/lstm/lstm_cells.py
+184
-0
research/lstm_object_detection/lstm/lstm_meta_arch.py
research/lstm_object_detection/lstm/lstm_meta_arch.py
+325
-0
research/lstm_object_detection/lstm/rnn_decoder.py
research/lstm_object_detection/lstm/rnn_decoder.py
+66
-0
research/lstm_object_detection/metrics/__init__.py
research/lstm_object_detection/metrics/__init__.py
+0
-0
research/lstm_object_detection/metrics/coco_evaluation_all_frames.py
...tm_object_detection/metrics/coco_evaluation_all_frames.py
+124
-0
research/lstm_object_detection/metrics/coco_evaluation_all_frames_test.py
...ject_detection/metrics/coco_evaluation_all_frames_test.py
+156
-0
research/lstm_object_detection/model_builder.py
research/lstm_object_detection/model_builder.py
+151
-0
research/lstm_object_detection/model_builder_test.py
research/lstm_object_detection/model_builder_test.py
+158
-0
research/lstm_object_detection/models/__init__.py
research/lstm_object_detection/models/__init__.py
+0
-0
research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor.py
...tection/models/lstm_ssd_mobilenet_v1_feature_extractor.py
+178
-0
research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor_test.py
...on/models/lstm_ssd_mobilenet_v1_feature_extractor_test.py
+139
-0
research/lstm_object_detection/protos/__init__.py
research/lstm_object_detection/protos/__init__.py
+0
-0
research/lstm_object_detection/protos/input_reader_google.proto
...ch/lstm_object_detection/protos/input_reader_google.proto
+33
-0
research/lstm_object_detection/protos/pipeline.proto
research/lstm_object_detection/protos/pipeline.proto
+21
-0
research/lstm_object_detection/seq_dataset_builder.py
research/lstm_object_detection/seq_dataset_builder.py
+241
-0
No files found.
research/lstm_object_detection/__init__.py
0 → 100644
View file @
0d8e49ec
research/lstm_object_detection/configs/lstm_ssd_mobilenet_v1_imagenet.config
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# For training on Imagenet Video with LSTM Mobilenet V1
[
object_detection
.
protos
.
lstm_model
] {
train_unroll_length
:
4
eval_unroll_length
:
4
}
model
{
ssd
{
num_classes
:
30
box_coder
{
faster_rcnn_box_coder
{
y_scale
:
10
.
0
x_scale
:
10
.
0
height_scale
:
5
.
0
width_scale
:
5
.
0
}
}
matcher
{
argmax_matcher
{
matched_threshold
:
0
.
5
unmatched_threshold
:
0
.
5
ignore_thresholds
:
false
negatives_lower_than_unmatched
:
true
force_match_for_each_row
:
true
}
}
similarity_calculator
{
iou_similarity
{
}
}
anchor_generator
{
ssd_anchor_generator
{
num_layers
:
5
min_scale
:
0
.
2
max_scale
:
0
.
95
aspect_ratios
:
1
.
0
aspect_ratios
:
2
.
0
aspect_ratios
:
0
.
5
aspect_ratios
:
3
.
0
aspect_ratios
:
0
.
3333
}
}
image_resizer
{
fixed_shape_resizer
{
height
:
256
width
:
256
}
}
box_predictor
{
convolutional_box_predictor
{
min_depth
:
0
max_depth
:
0
num_layers_before_predictor
:
3
use_dropout
:
false
dropout_keep_probability
:
0
.
8
kernel_size
:
3
box_code_size
:
4
apply_sigmoid_to_scores
:
false
use_depthwise
:
true
conv_hyperparams
{
activation
:
RELU_6
,
regularizer
{
l2_regularizer
{
weight
:
0
.
00004
}
}
initializer
{
truncated_normal_initializer
{
stddev
:
0
.
03
mean
:
0
.
0
}
}
batch_norm
{
train
:
true
,
scale
:
true
,
center
:
true
,
decay
:
0
.
9997
,
epsilon
:
0
.
001
,
}
}
}
}
feature_extractor
{
type
:
'lstm_mobilenet_v1'
min_depth
:
16
depth_multiplier
:
1
.
0
use_depthwise
:
true
conv_hyperparams
{
activation
:
RELU_6
,
regularizer
{
l2_regularizer
{
weight
:
0
.
00004
}
}
initializer
{
truncated_normal_initializer
{
stddev
:
0
.
03
mean
:
0
.
0
}
}
batch_norm
{
train
:
true
,
scale
:
true
,
center
:
true
,
decay
:
0
.
9997
,
epsilon
:
0
.
001
,
}
}
}
loss
{
classification_loss
{
weighted_sigmoid
{
}
}
localization_loss
{
weighted_smooth_l1
{
}
}
hard_example_miner
{
num_hard_examples
:
3000
iou_threshold
:
0
.
99
loss_type
:
CLASSIFICATION
max_negatives_per_positive
:
3
min_negatives_per_image
:
0
}
classification_weight
:
1
.
0
localization_weight
:
4
.
0
}
normalize_loss_by_num_matches
:
true
post_processing
{
batch_non_max_suppression
{
score_threshold
: -
20
.
0
iou_threshold
:
0
.
5
max_detections_per_class
:
100
max_total_detections
:
100
}
score_converter
:
SIGMOID
}
}
}
train_config
: {
batch_size
:
8
data_augmentation_options
{
random_horizontal_flip
{
}
}
data_augmentation_options
{
ssd_random_crop
{
}
}
optimizer
{
use_moving_average
:
false
rms_prop_optimizer
: {
learning_rate
: {
exponential_decay_learning_rate
{
initial_learning_rate
:
0
.
002
decay_steps
:
200000
decay_factor
:
0
.
95
}
}
momentum_optimizer_value
:
0
.
9
decay
:
0
.
9
epsilon
:
1
.
0
}
}
from_detection_checkpoint
:
true
gradient_clipping_by_norm
:
10
.
0
batch_queue_capacity
:
12
prefetch_queue_capacity
:
4
fine_tune_checkpoint
:
"/path/to/checkpoint/"
fine_tune_checkpoint_type
:
"detection"
}
train_input_reader
: {
shuffle_buffer_size
:
32
queue_capacity
:
12
prefetch_size
:
12
min_after_dequeue
:
4
label_map_path
:
"path/to/label_map"
external_input_reader
{
[
lstm_object_detection
.
input_readers
.
GoogleInputReader
.
google_input_reader
] {
tf_record_video_input_reader
: {
input_path
:
"your/cns/path"
data_type
:
TF_SEQUENCE_EXAMPLE
video_length
:
4
}
}
}
}
eval_config
: {
metrics_set
:
"coco_evaluation_last_frame"
use_moving_averages
:
true
min_score_threshold
:
0
.
5
max_num_boxes_to_visualize
:
300
visualize_groundtruth_boxes
:
true
groundtruth_box_visualization_color
:
"red"
}
eval_input_reader
: {
label_map_path
:
"path/to/label_map"
external_input_reader
{
[
lstm_object_detection
.
input_readers
.
GoogleInputReader
.
google_input_reader
] {
tf_record_video_input_reader
: {
input_path
:
"your/cns/path"
data_type
:
TF_SEQUENCE_EXAMPLE
video_length
:
4
}
}
}
shuffle
:
true
num_readers
:
1
}
research/lstm_object_detection/eval.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r
"""Evaluation executable for detection models.
This executable is used to evaluate DetectionModels. Example usage:
./eval \
--logtostderr \
--checkpoint_dir=path/to/checkpoint_dir \
--eval_dir=path/to/eval_dir \
--pipeline_config_path=pipeline_config.pbtxt
"""
import
functools
import
os
import
tensorflow
as
tf
from
google.protobuf
import
text_format
from
google3.pyglib
import
app
from
google3.pyglib
import
flags
from
lstm_object_detection
import
evaluator
from
lstm_object_detection
import
model_builder
from
lstm_object_detection
import
seq_dataset_builder
from
lstm_object_detection.utils
import
config_util
from
google3.third_party.tensorflow_models.object_detection.utils
import
label_map_util
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
flags
=
tf
.
app
.
flags
flags
.
DEFINE_boolean
(
'eval_training_data'
,
False
,
'If training data should be evaluated for this job.'
)
flags
.
DEFINE_string
(
'checkpoint_dir'
,
''
,
'Directory containing checkpoints to evaluate, typically '
'set to `train_dir` used in the training job.'
)
flags
.
DEFINE_string
(
'eval_dir'
,
''
,
'Directory to write eval summaries to.'
)
flags
.
DEFINE_string
(
'pipeline_config_path'
,
''
,
'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
'file. If provided, other configs are ignored'
)
flags
.
DEFINE_boolean
(
'run_once'
,
False
,
'Option to only run a single pass of '
'evaluation. Overrides the `max_evals` parameter in the '
'provided config.'
)
FLAGS
=
flags
.
FLAGS
def
main
(
unused_argv
):
assert
FLAGS
.
checkpoint_dir
,
'`checkpoint_dir` is missing.'
assert
FLAGS
.
eval_dir
,
'`eval_dir` is missing.'
if
FLAGS
.
pipeline_config_path
:
configs
=
config_util
.
get_configs_from_pipeline_file
(
FLAGS
.
pipeline_config_path
)
else
:
configs
=
config_util
.
get_configs_from_multiple_files
(
model_config_path
=
FLAGS
.
model_config_path
,
eval_config_path
=
FLAGS
.
eval_config_path
,
eval_input_config_path
=
FLAGS
.
input_config_path
)
pipeline_proto
=
config_util
.
create_pipeline_proto_from_configs
(
configs
)
config_text
=
text_format
.
MessageToString
(
pipeline_proto
)
tf
.
gfile
.
MakeDirs
(
FLAGS
.
eval_dir
)
with
tf
.
gfile
.
Open
(
os
.
path
.
join
(
FLAGS
.
eval_dir
,
'pipeline.config'
),
'wb'
)
as
f
:
f
.
write
(
config_text
)
model_config
=
configs
[
'model'
]
lstm_config
=
configs
[
'lstm_model'
]
eval_config
=
configs
[
'eval_config'
]
input_config
=
configs
[
'eval_input_config'
]
if
FLAGS
.
eval_training_data
:
input_config
.
external_input_reader
.
CopyFrom
(
configs
[
'train_input_config'
].
external_input_reader
)
lstm_config
.
eval_unroll_length
=
lstm_config
.
train_unroll_length
model_fn
=
functools
.
partial
(
model_builder
.
build
,
model_config
=
model_config
,
lstm_config
=
lstm_config
,
is_training
=
False
)
def
get_next
(
config
,
model_config
,
lstm_config
,
unroll_length
):
return
seq_dataset_builder
.
build
(
config
,
model_config
,
lstm_config
,
unroll_length
)
create_input_dict_fn
=
functools
.
partial
(
get_next
,
input_config
,
model_config
,
lstm_config
,
lstm_config
.
eval_unroll_length
)
label_map
=
label_map_util
.
load_labelmap
(
input_config
.
label_map_path
)
max_num_classes
=
max
([
item
.
id
for
item
in
label_map
.
item
])
categories
=
label_map_util
.
convert_label_map_to_categories
(
label_map
,
max_num_classes
)
if
FLAGS
.
run_once
:
eval_config
.
max_evals
=
1
evaluator
.
evaluate
(
create_input_dict_fn
,
model_fn
,
eval_config
,
categories
,
FLAGS
.
checkpoint_dir
,
FLAGS
.
eval_dir
)
if
__name__
==
'__main__'
:
app
.
run
()
research/lstm_object_detection/evaluator.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Detection model evaluator.
This file provides a generic evaluation method that can be used to evaluate a
DetectionModel.
"""
import
logging
import
tensorflow
as
tf
from
lstm_object_detection.metrics
import
coco_evaluation_all_frames
from
google3.third_party.tensorflow_models.object_detection
import
eval_util
from
google3.third_party.tensorflow_models.object_detection.core
import
prefetcher
from
google3.third_party.tensorflow_models.object_detection.core
import
standard_fields
as
fields
from
google3.third_party.tensorflow_models.object_detection.metrics
import
coco_evaluation
from
google3.third_party.tensorflow_models.object_detection.utils
import
object_detection_evaluation
# A dictionary of metric names to classes that implement the metric. The classes
# in the dictionary must implement
# utils.object_detection_evaluation.DetectionEvaluator interface.
EVAL_METRICS_CLASS_DICT
=
{
'pascal_voc_detection_metrics'
:
object_detection_evaluation
.
PascalDetectionEvaluator
,
'weighted_pascal_voc_detection_metrics'
:
object_detection_evaluation
.
WeightedPascalDetectionEvaluator
,
'pascal_voc_instance_segmentation_metrics'
:
object_detection_evaluation
.
PascalInstanceSegmentationEvaluator
,
'weighted_pascal_voc_instance_segmentation_metrics'
:
object_detection_evaluation
.
WeightedPascalInstanceSegmentationEvaluator
,
'open_images_detection_metrics'
:
object_detection_evaluation
.
OpenImagesDetectionEvaluator
,
'coco_detection_metrics'
:
coco_evaluation
.
CocoDetectionEvaluator
,
'coco_mask_metrics'
:
coco_evaluation
.
CocoMaskEvaluator
,
'coco_evaluation_all_frames'
:
coco_evaluation_all_frames
.
CocoEvaluationAllFrames
,
}
EVAL_DEFAULT_METRIC
=
'pascal_voc_detection_metrics'
def
_create_detection_op
(
model
,
input_dict
,
batch
):
"""Create detection ops.
Args:
model: model to perform predictions with.
input_dict: A dict holds input data.
batch: batch size for evaluation.
Returns:
Detection tensor ops.
"""
video_tensor
=
tf
.
stack
(
list
(
input_dict
[
fields
.
InputDataFields
.
image
]))
preprocessed_video
,
true_image_shapes
=
model
.
preprocess
(
tf
.
to_float
(
video_tensor
))
if
batch
is
not
None
:
prediction_dict
=
model
.
predict
(
preprocessed_video
,
true_image_shapes
,
batch
)
else
:
prediction_dict
=
model
.
predict
(
preprocessed_video
,
true_image_shapes
)
return
model
.
postprocess
(
prediction_dict
,
true_image_shapes
)
def
_extract_prediction_tensors
(
model
,
create_input_dict_fn
,
ignore_groundtruth
=
False
):
"""Restores the model in a tensorflow session.
Args:
model: model to perform predictions with.
create_input_dict_fn: function to create input tensor dictionaries.
ignore_groundtruth: whether groundtruth should be ignored.
Returns:
tensor_dict: A tensor dictionary with evaluations.
"""
input_dict
=
create_input_dict_fn
()
batch
=
None
if
'batch'
in
input_dict
:
batch
=
input_dict
.
pop
(
'batch'
)
else
:
prefetch_queue
=
prefetcher
.
prefetch
(
input_dict
,
capacity
=
500
)
input_dict
=
prefetch_queue
.
dequeue
()
# consistent format for images and videos
for
key
,
value
in
input_dict
.
iteritems
():
input_dict
[
key
]
=
(
value
,)
detections
=
_create_detection_op
(
model
,
input_dict
,
batch
)
# Print out anaylsis of the model.
tf
.
contrib
.
tfprof
.
model_analyzer
.
print_model_analysis
(
tf
.
get_default_graph
(),
tfprof_options
=
tf
.
contrib
.
tfprof
.
model_analyzer
.
TRAINABLE_VARS_PARAMS_STAT_OPTIONS
)
tf
.
contrib
.
tfprof
.
model_analyzer
.
print_model_analysis
(
tf
.
get_default_graph
(),
tfprof_options
=
tf
.
contrib
.
tfprof
.
model_analyzer
.
FLOAT_OPS_OPTIONS
)
num_frames
=
len
(
input_dict
[
fields
.
InputDataFields
.
image
])
ret
=
[]
for
i
in
range
(
num_frames
):
original_image
=
tf
.
expand_dims
(
input_dict
[
fields
.
InputDataFields
.
image
][
i
],
0
)
groundtruth
=
None
if
not
ignore_groundtruth
:
groundtruth
=
{
fields
.
InputDataFields
.
groundtruth_boxes
:
input_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
][
i
],
fields
.
InputDataFields
.
groundtruth_classes
:
input_dict
[
fields
.
InputDataFields
.
groundtruth_classes
][
i
],
}
optional_keys
=
(
fields
.
InputDataFields
.
groundtruth_area
,
fields
.
InputDataFields
.
groundtruth_is_crowd
,
fields
.
InputDataFields
.
groundtruth_difficult
,
fields
.
InputDataFields
.
groundtruth_group_of
,
)
for
opt_key
in
optional_keys
:
if
opt_key
in
input_dict
:
groundtruth
[
opt_key
]
=
input_dict
[
opt_key
][
i
]
if
fields
.
DetectionResultFields
.
detection_masks
in
detections
:
groundtruth
[
fields
.
InputDataFields
.
groundtruth_instance_masks
]
=
(
input_dict
[
fields
.
InputDataFields
.
groundtruth_instance_masks
][
i
])
detections_frame
=
{
key
:
tf
.
expand_dims
(
value
[
i
],
0
)
for
key
,
value
in
detections
.
iteritems
()
}
source_id
=
(
batch
.
key
[
0
]
if
batch
is
not
None
else
input_dict
[
fields
.
InputDataFields
.
source_id
][
i
])
ret
.
append
(
eval_util
.
result_dict_for_single_example
(
original_image
,
source_id
,
detections_frame
,
groundtruth
,
class_agnostic
=
(
fields
.
DetectionResultFields
.
detection_classes
not
in
detections
),
scale_to_absolute
=
True
))
return
ret
def
get_evaluators
(
eval_config
,
categories
):
"""Returns the evaluator class according to eval_config, valid for categories.
Args:
eval_config: evaluation configurations.
categories: a list of categories to evaluate.
Returns:
An list of instances of DetectionEvaluator.
Raises:
ValueError: if metric is not in the metric class dictionary.
"""
eval_metric_fn_keys
=
eval_config
.
metrics_set
if
not
eval_metric_fn_keys
:
eval_metric_fn_keys
=
[
EVAL_DEFAULT_METRIC
]
evaluators_list
=
[]
for
eval_metric_fn_key
in
eval_metric_fn_keys
:
if
eval_metric_fn_key
not
in
EVAL_METRICS_CLASS_DICT
:
raise
ValueError
(
'Metric not found: {}'
.
format
(
eval_metric_fn_key
))
else
:
evaluators_list
.
append
(
EVAL_METRICS_CLASS_DICT
[
eval_metric_fn_key
](
categories
=
categories
))
return
evaluators_list
def
evaluate
(
create_input_dict_fn
,
create_model_fn
,
eval_config
,
categories
,
checkpoint_dir
,
eval_dir
,
graph_hook_fn
=
None
):
"""Evaluation function for detection models.
Args:
create_input_dict_fn: a function to create a tensor input dictionary.
create_model_fn: a function that creates a DetectionModel.
eval_config: a eval_pb2.EvalConfig protobuf.
categories: a list of category dictionaries. Each dict in the list should
have an integer 'id' field and string 'name' field.
checkpoint_dir: directory to load the checkpoints to evaluate from.
eval_dir: directory to write evaluation metrics summary to.
graph_hook_fn: Optional function that is called after the training graph is
completely built. This is helpful to perform additional changes to the
training graph such as optimizing batchnorm. The function should modify
the default graph.
Returns:
metrics: A dictionary containing metric names and values from the latest
run.
"""
model
=
create_model_fn
()
if
eval_config
.
ignore_groundtruth
and
not
eval_config
.
export_path
:
logging
.
fatal
(
'If ignore_groundtruth=True then an export_path is '
'required. Aborting!!!'
)
tensor_dicts
=
_extract_prediction_tensors
(
model
=
model
,
create_input_dict_fn
=
create_input_dict_fn
,
ignore_groundtruth
=
eval_config
.
ignore_groundtruth
)
def
_process_batch
(
tensor_dicts
,
sess
,
batch_index
,
counters
,
losses_dict
=
None
):
"""Evaluates tensors in tensor_dicts, visualizing the first K examples.
This function calls sess.run on tensor_dicts, evaluating the original_image
tensor only on the first K examples and visualizing detections overlaid
on this original_image.
Args:
tensor_dicts: a dictionary of tensors
sess: tensorflow session
batch_index: the index of the batch amongst all batches in the run.
counters: a dictionary holding 'success' and 'skipped' fields which can
be updated to keep track of number of successful and failed runs,
respectively. If these fields are not updated, then the success/skipped
counter values shown at the end of evaluation will be incorrect.
losses_dict: Optional dictonary of scalar loss tensors. Necessary only
for matching function signiture in third_party eval_util.py.
Returns:
result_dict: a dictionary of numpy arrays
result_losses_dict: a dictionary of scalar losses. This is empty if input
losses_dict is None. Necessary only for matching function signiture in
third_party eval_util.py.
"""
if
batch_index
%
10
==
0
:
logging
.
info
(
'Running eval ops batch %d'
,
batch_index
)
if
not
losses_dict
:
losses_dict
=
{}
try
:
result_dicts
,
result_losses_dict
=
sess
.
run
([
tensor_dicts
,
losses_dict
])
counters
[
'success'
]
+=
1
except
tf
.
errors
.
InvalidArgumentError
:
logging
.
info
(
'Skipping image'
)
counters
[
'skipped'
]
+=
1
return
{}
num_images
=
len
(
tensor_dicts
)
for
i
in
range
(
num_images
):
result_dict
=
result_dicts
[
i
]
global_step
=
tf
.
train
.
global_step
(
sess
,
tf
.
train
.
get_global_step
())
tag
=
'image-%d'
%
(
batch_index
*
num_images
+
i
)
if
batch_index
<
eval_config
.
num_visualizations
/
num_images
:
eval_util
.
visualize_detection_results
(
result_dict
,
tag
,
global_step
,
categories
=
categories
,
summary_dir
=
eval_dir
,
export_dir
=
eval_config
.
visualization_export_dir
,
show_groundtruth
=
eval_config
.
visualize_groundtruth_boxes
,
groundtruth_box_visualization_color
=
eval_config
.
groundtruth_box_visualization_color
,
min_score_thresh
=
eval_config
.
min_score_threshold
,
max_num_predictions
=
eval_config
.
max_num_boxes_to_visualize
,
skip_scores
=
eval_config
.
skip_scores
,
skip_labels
=
eval_config
.
skip_labels
,
keep_image_id_for_visualization_export
=
eval_config
.
keep_image_id_for_visualization_export
)
if
num_images
>
1
:
return
result_dicts
,
result_losses_dict
else
:
return
result_dicts
[
0
],
result_losses_dict
variables_to_restore
=
tf
.
global_variables
()
global_step
=
tf
.
train
.
get_or_create_global_step
()
variables_to_restore
.
append
(
global_step
)
if
graph_hook_fn
:
graph_hook_fn
()
if
eval_config
.
use_moving_averages
:
variable_averages
=
tf
.
train
.
ExponentialMovingAverage
(
0.0
)
variables_to_restore
=
variable_averages
.
variables_to_restore
()
for
key
in
variables_to_restore
.
keys
():
if
'moving_mean'
in
key
:
variables_to_restore
[
key
.
replace
(
'moving_mean'
,
'moving_mean/ExponentialMovingAverage'
)]
=
(
variables_to_restore
[
key
])
del
variables_to_restore
[
key
]
if
'moving_variance'
in
key
:
variables_to_restore
[
key
.
replace
(
'moving_variance'
,
'moving_variance/ExponentialMovingAverage'
)]
=
(
variables_to_restore
[
key
])
del
variables_to_restore
[
key
]
saver
=
tf
.
train
.
Saver
(
variables_to_restore
)
def
_restore_latest_checkpoint
(
sess
):
latest_checkpoint
=
tf
.
train
.
latest_checkpoint
(
checkpoint_dir
)
saver
.
restore
(
sess
,
latest_checkpoint
)
metrics
=
eval_util
.
repeated_checkpoint_run
(
tensor_dict
=
tensor_dicts
,
summary_dir
=
eval_dir
,
evaluators
=
get_evaluators
(
eval_config
,
categories
),
batch_processor
=
_process_batch
,
checkpoint_dirs
=
[
checkpoint_dir
],
variables_to_restore
=
None
,
restore_fn
=
_restore_latest_checkpoint
,
num_batches
=
eval_config
.
num_examples
,
eval_interval_secs
=
eval_config
.
eval_interval_secs
,
max_number_of_evaluations
=
(
1
if
eval_config
.
ignore_groundtruth
else
eval_config
.
max_evals
if
eval_config
.
max_evals
else
None
),
master
=
eval_config
.
eval_master
,
save_graph
=
eval_config
.
save_graph
,
save_graph_dir
=
(
eval_dir
if
eval_config
.
save_graph
else
''
))
return
metrics
research/lstm_object_detection/lstm/__init__.py
0 → 100644
View file @
0d8e49ec
research/lstm_object_detection/lstm/lstm_cells.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""BottleneckConvLSTMCell implementation."""
import
google3
import
tensorflow.google
as
tf
import
google3.learning.brain.contrib.slim
as
slim
from
tensorflow.contrib.framework.python.ops
import
variables
_batch_norm
=
tf
.
contrib
.
layers
.
batch_norm
class
BottleneckConvLSTMCell
(
tf
.
contrib
.
rnn
.
RNNCell
):
"""Basic LSTM recurrent network cell using separable convolutions.
The implementation is based on: http://arxiv.org/abs/1409.2329.
We add forget_bias (default: 1) to the biases of the forget gate in order to
reduce the scale of forgetting in the beginning of the training.
This LSTM first projects inputs to the size of the output before doing gate
computations. This saves params unless the input is less than a third of the
state size channel-wise.
"""
def
__init__
(
self
,
filter_size
,
output_size
,
num_units
,
forget_bias
=
1.0
,
activation
=
tf
.
tanh
,
flattened_state
=
False
,
visualize_gates
=
True
):
"""Initializes the basic LSTM cell.
Args:
filter_size: collection, conv filter size
output_size: collection, the width/height dimensions of the cell/output
num_units: int, The number of channels in the LSTM cell.
forget_bias: float, The bias added to forget gates (see above).
activation: Activation function of the inner states.
flattened_state: if True, state tensor will be flattened and stored as
a 2-d tensor. Use for exporting the model to tfmini
visualize_gates: if True, add histogram summaries of all gates
and outputs to tensorboard
"""
self
.
_filter_size
=
list
(
filter_size
)
self
.
_output_size
=
list
(
output_size
)
self
.
_num_units
=
num_units
self
.
_forget_bias
=
forget_bias
self
.
_activation
=
activation
self
.
_viz_gates
=
visualize_gates
self
.
_flattened_state
=
flattened_state
self
.
_param_count
=
self
.
_num_units
for
dim
in
self
.
_output_size
:
self
.
_param_count
*=
dim
@
property
def
state_size
(
self
):
return
tf
.
contrib
.
rnn
.
LSTMStateTuple
(
self
.
_output_size
+
[
self
.
_num_units
],
self
.
_output_size
+
[
self
.
_num_units
])
@
property
def
state_size_flat
(
self
):
return
tf
.
contrib
.
rnn
.
LSTMStateTuple
([
self
.
_param_count
],
[
self
.
_param_count
])
@
property
def
output_size
(
self
):
return
self
.
_output_size
+
[
self
.
_num_units
]
def
__call__
(
self
,
inputs
,
state
,
scope
=
None
):
"""Long short-term memory cell (LSTM) with bottlenecking.
Args:
inputs: Input tensor at the current timestep.
state: Tuple of tensors, the state and output at the previous timestep.
scope: Optional scope.
Returns:
A tuple where the first element is the LSTM output and the second is
a LSTMStateTuple of the state at the current timestep.
"""
scope
=
scope
or
'conv_lstm_cell'
with
tf
.
variable_scope
(
scope
):
c
,
h
=
state
# unflatten state if neccesary
if
self
.
_flattened_state
:
c
=
tf
.
reshape
(
c
,
[
-
1
]
+
self
.
output_size
)
h
=
tf
.
reshape
(
h
,
[
-
1
]
+
self
.
output_size
)
# summary of input passed into cell
if
self
.
_viz_gates
:
slim
.
summaries
.
add_histogram_summary
(
inputs
,
'cell_input'
)
bottleneck
=
tf
.
contrib
.
layers
.
separable_conv2d
(
tf
.
concat
([
inputs
,
h
],
3
),
self
.
_num_units
,
self
.
_filter_size
,
depth_multiplier
=
1
,
activation_fn
=
self
.
_activation
,
normalizer_fn
=
None
,
scope
=
'bottleneck'
)
if
self
.
_viz_gates
:
slim
.
summaries
.
add_histogram_summary
(
bottleneck
,
'bottleneck'
)
concat
=
tf
.
contrib
.
layers
.
separable_conv2d
(
bottleneck
,
4
*
self
.
_num_units
,
self
.
_filter_size
,
depth_multiplier
=
1
,
activation_fn
=
None
,
normalizer_fn
=
None
,
scope
=
'gates'
)
i
,
j
,
f
,
o
=
tf
.
split
(
concat
,
4
,
3
)
new_c
=
(
c
*
tf
.
sigmoid
(
f
+
self
.
_forget_bias
)
+
tf
.
sigmoid
(
i
)
*
self
.
_activation
(
j
))
new_h
=
self
.
_activation
(
new_c
)
*
tf
.
sigmoid
(
o
)
# summary of cell output and new state
if
self
.
_viz_gates
:
slim
.
summaries
.
add_histogram_summary
(
new_h
,
'cell_output'
)
slim
.
summaries
.
add_histogram_summary
(
new_c
,
'cell_state'
)
# reflatten state to store it
if
self
.
_flattened_state
:
new_c
=
tf
.
reshape
(
new_c
,
[
-
1
,
self
.
_param_count
])
new_h
=
tf
.
reshape
(
new_h
,
[
-
1
,
self
.
_param_count
])
return
new_h
,
tf
.
contrib
.
rnn
.
LSTMStateTuple
(
new_c
,
new_h
if
self
.
_flattened_state
else
new_h
)
def
init_state
(
self
,
state_name
,
batch_size
,
dtype
,
learned_state
=
False
):
"""Creates an initial state compatible with this cell.
Args:
state_name: name of the state tensor
batch_size: model batch size
dtype: dtype for the tensor values i.e. tf.float32
learned_state: whether the initial state should be learnable. If false,
the initial state is set to all 0's
Returns:
The created initial state.
"""
state_size
=
(
self
.
state_size_flat
if
self
.
_flattened_state
else
self
.
state_size
)
# list of 2 zero tensors or variables tensors, depending on if
# learned_state is true
ret_flat
=
[(
variables
.
model_variable
(
state_name
+
str
(
i
),
shape
=
s
,
dtype
=
dtype
,
initializer
=
tf
.
truncated_normal_initializer
(
stddev
=
0.03
))
if
learned_state
else
tf
.
zeros
(
[
batch_size
]
+
s
,
dtype
=
dtype
,
name
=
state_name
))
for
i
,
s
in
enumerate
(
state_size
)]
# duplicates initial state across the batch axis if it's learned
if
learned_state
:
ret_flat
=
[
tf
.
stack
([
tensor
for
i
in
range
(
int
(
batch_size
))])
for
tensor
in
ret_flat
]
for
s
,
r
in
zip
(
state_size
,
ret_flat
):
r
.
set_shape
([
None
]
+
s
)
return
tf
.
nest
.
pack_sequence_as
(
structure
=
[
1
,
1
],
flat_sequence
=
ret_flat
)
research/lstm_object_detection/lstm/lstm_meta_arch.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""LSTM Meta-architecture definition.
General tensorflow implementation of convolutional Multibox/SSD detection
models with LSTM states, for use on video data.
See https://arxiv.org/abs/1711.06368 for details.
"""
import
re
import
tensorflow
as
tf
from
google3.third_party.tensorflow_models.object_detection.core
import
box_list_ops
from
google3.third_party.tensorflow_models.object_detection.core
import
standard_fields
as
fields
from
google3.third_party.tensorflow_models.object_detection.meta_architectures
import
ssd_meta_arch
from
google3.third_party.tensorflow_models.object_detection.utils
import
ops
from
google3.third_party.tensorflow_models.object_detection.utils
import
shape_utils
slim
=
tf
.
contrib
.
slim
class
LSTMMetaArch
(
ssd_meta_arch
.
SSDMetaArch
):
"""LSTM Meta-architecture definition."""
def
__init__
(
self
,
is_training
,
anchor_generator
,
box_predictor
,
box_coder
,
feature_extractor
,
matcher
,
region_similarity_calculator
,
encode_background_as_zeros
,
negative_class_weight
,
image_resizer_fn
,
non_max_suppression_fn
,
score_conversion_fn
,
classification_loss
,
localization_loss
,
classification_loss_weight
,
localization_loss_weight
,
normalize_loss_by_num_matches
,
hard_example_miner
,
unroll_length
,
add_summaries
=
True
):
super
(
LSTMMetaArch
,
self
).
__init__
(
is_training
,
anchor_generator
,
box_predictor
,
box_coder
,
feature_extractor
,
matcher
,
region_similarity_calculator
,
encode_background_as_zeros
,
negative_class_weight
,
image_resizer_fn
,
non_max_suppression_fn
,
score_conversion_fn
,
classification_loss
,
localization_loss
,
classification_loss_weight
,
localization_loss_weight
,
normalize_loss_by_num_matches
,
hard_example_miner
,
add_summaries
)
self
.
_unroll_length
=
unroll_length
@
property
def
unroll_length
(
self
):
return
self
.
_unroll_length
@
unroll_length
.
setter
def
unroll_length
(
self
,
unroll_length
):
self
.
_unroll_length
=
unroll_length
def
predict
(
self
,
preprocessed_inputs
,
true_image_shapes
,
states
=
None
,
state_name
=
'lstm_state'
,
feature_scope
=
None
):
with
tf
.
variable_scope
(
self
.
_extract_features_scope
,
values
=
[
preprocessed_inputs
],
reuse
=
tf
.
AUTO_REUSE
):
feature_maps
=
self
.
_feature_extractor
.
extract_features
(
preprocessed_inputs
,
states
,
state_name
,
unroll_length
=
self
.
_unroll_length
,
scope
=
feature_scope
)
feature_map_spatial_dims
=
self
.
_get_feature_map_spatial_dims
(
feature_maps
)
image_shape
=
shape_utils
.
combined_static_and_dynamic_shape
(
preprocessed_inputs
)
self
.
_batch_size
=
preprocessed_inputs
.
shape
[
0
].
value
/
self
.
_unroll_length
self
.
_states
=
states
self
.
_anchors
=
box_list_ops
.
concatenate
(
self
.
_anchor_generator
.
generate
(
feature_map_spatial_dims
,
im_height
=
image_shape
[
1
],
im_width
=
image_shape
[
2
]))
prediction_dict
=
self
.
_box_predictor
.
predict
(
feature_maps
,
self
.
_anchor_generator
.
num_anchors_per_location
())
# Multiscale_anchor_generator currently has a different dim compared to
# ssd_anchor_generator. Current fix is to check the dim of the box_encodings
# tensor. If dim is not 3(multiscale_anchor_generator), squeeze the 3rd dim.
# TODO(yinxiao): Remove this check once the anchor generator has unified
# dimension.
if
len
(
prediction_dict
[
'box_encodings'
][
0
].
get_shape
().
as_list
())
==
3
:
box_encodings
=
tf
.
concat
(
prediction_dict
[
'box_encodings'
],
axis
=
1
)
else
:
box_encodings
=
tf
.
squeeze
(
tf
.
concat
(
prediction_dict
[
'box_encodings'
],
axis
=
1
),
axis
=
2
)
class_predictions_with_background
=
tf
.
concat
(
prediction_dict
[
'class_predictions_with_background'
],
axis
=
1
)
predictions_dict
=
{
'preprocessed_inputs'
:
preprocessed_inputs
,
'box_encodings'
:
box_encodings
,
'class_predictions_with_background'
:
class_predictions_with_background
,
'feature_maps'
:
feature_maps
,
'anchors'
:
self
.
_anchors
.
get
(),
'states_and_outputs'
:
self
.
_feature_extractor
.
states_and_outputs
,
}
# In cases such as exporting the model, the states is always zero. Thus the
# step should be ignored.
if
states
is
not
None
:
predictions_dict
[
'step'
]
=
self
.
_feature_extractor
.
step
return
predictions_dict
def
loss
(
self
,
prediction_dict
,
true_image_shapes
,
scope
=
None
):
"""Computes scalar loss tensors with respect to provided groundtruth.
Calling this function requires that groundtruth tensors have been
provided via the provide_groundtruth function.
Args:
prediction_dict: a dictionary holding prediction tensors with
1) box_encodings: 3-D float tensor of shape [batch_size, num_anchors,
box_code_dimension] containing predicted boxes.
2) class_predictions_with_background: 3-D float tensor of shape
[batch_size, num_anchors, num_classes+1] containing class predictions
(logits) for each of the anchors. Note that this tensor *includes*
background class predictions.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
scope: Optional scope name.
Returns:
a dictionary mapping loss keys (`localization_loss` and
`classification_loss`) to scalar tensors representing corresponding loss
values.
"""
with
tf
.
name_scope
(
scope
,
'Loss'
,
prediction_dict
.
values
()):
keypoints
=
None
if
self
.
groundtruth_has_field
(
fields
.
BoxListFields
.
keypoints
):
keypoints
=
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
keypoints
)
weights
=
None
if
self
.
groundtruth_has_field
(
fields
.
BoxListFields
.
weights
):
weights
=
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
weights
)
(
batch_cls_targets
,
batch_cls_weights
,
batch_reg_targets
,
batch_reg_weights
,
match_list
)
=
self
.
_assign_targets
(
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
boxes
),
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
classes
),
keypoints
,
weights
)
if
self
.
_add_summaries
:
self
.
_summarize_target_assignment
(
self
.
groundtruth_lists
(
fields
.
BoxListFields
.
boxes
),
match_list
)
location_losses
=
self
.
_localization_loss
(
prediction_dict
[
'box_encodings'
],
batch_reg_targets
,
ignore_nan_targets
=
True
,
weights
=
batch_reg_weights
)
cls_losses
=
ops
.
reduce_sum_trailing_dimensions
(
self
.
_classification_loss
(
prediction_dict
[
'class_predictions_with_background'
],
batch_cls_targets
,
weights
=
batch_cls_weights
),
ndims
=
2
)
if
self
.
_hard_example_miner
:
(
loc_loss_list
,
cls_loss_list
)
=
self
.
_apply_hard_mining
(
location_losses
,
cls_losses
,
prediction_dict
,
match_list
)
localization_loss
=
tf
.
reduce_sum
(
tf
.
stack
(
loc_loss_list
))
classification_loss
=
tf
.
reduce_sum
(
tf
.
stack
(
cls_loss_list
))
if
self
.
_add_summaries
:
self
.
_hard_example_miner
.
summarize
()
else
:
if
self
.
_add_summaries
:
class_ids
=
tf
.
argmax
(
batch_cls_targets
,
axis
=
2
)
flattened_class_ids
=
tf
.
reshape
(
class_ids
,
[
-
1
])
flattened_classification_losses
=
tf
.
reshape
(
cls_losses
,
[
-
1
])
self
.
_summarize_anchor_classification_loss
(
flattened_class_ids
,
flattened_classification_losses
)
localization_loss
=
tf
.
reduce_sum
(
location_losses
)
classification_loss
=
tf
.
reduce_sum
(
cls_losses
)
# Optionally normalize by number of positive matches
normalizer
=
tf
.
constant
(
1.0
,
dtype
=
tf
.
float32
)
if
self
.
_normalize_loss_by_num_matches
:
normalizer
=
tf
.
maximum
(
tf
.
to_float
(
tf
.
reduce_sum
(
batch_reg_weights
)),
1.0
)
with
tf
.
name_scope
(
'localization_loss'
):
localization_loss_normalizer
=
normalizer
if
self
.
_normalize_loc_loss_by_codesize
:
localization_loss_normalizer
*=
self
.
_box_coder
.
code_size
localization_loss
=
((
self
.
_localization_loss_weight
/
(
localization_loss_normalizer
))
*
localization_loss
)
with
tf
.
name_scope
(
'classification_loss'
):
classification_loss
=
((
self
.
_classification_loss_weight
/
normalizer
)
*
classification_loss
)
loss_dict
=
{
'localization_loss'
:
localization_loss
,
'classification_loss'
:
classification_loss
}
return
loss_dict
def
restore_map
(
self
,
fine_tune_checkpoint_type
=
'lstm'
):
"""Returns a map of variables to load from a foreign checkpoint.
See parent class for details.
Args:
fine_tune_checkpoint_type: the type of checkpoint to restore from, either
SSD/LSTM detection checkpoint (with compatible variable names)
classification checkpoint for initialization prior to training.
Available options: `classification`, `detection`, `interleaved`,
and `lstm`.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
the model graph.
Raises:
ValueError: if fine_tune_checkpoint_type is not among
`classification`/`detection`/`interleaved`/`lstm`.
"""
if
fine_tune_checkpoint_type
not
in
[
'classification'
,
'detection'
,
'lstm'
]:
raise
ValueError
(
'Not supported fine_tune_checkpoint_type: {}'
.
format
(
fine_tune_checkpoint_type
))
variables_to_restore
=
{}
for
variable
in
tf
.
global_variables
():
var_name
=
variable
.
op
.
name
if
'global_step'
in
var_name
:
continue
# Remove FeatureExtractor prefix for classification checkpoints.
if
fine_tune_checkpoint_type
==
'classification'
:
var_name
=
(
re
.
split
(
'^'
+
self
.
_extract_features_scope
+
'/'
,
var_name
)[
-
1
])
# When loading from single frame detection checkpoints, we need to
# remap FeatureMaps variable names.
if
(
'FeatureMaps'
in
var_name
and
fine_tune_checkpoint_type
==
'detection'
):
var_name
=
var_name
.
replace
(
'FeatureMaps'
,
self
.
get_base_network_scope
())
variables_to_restore
[
var_name
]
=
variable
return
variables_to_restore
def
get_base_network_scope
(
self
):
"""Returns the variable scope of the base network.
Returns:
The variable scope of the feature extractor base network, e.g. MobilenetV1
"""
return
self
.
_feature_extractor
.
get_base_network_scope
()
class
LSTMFeatureExtractor
(
ssd_meta_arch
.
SSDFeatureExtractor
):
"""LSTM Meta-architecture Feature Extractor definition."""
@
property
def
depth_multipliers
(
self
):
return
self
.
_depth_multipliers
@
depth_multipliers
.
setter
def
depth_multipliers
(
self
,
depth_multipliers
):
self
.
_depth_multipliers
=
depth_multipliers
@
property
def
lstm_state_depth
(
self
):
return
self
.
_lstm_state_depth
@
lstm_state_depth
.
setter
def
lstm_state_depth
(
self
,
lstm_state_depth
):
self
.
_lstm_state_depth
=
lstm_state_depth
@
property
def
states_and_outputs
(
self
):
"""LSTM states and outputs.
This variable includes both LSTM states {C_t} and outputs {h_t}.
Returns:
states_and_outputs: A list of 4-D float tensors, including the lstm state
and output at each timestep.
"""
return
self
.
_states_out
@
property
def
step
(
self
):
return
self
.
_step
def
preprocess
(
self
,
resized_inputs
):
"""SSD preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
"""
return
(
2.0
/
255.0
)
*
resized_inputs
-
1.0
def
get_base_network_scope
(
self
):
"""Returns the variable scope of the base network.
Returns:
The variable scope of the base network, e.g. MobilenetV1
"""
return
self
.
_base_network_scope
research/lstm_object_detection/lstm/rnn_decoder.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Custom RNN decoder."""
from
tensorflow.python.ops
import
variable_scope
def
rnn_decoder
(
decoder_inputs
,
initial_state
,
cell
,
loop_function
=
None
,
scope
=
None
):
"""RNN decoder for the sequence-to-sequence model.
This decoder returns a list of all states, rather than only the final state.
Args:
decoder_inputs: A list of 4D Tensors with shape [batch_size x input_size].
initial_state: 2D Tensor with shape [batch_size x cell.state_size].
cell: rnn_cell.RNNCell defining the cell function and size.
loop_function: If not None, this function will be applied to the i-th output
in order to generate the i+1-st input, and decoder_inputs will be ignored,
except for the first element ("GO" symbol). This can be used for decoding,
but also for training to emulate http://arxiv.org/abs/1506.03099.
Signature -- loop_function(prev, i) = next
* prev is a 2D Tensor of shape [batch_size x output_size],
* i is an integer, the step number (when advanced control is needed),
* next is a 2D Tensor of shape [batch_size x input_size].
scope: VariableScope for the created subgraph; defaults to "rnn_decoder".
Returns:
A tuple of the form (outputs, state), where:
outputs: A list of the same length as decoder_inputs of 4D Tensors with
shape [batch_size x output_size] containing generated outputs.
state: A list of the same length as decoder_inputs of the state of each
cell at each time-step. It is a 2D Tensor of shape
[batch_size x cell.state_size].
"""
with
variable_scope
.
variable_scope
(
scope
or
'rnn_decoder'
):
state
=
initial_state
outputs
=
[]
states
=
[]
prev
=
None
for
i
,
decoder_input
in
enumerate
(
decoder_inputs
):
if
loop_function
is
not
None
and
prev
is
not
None
:
with
variable_scope
.
variable_scope
(
'loop_function'
,
reuse
=
True
):
decoder_input
=
loop_function
(
prev
,
i
)
if
i
>
0
:
variable_scope
.
get_variable_scope
().
reuse_variables
()
output
,
state
=
cell
(
decoder_input
,
state
)
outputs
.
append
(
output
)
states
.
append
(
state
)
if
loop_function
is
not
None
:
prev
=
output
return
outputs
,
states
research/lstm_object_detection/metrics/__init__.py
0 → 100644
View file @
0d8e49ec
research/lstm_object_detection/metrics/coco_evaluation_all_frames.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Class for evaluating video object detections with COCO metrics."""
import
tensorflow
as
tf
from
google3.third_party.tensorflow_models.object_detection.core
import
standard_fields
from
google3.third_party.tensorflow_models.object_detection.metrics
import
coco_evaluation
from
google3.third_party.tensorflow_models.object_detection.metrics
import
coco_tools
class
CocoEvaluationAllFrames
(
coco_evaluation
.
CocoDetectionEvaluator
):
"""Class to evaluate COCO detection metrics for frame sequences.
The class overrides two functions: add_single_ground_truth_image_info and
add_single_detected_image_info.
For the evaluation of sequence video detection, by iterating through the
entire groundtruth_dict, all the frames in the unrolled frames in one LSTM
training sample are considered. Therefore, both groundtruth and detection
results of all frames are added for the evaluation. This is used when all the
frames are labeled in the video object detection training job.
"""
def
add_single_ground_truth_image_info
(
self
,
image_id
,
groundtruth_dict
):
"""Add groundtruth results of all frames to the eval pipeline.
This method overrides the function defined in the base class.
Args:
image_id: A unique string/integer identifier for the image.
groundtruth_dict: A list of dictionary containing -
InputDataFields.groundtruth_boxes: float32 numpy array of shape
[num_boxes, 4] containing `num_boxes` groundtruth boxes of the format
[ymin, xmin, ymax, xmax] in absolute image coordinates.
InputDataFields.groundtruth_classes: integer numpy array of shape
[num_boxes] containing 1-indexed groundtruth classes for the boxes.
InputDataFields.groundtruth_is_crowd (optional): integer numpy array of
shape [num_boxes] containing iscrowd flag for groundtruth boxes.
"""
for
idx
,
gt
in
enumerate
(
groundtruth_dict
):
if
not
gt
:
continue
image_frame_id
=
'{}_{}'
.
format
(
image_id
,
idx
)
if
image_frame_id
in
self
.
_image_ids
:
tf
.
logging
.
warning
(
'Ignoring ground truth with image id %s since it was '
'previously added'
,
image_frame_id
)
continue
self
.
_groundtruth_list
.
extend
(
coco_tools
.
ExportSingleImageGroundtruthToCoco
(
image_id
=
image_frame_id
,
next_annotation_id
=
self
.
_annotation_id
,
category_id_set
=
self
.
_category_id_set
,
groundtruth_boxes
=
gt
[
standard_fields
.
InputDataFields
.
groundtruth_boxes
],
groundtruth_classes
=
gt
[
standard_fields
.
InputDataFields
.
groundtruth_classes
]))
self
.
_annotation_id
+=
(
gt
[
standard_fields
.
InputDataFields
.
groundtruth_boxes
].
shape
[
0
])
# Boolean to indicate whether a detection has been added for this image.
self
.
_image_ids
[
image_frame_id
]
=
False
def
add_single_detected_image_info
(
self
,
image_id
,
detections_dict
):
"""Add detection results of all frames to the eval pipeline.
This method overrides the function defined in the base class.
Args:
image_id: A unique string/integer identifier for the image.
detections_dict: A list of dictionary containing -
DetectionResultFields.detection_boxes: float32 numpy array of shape
[num_boxes, 4] containing `num_boxes` detection boxes of the format
[ymin, xmin, ymax, xmax] in absolute image coordinates.
DetectionResultFields.detection_scores: float32 numpy array of shape
[num_boxes] containing detection scores for the boxes.
DetectionResultFields.detection_classes: integer numpy array of shape
[num_boxes] containing 1-indexed detection classes for the boxes.
Raises:
ValueError: If groundtruth for the image_id is not available.
"""
for
idx
,
det
in
enumerate
(
detections_dict
):
if
not
det
:
continue
image_frame_id
=
'{}_{}'
.
format
(
image_id
,
idx
)
if
image_frame_id
not
in
self
.
_image_ids
:
raise
ValueError
(
'Missing groundtruth for image-frame id: {}'
.
format
(
image_frame_id
))
if
self
.
_image_ids
[
image_frame_id
]:
tf
.
logging
.
warning
(
'Ignoring detection with image id %s since it was '
'previously added'
,
image_frame_id
)
continue
self
.
_detection_boxes_list
.
extend
(
coco_tools
.
ExportSingleImageDetectionBoxesToCoco
(
image_id
=
image_frame_id
,
category_id_set
=
self
.
_category_id_set
,
detection_boxes
=
det
[
standard_fields
.
DetectionResultFields
.
detection_boxes
],
detection_scores
=
det
[
standard_fields
.
DetectionResultFields
.
detection_scores
],
detection_classes
=
det
[
standard_fields
.
DetectionResultFields
.
detection_classes
]))
self
.
_image_ids
[
image_frame_id
]
=
True
research/lstm_object_detection/metrics/coco_evaluation_all_frames_test.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for video_object_detection.metrics.coco_video_evaluation."""
import
numpy
as
np
import
tensorflow
as
tf
from
lstm_object_detection.metrics
import
coco_evaluation_all_frames
from
google3.third_party.tensorflow_models.object_detection.core
import
standard_fields
class
CocoEvaluationAllFramesTest
(
tf
.
test
.
TestCase
):
def
testGroundtruthAndDetectionsDisagreeOnAllFrames
(
self
):
"""Tests that mAP is calculated on several different frame results."""
category_list
=
[{
'id'
:
0
,
'name'
:
'dog'
},
{
'id'
:
1
,
'name'
:
'cat'
}]
video_evaluator
=
coco_evaluation_all_frames
.
CocoEvaluationAllFrames
(
category_list
)
video_evaluator
.
add_single_ground_truth_image_info
(
image_id
=
'image1'
,
groundtruth_dict
=
[{
standard_fields
.
InputDataFields
.
groundtruth_boxes
:
np
.
array
([[
50.
,
50.
,
200.
,
200.
]]),
standard_fields
.
InputDataFields
.
groundtruth_classes
:
np
.
array
([
1
])
},
{
standard_fields
.
InputDataFields
.
groundtruth_boxes
:
np
.
array
([[
50.
,
50.
,
100.
,
100.
]]),
standard_fields
.
InputDataFields
.
groundtruth_classes
:
np
.
array
([
1
])
}])
video_evaluator
.
add_single_detected_image_info
(
image_id
=
'image1'
,
# A different groundtruth box on the frame other than the last one.
detections_dict
=
[{
standard_fields
.
DetectionResultFields
.
detection_boxes
:
np
.
array
([[
100.
,
100.
,
200.
,
200.
]]),
standard_fields
.
DetectionResultFields
.
detection_scores
:
np
.
array
([.
8
]),
standard_fields
.
DetectionResultFields
.
detection_classes
:
np
.
array
([
1
])
},
{
standard_fields
.
DetectionResultFields
.
detection_boxes
:
np
.
array
([[
50.
,
50.
,
100.
,
100.
]]),
standard_fields
.
DetectionResultFields
.
detection_scores
:
np
.
array
([.
8
]),
standard_fields
.
DetectionResultFields
.
detection_classes
:
np
.
array
([
1
])
}])
metrics
=
video_evaluator
.
evaluate
()
self
.
assertNotEqual
(
metrics
[
'DetectionBoxes_Precision/mAP'
],
1.0
)
def
testGroundtruthAndDetections
(
self
):
"""Tests that mAP is calculated correctly on GT and Detections."""
category_list
=
[{
'id'
:
0
,
'name'
:
'dog'
},
{
'id'
:
1
,
'name'
:
'cat'
}]
video_evaluator
=
coco_evaluation_all_frames
.
CocoEvaluationAllFrames
(
category_list
)
video_evaluator
.
add_single_ground_truth_image_info
(
image_id
=
'image1'
,
groundtruth_dict
=
[{
standard_fields
.
InputDataFields
.
groundtruth_boxes
:
np
.
array
([[
100.
,
100.
,
200.
,
200.
]]),
standard_fields
.
InputDataFields
.
groundtruth_classes
:
np
.
array
([
1
])
}])
video_evaluator
.
add_single_ground_truth_image_info
(
image_id
=
'image2'
,
groundtruth_dict
=
[{
standard_fields
.
InputDataFields
.
groundtruth_boxes
:
np
.
array
([[
50.
,
50.
,
100.
,
100.
]]),
standard_fields
.
InputDataFields
.
groundtruth_classes
:
np
.
array
([
1
])
}])
video_evaluator
.
add_single_ground_truth_image_info
(
image_id
=
'image3'
,
groundtruth_dict
=
[{
standard_fields
.
InputDataFields
.
groundtruth_boxes
:
np
.
array
([[
50.
,
100.
,
100.
,
120.
]]),
standard_fields
.
InputDataFields
.
groundtruth_classes
:
np
.
array
([
1
])
}])
video_evaluator
.
add_single_detected_image_info
(
image_id
=
'image1'
,
detections_dict
=
[{
standard_fields
.
DetectionResultFields
.
detection_boxes
:
np
.
array
([[
100.
,
100.
,
200.
,
200.
]]),
standard_fields
.
DetectionResultFields
.
detection_scores
:
np
.
array
([.
8
]),
standard_fields
.
DetectionResultFields
.
detection_classes
:
np
.
array
([
1
])
}])
video_evaluator
.
add_single_detected_image_info
(
image_id
=
'image2'
,
detections_dict
=
[{
standard_fields
.
DetectionResultFields
.
detection_boxes
:
np
.
array
([[
50.
,
50.
,
100.
,
100.
]]),
standard_fields
.
DetectionResultFields
.
detection_scores
:
np
.
array
([.
8
]),
standard_fields
.
DetectionResultFields
.
detection_classes
:
np
.
array
([
1
])
}])
video_evaluator
.
add_single_detected_image_info
(
image_id
=
'image3'
,
detections_dict
=
[{
standard_fields
.
DetectionResultFields
.
detection_boxes
:
np
.
array
([[
50.
,
100.
,
100.
,
120.
]]),
standard_fields
.
DetectionResultFields
.
detection_scores
:
np
.
array
([.
8
]),
standard_fields
.
DetectionResultFields
.
detection_classes
:
np
.
array
([
1
])
}])
metrics
=
video_evaluator
.
evaluate
()
self
.
assertAlmostEqual
(
metrics
[
'DetectionBoxes_Precision/mAP'
],
1.0
)
def
testMissingDetectionResults
(
self
):
"""Tests if groundtrue is missing, raises ValueError."""
category_list
=
[{
'id'
:
0
,
'name'
:
'dog'
}]
video_evaluator
=
coco_evaluation_all_frames
.
CocoEvaluationAllFrames
(
category_list
)
video_evaluator
.
add_single_ground_truth_image_info
(
image_id
=
'image1'
,
groundtruth_dict
=
[{
standard_fields
.
InputDataFields
.
groundtruth_boxes
:
np
.
array
([[
100.
,
100.
,
200.
,
200.
]]),
standard_fields
.
InputDataFields
.
groundtruth_classes
:
np
.
array
([
1
])
}])
with
self
.
assertRaisesRegexp
(
ValueError
,
r
'Missing groundtruth for image-frame id:.*'
):
video_evaluator
.
add_single_detected_image_info
(
image_id
=
'image3'
,
detections_dict
=
[{
standard_fields
.
DetectionResultFields
.
detection_boxes
:
np
.
array
([[
100.
,
100.
,
200.
,
200.
]]),
standard_fields
.
DetectionResultFields
.
detection_scores
:
np
.
array
([.
8
]),
standard_fields
.
DetectionResultFields
.
detection_classes
:
np
.
array
([
1
])
}])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
research/lstm_object_detection/model_builder.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A function to build a DetectionModel from configuration."""
from
lstm_object_detection.lstm
import
lstm_meta_arch
from
lstm_object_detection.models.lstm_ssd_mobilenet_v1_feature_extractor
import
LSTMMobileNetV1FeatureExtractor
from
google3.third_party.tensorflow_models.object_detection.builders
import
anchor_generator_builder
from
google3.third_party.tensorflow_models.object_detection.builders
import
box_coder_builder
from
google3.third_party.tensorflow_models.object_detection.builders
import
box_predictor_builder
from
google3.third_party.tensorflow_models.object_detection.builders
import
hyperparams_builder
from
google3.third_party.tensorflow_models.object_detection.builders
import
image_resizer_builder
from
google3.third_party.tensorflow_models.object_detection.builders
import
losses_builder
from
google3.third_party.tensorflow_models.object_detection.builders
import
matcher_builder
from
google3.third_party.tensorflow_models.object_detection.builders
import
model_builder
from
google3.third_party.tensorflow_models.object_detection.builders
import
post_processing_builder
from
google3.third_party.tensorflow_models.object_detection.builders
import
region_similarity_calculator_builder
as
sim_calc
model_builder
.
SSD_FEATURE_EXTRACTOR_CLASS_MAP
.
update
({
'lstm_mobilenet_v1'
:
LSTMMobileNetV1FeatureExtractor
,
})
SSD_FEATURE_EXTRACTOR_CLASS_MAP
=
model_builder
.
SSD_FEATURE_EXTRACTOR_CLASS_MAP
def
build
(
model_config
,
lstm_config
,
is_training
):
"""Builds a DetectionModel based on the model config.
Args:
model_config: A model.proto object containing the config for the desired
DetectionModel.
lstm_config: LstmModel config proto that specifies LSTM train/eval configs.
is_training: True if this model is being built for training purposes.
Returns:
DetectionModel based on the config.
Raises:
ValueError: On invalid meta architecture or model.
"""
return
_build_lstm_model
(
model_config
.
ssd
,
lstm_config
,
is_training
)
def
_build_lstm_feature_extractor
(
feature_extractor_config
,
is_training
,
lstm_state_depth
,
reuse_weights
=
None
):
"""Builds a ssd_meta_arch.SSDFeatureExtractor based on config.
Args:
feature_extractor_config: A SSDFeatureExtractor proto config from ssd.proto.
is_training: True if this feature extractor is being built for training.
lstm_state_depth: An integer of the depth of the lstm state.
reuse_weights: If the feature extractor should reuse weights.
Returns:
ssd_meta_arch.SSDFeatureExtractor based on config.
Raises:
ValueError: On invalid feature extractor type.
"""
feature_type
=
feature_extractor_config
.
type
depth_multiplier
=
feature_extractor_config
.
depth_multiplier
min_depth
=
feature_extractor_config
.
min_depth
pad_to_multiple
=
feature_extractor_config
.
pad_to_multiple
use_explicit_padding
=
feature_extractor_config
.
use_explicit_padding
use_depthwise
=
feature_extractor_config
.
use_depthwise
conv_hyperparams
=
hyperparams_builder
.
build
(
feature_extractor_config
.
conv_hyperparams
,
is_training
)
override_base_feature_extractor_hyperparams
=
(
feature_extractor_config
.
override_base_feature_extractor_hyperparams
)
if
feature_type
not
in
SSD_FEATURE_EXTRACTOR_CLASS_MAP
:
raise
ValueError
(
'Unknown ssd feature_extractor: {}'
.
format
(
feature_type
))
feature_extractor_class
=
SSD_FEATURE_EXTRACTOR_CLASS_MAP
[
feature_type
]
return
feature_extractor_class
(
is_training
,
depth_multiplier
,
min_depth
,
pad_to_multiple
,
conv_hyperparams
,
reuse_weights
,
use_explicit_padding
,
use_depthwise
,
override_base_feature_extractor_hyperparams
,
lstm_state_depth
)
def
_build_lstm_model
(
ssd_config
,
lstm_config
,
is_training
):
"""Builds an LSTM detection model based on the model config.
Args:
ssd_config: A ssd.proto object containing the config for the desired
LSTMMetaArch.
lstm_config: LstmModel config proto that specifies LSTM train/eval configs.
is_training: True if this model is being built for training purposes.
Returns:
LSTMMetaArch based on the config.
Raises:
ValueError: If ssd_config.type is not recognized (i.e. not registered in
model_class_map), or if lstm_config.interleave_strategy is not recognized.
ValueError: If unroll_length is not specified in the config file.
"""
feature_extractor
=
_build_lstm_feature_extractor
(
ssd_config
.
feature_extractor
,
is_training
,
lstm_config
.
lstm_state_depth
)
box_coder
=
box_coder_builder
.
build
(
ssd_config
.
box_coder
)
matcher
=
matcher_builder
.
build
(
ssd_config
.
matcher
)
region_similarity_calculator
=
sim_calc
.
build
(
ssd_config
.
similarity_calculator
)
num_classes
=
ssd_config
.
num_classes
ssd_box_predictor
=
box_predictor_builder
.
build
(
hyperparams_builder
.
build
,
ssd_config
.
box_predictor
,
is_training
,
num_classes
)
anchor_generator
=
anchor_generator_builder
.
build
(
ssd_config
.
anchor_generator
)
image_resizer_fn
=
image_resizer_builder
.
build
(
ssd_config
.
image_resizer
)
non_max_suppression_fn
,
score_conversion_fn
=
post_processing_builder
.
build
(
ssd_config
.
post_processing
)
(
classification_loss
,
localization_loss
,
classification_weight
,
localization_weight
,
miner
,
_
)
=
losses_builder
.
build
(
ssd_config
.
loss
)
normalize_loss_by_num_matches
=
ssd_config
.
normalize_loss_by_num_matches
encode_background_as_zeros
=
ssd_config
.
encode_background_as_zeros
negative_class_weight
=
ssd_config
.
negative_class_weight
# Extra configs for lstm unroll length.
unroll_length
=
None
if
'lstm'
in
ssd_config
.
feature_extractor
.
type
:
if
is_training
:
unroll_length
=
lstm_config
.
train_unroll_length
else
:
unroll_length
=
lstm_config
.
eval_unroll_length
if
unroll_length
is
None
:
raise
ValueError
(
'No unroll length found in the config file'
)
lstm_model
=
lstm_meta_arch
.
LSTMMetaArch
(
is_training
,
anchor_generator
,
ssd_box_predictor
,
box_coder
,
feature_extractor
,
matcher
,
region_similarity_calculator
,
encode_background_as_zeros
,
negative_class_weight
,
image_resizer_fn
,
non_max_suppression_fn
,
score_conversion_fn
,
classification_loss
,
localization_loss
,
classification_weight
,
localization_weight
,
normalize_loss_by_num_matches
,
miner
,
unroll_length
)
return
lstm_model
research/lstm_object_detection/model_builder_test.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for video_object_detection.tensorflow.model_builder."""
import
tensorflow
as
tf
from
google.protobuf
import
text_format
from
lstm_object_detection
import
model_builder
from
lstm_object_detection.lstm
import
lstm_meta_arch
from
lstm_object_detection.protos
import
pipeline_pb2
as
internal_pipeline_pb2
from
google3.third_party.tensorflow_models.object_detection.protos
import
pipeline_pb2
class
ModelBuilderTest
(
tf
.
test
.
TestCase
):
def
create_model
(
self
,
model_config
,
lstm_config
):
"""Builds a DetectionModel based on the model config.
Args:
model_config: A model.proto object containing the config for the desired
DetectionModel.
lstm_config: LstmModel config proto that specifies LSTM train/eval
configs.
Returns:
DetectionModel based on the config.
"""
return
model_builder
.
build
(
model_config
,
lstm_config
,
is_training
=
True
)
def
get_model_configs_from_proto
(
self
):
"""Creates a model text proto for testing.
Returns:
A dictionary of model configs.
"""
model_text_proto
=
"""
[object_detection.protos.lstm_model] {
train_unroll_length: 4
eval_unroll_length: 4
}
model {
ssd {
feature_extractor {
type: 'lstm_mobilenet_v1'
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
}
negative_class_weight: 2.0
box_coder {
faster_rcnn_box_coder {
}
}
matcher {
argmax_matcher {
}
}
similarity_calculator {
iou_similarity {
}
}
anchor_generator {
ssd_anchor_generator {
aspect_ratios: 1.0
}
}
image_resizer {
fixed_shape_resizer {
height: 320
width: 320
}
}
box_predictor {
convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
}
}
initializer {
truncated_normal_initializer {
}
}
}
}
}
normalize_loc_loss_by_codesize: true
loss {
classification_loss {
weighted_softmax {
}
}
localization_loss {
weighted_smooth_l1 {
}
}
}
}
}"""
pipeline_config
=
pipeline_pb2
.
TrainEvalPipelineConfig
()
text_format
.
Merge
(
model_text_proto
,
pipeline_config
)
configs
=
{}
configs
[
'model'
]
=
pipeline_config
.
model
configs
[
'lstm_model'
]
=
pipeline_config
.
Extensions
[
internal_pipeline_pb2
.
lstm_model
]
return
configs
def
test_model_creation_from_valid_configs
(
self
):
configs
=
self
.
get_model_configs_from_proto
()
# Test model properties.
self
.
assertEqual
(
configs
[
'model'
].
ssd
.
negative_class_weight
,
2.0
)
self
.
assertTrue
(
configs
[
'model'
].
ssd
.
normalize_loc_loss_by_codesize
)
self
.
assertEqual
(
configs
[
'model'
].
ssd
.
feature_extractor
.
type
,
'lstm_mobilenet_v1'
)
model
=
self
.
create_model
(
configs
[
'model'
],
configs
[
'lstm_model'
])
# Test architechture type.
self
.
assertIsInstance
(
model
,
lstm_meta_arch
.
LSTMMetaArch
)
# Test LSTM unroll length.
self
.
assertEqual
(
model
.
unroll_length
,
4
)
def
test_model_creation_from_invalid_configs
(
self
):
configs
=
self
.
get_model_configs_from_proto
()
# Test model build failure with wrong input configs.
with
self
.
assertRaises
(
AttributeError
):
_
=
self
.
create_model
(
configs
[
'model'
],
configs
[
'model'
])
# Test model builder failure with missing configs.
with
self
.
assertRaises
(
TypeError
):
# pylint: disable=no-value-for-parameter
_
=
self
.
create_model
(
configs
[
'lstm_model'
])
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
research/lstm_object_detection/models/__init__.py
0 → 100644
View file @
0d8e49ec
research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""LSTMFeatureExtractor for MobilenetV1 features."""
import
tensorflow
as
tf
from
tensorflow.python.framework
import
ops
as
tf_ops
from
lstm_object_detection.lstm
import
lstm_cells
from
lstm_object_detection.lstm
import
lstm_meta_arch
from
lstm_object_detection.lstm
import
rnn_decoder
from
google3.third_party.tensorflow_models.object_detection.models
import
feature_map_generators
from
google3.third_party.tensorflow_models.object_detection.utils
import
context_manager
from
google3.third_party.tensorflow_models.object_detection.utils
import
ops
from
google3.third_party.tensorflow_models.object_detection.utils
import
shape_utils
from
nets
import
mobilenet_v1
slim
=
tf
.
contrib
.
slim
class
LSTMMobileNetV1FeatureExtractor
(
lstm_meta_arch
.
LSTMFeatureExtractor
):
"""LSTM Feature Extractor using MobilenetV1 features."""
def
__init__
(
self
,
is_training
,
depth_multiplier
,
min_depth
,
pad_to_multiple
,
conv_hyperparams
,
reuse_weights
=
None
,
use_explicit_padding
=
False
,
use_depthwise
=
True
,
override_base_feature_extractor_hyperparams
=
False
,
lstm_state_depth
=
256
):
"""Initializes instance of MobileNetV1 Feature Extractor for LSTM Models.
Args:
is_training: A boolean whether the network is in training mode.
depth_multiplier: A float depth multiplier for feature extractor.
min_depth: A number representing minimum feature extractor depth.
pad_to_multiple: The nearest multiple to zero pad the input height and
width dimensions to.
conv_hyperparams: A function to construct tf slim arg_scope for conv2d
and separable_conv2d ops in the layers that are added on top of the
base feature extractor.
reuse_weights: Whether to reuse variables. Default is None.
use_explicit_padding: Whether to use explicit padding when extracting
features. Default is False.
use_depthwise: Whether to use depthwise convolutions. Default is True.
override_base_feature_extractor_hyperparams: Whether to override
hyperparameters of the base feature extractor with the one from
`conv_hyperparams_fn`.
lstm_state_depth: An integter of the depth of the lstm state.
"""
super
(
LSTMMobileNetV1FeatureExtractor
,
self
).
__init__
(
is_training
,
depth_multiplier
,
min_depth
,
pad_to_multiple
,
conv_hyperparams
,
reuse_weights
,
use_explicit_padding
,
use_depthwise
,
override_base_feature_extractor_hyperparams
)
self
.
_feature_map_layout
=
{
'from_layer'
:
[
'Conv2d_13_pointwise_lstm'
,
''
,
''
,
''
,
''
],
'layer_depth'
:
[
-
1
,
512
,
256
,
256
,
128
],
'use_explicit_padding'
:
self
.
_use_explicit_padding
,
'use_depthwise'
:
self
.
_use_depthwise
,
}
self
.
_base_network_scope
=
'MobilenetV1'
self
.
_lstm_state_depth
=
lstm_state_depth
def
extract_features
(
self
,
preprocessed_inputs
,
state_saver
=
None
,
state_name
=
'lstm_state'
,
unroll_length
=
5
,
scope
=
None
):
"""Extracts features from preprocessed inputs.
The features include the base network features, lstm features and SSD
features, organized in the following name scope:
<parent scope>/MobilenetV1/...
<parent scope>/LSTM/...
<parent scope>/FeatureMaps/...
Args:
preprocessed_inputs: A [batch, height, width, channels] float tensor
representing a batch of consecutive frames from video clips.
state_saver: A state saver object with methods `state` and `save_state`.
state_name: A python string for the name to use with the state_saver.
unroll_length: The number of steps to unroll the lstm.
scope: The scope for the base network of the feature extractor.
Returns:
A list of tensors where the ith tensor has shape [batch, height_i,
width_i, depth_i]
"""
preprocessed_inputs
=
shape_utils
.
check_min_image_dim
(
33
,
preprocessed_inputs
)
with
slim
.
arg_scope
(
mobilenet_v1
.
mobilenet_v1_arg_scope
(
is_training
=
self
.
_is_training
)):
with
(
slim
.
arg_scope
(
self
.
_conv_hyperparams_fn
())
if
self
.
_override_base_feature_extractor_hyperparams
else
context_manager
.
IdentityContextManager
()):
with
slim
.
arg_scope
([
slim
.
batch_norm
],
fused
=
False
):
# Base network.
with
tf
.
variable_scope
(
scope
,
self
.
_base_network_scope
,
reuse
=
self
.
_reuse_weights
)
as
scope
:
net
,
image_features
=
mobilenet_v1
.
mobilenet_v1_base
(
ops
.
pad_to_multiple
(
preprocessed_inputs
,
self
.
_pad_to_multiple
),
final_endpoint
=
'Conv2d_13_pointwise'
,
min_depth
=
self
.
_min_depth
,
depth_multiplier
=
self
.
_depth_multiplier
,
scope
=
scope
)
with
slim
.
arg_scope
(
self
.
_conv_hyperparams_fn
()):
with
slim
.
arg_scope
(
[
slim
.
batch_norm
],
fused
=
False
,
is_training
=
self
.
_is_training
):
# ConvLSTM layers.
with
tf
.
variable_scope
(
'LSTM'
,
reuse
=
self
.
_reuse_weights
)
as
lstm_scope
:
lstm_cell
=
lstm_cells
.
BottleneckConvLSTMCell
(
filter_size
=
(
3
,
3
),
output_size
=
(
net
.
shape
[
1
].
value
,
net
.
shape
[
2
].
value
),
num_units
=
max
(
self
.
_min_depth
,
self
.
_lstm_state_depth
),
activation
=
tf
.
nn
.
relu6
,
visualize_gates
=
True
)
net_seq
=
list
(
tf
.
split
(
net
,
unroll_length
))
if
state_saver
is
None
:
init_state
=
lstm_cell
.
init_state
(
state_name
,
net
.
shape
[
0
].
value
/
unroll_length
,
tf
.
float32
)
else
:
c
=
state_saver
.
state
(
'%s_c'
%
state_name
)
h
=
state_saver
.
state
(
'%s_h'
%
state_name
)
init_state
=
(
c
,
h
)
# Identities added for inputing state tensors externally.
c_ident
=
tf
.
identity
(
init_state
[
0
],
name
=
'lstm_state_in_c'
)
h_ident
=
tf
.
identity
(
init_state
[
1
],
name
=
'lstm_state_in_h'
)
init_state
=
(
c_ident
,
h_ident
)
net_seq
,
states_out
=
rnn_decoder
.
rnn_decoder
(
net_seq
,
init_state
,
lstm_cell
,
scope
=
lstm_scope
)
batcher_ops
=
None
self
.
_states_out
=
states_out
if
state_saver
is
not
None
:
self
.
_step
=
state_saver
.
state
(
'%s_step'
%
state_name
)
batcher_ops
=
[
state_saver
.
save_state
(
'%s_c'
%
state_name
,
states_out
[
-
1
][
0
]),
state_saver
.
save_state
(
'%s_h'
%
state_name
,
states_out
[
-
1
][
1
]),
state_saver
.
save_state
(
'%s_step'
%
state_name
,
self
.
_step
-
1
)
]
with
tf_ops
.
control_dependencies
(
batcher_ops
):
image_features
[
'Conv2d_13_pointwise_lstm'
]
=
tf
.
concat
(
net_seq
,
0
)
# Identities added for reading output states, to be reused externally.
tf
.
identity
(
states_out
[
-
1
][
0
],
name
=
'lstm_state_out_c'
)
tf
.
identity
(
states_out
[
-
1
][
1
],
name
=
'lstm_state_out_h'
)
# SSD layers.
with
tf
.
variable_scope
(
'FeatureMaps'
,
reuse
=
self
.
_reuse_weights
):
feature_maps
=
feature_map_generators
.
multi_resolution_feature_maps
(
feature_map_layout
=
self
.
_feature_map_layout
,
depth_multiplier
=
(
self
.
_depth_multiplier
),
min_depth
=
self
.
_min_depth
,
insert_1x1_conv
=
True
,
image_features
=
image_features
)
return
feature_maps
.
values
()
research/lstm_object_detection/models/lstm_ssd_mobilenet_v1_feature_extractor_test.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for models.lstm_ssd_mobilenet_v1_feature_extractor."""
import
numpy
as
np
import
tensorflow
as
tf
from
lstm_object_detection.models
import
lstm_ssd_mobilenet_v1_feature_extractor
as
feature_extactor
from
google3.third_party.tensorflow_models.object_detection.models
import
ssd_feature_extractor_test
slim
=
tf
.
contrib
.
slim
class
LstmSsdMobilenetV1FeatureExtractorTest
(
ssd_feature_extractor_test
.
SsdFeatureExtractorTestBase
):
def
_create_feature_extractor
(
self
,
depth_multiplier
=
1.0
,
pad_to_multiple
=
1
,
is_training
=
True
,
use_explicit_padding
=
False
):
"""Constructs a new feature extractor.
Args:
depth_multiplier: A float depth multiplier for feature extractor.
pad_to_multiple: The nearest multiple to zero pad the input height and
width dimensions to.
is_training: A boolean whether the network is in training mode.
use_explicit_padding: A boolean whether to use explicit padding.
Returns:
An lstm_ssd_meta_arch.LSTMMobileNetV1FeatureExtractor object.
"""
min_depth
=
32
extractor
=
(
feature_extactor
.
LSTMMobileNetV1FeatureExtractor
(
is_training
,
depth_multiplier
,
min_depth
,
pad_to_multiple
,
self
.
conv_hyperparams_fn
,
use_explicit_padding
=
use_explicit_padding
))
extractor
.
lstm_state_depth
=
int
(
256
*
depth_multiplier
)
return
extractor
def
test_extract_features_returns_correct_shapes_256
(
self
):
image_height
=
256
image_width
=
256
depth_multiplier
=
1.0
pad_to_multiple
=
1
batch_size
=
5
expected_feature_map_shape
=
[(
batch_size
,
8
,
8
,
256
),
(
batch_size
,
4
,
4
,
512
),
(
batch_size
,
2
,
2
,
256
),
(
batch_size
,
1
,
1
,
256
)]
self
.
check_extract_features_returns_correct_shape
(
batch_size
,
image_height
,
image_width
,
depth_multiplier
,
pad_to_multiple
,
expected_feature_map_shape
,
use_explicit_padding
=
False
)
self
.
check_extract_features_returns_correct_shape
(
batch_size
,
image_height
,
image_width
,
depth_multiplier
,
pad_to_multiple
,
expected_feature_map_shape
,
use_explicit_padding
=
True
)
def
test_preprocess_returns_correct_value_range
(
self
):
test_image
=
np
.
random
.
rand
(
5
,
128
,
128
,
3
)
feature_extractor
=
self
.
_create_feature_extractor
()
preprocessed_image
=
feature_extractor
.
preprocess
(
test_image
)
self
.
assertTrue
(
np
.
all
(
np
.
less_equal
(
np
.
abs
(
preprocessed_image
),
1.0
)))
def
test_variables_only_created_in_scope
(
self
):
scope_name
=
'MobilenetV1'
g
=
tf
.
Graph
()
with
g
.
as_default
():
preprocessed_inputs
=
tf
.
placeholder
(
tf
.
float32
,
(
5
,
256
,
256
,
3
))
feature_extractor
=
self
.
_create_feature_extractor
()
feature_extractor
.
extract_features
(
preprocessed_inputs
)
variables
=
g
.
get_collection
(
tf
.
GraphKeys
.
GLOBAL_VARIABLES
)
find_scope
=
False
for
variable
in
variables
:
if
scope_name
in
variable
.
name
:
find_scope
=
True
break
self
.
assertTrue
(
find_scope
)
def
test_lstm_non_zero_state
(
self
):
init_state
=
{
'lstm_state_c'
:
tf
.
zeros
([
8
,
8
,
256
]),
'lstm_state_h'
:
tf
.
zeros
([
8
,
8
,
256
]),
'lstm_state_step'
:
tf
.
zeros
([
1
])
}
seq
=
{
'test'
:
tf
.
random_uniform
([
3
,
1
,
1
,
1
])}
stateful_reader
=
tf
.
contrib
.
training
.
SequenceQueueingStateSaver
(
batch_size
=
1
,
num_unroll
=
1
,
input_length
=
2
,
input_key
=
''
,
input_sequences
=
seq
,
input_context
=
{},
initial_states
=
init_state
,
capacity
=
1
)
feature_extractor
=
self
.
_create_feature_extractor
()
image
=
tf
.
random_uniform
([
5
,
256
,
256
,
3
])
with
tf
.
variable_scope
(
'zero_state'
):
feature_map
=
feature_extractor
.
extract_features
(
image
,
stateful_reader
.
next_batch
)
with
tf
.
Session
()
as
sess
:
sess
.
run
(
tf
.
global_variables_initializer
())
sess
.
run
([
stateful_reader
.
prefetch_op
])
_
=
sess
.
run
([
feature_map
])
# Update states with the next batch.
state
=
sess
.
run
(
stateful_reader
.
next_batch
.
state
(
'lstm_state_c'
))
# State should no longer be zero after update.
self
.
assertTrue
(
state
.
any
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
research/lstm_object_detection/protos/__init__.py
0 → 100644
View file @
0d8e49ec
research/lstm_object_detection/protos/input_reader_google.proto
0 → 100644
View file @
0d8e49ec
syntax
=
"proto2"
;
package
lstm_object_detection
.
input_readers
;
import
"third_party/tensorflow_models/object_detection/protos/input_reader.proto"
;
message
GoogleInputReader
{
extend
object_detection
.
protos.ExternalInputReader
{
optional
GoogleInputReader
google_input_reader
=
444
;
}
oneof
input_reader
{
TFRecordVideoInputReader
tf_record_video_input_reader
=
1
;
}
}
message
TFRecordVideoInputReader
{
// Path(s) to tfrecords of input data.
repeated
string
input_path
=
1
;
enum
DataType
{
UNSPECIFIED
=
0
;
ANNOTATED_IMAGE
=
1
;
TF_EXAMPLE
=
2
;
TF_SEQUENCE_EXAMPLE
=
3
;
}
optional
DataType
data_type
=
2
[
default
=
TF_SEQUENCE_EXAMPLE
];
// Length of the video sequence. All the input video sequence should have the
// same length in frames, e.g. 5 frames.
optional
int32
video_length
=
3
;
}
research/lstm_object_detection/protos/pipeline.proto
0 → 100644
View file @
0d8e49ec
syntax
=
"proto2"
;
package
object_detection
.
protos
;
import
"third_party/tensorflow_models/object_detection/protos/pipeline.proto"
;
extend
TrainEvalPipelineConfig
{
optional
LstmModel
lstm_model
=
205743444
;
}
// Message for extra fields needed for configuring LSTM model.
message
LstmModel
{
// Unroll length for training LSTMs.
optional
int32
train_unroll_length
=
1
;
// Unroll length for evaluating LSTMs.
optional
int32
eval_unroll_length
=
2
;
// Depth of the lstm feature map.
optional
int32
lstm_state_depth
=
3
[
default
=
256
];
}
research/lstm_object_detection/seq_dataset_builder.py
0 → 100644
View file @
0d8e49ec
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r
"""tf.data.Dataset builder.
Creates data sources for DetectionModels from an InputReader config. See
input_reader.proto for options.
Note: If users wishes to also use their own InputReaders with the Object
Detection configuration framework, they should define their own builder function
that wraps the build function.
"""
import
tensorflow
as
tf
import
tensorflow.google
as
google_tf
from
google3.learning.brain.contrib.slim.data
import
parallel_reader
from
tensorflow.contrib.training.python.training
import
sequence_queueing_state_saver
as
sqss
from
lstm_object_detection
import
tf_sequence_example_decoder
from
lstm_object_detection.protos
import
input_reader_google_pb2
from
google3.third_party.tensorflow_models.object_detection.core
import
preprocessor
from
google3.third_party.tensorflow_models.object_detection.core
import
preprocessor_cache
from
google3.third_party.tensorflow_models.object_detection.core
import
standard_fields
as
fields
from
google3.third_party.tensorflow_models.object_detection.protos
import
input_reader_pb2
from
google3.third_party.tensorflow_models.object_detection.utils
import
ops
as
util_ops
# TODO(yinxiao): Make the following variable into configurable proto.
# Padding size for the labeled objects in each frame. Here we assume each
# frame has a total number of objects less than _PADDING_SIZE.
_PADDING_SIZE
=
30
def
_build_training_batch_dict
(
batch_sequences_with_states
,
unroll_length
,
batch_size
):
"""Builds training batch samples.
Args:
batch_sequences_with_states: A batch_sequences_with_states object.
unroll_length: Unrolled length for LSTM training.
batch_size: Batch size for queue outputs.
Returns:
A dictionary of tensors based on items in input_reader_config.
"""
seq_tensors_dict
=
{
fields
.
InputDataFields
.
image
:
[],
fields
.
InputDataFields
.
groundtruth_boxes
:
[],
fields
.
InputDataFields
.
groundtruth_classes
:
[],
'batch'
:
batch_sequences_with_states
,
}
for
i
in
range
(
unroll_length
):
for
j
in
range
(
batch_size
):
filtered_dict
=
util_ops
.
filter_groundtruth_with_nan_box_coordinates
({
fields
.
InputDataFields
.
groundtruth_boxes
:
(
batch_sequences_with_states
.
sequences
[
'groundtruth_boxes'
][
j
][
i
]),
fields
.
InputDataFields
.
groundtruth_classes
:
(
batch_sequences_with_states
.
sequences
[
'groundtruth_classes'
][
j
][
i
]
),
})
filtered_dict
=
util_ops
.
retain_groundtruth_with_positive_classes
(
filtered_dict
)
seq_tensors_dict
[
fields
.
InputDataFields
.
image
].
append
(
batch_sequences_with_states
.
sequences
[
'image'
][
j
][
i
])
seq_tensors_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
].
append
(
filtered_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
])
seq_tensors_dict
[
fields
.
InputDataFields
.
groundtruth_classes
].
append
(
filtered_dict
[
fields
.
InputDataFields
.
groundtruth_classes
])
seq_tensors_dict
[
fields
.
InputDataFields
.
image
]
=
tuple
(
seq_tensors_dict
[
fields
.
InputDataFields
.
image
])
seq_tensors_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
]
=
tuple
(
seq_tensors_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
])
seq_tensors_dict
[
fields
.
InputDataFields
.
groundtruth_classes
]
=
tuple
(
seq_tensors_dict
[
fields
.
InputDataFields
.
groundtruth_classes
])
return
seq_tensors_dict
def
build
(
input_reader_config
,
model_config
,
lstm_config
,
unroll_length
,
data_augmentation_options
=
None
,
batch_size
=
1
):
"""Builds a tensor dictionary based on the InputReader config.
Args:
input_reader_config: An input_reader_builder.InputReader object.
model_config: A model.proto object containing the config for the desired
DetectionModel.
lstm_config: LSTM specific configs.
unroll_length: Unrolled length for LSTM training.
data_augmentation_options: A list of tuples, where each tuple contains a
data augmentation function and a dictionary containing arguments and their
values (see preprocessor.py).
batch_size: Batch size for queue outputs.
Returns:
A dictionary of tensors based on items in the input_reader_config.
Raises:
ValueError: On invalid input reader proto.
ValueError: If no input paths are specified.
"""
if
not
isinstance
(
input_reader_config
,
input_reader_pb2
.
InputReader
):
raise
ValueError
(
'input_reader_config not of type '
'input_reader_pb2.InputReader.'
)
external_reader_config
=
input_reader_config
.
external_input_reader
google_input_reader_config
=
external_reader_config
.
Extensions
[
input_reader_google_pb2
.
GoogleInputReader
.
google_input_reader
]
input_reader_type
=
google_input_reader_config
.
WhichOneof
(
'input_reader'
)
if
input_reader_type
==
'tf_record_video_input_reader'
:
config
=
google_input_reader_config
.
tf_record_video_input_reader
reader_type_class
=
tf
.
TFRecordReader
else
:
raise
ValueError
(
'Unsupported reader in input_reader_config: %s'
%
input_reader_type
)
if
not
config
.
input_path
:
raise
ValueError
(
'At least one input path must be specified in '
'`input_reader_config`.'
)
key
,
value
=
parallel_reader
.
parallel_read
(
config
.
input_path
[:],
# Convert `RepeatedScalarContainer` to list.
reader_class
=
reader_type_class
,
num_epochs
=
(
input_reader_config
.
num_epochs
if
input_reader_config
.
num_epochs
else
None
),
num_readers
=
input_reader_config
.
num_readers
,
shuffle
=
input_reader_config
.
shuffle
,
dtypes
=
[
tf
.
string
,
tf
.
string
],
capacity
=
input_reader_config
.
queue_capacity
,
min_after_dequeue
=
input_reader_config
.
min_after_dequeue
)
# TODO(yinxiao): Add loading instance mask option.
decoder
=
tf_sequence_example_decoder
.
TfSequenceExampleDecoder
()
keys_to_decode
=
[
fields
.
InputDataFields
.
image
,
fields
.
InputDataFields
.
groundtruth_boxes
,
fields
.
InputDataFields
.
groundtruth_classes
]
tensor_dict
=
decoder
.
decode
(
value
,
items
=
keys_to_decode
)
tensor_dict
[
'image'
].
set_shape
([
None
,
None
,
None
,
3
])
tensor_dict
[
'groundtruth_boxes'
].
set_shape
([
None
,
None
,
4
])
height
=
model_config
.
ssd
.
image_resizer
.
fixed_shape_resizer
.
height
width
=
model_config
.
ssd
.
image_resizer
.
fixed_shape_resizer
.
width
# If data augmentation is specified in the config file, the preprocessor
# will be called here to augment the data as specified. Most common
# augmentations include horizontal flip and cropping.
if
data_augmentation_options
:
images_pre
=
tf
.
split
(
tensor_dict
[
'image'
],
config
.
video_length
,
axis
=
0
)
bboxes_pre
=
tf
.
split
(
tensor_dict
[
'groundtruth_boxes'
],
config
.
video_length
,
axis
=
0
)
labels_pre
=
tf
.
split
(
tensor_dict
[
'groundtruth_classes'
],
config
.
video_length
,
axis
=
0
)
images_proc
,
bboxes_proc
,
labels_proc
=
[],
[],
[]
cache
=
preprocessor_cache
.
PreprocessorCache
()
for
i
,
_
in
enumerate
(
images_pre
):
image_dict
=
{
fields
.
InputDataFields
.
image
:
images_pre
[
i
],
fields
.
InputDataFields
.
groundtruth_boxes
:
tf
.
squeeze
(
bboxes_pre
[
i
],
axis
=
0
),
fields
.
InputDataFields
.
groundtruth_classes
:
tf
.
squeeze
(
labels_pre
[
i
],
axis
=
0
),
}
image_dict
=
preprocessor
.
preprocess
(
image_dict
,
data_augmentation_options
,
func_arg_map
=
preprocessor
.
get_default_func_arg_map
(),
preprocess_vars_cache
=
cache
)
# Pads detection count to _PADDING_SIZE.
image_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
]
=
tf
.
pad
(
image_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
],
[[
0
,
_PADDING_SIZE
],
[
0
,
0
]])
image_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
]
=
tf
.
slice
(
image_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
],
[
0
,
0
],
[
_PADDING_SIZE
,
-
1
])
image_dict
[
fields
.
InputDataFields
.
groundtruth_classes
]
=
tf
.
pad
(
image_dict
[
fields
.
InputDataFields
.
groundtruth_classes
],
[[
0
,
_PADDING_SIZE
]])
image_dict
[
fields
.
InputDataFields
.
groundtruth_classes
]
=
tf
.
slice
(
image_dict
[
fields
.
InputDataFields
.
groundtruth_classes
],
[
0
],
[
_PADDING_SIZE
])
images_proc
.
append
(
image_dict
[
fields
.
InputDataFields
.
image
])
bboxes_proc
.
append
(
image_dict
[
fields
.
InputDataFields
.
groundtruth_boxes
])
labels_proc
.
append
(
image_dict
[
fields
.
InputDataFields
.
groundtruth_classes
])
tensor_dict
[
'image'
]
=
tf
.
concat
(
images_proc
,
axis
=
0
)
tensor_dict
[
'groundtruth_boxes'
]
=
tf
.
stack
(
bboxes_proc
,
axis
=
0
)
tensor_dict
[
'groundtruth_classes'
]
=
tf
.
stack
(
labels_proc
,
axis
=
0
)
else
:
# Pads detection count to _PADDING_SIZE per frame.
tensor_dict
[
'groundtruth_boxes'
]
=
tf
.
pad
(
tensor_dict
[
'groundtruth_boxes'
],
[[
0
,
0
],
[
0
,
_PADDING_SIZE
],
[
0
,
0
]])
tensor_dict
[
'groundtruth_boxes'
]
=
tf
.
slice
(
tensor_dict
[
'groundtruth_boxes'
],
[
0
,
0
,
0
],
[
-
1
,
_PADDING_SIZE
,
-
1
])
tensor_dict
[
'groundtruth_classes'
]
=
tf
.
pad
(
tensor_dict
[
'groundtruth_classes'
],
[[
0
,
0
],
[
0
,
_PADDING_SIZE
]])
tensor_dict
[
'groundtruth_classes'
]
=
tf
.
slice
(
tensor_dict
[
'groundtruth_classes'
],
[
0
,
0
],
[
-
1
,
_PADDING_SIZE
])
tensor_dict
[
'image'
],
_
=
preprocessor
.
resize_image
(
tensor_dict
[
'image'
],
new_height
=
height
,
new_width
=
width
)
num_steps
=
config
.
video_length
/
unroll_length
init_states
=
{
'lstm_state_c'
:
tf
.
zeros
([
height
/
32
,
width
/
32
,
lstm_config
.
lstm_state_depth
]),
'lstm_state_h'
:
tf
.
zeros
([
height
/
32
,
width
/
32
,
lstm_config
.
lstm_state_depth
]),
'lstm_state_step'
:
tf
.
constant
(
num_steps
,
shape
=
[]),
}
batch
=
sqss
.
batch_sequences_with_states
(
input_key
=
key
,
input_sequences
=
tensor_dict
,
input_context
=
{},
input_length
=
None
,
initial_states
=
init_states
,
num_unroll
=
unroll_length
,
batch_size
=
batch_size
,
num_threads
=
batch_size
,
make_keys_unique
=
True
,
capacity
=
batch_size
*
batch_size
)
return
_build_training_batch_dict
(
batch
,
unroll_length
,
batch_size
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment