Commit e00e0e13 authored by dreamdragon's avatar dreamdragon
Browse files

Merge remote-tracking branch 'upstream/master'

parents b915db4e 402b561b
...@@ -12,7 +12,7 @@ import "object_detection/protos/post_processing.proto"; ...@@ -12,7 +12,7 @@ import "object_detection/protos/post_processing.proto";
import "object_detection/protos/region_similarity_calculator.proto"; import "object_detection/protos/region_similarity_calculator.proto";
// Configuration for Single Shot Detection (SSD) models. // Configuration for Single Shot Detection (SSD) models.
// Next id: 21 // Next id: 22
message Ssd { message Ssd {
// Number of classes to predict. // Number of classes to predict.
...@@ -92,11 +92,17 @@ message Ssd { ...@@ -92,11 +92,17 @@ message Ssd {
// Minimum number of effective negative samples. // Minimum number of effective negative samples.
// Only applies if use_expected_classification_loss_under_sampling is true. // Only applies if use_expected_classification_loss_under_sampling is true.
optional float minimum_negative_sampling = 19 [default=0]; optional float min_num_negative_samples = 19 [default=0];
// Desired number of effective negative samples per positive sample. // Desired number of effective negative samples per positive sample.
// Only applies if use_expected_classification_loss_under_sampling is true. // Only applies if use_expected_classification_loss_under_sampling is true.
optional float desired_negative_sampling_ratio = 20 [default=3]; optional float desired_negative_sampling_ratio = 20 [default=3];
// Whether to add an implicit background class to one-hot encodings of
// groundtruth labels. Set to false if using groundtruth labels with an
// explicit background class, using multiclass scores, or if training a single
// class model.
optional bool add_background_class = 21 [default = true];
} }
......
...@@ -6,7 +6,7 @@ import "object_detection/protos/optimizer.proto"; ...@@ -6,7 +6,7 @@ import "object_detection/protos/optimizer.proto";
import "object_detection/protos/preprocessor.proto"; import "object_detection/protos/preprocessor.proto";
// Message for configuring DetectionModel training jobs (train.py). // Message for configuring DetectionModel training jobs (train.py).
// Next id: 27 // Next id: 28
message TrainConfig { message TrainConfig {
// Effective batch size to use for training. // Effective batch size to use for training.
// For TPU (or sync SGD jobs), the batch size per core (or GPU) is going to be // For TPU (or sync SGD jobs), the batch size per core (or GPU) is going to be
...@@ -115,4 +115,7 @@ message TrainConfig { ...@@ -115,4 +115,7 @@ message TrainConfig {
// Whether to use bfloat16 for training. // Whether to use bfloat16 for training.
optional bool use_bfloat16 = 26 [default=false]; optional bool use_bfloat16 = 26 [default=false];
// Whether to summarize gradients.
optional bool summarize_gradients = 27 [default=false];
} }
trainingInput: trainingInput:
runtimeVersion: "1.8" runtimeVersion: "1.9"
scaleTier: CUSTOM scaleTier: CUSTOM
masterType: standard_gpu masterType: standard_gpu
workerCount: 5 workerCount: 5
......
# Quantized trained SSD with Mobilenet v2 on Open Images v4.
# Non-face boxes are dropped during training and non-face groundtruth boxes are
# ignored when evaluating.
#
# Users should configure the fine_tune_checkpoint field in the train config as
# well as the label_map_path and input_path fields in the train_input_reader and
# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
# should be configured.
model {
ssd {
num_classes: 1
image_resizer {
fixed_shape_resizer {
height: 320
width: 320
}
}
feature_extractor {
type: "ssd_mobilenet_v2"
depth_multiplier: 1.0
min_depth: 16
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 4.0e-05
}
}
initializer {
truncated_normal_initializer {
mean: 0.0
stddev: 0.03
}
}
activation: RELU_6
batch_norm {
decay: 0.9997
center: true
scale: true
epsilon: 0.001
train: true
}
}
pad_to_multiple: 32
use_explicit_padding: true
}
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
}
}
similarity_calculator {
iou_similarity {
}
}
box_predictor {
convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 4.0e-05
}
}
initializer {
truncated_normal_initializer {
mean: 0.0
stddev: 0.03
}
}
activation: RELU_6
batch_norm {
decay: 0.9997
center: true
scale: true
epsilon: 0.001
train: true
}
}
min_depth: 0
max_depth: 0
num_layers_before_predictor: 0
use_dropout: false
kernel_size: 3
box_code_size: 4
apply_sigmoid_to_scores: false
}
}
anchor_generator {
ssd_anchor_generator {
num_layers: 6
min_scale: 0.2
max_scale: 0.95
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
aspect_ratios: 3.0
aspect_ratios: 0.3333
height_stride: 16
height_stride: 32
height_stride: 64
height_stride: 128
height_stride: 256
height_stride: 512
width_stride: 16
width_stride: 32
width_stride: 64
width_stride: 128
width_stride: 256
width_stride: 512
}
}
post_processing {
batch_non_max_suppression {
score_threshold: 1.0e-08
iou_threshold: 0.5
max_detections_per_class: 100
max_total_detections: 100
}
score_converter: SIGMOID
}
normalize_loss_by_num_matches: true
loss {
localization_loss {
weighted_smooth_l1 {
}
}
classification_loss {
weighted_sigmoid {
}
}
hard_example_miner {
num_hard_examples: 3000
iou_threshold: 0.99
loss_type: CLASSIFICATION
max_negatives_per_positive: 3
min_negatives_per_image: 10
}
classification_weight: 1.0
localization_weight: 1.0
}
}
}
train_config {
batch_size: 32
data_augmentation_options {
random_horizontal_flip {
keypoint_flip_permutation: 1
keypoint_flip_permutation: 0
keypoint_flip_permutation: 2
keypoint_flip_permutation: 3
keypoint_flip_permutation: 5
keypoint_flip_permutation: 4
}
}
data_augmentation_options {
ssd_random_crop_fixed_aspect_ratio {
}
}
optimizer {
rms_prop_optimizer {
learning_rate {
exponential_decay_learning_rate {
initial_learning_rate: 0.004
decay_steps: 800720
decay_factor: 0.95
}
}
momentum_optimizer_value: 0.9
decay: 0.9
epsilon: 1.0
}
}
fine_tune_checkpoint: ""
}
train_input_reader {
label_map_path: "PATH_TO_BE_CONFIGURED/face_label_map.pbtxt"
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/face_train.record-?????-of-00100"
}
}
eval_config {
metrics_set: "coco_detection_metrics"
use_moving_averages: true
}
eval_input_reader {
label_map_path: "PATH_TO_BE_CONFIGURED/face_label_map.pbtxt"
shuffle: false
num_readers: 1
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/face_val.record-?????-of-00010"
}
}
graph_rewriter {
quantization {
delay: 500000
weight_bits: 8
activation_bits: 8
}
}
# Quantized trained SSD with Mobilenet v2 on MSCOCO Dataset.
# Users should configure the fine_tune_checkpoint field in the train config as
# well as the label_map_path and input_path fields in the train_input_reader and
# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
# should be configured.
model {
ssd {
num_classes: 90
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
}
}
similarity_calculator {
iou_similarity {
}
}
anchor_generator {
ssd_anchor_generator {
num_layers: 6
min_scale: 0.2
max_scale: 0.95
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
aspect_ratios: 3.0
aspect_ratios: 0.3333
}
}
image_resizer {
fixed_shape_resizer {
height: 300
width: 300
}
}
box_predictor {
convolutional_box_predictor {
min_depth: 0
max_depth: 0
num_layers_before_predictor: 0
use_dropout: false
dropout_keep_probability: 0.8
kernel_size: 1
box_code_size: 4
apply_sigmoid_to_scores: false
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
batch_norm {
train: true,
scale: true,
center: true,
decay: 0.9997,
epsilon: 0.001,
}
}
}
}
feature_extractor {
type: 'ssd_mobilenet_v2'
min_depth: 16
depth_multiplier: 1.0
conv_hyperparams {
activation: RELU_6,
regularizer {
l2_regularizer {
weight: 0.00004
}
}
initializer {
truncated_normal_initializer {
stddev: 0.03
mean: 0.0
}
}
batch_norm {
train: true,
scale: true,
center: true,
decay: 0.9997,
epsilon: 0.001,
}
}
}
loss {
classification_loss {
weighted_sigmoid {
}
}
localization_loss {
weighted_smooth_l1 {
}
}
hard_example_miner {
num_hard_examples: 3000
iou_threshold: 0.99
loss_type: CLASSIFICATION
max_negatives_per_positive: 3
min_negatives_per_image: 3
}
classification_weight: 1.0
localization_weight: 1.0
}
normalize_loss_by_num_matches: true
post_processing {
batch_non_max_suppression {
score_threshold: 1e-8
iou_threshold: 0.6
max_detections_per_class: 100
max_total_detections: 100
}
score_converter: SIGMOID
}
}
}
train_config: {
batch_size: 24
optimizer {
rms_prop_optimizer: {
learning_rate: {
exponential_decay_learning_rate {
initial_learning_rate: 0.004
decay_steps: 800720
decay_factor: 0.95
}
}
momentum_optimizer_value: 0.9
decay: 0.9
epsilon: 1.0
}
}
fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/model.ckpt"
fine_tune_checkpoint_type: "detection"
# Note: The below line limits the training process to 200K steps, which we
# empirically found to be sufficient enough to train the pets dataset. This
# effectively bypasses the learning rate schedule (the learning rate will
# never decay). Remove the below line to train indefinitely.
num_steps: 200000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
ssd_random_crop {
}
}
}
train_input_reader: {
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/mscoco_train.record-?????-of-00100"
}
label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
}
eval_config: {
num_examples: 8000
# Note: The below line limits the evaluation process to 10 evaluations.
# Remove the below line to evaluate indefinitely.
max_evals: 10
}
eval_input_reader: {
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/mscoco_val.record-?????-of-00010"
}
label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
shuffle: false
num_readers: 1
}
graph_rewriter {
quantization {
delay: 48000
weight_bits: 8
activation_bits: 8
}
}
\ No newline at end of file
...@@ -76,12 +76,14 @@ def get_spatial_image_size(image_resizer_config): ...@@ -76,12 +76,14 @@ def get_spatial_image_size(image_resizer_config):
raise ValueError("Unknown image resizer type.") raise ValueError("Unknown image resizer type.")
def get_configs_from_pipeline_file(pipeline_config_path): def get_configs_from_pipeline_file(pipeline_config_path, config_override=None):
"""Reads config from a file containing pipeline_pb2.TrainEvalPipelineConfig. """Reads config from a file containing pipeline_pb2.TrainEvalPipelineConfig.
Args: Args:
pipeline_config_path: Path to pipeline_pb2.TrainEvalPipelineConfig text pipeline_config_path: Path to pipeline_pb2.TrainEvalPipelineConfig text
proto. proto.
config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
override pipeline_config_path.
Returns: Returns:
Dictionary of configuration objects. Keys are `model`, `train_config`, Dictionary of configuration objects. Keys are `model`, `train_config`,
...@@ -92,6 +94,8 @@ def get_configs_from_pipeline_file(pipeline_config_path): ...@@ -92,6 +94,8 @@ def get_configs_from_pipeline_file(pipeline_config_path):
with tf.gfile.GFile(pipeline_config_path, "r") as f: with tf.gfile.GFile(pipeline_config_path, "r") as f:
proto_str = f.read() proto_str = f.read()
text_format.Merge(proto_str, pipeline_config) text_format.Merge(proto_str, pipeline_config)
if config_override:
text_format.Merge(config_override, pipeline_config)
return create_configs_from_pipeline_proto(pipeline_config) return create_configs_from_pipeline_proto(pipeline_config)
...@@ -430,7 +434,7 @@ def merge_external_params_with_configs(configs, hparams=None, kwargs_dict=None): ...@@ -430,7 +434,7 @@ def merge_external_params_with_configs(configs, hparams=None, kwargs_dict=None):
final learning rates. final learning rates.
In this case key can be one of the following formats: In this case key can be one of the following formats:
1. legacy update: single string that indicates the attribute to be 1. legacy update: single string that indicates the attribute to be
updated. E.g. 'lable_map_path', 'eval_input_path', 'shuffle'. updated. E.g. 'label_map_path', 'eval_input_path', 'shuffle'.
Note that when updating fields (e.g. eval_input_path, eval_shuffle) in Note that when updating fields (e.g. eval_input_path, eval_shuffle) in
eval_input_configs, the override will only be applied when eval_input_configs, the override will only be applied when
eval_input_configs has exactly 1 element. eval_input_configs has exactly 1 element.
......
...@@ -633,11 +633,37 @@ class ObjectDetectionEvaluation(object): ...@@ -633,11 +633,37 @@ class ObjectDetectionEvaluation(object):
nms_max_output_boxes=10000, nms_max_output_boxes=10000,
use_weighted_mean_ap=False, use_weighted_mean_ap=False,
label_id_offset=0, label_id_offset=0,
group_of_weight=0.0): group_of_weight=0.0,
per_image_eval_class=per_image_evaluation.PerImageEvaluation):
"""Constructor.
Args:
num_groundtruth_classes: Number of ground-truth classes.
matching_iou_threshold: IOU threshold used for matching detected boxes
to ground-truth boxes.
nms_iou_threshold: IOU threshold used for non-maximum suppression.
nms_max_output_boxes: Maximum number of boxes returned by non-maximum
suppression.
use_weighted_mean_ap: (optional) boolean which determines if the mean
average precision is computed directly from the scores and tp_fp_labels
of all classes.
label_id_offset: The label id offset.
group_of_weight: Weight of group-of boxes.If set to 0, detections of the
correct class within a group-of box are ignored. If weight is > 0, then
if at least one detection falls within a group-of box with
matching_iou_threshold, weight group_of_weight is added to true
positives. Consequently, if no detection falls within a group-of box,
weight group_of_weight is added to false negatives.
per_image_eval_class: The class that contains functions for computing
per image metrics.
Raises:
ValueError: if num_groundtruth_classes is smaller than 1.
"""
if num_groundtruth_classes < 1: if num_groundtruth_classes < 1:
raise ValueError('Need at least 1 groundtruth class for evaluation.') raise ValueError('Need at least 1 groundtruth class for evaluation.')
self.per_image_eval = per_image_evaluation.PerImageEvaluation( self.per_image_eval = per_image_eval_class(
num_groundtruth_classes=num_groundtruth_classes, num_groundtruth_classes=num_groundtruth_classes,
matching_iou_threshold=matching_iou_threshold, matching_iou_threshold=matching_iou_threshold,
nms_iou_threshold=nms_iou_threshold, nms_iou_threshold=nms_iou_threshold,
...@@ -659,14 +685,16 @@ class ObjectDetectionEvaluation(object): ...@@ -659,14 +685,16 @@ class ObjectDetectionEvaluation(object):
self._initialize_detections() self._initialize_detections()
def _initialize_detections(self): def _initialize_detections(self):
"""Initializes internal data structures."""
self.detection_keys = set() self.detection_keys = set()
self.scores_per_class = [[] for _ in range(self.num_class)] self.scores_per_class = [[] for _ in range(self.num_class)]
self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)] self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]
self.num_images_correctly_detected_per_class = np.zeros(self.num_class) self.num_images_correctly_detected_per_class = np.zeros(self.num_class)
self.average_precision_per_class = np.empty(self.num_class, dtype=float) self.average_precision_per_class = np.empty(self.num_class, dtype=float)
self.average_precision_per_class.fill(np.nan) self.average_precision_per_class.fill(np.nan)
self.precisions_per_class = [] self.precisions_per_class = [np.nan] * self.num_class
self.recalls_per_class = [] self.recalls_per_class = [np.nan] * self.num_class
self.corloc_per_class = np.ones(self.num_class, dtype=float) self.corloc_per_class = np.ones(self.num_class, dtype=float)
def clear_detections(self): def clear_detections(self):
...@@ -867,8 +895,8 @@ class ObjectDetectionEvaluation(object): ...@@ -867,8 +895,8 @@ class ObjectDetectionEvaluation(object):
logging.info(scores) logging.info(scores)
precision, recall = metrics.compute_precision_recall( precision, recall = metrics.compute_precision_recall(
scores, tp_fp_labels, self.num_gt_instances_per_class[class_index]) scores, tp_fp_labels, self.num_gt_instances_per_class[class_index])
self.precisions_per_class.append(precision) self.precisions_per_class[class_index] = precision
self.recalls_per_class.append(recall) self.recalls_per_class[class_index] = recall
average_precision = metrics.compute_average_precision(precision, recall) average_precision = metrics.compute_average_precision(precision, recall)
self.average_precision_per_class[class_index] = average_precision self.average_precision_per_class[class_index] = average_precision
......
...@@ -872,7 +872,8 @@ def merge_boxes_with_multiple_labels(boxes, ...@@ -872,7 +872,8 @@ def merge_boxes_with_multiple_labels(boxes,
merged_box_indices) merged_box_indices)
def nearest_neighbor_upsampling(input_tensor, scale): def nearest_neighbor_upsampling(input_tensor, scale=None, height_scale=None,
width_scale=None):
"""Nearest neighbor upsampling implementation. """Nearest neighbor upsampling implementation.
Nearest neighbor upsampling function that maps input tensor with shape Nearest neighbor upsampling function that maps input tensor with shape
...@@ -883,19 +884,33 @@ def nearest_neighbor_upsampling(input_tensor, scale): ...@@ -883,19 +884,33 @@ def nearest_neighbor_upsampling(input_tensor, scale):
Args: Args:
input_tensor: A float32 tensor of size [batch, height_in, width_in, input_tensor: A float32 tensor of size [batch, height_in, width_in,
channels]. channels].
scale: An integer multiple to scale resolution of input data. scale: An integer multiple to scale resolution of input data in both height
and width dimensions.
height_scale: An integer multiple to scale the height of input image. This
option when provided overrides `scale` option.
width_scale: An integer multiple to scale the width of input image. This
option when provided overrides `scale` option.
Returns: Returns:
data_up: A float32 tensor of size data_up: A float32 tensor of size
[batch, height_in*scale, width_in*scale, channels]. [batch, height_in*scale, width_in*scale, channels].
Raises:
ValueError: If both scale and height_scale or if both scale and width_scale
are None.
""" """
if not scale and (height_scale is None or width_scale is None):
raise ValueError('Provide either `scale` or `height_scale` and'
' `width_scale`.')
with tf.name_scope('nearest_neighbor_upsampling'): with tf.name_scope('nearest_neighbor_upsampling'):
h_scale = scale if height_scale is None else height_scale
w_scale = scale if width_scale is None else width_scale
(batch_size, height, width, (batch_size, height, width,
channels) = shape_utils.combined_static_and_dynamic_shape(input_tensor) channels) = shape_utils.combined_static_and_dynamic_shape(input_tensor)
output_tensor = tf.reshape( output_tensor = tf.reshape(
input_tensor, [batch_size, height, 1, width, 1, channels]) * tf.ones( input_tensor, [batch_size, height, 1, width, 1, channels]) * tf.ones(
[1, 1, scale, 1, scale, 1], dtype=input_tensor.dtype) [1, 1, h_scale, 1, w_scale, 1], dtype=input_tensor.dtype)
return tf.reshape(output_tensor, return tf.reshape(output_tensor,
[batch_size, height * scale, width * scale, channels]) [batch_size, height * h_scale, width * w_scale, channels])
def matmul_gather_on_zeroth_axis(params, indices, scope=None): def matmul_gather_on_zeroth_axis(params, indices, scope=None):
...@@ -1072,29 +1087,35 @@ def native_crop_and_resize(image, boxes, crop_size, scope=None): ...@@ -1072,29 +1087,35 @@ def native_crop_and_resize(image, boxes, crop_size, scope=None):
return tf.reshape(cropped_regions, final_shape) return tf.reshape(cropped_regions, final_shape)
def expected_classification_loss_under_sampling(batch_cls_targets, cls_losses, def expected_classification_loss_under_sampling(
desired_negative_sampling_ratio, batch_cls_targets, cls_losses, unmatched_cls_losses,
minimum_negative_sampling): desired_negative_sampling_ratio, min_num_negative_samples):
"""Computes classification loss by background/foreground weighting. """Computes classification loss by background/foreground weighting.
The weighting is such that the effective background/foreground weight ratio The weighting is such that the effective background/foreground weight ratio
is the desired_negative_sampling_ratio. if p_i is the foreground probability is the desired_negative_sampling_ratio. if p_i is the foreground probability
of anchor a_i, L(a_i) is the anchors loss, N is the number of anchors, and M of anchor a_i, L(a_i) is the anchors loss, N is the number of anchors, M
is the sum of foreground probabilities across anchors, then the total loss L is the sum of foreground probabilities across anchors, and K is the desired
is calculated as: ratio between the number of negative and positive samples, then the total loss
L is calculated as:
beta = K*M/(N-M) beta = K*M/(N-M)
L = sum_{i=1}^N [p_i + beta * (1 - p_i)] * (L(a_i)) L = sum_{i=1}^N [p_i * L_p(a_i) + beta * (1 - p_i) * L_n(a_i)]
where L_p(a_i) is the loss against target assuming the anchor was matched,
otherwise zero, and L_n(a_i) is the loss against the background target
assuming the anchor was unmatched, otherwise zero.
Args: Args:
batch_cls_targets: A tensor with shape [batch_size, num_anchors, batch_cls_targets: A tensor with shape [batch_size, num_anchors, num_classes
num_classes + 1], where 0'th index is the background class, containing + 1], where 0'th index is the background class, containing the class
the class distrubution for the target assigned to a given anchor. distrubution for the target assigned to a given anchor.
cls_losses: Float tensor of shape [batch_size, num_anchors] cls_losses: Float tensor of shape [batch_size, num_anchors] representing
representing anchorwise classification losses. anchorwise classification losses.
unmatched_cls_losses: loss for each anchor against the unmatched class
target.
desired_negative_sampling_ratio: The desired background/foreground weight desired_negative_sampling_ratio: The desired background/foreground weight
ratio. ratio.
minimum_negative_sampling: Minimum number of effective negative samples. min_num_negative_samples: Minimum number of effective negative samples.
Used only when there are no positive examples. Used only when there are no positive examples.
Returns: Returns:
...@@ -1103,36 +1124,44 @@ def expected_classification_loss_under_sampling(batch_cls_targets, cls_losses, ...@@ -1103,36 +1124,44 @@ def expected_classification_loss_under_sampling(batch_cls_targets, cls_losses,
num_anchors = tf.cast(tf.shape(batch_cls_targets)[1], tf.float32) num_anchors = tf.cast(tf.shape(batch_cls_targets)[1], tf.float32)
# find the p_i # find the p_i
foreground_probabilities = ( foreground_probabilities = 1 - batch_cls_targets[:, :, 0]
foreground_probabilities_from_targets(batch_cls_targets))
foreground_sum = tf.reduce_sum(foreground_probabilities, axis=-1) foreground_sum = tf.reduce_sum(foreground_probabilities, axis=-1)
# for each anchor, expected_j is the expected number of positive anchors
# given that this anchor was sampled as negative.
tiled_foreground_sum = tf.tile(
tf.reshape(foreground_sum, [-1, 1]),
[1, tf.cast(num_anchors, tf.int32)])
expected_j = tiled_foreground_sum - foreground_probabilities
k = desired_negative_sampling_ratio k = desired_negative_sampling_ratio
# compute beta # compute beta
denominators = (num_anchors - foreground_sum) expected_negatives = tf.to_float(num_anchors) - expected_j
beta = tf.where( desired_negatives = k * expected_j
tf.equal(denominators, 0), tf.zeros_like(foreground_sum), desired_negatives = tf.where(
k * foreground_sum / denominators) tf.greater(desired_negatives, expected_negatives), expected_negatives,
desired_negatives)
# probability that an anchor is sampled for the loss computation given that it
# is negative.
beta = desired_negatives / expected_negatives
# where the foreground sum is zero, use a minimum negative weight. # where the foreground sum is zero, use a minimum negative weight.
min_negative_weight = 1.0 * minimum_negative_sampling / num_anchors min_negative_weight = 1.0 * min_num_negative_samples / num_anchors
beta = tf.where( beta = tf.where(
tf.equal(foreground_sum, 0), min_negative_weight * tf.ones_like(beta), tf.equal(tiled_foreground_sum, 0),
beta) min_negative_weight * tf.ones_like(beta), beta)
beta = tf.reshape(beta, [-1, 1])
cls_loss_weights = foreground_probabilities + ( foreground_weights = foreground_probabilities
1 - foreground_probabilities) * beta background_weights = (1 - foreground_weights) * beta
weighted_losses = cls_loss_weights * cls_losses weighted_foreground_losses = foreground_weights * cls_losses
weighted_background_losses = background_weights * unmatched_cls_losses
cls_losses = tf.reduce_sum(weighted_losses, axis=-1) cls_losses = tf.reduce_sum(
weighted_foreground_losses, axis=-1) + tf.reduce_sum(
weighted_background_losses, axis=-1)
return cls_losses return cls_losses
def foreground_probabilities_from_targets(batch_cls_targets):
foreground_probabilities = 1 - batch_cls_targets[:, :, 0]
return foreground_probabilities
...@@ -1222,7 +1222,7 @@ class MergeBoxesWithMultipleLabelsTest(tf.test.TestCase): ...@@ -1222,7 +1222,7 @@ class MergeBoxesWithMultipleLabelsTest(tf.test.TestCase):
class NearestNeighborUpsamplingTest(test_case.TestCase): class NearestNeighborUpsamplingTest(test_case.TestCase):
def test_upsampling(self): def test_upsampling_with_single_scale(self):
def graph_fn(inputs): def graph_fn(inputs):
custom_op_output = ops.nearest_neighbor_upsampling(inputs, scale=2) custom_op_output = ops.nearest_neighbor_upsampling(inputs, scale=2)
...@@ -1236,6 +1236,22 @@ class NearestNeighborUpsamplingTest(test_case.TestCase): ...@@ -1236,6 +1236,22 @@ class NearestNeighborUpsamplingTest(test_case.TestCase):
[[2], [2], [3], [3]]]] [[2], [2], [3], [3]]]]
self.assertAllClose(custom_op_output, expected_output) self.assertAllClose(custom_op_output, expected_output)
def test_upsampling_with_separate_height_width_scales(self):
def graph_fn(inputs):
custom_op_output = ops.nearest_neighbor_upsampling(inputs,
height_scale=2,
width_scale=3)
return custom_op_output
inputs = np.reshape(np.arange(4).astype(np.float32), [1, 2, 2, 1])
custom_op_output = self.execute(graph_fn, [inputs])
expected_output = [[[[0], [0], [0], [1], [1], [1]],
[[0], [0], [0], [1], [1], [1]],
[[2], [2], [2], [3], [3], [3]],
[[2], [2], [2], [3], [3], [3]]]]
self.assertAllClose(custom_op_output, expected_output)
class MatmulGatherOnZerothAxis(test_case.TestCase): class MatmulGatherOnZerothAxis(test_case.TestCase):
...@@ -1454,78 +1470,182 @@ class OpsTestExpectedClassificationLoss(test_case.TestCase): ...@@ -1454,78 +1470,182 @@ class OpsTestExpectedClassificationLoss(test_case.TestCase):
def testExpectedClassificationLossUnderSamplingWithHardLabels(self): def testExpectedClassificationLossUnderSamplingWithHardLabels(self):
def graph_fn(batch_cls_targets, cls_losses, negative_to_positive_ratio, def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses,
minimum_negative_sampling): negative_to_positive_ratio, min_num_negative_samples):
return ops.expected_classification_loss_under_sampling( return ops.expected_classification_loss_under_sampling(
batch_cls_targets, cls_losses, negative_to_positive_ratio, batch_cls_targets, cls_losses, unmatched_cls_losses,
minimum_negative_sampling) negative_to_positive_ratio, min_num_negative_samples)
batch_cls_targets = np.array( batch_cls_targets = np.array(
[[[1., 0, 0], [0, 1., 0]], [[1., 0, 0], [0, 1., 0]]], dtype=np.float32) [[[1., 0, 0], [0, 1., 0]], [[1., 0, 0], [0, 1., 0]]], dtype=np.float32)
cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32) cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32)
unmatched_cls_losses = np.array([[10, 20], [30, 40]], dtype=np.float32)
negative_to_positive_ratio = np.array([2], dtype=np.float32) negative_to_positive_ratio = np.array([2], dtype=np.float32)
minimum_negative_sampling = np.array([1], dtype=np.float32) min_num_negative_samples = np.array([1], dtype=np.float32)
classification_loss = self.execute(graph_fn, [ classification_loss = self.execute(graph_fn, [
batch_cls_targets, cls_losses, negative_to_positive_ratio, batch_cls_targets, cls_losses, unmatched_cls_losses,
minimum_negative_sampling negative_to_positive_ratio, min_num_negative_samples
]) ])
# expected_foregorund_sum = [1,1] # expected_foreground_sum = [1,1]
# expected_beta = [2,2] # expected_expected_j = [[1, 0], [1, 0]]
# expected_cls_loss_weights = [2,1],[2,1] # expected_expected_negatives = [[1, 2], [1, 2]]
# expected_classification_loss_under_sampling = [2*1+1*2, 2*3+1*4] # expected_desired_negatives = [[2, 0], [2, 0]]
expected_classification_loss_under_sampling = [2 + 2, 6 + 4] # expected_beta = [[1, 0], [1, 0]]
# expected_foreground_weights = [[0, 1], [0, 1]]
# expected_background_weights = [[1, 0], [1, 0]]
# expected_weighted_foreground_losses = [[0, 2], [0, 4]]
# expected_weighted_background_losses = [[10, 0], [30, 0]]
# expected_classification_loss_under_sampling = [6, 40]
expected_classification_loss_under_sampling = [2 + 10, 4 + 30]
self.assertAllClose(expected_classification_loss_under_sampling,
classification_loss)
def testExpectedClassificationLossUnderSamplingWithHardLabelsMoreNegatives(
self):
def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses,
negative_to_positive_ratio, min_num_negative_samples):
return ops.expected_classification_loss_under_sampling(
batch_cls_targets, cls_losses, unmatched_cls_losses,
negative_to_positive_ratio, min_num_negative_samples)
batch_cls_targets = np.array(
[[[1., 0, 0], [0, 1., 0], [1., 0, 0], [1., 0, 0], [1., 0, 0]]],
dtype=np.float32)
cls_losses = np.array([[1, 2, 3, 4, 5]], dtype=np.float32)
unmatched_cls_losses = np.array([[10, 20, 30, 40, 50]], dtype=np.float32)
negative_to_positive_ratio = np.array([2], dtype=np.float32)
min_num_negative_samples = np.array([1], dtype=np.float32)
classification_loss = self.execute(graph_fn, [
batch_cls_targets, cls_losses, unmatched_cls_losses,
negative_to_positive_ratio, min_num_negative_samples
])
# expected_foreground_sum = [1]
# expected_expected_j = [[1, 0, 1, 1, 1]]
# expected_expected_negatives = [[4, 5, 4, 4, 4]]
# expected_desired_negatives = [[2, 0, 2, 2, 2]]
# expected_beta = [[.5, 0, .5, .5, .5]]
# expected_foreground_weights = [[0, 1, 0, 0, 0]]
# expected_background_weights = [[.5, 0, .5, .5, .5]]
# expected_weighted_foreground_losses = [[0, 2, 0, 0, 0]]
# expected_weighted_background_losses = [[10*.5, 0, 30*.5, 40*.5, 50*.5]]
# expected_classification_loss_under_sampling = [5+2+15+20+25]
expected_classification_loss_under_sampling = [5 + 2 + 15 + 20 + 25]
self.assertAllClose(expected_classification_loss_under_sampling, self.assertAllClose(expected_classification_loss_under_sampling,
classification_loss) classification_loss)
def testExpectedClassificationLossUnderSamplingWithAllNegative(self): def testExpectedClassificationLossUnderSamplingWithAllNegative(self):
def graph_fn(batch_cls_targets, cls_losses): def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses):
return ops.expected_classification_loss_under_sampling( return ops.expected_classification_loss_under_sampling(
batch_cls_targets, cls_losses, negative_to_positive_ratio, batch_cls_targets, cls_losses, unmatched_cls_losses,
minimum_negative_sampling) negative_to_positive_ratio, min_num_negative_samples)
batch_cls_targets = np.array( batch_cls_targets = np.array(
[[[1, 0, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 0]]], dtype=np.float32) [[[1, 0, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 0]]], dtype=np.float32)
cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32) cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32)
unmatched_cls_losses = np.array([[10, 20], [30, 40]], dtype=np.float32)
negative_to_positive_ratio = np.array([2], dtype=np.float32) negative_to_positive_ratio = np.array([2], dtype=np.float32)
minimum_negative_sampling = np.array([1], dtype=np.float32) min_num_negative_samples = np.array([1], dtype=np.float32)
classification_loss = self.execute(graph_fn, classification_loss = self.execute(
[batch_cls_targets, cls_losses]) graph_fn, [batch_cls_targets, cls_losses, unmatched_cls_losses])
# expected_foregorund_sum = [0,0] # expected_foreground_sum = [0,0]
# expected_beta = [0.5,0.5] # expected_expected_j = [[0, 0], [0, 0]]
# expected_cls_loss_weights = [0.5,0.5],[0.5,0.5] # expected_expected_negatives = [[2, 2], [2, 2]]
# expected_classification_loss_under_sampling = [.5*1+.5*2, .5*3+.5*4] # expected_desired_negatives = [[0, 0], [0, 0]]
expected_classification_loss_under_sampling = [1.5, 3.5] # expected_beta = [[0, 0],[0, 0]]
# expected_foreground_weights = [[0, 0], [0, 0]]
# expected_background_weights = [[.5, .5], [.5, .5]]
# expected_weighted_foreground_losses = [[0, 0], [0, 0]]
# expected_weighted_background_losses = [[5, 10], [15, 20]]
# expected_classification_loss_under_sampling = [15, 35]
expected_classification_loss_under_sampling = [
10 * .5 + 20 * .5, 30 * .5 + 40 * .5
]
self.assertAllClose(expected_classification_loss_under_sampling, self.assertAllClose(expected_classification_loss_under_sampling,
classification_loss) classification_loss)
def testExpectedClassificationLossUnderSamplingWithAllPositive(self): def testExpectedClassificationLossUnderSamplingWithAllPositive(self):
def graph_fn(batch_cls_targets, cls_losses): def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses):
return ops.expected_classification_loss_under_sampling( return ops.expected_classification_loss_under_sampling(
batch_cls_targets, cls_losses, negative_to_positive_ratio, batch_cls_targets, cls_losses, unmatched_cls_losses,
minimum_negative_sampling) negative_to_positive_ratio, min_num_negative_samples)
batch_cls_targets = np.array( batch_cls_targets = np.array(
[[[0, 1., 0], [0, 1., 0]], [[0, 1, 0], [0, 0, 1]]], dtype=np.float32) [[[0, 1., 0], [0, 1., 0]], [[0, 1, 0], [0, 0, 1]]], dtype=np.float32)
cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32) cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32)
unmatched_cls_losses = np.array([[10, 20], [30, 40]], dtype=np.float32)
negative_to_positive_ratio = np.array([2], dtype=np.float32) negative_to_positive_ratio = np.array([2], dtype=np.float32)
minimum_negative_sampling = np.array([1], dtype=np.float32) min_num_negative_samples = np.array([1], dtype=np.float32)
classification_loss = self.execute(
graph_fn, [batch_cls_targets, cls_losses, unmatched_cls_losses])
# expected_foreground_sum = [2,2]
# expected_expected_j = [[1, 1], [1, 1]]
# expected_expected_negatives = [[1, 1], [1, 1]]
# expected_desired_negatives = [[1, 1], [1, 1]]
# expected_beta = [[1, 1],[1, 1]]
# expected_foreground_weights = [[1, 1], [1, 1]]
# expected_background_weights = [[0, 0], [0, 0]]
# expected_weighted_foreground_losses = [[1, 2], [3, 4]]
# expected_weighted_background_losses = [[0, 0], [0, 0]]
# expected_classification_loss_under_sampling = [15, 35]
expected_classification_loss_under_sampling = [1 + 2, 3 + 4]
classification_loss = self.execute(graph_fn, self.assertAllClose(expected_classification_loss_under_sampling,
[batch_cls_targets, cls_losses]) classification_loss)
# expected_foregorund_sum = [2,2] def testExpectedClassificationLossUnderSamplingWithSoftLabels(self):
# expected_beta = [0,0]
# expected_cls_loss_weights = [1,1],[1,1] def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses,
# expected_classification_loss_under_sampling = [1*1+1*2, 1*3+1*4] negative_to_positive_ratio, min_num_negative_samples):
expected_classification_loss_under_sampling = [1 + 2, 3 + 4] return ops.expected_classification_loss_under_sampling(
batch_cls_targets, cls_losses, unmatched_cls_losses,
negative_to_positive_ratio, min_num_negative_samples)
batch_cls_targets = np.array([[[.75, .25, 0], [0.25, .75, 0], [.75, .25, 0],
[0.25, .75, 0], [1., 0, 0]]],
dtype=np.float32)
cls_losses = np.array([[1, 2, 3, 4, 5]], dtype=np.float32)
unmatched_cls_losses = np.array([[10, 20, 30, 40, 50]], dtype=np.float32)
negative_to_positive_ratio = np.array([2], dtype=np.float32)
min_num_negative_samples = np.array([1], dtype=np.float32)
classification_loss = self.execute(graph_fn, [
batch_cls_targets, cls_losses, unmatched_cls_losses,
negative_to_positive_ratio, min_num_negative_samples
])
# expected_foreground_sum = [2]
# expected_expected_j = [[1.75, 1.25, 1.75, 1.25, 2]]
# expected_expected_negatives = [[3.25, 3.75, 3.25, 3.75, 3]]
# expected_desired_negatives = [[3.25, 2.5, 3.25, 2.5, 3]]
# expected_beta = [[1, 2/3, 1, 2/3, 1]]
# expected_foreground_weights = [[0.25, .75, .25, .75, 0]]
# expected_background_weights = [[[.75, 1/6., .75, 1/6., 1]]]
# expected_weighted_foreground_losses = [[.25*1, .75*2, .25*3, .75*4, 0*5]]
# expected_weighted_background_losses = [[
# .75*10, 1/6.*20, .75*30, 1/6.*40, 1*50]]
# expected_classification_loss_under_sampling = sum([
# .25*1, .75*2, .25*3, .75*4, 0, .75*10, 1/6.*20, .75*30,
# 1/6.*40, 1*50])
expected_classification_loss_under_sampling = [
sum([
.25 * 1, .75 * 2, .25 * 3, .75 * 4, 0, .75 * 10, 1 / 6. * 20,
.75 * 30, 1 / 6. * 40, 1 * 50
])
]
self.assertAllClose(expected_classification_loss_under_sampling, self.assertAllClose(expected_classification_loss_under_sampling,
classification_loss) classification_loss)
......
...@@ -45,8 +45,10 @@ class MockBoxCoder(box_coder.BoxCoder): ...@@ -45,8 +45,10 @@ class MockBoxCoder(box_coder.BoxCoder):
class MockBoxPredictor(box_predictor.BoxPredictor): class MockBoxPredictor(box_predictor.BoxPredictor):
"""Simple box predictor that ignores inputs and outputs all zeros.""" """Simple box predictor that ignores inputs and outputs all zeros."""
def __init__(self, is_training, num_classes, predict_mask=False): def __init__(self, is_training, num_classes, add_background_class=True,
predict_mask=False):
super(MockBoxPredictor, self).__init__(is_training, num_classes) super(MockBoxPredictor, self).__init__(is_training, num_classes)
self._add_background_class = add_background_class
self._predict_mask = predict_mask self._predict_mask = predict_mask
def _predict(self, image_features, num_predictions_per_location): def _predict(self, image_features, num_predictions_per_location):
...@@ -57,10 +59,13 @@ class MockBoxPredictor(box_predictor.BoxPredictor): ...@@ -57,10 +59,13 @@ class MockBoxPredictor(box_predictor.BoxPredictor):
num_anchors = (combined_feature_shape[1] * combined_feature_shape[2]) num_anchors = (combined_feature_shape[1] * combined_feature_shape[2])
code_size = 4 code_size = 4
zero = tf.reduce_sum(0 * image_feature) zero = tf.reduce_sum(0 * image_feature)
num_class_slots = self.num_classes
if self._add_background_class:
num_class_slots = num_class_slots + 1
box_encodings = zero + tf.zeros( box_encodings = zero + tf.zeros(
(batch_size, num_anchors, 1, code_size), dtype=tf.float32) (batch_size, num_anchors, 1, code_size), dtype=tf.float32)
class_predictions_with_background = zero + tf.zeros( class_predictions_with_background = zero + tf.zeros(
(batch_size, num_anchors, self.num_classes + 1), dtype=tf.float32) (batch_size, num_anchors, num_class_slots), dtype=tf.float32)
masks = zero + tf.zeros( masks = zero + tf.zeros(
(batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE, (batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE,
DEFAULT_MASK_SIZE), DEFAULT_MASK_SIZE),
...@@ -80,9 +85,11 @@ class MockBoxPredictor(box_predictor.BoxPredictor): ...@@ -80,9 +85,11 @@ class MockBoxPredictor(box_predictor.BoxPredictor):
class MockKerasBoxPredictor(box_predictor.KerasBoxPredictor): class MockKerasBoxPredictor(box_predictor.KerasBoxPredictor):
"""Simple box predictor that ignores inputs and outputs all zeros.""" """Simple box predictor that ignores inputs and outputs all zeros."""
def __init__(self, is_training, num_classes, predict_mask=False): def __init__(self, is_training, num_classes, add_background_class=True,
predict_mask=False):
super(MockKerasBoxPredictor, self).__init__( super(MockKerasBoxPredictor, self).__init__(
is_training, num_classes, False, False) is_training, num_classes, False, False)
self._add_background_class = add_background_class
self._predict_mask = predict_mask self._predict_mask = predict_mask
def _predict(self, image_features, **kwargs): def _predict(self, image_features, **kwargs):
...@@ -93,10 +100,13 @@ class MockKerasBoxPredictor(box_predictor.KerasBoxPredictor): ...@@ -93,10 +100,13 @@ class MockKerasBoxPredictor(box_predictor.KerasBoxPredictor):
num_anchors = (combined_feature_shape[1] * combined_feature_shape[2]) num_anchors = (combined_feature_shape[1] * combined_feature_shape[2])
code_size = 4 code_size = 4
zero = tf.reduce_sum(0 * image_feature) zero = tf.reduce_sum(0 * image_feature)
num_class_slots = self.num_classes
if self._add_background_class:
num_class_slots = num_class_slots + 1
box_encodings = zero + tf.zeros( box_encodings = zero + tf.zeros(
(batch_size, num_anchors, 1, code_size), dtype=tf.float32) (batch_size, num_anchors, 1, code_size), dtype=tf.float32)
class_predictions_with_background = zero + tf.zeros( class_predictions_with_background = zero + tf.zeros(
(batch_size, num_anchors, self.num_classes + 1), dtype=tf.float32) (batch_size, num_anchors, num_class_slots), dtype=tf.float32)
masks = zero + tf.zeros( masks = zero + tf.zeros(
(batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE, (batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE,
DEFAULT_MASK_SIZE), DEFAULT_MASK_SIZE),
......
package(default_visibility = ["//visibility:public"])
# struct2depth
This a method for unsupervised learning of depth and egomotion from monocular video, achieving new state-of-the-art results on both tasks by explicitly modeling 3D object motion, performing on-line refinement and improving quality for moving objects by novel loss formulations. It will appear in the following paper:
**V. Casser, S. Pirk, R. Mahjourian, A. Angelova, Depth Prediction Without the Sensors: Leveraging Structure for Unsupervised Learning from Monocular Videos, AAAI Conference on Artificial Intelligence, 2019**
https://arxiv.org/pdf/1811.06152.pdf
This code is implemented and supported by Vincent Casser (git username: VincentCa) and Anelia Angelova (git username: AneliaAngelova). Please contact anelia@google.com for questions.
Project website: https://sites.google.com/view/struct2depth.
## Quick start: Running training
Before running training, run gen_data_* script for the respective dataset in order to generate the data in the appropriate format for KITTI or Cityscapes. It is assumed that motion masks are already generated and stored as images.
Models are trained from an Imagenet pretrained model.
```shell
ckpt_dir="your/checkpoint/folder"
data_dir="KITTI_SEQ2_LR/" # Set for KITTI
data_dir="CITYSCAPES_SEQ2_LR/" # Set for Cityscapes
imagenet_ckpt="resnet_pretrained/model.ckpt"
python train.py \
--logtostderr \
--checkpoint_dir $ckpt_dir \
--data_dir $data_dir \
--architecture resnet \
--imagenet_ckpt $imagenet_ckpt \
--imagenet_norm true \
--joint_encoder false
```
## Running depth/egomotion inference on an image folder
KITTI is trained on the raw image data (resized to 416 x 128), but inputs are standardized before feeding them, and Cityscapes images are cropped using the following cropping parameters: (192, 1856, 256, 768). If using a different crop, it is likely that additional training is necessary. Therefore, please follow the inference example shown below when using one of the models. The right choice might depend on a variety of factors. For example, if a checkpoint should be used for odometry, be aware that for improved odometry on motion models, using segmentation masks could be advantageous (setting *use_masks=true* for inference). On the other hand, all models can be used for single-frame depth estimation without any additional information.
```shell
input_dir="your/image/folder"
output_dir="your/output/folder"
model_checkpoint="your/model/checkpoint"
python inference.py \
--logtostderr \
--file_extension png \
--depth \
--egomotion true \
--input_dir $input_dir \
--output_dir $output_dir \
--model_ckpt $model_checkpoint
```
Note that the egomotion prediction expects the files in the input directory to be a consecutive sequence, and that sorting the filenames alphabetically is putting them in the right order.
One can also run inference on KITTI by providing
```shell
--input_list_file ~/kitti-raw-uncompressed/test_files_eigen.txt
```
and on Cityscapes by passing
```shell
--input_list_file CITYSCAPES_FULL/test_files_cityscapes.txt
```
instead of *input_dir*.
Alternatively inference can also be ran on pre-processed images.
## Running on-line refinement
On-line refinement is executed on top of an existing inference folder, so make sure to run regular inference first. Then you can run the on-line fusion procedure as follows:
```shell
prediction_dir="some/prediction/dir"
model_ckpt="checkpoints/checkpoints_baseline/model-199160"
handle_motion="false"
size_constraint_weight="0" # This must be zero when not handling motion.
# If running on KITTI, set as follows:
data_dir="KITTI_SEQ2_LR_EIGEN/"
triplet_list_file="$data_dir/test_files_eigen_triplets.txt"
triplet_list_file_remains="$data_dir/test_files_eigen_triplets_remains.txt"
ft_name="kitti"
# If running on Cityscapes, set as follows:
data_dir="CITYSCAPES_SEQ2_LR_TEST/" # Set for Cityscapes
triplet_list_file="/CITYSCAPES_SEQ2_LR_TEST/test_files_cityscapes_triplets.txt"
triplet_list_file_remains="CITYSCAPES_SEQ2_LR_TEST/test_files_cityscapes_triplets_remains.txt"
ft_name="cityscapes"
python optimize.py \
--logtostderr \
--output_dir $prediction_dir \
--data_dir $data_dir \
--triplet_list_file $triplet_list_file \
--triplet_list_file_remains $triplet_list_file_remains \
--ft_name $ft_name \
--model_ckpt $model_ckpt \
--file_extension png \
--handle_motion $handle_motion \
--size_constraint_weight $size_constraint_weight
```
## Running evaluation
```shell
prediction_dir="some/prediction/dir"
# Use these settings for KITTI:
eval_list_file="KITTI_FULL/kitti-raw-uncompressed/test_files_eigen.txt"
eval_crop="garg"
eval_mode="kitti"
# Use these settings for Cityscapes:
eval_list_file="CITYSCAPES_FULL/test_files_cityscapes.txt"
eval_crop="none"
eval_mode="cityscapes"
python evaluate.py \
--logtostderr \
--prediction_dir $prediction_dir \
--eval_list_file $eval_list_file \
--eval_crop $eval_crop \
--eval_mode $eval_mode
```
## Credits
This code is implemented and supported by Vincent Casser and Anelia Angelova and can be found at
https://sites.google.com/view/struct2depth.
The core implementation is derived from [https://github.com/tensorflow/models/tree/master/research/vid2depth)](https://github.com/tensorflow/models/tree/master/research/vid2depth)
by [Reza Mahjourian](rezama@google.com), which in turn is based on [SfMLearner
(https://github.com/tinghuiz/SfMLearner)](https://github.com/tinghuiz/SfMLearner)
by [Tinghui Zhou](https://github.com/tinghuiz).
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Common utilities for data pre-processing, e.g. matching moving object across frames."""
import numpy as np
def compute_overlap(mask1, mask2):
# Use IoU here.
return np.sum(mask1 & mask2)/np.sum(mask1 | mask2)
def align(seg_img1, seg_img2, seg_img3, threshold_same=0.3):
res_img1 = np.zeros_like(seg_img1)
res_img2 = np.zeros_like(seg_img2)
res_img3 = np.zeros_like(seg_img3)
remaining_objects2 = list(np.unique(seg_img2.flatten()))
remaining_objects3 = list(np.unique(seg_img3.flatten()))
for seg_id in np.unique(seg_img1):
# See if we can find correspondences to seg_id in seg_img2.
max_overlap2 = float('-inf')
max_segid2 = -1
for seg_id2 in remaining_objects2:
overlap = compute_overlap(seg_img1==seg_id, seg_img2==seg_id2)
if overlap>max_overlap2:
max_overlap2 = overlap
max_segid2 = seg_id2
if max_overlap2 > threshold_same:
max_overlap3 = float('-inf')
max_segid3 = -1
for seg_id3 in remaining_objects3:
overlap = compute_overlap(seg_img2==max_segid2, seg_img3==seg_id3)
if overlap>max_overlap3:
max_overlap3 = overlap
max_segid3 = seg_id3
if max_overlap3 > threshold_same:
res_img1[seg_img1==seg_id] = seg_id
res_img2[seg_img2==max_segid2] = seg_id
res_img3[seg_img3==max_segid3] = seg_id
remaining_objects2.remove(max_segid2)
remaining_objects3.remove(max_segid3)
return res_img1, res_img2, res_img3
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
""" Offline data generation for the Cityscapes dataset."""
import os
from absl import app
from absl import flags
from absl import logging
import numpy as np
import cv2
import os, glob
import alignment
from alignment import compute_overlap
from alignment import align
SKIP = 2
WIDTH = 416
HEIGHT = 128
SUB_FOLDER = 'train'
INPUT_DIR = '/usr/local/google/home/anelia/struct2depth/CITYSCAPES_FULL/'
OUTPUT_DIR = '/usr/local/google/home/anelia/struct2depth/CITYSCAPES_Processed/'
def crop(img, segimg, fx, fy, cx, cy):
# Perform center cropping, preserving 50% vertically.
middle_perc = 0.50
left = 1 - middle_perc
half = left / 2
a = img[int(img.shape[0]*(half)):int(img.shape[0]*(1-half)), :]
aseg = segimg[int(segimg.shape[0]*(half)):int(segimg.shape[0]*(1-half)), :]
cy /= (1 / middle_perc)
# Resize to match target height while preserving aspect ratio.
wdt = int((float(HEIGHT)*a.shape[1]/a.shape[0]))
x_scaling = float(wdt)/a.shape[1]
y_scaling = float(HEIGHT)/a.shape[0]
b = cv2.resize(a, (wdt, HEIGHT))
bseg = cv2.resize(aseg, (wdt, HEIGHT))
# Adjust intrinsics.
fx*=x_scaling
fy*=y_scaling
cx*=x_scaling
cy*=y_scaling
# Perform center cropping horizontally.
remain = b.shape[1] - WIDTH
cx /= (b.shape[1] / WIDTH)
c = b[:, int(remain/2):b.shape[1]-int(remain/2)]
cseg = bseg[:, int(remain/2):b.shape[1]-int(remain/2)]
return c, cseg, fx, fy, cx, cy
def run_all():
dir_name=INPUT_DIR + '/leftImg8bit_sequence/' + SUB_FOLDER + '/*'
print('Processing directory', dir_name)
for location in glob.glob(INPUT_DIR + '/leftImg8bit_sequence/' + SUB_FOLDER + '/*'):
location_name = os.path.basename(location)
print('Processing location', location_name)
files = sorted(glob.glob(location + '/*.png'))
files = [file for file in files if '-seg.png' not in file]
# Break down into sequences
sequences = {}
seq_nr = 0
last_seq = ''
last_imgnr = -1
for i in range(len(files)):
seq = os.path.basename(files[i]).split('_')[1]
nr = int(os.path.basename(files[i]).split('_')[2])
if seq!=last_seq or last_imgnr+1!=nr:
seq_nr+=1
last_imgnr = nr
last_seq = seq
if not seq_nr in sequences:
sequences[seq_nr] = []
sequences[seq_nr].append(files[i])
for (k,v) in sequences.items():
print('Processing sequence', k, 'with', len(v), 'elements...')
output_dir = OUTPUT_DIR + '/' + location_name + '_' + str(k)
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
files = sorted(v)
triplet = []
seg_triplet = []
ct = 1
# Find applicable intrinsics.
for j in range(len(files)):
osegname = os.path.basename(files[j]).split('_')[1]
oimgnr = os.path.basename(files[j]).split('_')[2]
applicable_intrinsics = INPUT_DIR + '/camera/' + SUB_FOLDER + '/' + location_name + '/' + location_name + '_' + osegname + '_' + oimgnr + '_camera.json'
# Get the intrinsics for one of the file of the sequence.
if os.path.isfile(applicable_intrinsics):
f = open(applicable_intrinsics, 'r')
lines = f.readlines()
f.close()
lines = [line.rstrip() for line in lines]
fx = float(lines[11].split(': ')[1].replace(',', ''))
fy = float(lines[12].split(': ')[1].replace(',', ''))
cx = float(lines[13].split(': ')[1].replace(',', ''))
cy = float(lines[14].split(': ')[1].replace(',', ''))
for j in range(0, len(files), SKIP):
img = cv2.imread(files[j])
segimg = cv2.imread(files[j].replace('.png', '-seg.png'))
smallimg, segimg, fx_this, fy_this, cx_this, cy_this = crop(img, segimg, fx, fy, cx, cy)
triplet.append(smallimg)
seg_triplet.append(segimg)
if len(triplet)==3:
cmb = np.hstack(triplet)
align1, align2, align3 = align(seg_triplet[0], seg_triplet[1], seg_triplet[2])
cmb_seg = np.hstack([align1, align2, align3])
cv2.imwrite(os.path.join(output_dir, str(ct).zfill(10) + '.png'), cmb)
cv2.imwrite(os.path.join(output_dir, str(ct).zfill(10) + '-fseg.png'), cmb_seg)
f = open(os.path.join(output_dir, str(ct).zfill(10) + '_cam.txt'), 'w')
f.write(str(fx_this) + ',0.0,' + str(cx_this) + ',0.0,' + str(fy_this) + ',' + str(cy_this) + ',0.0,0.0,1.0')
f.close()
del triplet[0]
del seg_triplet[0]
ct+=1
# Create file list for training. Be careful as it collects and includes all files recursively.
fn = open(OUTPUT_DIR + '/' + SUB_FOLDER + '.txt', 'w')
for f in glob.glob(OUTPUT_DIR + '/*/*.png'):
if '-seg.png' in f or '-fseg.png' in f:
continue
folder_name = f.split('/')[-2]
img_name = f.split('/')[-1].replace('.png', '')
fn.write(folder_name + ' ' + img_name + '\n')
fn.close()
def main(_):
run_all()
if __name__ == '__main__':
app.run(main)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
""" Offline data generation for the KITTI dataset."""
import os
from absl import app
from absl import flags
from absl import logging
import numpy as np
import cv2
import os, glob
import alignment
from alignment import compute_overlap
from alignment import align
SEQ_LENGTH = 3
WIDTH = 416
HEIGHT = 128
STEPSIZE = 1
INPUT_DIR = '/usr/local/google/home/anelia/struct2depth/KITTI_FULL/kitti-raw-uncompressed'
OUTPUT_DIR = '/usr/local/google/home/anelia/struct2depth/KITTI_procesed/'
def get_line(file, start):
file = open(file, 'r')
lines = file.readlines()
lines = [line.rstrip() for line in lines]
ret = None
for line in lines:
nline = line.split(': ')
if nline[0]==start:
ret = nline[1].split(' ')
ret = np.array([float(r) for r in ret], dtype=float)
ret = ret.reshape((3,4))[0:3, 0:3]
break
file.close()
return ret
def crop(img, segimg, fx, fy, cx, cy):
# Perform center cropping, preserving 50% vertically.
middle_perc = 0.50
left = 1-middle_perc
half = left/2
a = img[int(img.shape[0]*(half)):int(img.shape[0]*(1-half)), :]
aseg = segimg[int(segimg.shape[0]*(half)):int(segimg.shape[0]*(1-half)), :]
cy /= (1/middle_perc)
# Resize to match target height while preserving aspect ratio.
wdt = int((128*a.shape[1]/a.shape[0]))
x_scaling = float(wdt)/a.shape[1]
y_scaling = 128.0/a.shape[0]
b = cv2.resize(a, (wdt, 128))
bseg = cv2.resize(aseg, (wdt, 128))
# Adjust intrinsics.
fx*=x_scaling
fy*=y_scaling
cx*=x_scaling
cy*=y_scaling
# Perform center cropping horizontally.
remain = b.shape[1] - 416
cx /= (b.shape[1]/416)
c = b[:, int(remain/2):b.shape[1]-int(remain/2)]
cseg = bseg[:, int(remain/2):b.shape[1]-int(remain/2)]
return c, cseg, fx, fy, cx, cy
def run_all():
ct = 0
if not OUTPUT_DIR.endswith('/'):
OUTPUT_DIR = OUTPUT_DIR + '/'
for d in glob.glob(INPUT_DIR + '/*/'):
date = d.split('/')[-2]
file_calibration = d + 'calib_cam_to_cam.txt'
calib_raw = [get_line(file_calibration, 'P_rect_02'), get_line(file_calibration, 'P_rect_03')]
for d2 in glob.glob(d + '*/'):
seqname = d2.split('/')[-2]
print('Processing sequence', seqname)
for subfolder in ['image_02/data', 'image_03/data']:
ct = 1
seqname = d2.split('/')[-2] + subfolder.replace('image', '').replace('/data', '')
if not os.path.exists(OUTPUT_DIR + seqname):
os.mkdir(OUTPUT_DIR + seqname)
calib_camera = calib_raw[0] if subfolder=='image_02/data' else calib_raw[1]
folder = d2 + subfolder
files = glob.glob(folder + '/*.png')
files = [file for file in files if not 'disp' in file and not 'flip' in file and not 'seg' in file]
files = sorted(files)
for i in range(SEQ_LENGTH, len(files)+1, STEPSIZE):
imgnum = str(ct).zfill(10)
if os.path.exists(OUTPUT_DIR + seqname + '/' + imgnum + '.png'):
ct+=1
continue
big_img = np.zeros(shape=(HEIGHT, WIDTH*SEQ_LENGTH, 3))
wct = 0
for j in range(i-SEQ_LENGTH, i): # Collect frames for this sample.
img = cv2.imread(files[j])
ORIGINAL_HEIGHT, ORIGINAL_WIDTH, _ = img.shape
zoom_x = WIDTH/ORIGINAL_WIDTH
zoom_y = HEIGHT/ORIGINAL_HEIGHT
# Adjust intrinsics.
calib_current = calib_camera.copy()
calib_current[0, 0] *= zoom_x
calib_current[0, 2] *= zoom_x
calib_current[1, 1] *= zoom_y
calib_current[1, 2] *= zoom_y
calib_representation = ','.join([str(c) for c in calib_current.flatten()])
img = cv2.resize(img, (WIDTH, HEIGHT))
big_img[:,wct*WIDTH:(wct+1)*WIDTH] = img
wct+=1
cv2.imwrite(OUTPUT_DIR + seqname + '/' + imgnum + '.png', big_img)
f = open(OUTPUT_DIR + seqname + '/' + imgnum + '_cam.txt', 'w')
f.write(calib_representation)
f.close()
ct+=1
def main(_):
run_all()
if __name__ == '__main__':
app.run(main)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs struct2depth at inference. Produces depth estimates, ego-motion and object motion."""
# Example usage:
#
# python inference.py \
# --input_dir ~/struct2depth/kitti-raw-uncompressed/ \
# --output_dir ~/struct2depth/output \
# --model_ckpt ~/struct2depth/model/model-199160
# --file_extension png \
# --depth \
# --egomotion true \
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import app
from absl import flags
from absl import logging
#import matplotlib.pyplot as plt
import model
import numpy as np
import fnmatch
import tensorflow as tf
import nets
import util
gfile = tf.gfile
# CMAP = 'plasma'
INFERENCE_MODE_SINGLE = 'single' # Take plain single-frame input.
INFERENCE_MODE_TRIPLETS = 'triplets' # Take image triplets as input.
# For KITTI, we just resize input images and do not perform cropping. For
# Cityscapes, the car hood and more image content has been cropped in order
# to fit aspect ratio, and remove static content from the images. This has to be
# kept at inference time.
INFERENCE_CROP_NONE = 'none'
INFERENCE_CROP_CITYSCAPES = 'cityscapes'
flags.DEFINE_string('output_dir', None, 'Directory to store predictions.')
flags.DEFINE_string('file_extension', 'png', 'Image data file extension of '
'files provided with input_dir. Also determines the output '
'file format of depth prediction images.')
flags.DEFINE_bool('depth', True, 'Determines if the depth prediction network '
'should be executed and its predictions be saved.')
flags.DEFINE_bool('egomotion', False, 'Determines if the egomotion prediction '
'network should be executed and its predictions be saved. If '
'inference is run in single inference mode, it is assumed '
'that files in the same directory belong in the same '
'sequence, and sorting them alphabetically establishes the '
'right temporal order.')
flags.DEFINE_string('model_ckpt', None, 'Model checkpoint to evaluate.')
flags.DEFINE_string('input_dir', None, 'Directory containing image files to '
'evaluate. This crawls recursively for images in the '
'directory, mirroring relative subdirectory structures '
'into the output directory.')
flags.DEFINE_string('input_list_file', None, 'Text file containing paths to '
'image files to process. Paths should be relative with '
'respect to the list file location. Relative path '
'structures will be mirrored in the output directory.')
flags.DEFINE_integer('batch_size', 1, 'The size of a sample batch')
flags.DEFINE_integer('img_height', 128, 'Input frame height.')
flags.DEFINE_integer('img_width', 416, 'Input frame width.')
flags.DEFINE_integer('seq_length', 3, 'Number of frames in sequence.')
flags.DEFINE_enum('architecture', nets.RESNET, nets.ARCHITECTURES,
'Defines the architecture to use for the depth prediction '
'network. Defaults to ResNet-based encoder and accompanying '
'decoder.')
flags.DEFINE_boolean('imagenet_norm', True, 'Whether to normalize the input '
'images channel-wise so that they match the distribution '
'most ImageNet-models were trained on.')
flags.DEFINE_bool('use_skip', True, 'Whether to use skip connections in the '
'encoder-decoder architecture.')
flags.DEFINE_bool('joint_encoder', False, 'Whether to share parameters '
'between the depth and egomotion networks by using a joint '
'encoder architecture. The egomotion network is then '
'operating only on the hidden representation provided by the '
'joint encoder.')
flags.DEFINE_bool('shuffle', False, 'Whether to shuffle the order in which '
'images are processed.')
flags.DEFINE_bool('flip', False, 'Whether images should be flipped as well as '
'resulting predictions (for test-time augmentation). This '
'currently applies to the depth network only.')
flags.DEFINE_enum('inference_mode', INFERENCE_MODE_SINGLE,
[INFERENCE_MODE_SINGLE,
INFERENCE_MODE_TRIPLETS],
'Whether to use triplet mode for inference, which accepts '
'triplets instead of single frames.')
flags.DEFINE_enum('inference_crop', INFERENCE_CROP_NONE,
[INFERENCE_CROP_NONE,
INFERENCE_CROP_CITYSCAPES],
'Whether to apply a Cityscapes-specific crop on the input '
'images first before running inference.')
flags.DEFINE_bool('use_masks', False, 'Whether to mask out potentially '
'moving objects when feeding image input to the egomotion '
'network. This might improve odometry results when using '
'a motion model. For this, pre-computed segmentation '
'masks have to be available for every image, with the '
'background being zero.')
FLAGS = flags.FLAGS
flags.mark_flag_as_required('output_dir')
flags.mark_flag_as_required('model_ckpt')
def _run_inference(output_dir=None,
file_extension='png',
depth=True,
egomotion=False,
model_ckpt=None,
input_dir=None,
input_list_file=None,
batch_size=1,
img_height=128,
img_width=416,
seq_length=3,
architecture=nets.RESNET,
imagenet_norm=True,
use_skip=True,
joint_encoder=True,
shuffle=False,
flip_for_depth=False,
inference_mode=INFERENCE_MODE_SINGLE,
inference_crop=INFERENCE_CROP_NONE,
use_masks=False):
"""Runs inference. Refer to flags in inference.py for details."""
inference_model = model.Model(is_training=False,
batch_size=batch_size,
img_height=img_height,
img_width=img_width,
seq_length=seq_length,
architecture=architecture,
imagenet_norm=imagenet_norm,
use_skip=use_skip,
joint_encoder=joint_encoder)
vars_to_restore = util.get_vars_to_save_and_restore(model_ckpt)
saver = tf.train.Saver(vars_to_restore)
sv = tf.train.Supervisor(logdir='/tmp/', saver=None)
with sv.managed_session() as sess:
saver.restore(sess, model_ckpt)
if not gfile.Exists(output_dir):
gfile.MakeDirs(output_dir)
logging.info('Predictions will be saved in %s.', output_dir)
# Collect all images to run inference on.
im_files, basepath_in = collect_input_images(input_dir, input_list_file,
file_extension)
if shuffle:
logging.info('Shuffling data...')
np.random.shuffle(im_files)
logging.info('Running inference on %d files.', len(im_files))
# Create missing output folders and pre-compute target directories.
output_dirs = create_output_dirs(im_files, basepath_in, output_dir)
# Run depth prediction network.
if depth:
im_batch = []
for i in range(len(im_files)):
if i % 100 == 0:
logging.info('%s of %s files processed.', i, len(im_files))
# Read image and run inference.
if inference_mode == INFERENCE_MODE_SINGLE:
if inference_crop == INFERENCE_CROP_NONE:
im = util.load_image(im_files[i], resize=(img_width, img_height))
elif inference_crop == INFERENCE_CROP_CITYSCAPES:
im = util.crop_cityscapes(util.load_image(im_files[i]),
resize=(img_width, img_height))
elif inference_mode == INFERENCE_MODE_TRIPLETS:
im = util.load_image(im_files[i], resize=(img_width * 3, img_height))
im = im[:, img_width:img_width*2]
if flip_for_depth:
im = np.flip(im, axis=1)
im_batch.append(im)
if len(im_batch) == batch_size or i == len(im_files) - 1:
# Call inference on batch.
for _ in range(batch_size - len(im_batch)): # Fill up batch.
im_batch.append(np.zeros(shape=(img_height, img_width, 3),
dtype=np.float32))
im_batch = np.stack(im_batch, axis=0)
est_depth = inference_model.inference_depth(im_batch, sess)
if flip_for_depth:
est_depth = np.flip(est_depth, axis=2)
im_batch = np.flip(im_batch, axis=2)
for j in range(len(im_batch)):
color_map = util.normalize_depth_for_display(
np.squeeze(est_depth[j]))
visualization = np.concatenate((im_batch[j], color_map), axis=0)
# Save raw prediction and color visualization. Extract filename
# without extension from full path: e.g. path/to/input_dir/folder1/
# file1.png -> file1
k = i - len(im_batch) + 1 + j
filename_root = os.path.splitext(os.path.basename(im_files[k]))[0]
pref = '_flip' if flip_for_depth else ''
output_raw = os.path.join(
output_dirs[k], filename_root + pref + '.npy')
output_vis = os.path.join(
output_dirs[k], filename_root + pref + '.png')
with gfile.Open(output_raw, 'wb') as f:
np.save(f, est_depth[j])
util.save_image(output_vis, visualization, file_extension)
im_batch = []
# Run egomotion network.
if egomotion:
if inference_mode == INFERENCE_MODE_SINGLE:
# Run regular egomotion inference loop.
input_image_seq = []
input_seg_seq = []
current_sequence_dir = None
current_output_handle = None
for i in range(len(im_files)):
sequence_dir = os.path.dirname(im_files[i])
if sequence_dir != current_sequence_dir:
# Assume start of a new sequence, since this image lies in a
# different directory than the previous ones.
# Clear egomotion input buffer.
output_filepath = os.path.join(output_dirs[i], 'egomotion.txt')
if current_output_handle is not None:
current_output_handle.close()
current_sequence_dir = sequence_dir
logging.info('Writing egomotion sequence to %s.', output_filepath)
current_output_handle = gfile.Open(output_filepath, 'w')
input_image_seq = []
im = util.load_image(im_files[i], resize=(img_width, img_height))
input_image_seq.append(im)
if use_masks:
im_seg_path = im_files[i].replace('.%s' % file_extension,
'-seg.%s' % file_extension)
if not gfile.Exists(im_seg_path):
raise ValueError('No segmentation mask %s has been found for '
'image %s. If none are available, disable '
'use_masks.' % (im_seg_path, im_files[i]))
input_seg_seq.append(util.load_image(im_seg_path,
resize=(img_width, img_height),
interpolation='nn'))
if len(input_image_seq) < seq_length: # Buffer not filled yet.
continue
if len(input_image_seq) > seq_length: # Remove oldest entry.
del input_image_seq[0]
if use_masks:
del input_seg_seq[0]
input_image_stack = np.concatenate(input_image_seq, axis=2)
input_image_stack = np.expand_dims(input_image_stack, axis=0)
if use_masks:
input_image_stack = mask_image_stack(input_image_stack,
input_seg_seq)
est_egomotion = np.squeeze(inference_model.inference_egomotion(
input_image_stack, sess))
egomotion_str = []
for j in range(seq_length - 1):
egomotion_str.append(','.join([str(d) for d in est_egomotion[j]]))
current_output_handle.write(
str(i) + ' ' + ' '.join(egomotion_str) + '\n')
if current_output_handle is not None:
current_output_handle.close()
elif inference_mode == INFERENCE_MODE_TRIPLETS:
written_before = []
for i in range(len(im_files)):
im = util.load_image(im_files[i], resize=(img_width * 3, img_height))
input_image_stack = np.concatenate(
[im[:, :img_width], im[:, img_width:img_width*2],
im[:, img_width*2:]], axis=2)
input_image_stack = np.expand_dims(input_image_stack, axis=0)
if use_masks:
im_seg_path = im_files[i].replace('.%s' % file_extension,
'-seg.%s' % file_extension)
if not gfile.Exists(im_seg_path):
raise ValueError('No segmentation mask %s has been found for '
'image %s. If none are available, disable '
'use_masks.' % (im_seg_path, im_files[i]))
seg = util.load_image(im_seg_path,
resize=(img_width * 3, img_height),
interpolation='nn')
input_seg_seq = [seg[:, :img_width], seg[:, img_width:img_width*2],
seg[:, img_width*2:]]
input_image_stack = mask_image_stack(input_image_stack,
input_seg_seq)
est_egomotion = inference_model.inference_egomotion(
input_image_stack, sess)
est_egomotion = np.squeeze(est_egomotion)
egomotion_1_2 = ','.join([str(d) for d in est_egomotion[0]])
egomotion_2_3 = ','.join([str(d) for d in est_egomotion[1]])
output_filepath = os.path.join(output_dirs[i], 'egomotion.txt')
file_mode = 'w' if output_filepath not in written_before else 'a'
with gfile.Open(output_filepath, file_mode) as current_output_handle:
current_output_handle.write(str(i) + ' ' + egomotion_1_2 + ' ' +
egomotion_2_3 + '\n')
written_before.append(output_filepath)
logging.info('Done.')
def mask_image_stack(input_image_stack, input_seg_seq):
"""Masks out moving image contents by using the segmentation masks provided.
This can lead to better odometry accuracy for motion models, but is optional
to use. Is only called if use_masks is enabled.
Args:
input_image_stack: The input image stack of shape (1, H, W, seq_length).
input_seg_seq: List of segmentation masks with seq_length elements of shape
(H, W, C) for some number of channels C.
Returns:
Input image stack with detections provided by segmentation mask removed.
"""
background = [mask == 0 for mask in input_seg_seq]
background = reduce(lambda m1, m2: m1 & m2, background)
# If masks are RGB, assume all channels to be the same. Reduce to the first.
if background.ndim == 3 and background.shape[2] > 1:
background = np.expand_dims(background[:, :, 0], axis=2)
elif background.ndim == 2: # Expand.
background = np.expand_dism(background, axis=2)
# background is now of shape (H, W, 1).
background_stack = np.tile(background, [1, 1, input_image_stack.shape[3]])
return np.multiply(input_image_stack, background_stack)
def collect_input_images(input_dir, input_list_file, file_extension):
"""Collects all input images that are to be processed."""
if input_dir is not None:
im_files = _recursive_glob(input_dir, '*.' + file_extension)
basepath_in = os.path.normpath(input_dir)
elif input_list_file is not None:
im_files = util.read_text_lines(input_list_file)
basepath_in = os.path.dirname(input_list_file)
im_files = [os.path.join(basepath_in, f) for f in im_files]
im_files = [f for f in im_files if 'disp' not in f and '-seg' not in f and
'-fseg' not in f and '-flip' not in f]
return sorted(im_files), basepath_in
def create_output_dirs(im_files, basepath_in, output_dir):
"""Creates required directories, and returns output dir for each file."""
output_dirs = []
for i in range(len(im_files)):
relative_folder_in = os.path.relpath(
os.path.dirname(im_files[i]), basepath_in)
absolute_folder_out = os.path.join(output_dir, relative_folder_in)
if not gfile.IsDirectory(absolute_folder_out):
gfile.MakeDirs(absolute_folder_out)
output_dirs.append(absolute_folder_out)
return output_dirs
def _recursive_glob(treeroot, pattern):
results = []
for base, _, files in os.walk(treeroot):
files = fnmatch.filter(files, pattern)
results.extend(os.path.join(base, f) for f in files)
return results
def main(_):
#if (flags.input_dir is None) == (flags.input_list_file is None):
# raise ValueError('Exactly one of either input_dir or input_list_file has '
# 'to be provided.')
#if not flags.depth and not flags.egomotion:
# raise ValueError('At least one of the depth and egomotion network has to '
# 'be called for inference.')
#if (flags.inference_mode == inference_lib.INFERENCE_MODE_TRIPLETS and
# flags.seq_length != 3):
# raise ValueError('For sequence lengths other than three, single inference '
# 'mode has to be used.')
_run_inference(output_dir=FLAGS.output_dir,
file_extension=FLAGS.file_extension,
depth=FLAGS.depth,
egomotion=FLAGS.egomotion,
model_ckpt=FLAGS.model_ckpt,
input_dir=FLAGS.input_dir,
input_list_file=FLAGS.input_list_file,
batch_size=FLAGS.batch_size,
img_height=FLAGS.img_height,
img_width=FLAGS.img_width,
seq_length=FLAGS.seq_length,
architecture=FLAGS.architecture,
imagenet_norm=FLAGS.imagenet_norm,
use_skip=FLAGS.use_skip,
joint_encoder=FLAGS.joint_encoder,
shuffle=FLAGS.shuffle,
flip_for_depth=FLAGS.flip,
inference_mode=FLAGS.inference_mode,
inference_crop=FLAGS.inference_crop,
use_masks=FLAGS.use_masks)
if __name__ == '__main__':
app.run(main)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Build model for inference or training."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import logging
import numpy as np
import tensorflow as tf
import nets
import project
import reader
import util
gfile = tf.gfile
slim = tf.contrib.slim
NUM_SCALES = 4
class Model(object):
"""Model code based on SfMLearner."""
def __init__(self,
data_dir=None,
file_extension='png',
is_training=True,
learning_rate=0.0002,
beta1=0.9,
reconstr_weight=0.85,
smooth_weight=0.05,
ssim_weight=0.15,
icp_weight=0.0,
batch_size=4,
img_height=128,
img_width=416,
seq_length=3,
architecture=nets.RESNET,
imagenet_norm=True,
weight_reg=0.05,
exhaustive_mode=False,
random_scale_crop=False,
flipping_mode=reader.FLIP_RANDOM,
random_color=True,
depth_upsampling=True,
depth_normalization=True,
compute_minimum_loss=True,
use_skip=True,
joint_encoder=True,
build_sum=True,
shuffle=True,
input_file='train',
handle_motion=False,
equal_weighting=False,
size_constraint_weight=0.0,
train_global_scale_var=True):
self.data_dir = data_dir
self.file_extension = file_extension
self.is_training = is_training
self.learning_rate = learning_rate
self.reconstr_weight = reconstr_weight
self.smooth_weight = smooth_weight
self.ssim_weight = ssim_weight
self.icp_weight = icp_weight
self.beta1 = beta1
self.batch_size = batch_size
self.img_height = img_height
self.img_width = img_width
self.seq_length = seq_length
self.architecture = architecture
self.imagenet_norm = imagenet_norm
self.weight_reg = weight_reg
self.exhaustive_mode = exhaustive_mode
self.random_scale_crop = random_scale_crop
self.flipping_mode = flipping_mode
self.random_color = random_color
self.depth_upsampling = depth_upsampling
self.depth_normalization = depth_normalization
self.compute_minimum_loss = compute_minimum_loss
self.use_skip = use_skip
self.joint_encoder = joint_encoder
self.build_sum = build_sum
self.shuffle = shuffle
self.input_file = input_file
self.handle_motion = handle_motion
self.equal_weighting = equal_weighting
self.size_constraint_weight = size_constraint_weight
self.train_global_scale_var = train_global_scale_var
logging.info('data_dir: %s', data_dir)
logging.info('file_extension: %s', file_extension)
logging.info('is_training: %s', is_training)
logging.info('learning_rate: %s', learning_rate)
logging.info('reconstr_weight: %s', reconstr_weight)
logging.info('smooth_weight: %s', smooth_weight)
logging.info('ssim_weight: %s', ssim_weight)
logging.info('icp_weight: %s', icp_weight)
logging.info('size_constraint_weight: %s', size_constraint_weight)
logging.info('beta1: %s', beta1)
logging.info('batch_size: %s', batch_size)
logging.info('img_height: %s', img_height)
logging.info('img_width: %s', img_width)
logging.info('seq_length: %s', seq_length)
logging.info('architecture: %s', architecture)
logging.info('imagenet_norm: %s', imagenet_norm)
logging.info('weight_reg: %s', weight_reg)
logging.info('exhaustive_mode: %s', exhaustive_mode)
logging.info('random_scale_crop: %s', random_scale_crop)
logging.info('flipping_mode: %s', flipping_mode)
logging.info('random_color: %s', random_color)
logging.info('depth_upsampling: %s', depth_upsampling)
logging.info('depth_normalization: %s', depth_normalization)
logging.info('compute_minimum_loss: %s', compute_minimum_loss)
logging.info('use_skip: %s', use_skip)
logging.info('joint_encoder: %s', joint_encoder)
logging.info('build_sum: %s', build_sum)
logging.info('shuffle: %s', shuffle)
logging.info('input_file: %s', input_file)
logging.info('handle_motion: %s', handle_motion)
logging.info('equal_weighting: %s', equal_weighting)
logging.info('train_global_scale_var: %s', train_global_scale_var)
if self.size_constraint_weight > 0 or not is_training:
self.global_scale_var = tf.Variable(
0.1, name='global_scale_var',
trainable=self.is_training and train_global_scale_var,
dtype=tf.float32,
constraint=lambda x: tf.clip_by_value(x, 0, np.infty))
if self.is_training:
self.reader = reader.DataReader(self.data_dir, self.batch_size,
self.img_height, self.img_width,
self.seq_length, NUM_SCALES,
self.file_extension,
self.random_scale_crop,
self.flipping_mode,
self.random_color,
self.imagenet_norm,
self.shuffle,
self.input_file)
self.build_train_graph()
else:
self.build_depth_test_graph()
self.build_egomotion_test_graph()
if self.handle_motion:
self.build_objectmotion_test_graph()
# At this point, the model is ready. Print some info on model params.
util.count_parameters()
def build_train_graph(self):
self.build_inference_for_training()
self.build_loss()
self.build_train_op()
if self.build_sum:
self.build_summaries()
def build_inference_for_training(self):
"""Invokes depth and ego-motion networks and computes clouds if needed."""
(self.image_stack, self.image_stack_norm, self.seg_stack,
self.intrinsic_mat, self.intrinsic_mat_inv) = self.reader.read_data()
with tf.variable_scope('depth_prediction'):
# Organized by ...[i][scale]. Note that the order is flipped in
# variables in build_loss() below.
self.disp = {}
self.depth = {}
self.depth_upsampled = {}
self.inf_loss = 0.0
# Organized by [i].
disp_bottlenecks = [None] * self.seq_length
if self.icp_weight > 0:
self.cloud = {}
for i in range(self.seq_length):
image = self.image_stack_norm[:, :, :, 3 * i:3 * (i + 1)]
multiscale_disps_i, disp_bottlenecks[i] = nets.disp_net(
self.architecture, image, self.use_skip,
self.weight_reg, True)
multiscale_depths_i = [1.0 / d for d in multiscale_disps_i]
self.disp[i] = multiscale_disps_i
self.depth[i] = multiscale_depths_i
if self.depth_upsampling:
self.depth_upsampled[i] = []
# Upsample low-resolution depth maps using differentiable bilinear
# interpolation.
for s in range(len(multiscale_depths_i)):
self.depth_upsampled[i].append(tf.image.resize_bilinear(
multiscale_depths_i[s], [self.img_height, self.img_width],
align_corners=True))
if self.icp_weight > 0:
multiscale_clouds_i = [
project.get_cloud(d,
self.intrinsic_mat_inv[:, s, :, :],
name='cloud%d_%d' % (s, i))
for (s, d) in enumerate(multiscale_depths_i)
]
self.cloud[i] = multiscale_clouds_i
# Reuse the same depth graph for all images.
tf.get_variable_scope().reuse_variables()
if self.handle_motion:
# Define egomotion network. This network can see the whole scene except
# for any moving objects as indicated by the provided segmentation masks.
# To avoid the network getting clues of motion by tracking those masks, we
# define the segmentation masks as the union temporally.
with tf.variable_scope('egomotion_prediction'):
base_input = self.image_stack_norm # (B, H, W, 9)
seg_input = self.seg_stack # (B, H, W, 9)
ref_zero = tf.constant(0, dtype=tf.uint8)
# Motion model is currently defined for three-frame sequences.
object_mask1 = tf.equal(seg_input[:, :, :, 0], ref_zero)
object_mask2 = tf.equal(seg_input[:, :, :, 3], ref_zero)
object_mask3 = tf.equal(seg_input[:, :, :, 6], ref_zero)
mask_complete = tf.expand_dims(tf.logical_and( # (B, H, W, 1)
tf.logical_and(object_mask1, object_mask2), object_mask3), axis=3)
mask_complete = tf.tile(mask_complete, (1, 1, 1, 9)) # (B, H, W, 9)
# Now mask out base_input.
self.mask_complete = tf.to_float(mask_complete)
self.base_input_masked = base_input * self.mask_complete
self.egomotion = nets.egomotion_net(
image_stack=self.base_input_masked,
disp_bottleneck_stack=None,
joint_encoder=False,
seq_length=self.seq_length,
weight_reg=self.weight_reg)
# Define object motion network for refinement. This network only sees
# one object at a time over the whole sequence, and tries to estimate its
# motion. The sequence of images are the respective warped frames.
# For each scale, contains batch_size elements of shape (N, 2, 6).
self.object_transforms = {}
# For each scale, contains batch_size elements of shape (N, H, W, 9).
self.object_masks = {}
self.object_masks_warped = {}
# For each scale, contains batch_size elements of size N.
self.object_ids = {}
self.egomotions_seq = {}
self.warped_seq = {}
self.inputs_objectmotion_net = {}
with tf.variable_scope('objectmotion_prediction'):
# First, warp raw images according to overall egomotion.
for s in range(NUM_SCALES):
self.warped_seq[s] = []
self.egomotions_seq[s] = []
for source_index in range(self.seq_length):
egomotion_mat_i_1 = project.get_transform_mat(
self.egomotion, source_index, 1)
warped_image_i_1, _ = (
project.inverse_warp(
self.image_stack[
:, :, :, source_index*3:(source_index+1)*3],
self.depth_upsampled[1][s],
egomotion_mat_i_1,
self.intrinsic_mat[:, 0, :, :],
self.intrinsic_mat_inv[:, 0, :, :]))
self.warped_seq[s].append(warped_image_i_1)
self.egomotions_seq[s].append(egomotion_mat_i_1)
# Second, for every object in the segmentation mask, take its mask and
# warp it according to the egomotion estimate. Then put a threshold to
# binarize the warped result. Use this mask to mask out background and
# other objects, and pass the filtered image to the object motion
# network.
self.object_transforms[s] = []
self.object_masks[s] = []
self.object_ids[s] = []
self.object_masks_warped[s] = []
self.inputs_objectmotion_net[s] = {}
for i in range(self.batch_size):
seg_sequence = self.seg_stack[i] # (H, W, 9=3*3)
object_ids = tf.unique(tf.reshape(seg_sequence, [-1]))[0]
self.object_ids[s].append(object_ids)
color_stack = []
mask_stack = []
mask_stack_warped = []
for j in range(self.seq_length):
current_image = self.warped_seq[s][j][i] # (H, W, 3)
current_seg = seg_sequence[:, :, j * 3:(j+1) * 3] # (H, W, 3)
def process_obj_mask_warp(obj_id):
"""Performs warping of the individual object masks."""
obj_mask = tf.to_float(tf.equal(current_seg, obj_id))
# Warp obj_mask according to overall egomotion.
obj_mask_warped, _ = (
project.inverse_warp(
tf.expand_dims(obj_mask, axis=0),
# Middle frame, highest scale, batch element i:
tf.expand_dims(self.depth_upsampled[1][s][i], axis=0),
# Matrix for warping j into middle frame, batch elem. i:
tf.expand_dims(self.egomotions_seq[s][j][i], axis=0),
tf.expand_dims(self.intrinsic_mat[i, 0, :, :], axis=0),
tf.expand_dims(self.intrinsic_mat_inv[i, 0, :, :],
axis=0)))
obj_mask_warped = tf.squeeze(obj_mask_warped)
obj_mask_binarized = tf.greater( # Threshold to binarize mask.
obj_mask_warped, tf.constant(0.5))
return tf.to_float(obj_mask_binarized)
def process_obj_mask(obj_id):
"""Returns the individual object masks separately."""
return tf.to_float(tf.equal(current_seg, obj_id))
object_masks = tf.map_fn( # (N, H, W, 3)
process_obj_mask, object_ids, dtype=tf.float32)
if self.size_constraint_weight > 0:
# The object segmentation masks are all in object_masks.
# We need to measure the height of every of them, and get the
# approximate distance.
# self.depth_upsampled of shape (seq_length, scale, B, H, W).
depth_pred = self.depth_upsampled[j][s][i] # (H, W)
def get_losses(obj_mask):
"""Get motion constraint loss."""
# Find height of segment.
coords = tf.where(tf.greater( # Shape (num_true, 2=yx)
obj_mask[:, :, 0], tf.constant(0.5, dtype=tf.float32)))
y_max = tf.reduce_max(coords[:, 0])
y_min = tf.reduce_min(coords[:, 0])
seg_height = y_max - y_min
f_y = self.intrinsic_mat[i, 0, 1, 1]
approx_depth = ((f_y * self.global_scale_var) /
tf.to_float(seg_height))
reference_pred = tf.boolean_mask(
depth_pred, tf.greater(
tf.reshape(obj_mask[:, :, 0],
(self.img_height, self.img_width, 1)),
tf.constant(0.5, dtype=tf.float32)))
# Establish loss on approx_depth, a scalar, and
# reference_pred, our dense prediction. Normalize both to
# prevent degenerative depth shrinking.
global_mean_depth_pred = tf.reduce_mean(depth_pred)
reference_pred /= global_mean_depth_pred
approx_depth /= global_mean_depth_pred
spatial_err = tf.abs(reference_pred - approx_depth)
mean_spatial_err = tf.reduce_mean(spatial_err)
return mean_spatial_err
losses = tf.map_fn(
get_losses, object_masks, dtype=tf.float32)
self.inf_loss += tf.reduce_mean(losses)
object_masks_warped = tf.map_fn( # (N, H, W, 3)
process_obj_mask_warp, object_ids, dtype=tf.float32)
filtered_images = tf.map_fn(
lambda mask: current_image * mask, object_masks_warped,
dtype=tf.float32) # (N, H, W, 3)
color_stack.append(filtered_images)
mask_stack.append(object_masks)
mask_stack_warped.append(object_masks_warped)
# For this batch-element, if there are N moving objects,
# color_stack, mask_stack and mask_stack_warped contain both
# seq_length elements of shape (N, H, W, 3).
# We can now concatenate them on the last axis, creating a tensor of
# (N, H, W, 3*3 = 9), and, assuming N does not get too large so that
# we have enough memory, pass them in a single batch to the object
# motion network.
mask_stack = tf.concat(mask_stack, axis=3) # (N, H, W, 9)
mask_stack_warped = tf.concat(mask_stack_warped, axis=3)
color_stack = tf.concat(color_stack, axis=3) # (N, H, W, 9)
all_transforms = nets.objectmotion_net(
# We cut the gradient flow here as the object motion gradient
# should have no saying in how the egomotion network behaves.
# One could try just stopping the gradient for egomotion, but
# not for the depth prediction network.
image_stack=tf.stop_gradient(color_stack),
disp_bottleneck_stack=None,
joint_encoder=False, # Joint encoder not supported.
seq_length=self.seq_length,
weight_reg=self.weight_reg)
# all_transforms of shape (N, 2, 6).
self.object_transforms[s].append(all_transforms)
self.object_masks[s].append(mask_stack)
self.object_masks_warped[s].append(mask_stack_warped)
self.inputs_objectmotion_net[s][i] = color_stack
tf.get_variable_scope().reuse_variables()
else:
# Don't handle motion, classic model formulation.
with tf.name_scope('egomotion_prediction'):
if self.joint_encoder:
# Re-arrange disp_bottleneck_stack to be of shape
# [B, h_hid, w_hid, c_hid * seq_length]. Currently, it is a list with
# seq_length elements, each of dimension [B, h_hid, w_hid, c_hid].
disp_bottleneck_stack = tf.concat(disp_bottlenecks, axis=3)
else:
disp_bottleneck_stack = None
self.egomotion = nets.egomotion_net(
image_stack=self.image_stack_norm,
disp_bottleneck_stack=disp_bottleneck_stack,
joint_encoder=self.joint_encoder,
seq_length=self.seq_length,
weight_reg=self.weight_reg)
def build_loss(self):
"""Adds ops for computing loss."""
with tf.name_scope('compute_loss'):
self.reconstr_loss = 0
self.smooth_loss = 0
self.ssim_loss = 0
self.icp_transform_loss = 0
self.icp_residual_loss = 0
# self.images is organized by ...[scale][B, h, w, seq_len * 3].
self.images = [None for _ in range(NUM_SCALES)]
# Following nested lists are organized by ...[scale][source-target].
self.warped_image = [{} for _ in range(NUM_SCALES)]
self.warp_mask = [{} for _ in range(NUM_SCALES)]
self.warp_error = [{} for _ in range(NUM_SCALES)]
self.ssim_error = [{} for _ in range(NUM_SCALES)]
self.icp_transform = [{} for _ in range(NUM_SCALES)]
self.icp_residual = [{} for _ in range(NUM_SCALES)]
self.middle_frame_index = util.get_seq_middle(self.seq_length)
# Compute losses at each scale.
for s in range(NUM_SCALES):
# Scale image stack.
if s == 0: # Just as a precaution. TF often has interpolation bugs.
self.images[s] = self.image_stack
else:
height_s = int(self.img_height / (2**s))
width_s = int(self.img_width / (2**s))
self.images[s] = tf.image.resize_bilinear(
self.image_stack, [height_s, width_s], align_corners=True)
# Smoothness.
if self.smooth_weight > 0:
for i in range(self.seq_length):
# When computing minimum loss, use the depth map from the middle
# frame only.
if not self.compute_minimum_loss or i == self.middle_frame_index:
disp_smoothing = self.disp[i][s]
if self.depth_normalization:
# Perform depth normalization, dividing by the mean.
mean_disp = tf.reduce_mean(disp_smoothing, axis=[1, 2, 3],
keep_dims=True)
disp_input = disp_smoothing / mean_disp
else:
disp_input = disp_smoothing
scaling_f = (1.0 if self.equal_weighting else 1.0 / (2**s))
self.smooth_loss += scaling_f * self.depth_smoothness(
disp_input, self.images[s][:, :, :, 3 * i:3 * (i + 1)])
self.debug_all_warped_image_batches = []
for i in range(self.seq_length):
for j in range(self.seq_length):
if i == j:
continue
# When computing minimum loss, only consider the middle frame as
# target.
if self.compute_minimum_loss and j != self.middle_frame_index:
continue
# We only consider adjacent frames, unless either
# compute_minimum_loss is on (where the middle frame is matched with
# all other frames) or exhaustive_mode is on (where all frames are
# matched with each other).
if (not self.compute_minimum_loss and not self.exhaustive_mode and
abs(i - j) != 1):
continue
selected_scale = 0 if self.depth_upsampling else s
source = self.images[selected_scale][:, :, :, 3 * i:3 * (i + 1)]
target = self.images[selected_scale][:, :, :, 3 * j:3 * (j + 1)]
if self.depth_upsampling:
target_depth = self.depth_upsampled[j][s]
else:
target_depth = self.depth[j][s]
key = '%d-%d' % (i, j)
if self.handle_motion:
# self.seg_stack of shape (B, H, W, 9).
# target_depth corresponds to middle frame, of shape (B, H, W, 1).
# Now incorporate the other warping results, performed according
# to the object motion network's predictions.
# self.object_masks batch_size elements of (N, H, W, 9).
# self.object_masks_warped batch_size elements of (N, H, W, 9).
# self.object_transforms batch_size elements of (N, 2, 6).
self.all_batches = []
for batch_s in range(self.batch_size):
# To warp i into j, first take the base warping (this is the
# full image i warped into j using only the egomotion estimate).
base_warping = self.warped_seq[s][i][batch_s]
transform_matrices_thisbatch = tf.map_fn(
lambda transform: project.get_transform_mat(
tf.expand_dims(transform, axis=0), i, j)[0],
self.object_transforms[0][batch_s])
def inverse_warp_wrapper(matrix):
"""Wrapper for inverse warping method."""
warp_image, _ = (
project.inverse_warp(
tf.expand_dims(base_warping, axis=0),
tf.expand_dims(target_depth[batch_s], axis=0),
tf.expand_dims(matrix, axis=0),
tf.expand_dims(self.intrinsic_mat[
batch_s, selected_scale, :, :], axis=0),
tf.expand_dims(self.intrinsic_mat_inv[
batch_s, selected_scale, :, :], axis=0)))
return warp_image
warped_images_thisbatch = tf.map_fn(
inverse_warp_wrapper, transform_matrices_thisbatch,
dtype=tf.float32)
warped_images_thisbatch = warped_images_thisbatch[:, 0, :, :, :]
# warped_images_thisbatch is now of shape (N, H, W, 9).
# Combine warped frames into a single one, using the object
# masks. Result should be (1, 128, 416, 3).
# Essentially, we here want to sum them all up, filtered by the
# respective object masks.
mask_base_valid_source = tf.equal(
self.seg_stack[batch_s, :, :, i*3:(i+1)*3],
tf.constant(0, dtype=tf.uint8))
mask_base_valid_target = tf.equal(
self.seg_stack[batch_s, :, :, j*3:(j+1)*3],
tf.constant(0, dtype=tf.uint8))
mask_valid = tf.logical_and(
mask_base_valid_source, mask_base_valid_target)
self.base_warping = base_warping * tf.to_float(mask_valid)
background = tf.expand_dims(self.base_warping, axis=0)
def construct_const_filter_tensor(obj_id):
return tf.fill(
dims=[self.img_height, self.img_width, 3],
value=tf.sign(obj_id)) * tf.to_float(
tf.equal(self.seg_stack[batch_s, :, :, 3:6],
tf.cast(obj_id, dtype=tf.uint8)))
filter_tensor = tf.map_fn(
construct_const_filter_tensor,
tf.to_float(self.object_ids[s][batch_s]))
filter_tensor = tf.stack(filter_tensor, axis=0)
objects_to_add = tf.reduce_sum(
tf.multiply(warped_images_thisbatch, filter_tensor),
axis=0, keepdims=True)
combined = background + objects_to_add
self.all_batches.append(combined)
# Now of shape (B, 128, 416, 3).
self.warped_image[s][key] = tf.concat(self.all_batches, axis=0)
else:
# Don't handle motion, classic model formulation.
egomotion_mat_i_j = project.get_transform_mat(
self.egomotion, i, j)
# Inverse warp the source image to the target image frame for
# photometric consistency loss.
self.warped_image[s][key], self.warp_mask[s][key] = (
project.inverse_warp(
source,
target_depth,
egomotion_mat_i_j,
self.intrinsic_mat[:, selected_scale, :, :],
self.intrinsic_mat_inv[:, selected_scale, :, :]))
# Reconstruction loss.
self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target)
if not self.compute_minimum_loss:
self.reconstr_loss += tf.reduce_mean(
self.warp_error[s][key] * self.warp_mask[s][key])
# SSIM.
if self.ssim_weight > 0:
self.ssim_error[s][key] = self.ssim(self.warped_image[s][key],
target)
# TODO(rezama): This should be min_pool2d().
if not self.compute_minimum_loss:
ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1,
'VALID')
self.ssim_loss += tf.reduce_mean(
self.ssim_error[s][key] * ssim_mask)
# If the minimum loss should be computed, the loss calculation has been
# postponed until here.
if self.compute_minimum_loss:
for frame_index in range(self.middle_frame_index):
key1 = '%d-%d' % (frame_index, self.middle_frame_index)
key2 = '%d-%d' % (self.seq_length - frame_index - 1,
self.middle_frame_index)
logging.info('computing min error between %s and %s', key1, key2)
min_error = tf.minimum(self.warp_error[s][key1],
self.warp_error[s][key2])
self.reconstr_loss += tf.reduce_mean(min_error)
if self.ssim_weight > 0: # Also compute the minimum SSIM loss.
min_error_ssim = tf.minimum(self.ssim_error[s][key1],
self.ssim_error[s][key2])
self.ssim_loss += tf.reduce_mean(min_error_ssim)
# Build the total loss as composed of L1 reconstruction, SSIM, smoothing
# and object size constraint loss as appropriate.
self.reconstr_loss *= self.reconstr_weight
self.total_loss = self.reconstr_loss
if self.smooth_weight > 0:
self.smooth_loss *= self.smooth_weight
self.total_loss += self.smooth_loss
if self.ssim_weight > 0:
self.ssim_loss *= self.ssim_weight
self.total_loss += self.ssim_loss
if self.size_constraint_weight > 0:
self.inf_loss *= self.size_constraint_weight
self.total_loss += self.inf_loss
def gradient_x(self, img):
return img[:, :, :-1, :] - img[:, :, 1:, :]
def gradient_y(self, img):
return img[:, :-1, :, :] - img[:, 1:, :, :]
def depth_smoothness(self, depth, img):
"""Computes image-aware depth smoothness loss."""
depth_dx = self.gradient_x(depth)
depth_dy = self.gradient_y(depth)
image_dx = self.gradient_x(img)
image_dy = self.gradient_y(img)
weights_x = tf.exp(-tf.reduce_mean(tf.abs(image_dx), 3, keepdims=True))
weights_y = tf.exp(-tf.reduce_mean(tf.abs(image_dy), 3, keepdims=True))
smoothness_x = depth_dx * weights_x
smoothness_y = depth_dy * weights_y
return tf.reduce_mean(abs(smoothness_x)) + tf.reduce_mean(abs(smoothness_y))
def ssim(self, x, y):
"""Computes a differentiable structured image similarity measure."""
c1 = 0.01**2 # As defined in SSIM to stabilize div. by small denominator.
c2 = 0.03**2
mu_x = slim.avg_pool2d(x, 3, 1, 'VALID')
mu_y = slim.avg_pool2d(y, 3, 1, 'VALID')
sigma_x = slim.avg_pool2d(x**2, 3, 1, 'VALID') - mu_x**2
sigma_y = slim.avg_pool2d(y**2, 3, 1, 'VALID') - mu_y**2
sigma_xy = slim.avg_pool2d(x * y, 3, 1, 'VALID') - mu_x * mu_y
ssim_n = (2 * mu_x * mu_y + c1) * (2 * sigma_xy + c2)
ssim_d = (mu_x**2 + mu_y**2 + c1) * (sigma_x + sigma_y + c2)
ssim = ssim_n / ssim_d
return tf.clip_by_value((1 - ssim) / 2, 0, 1)
def build_train_op(self):
with tf.name_scope('train_op'):
optim = tf.train.AdamOptimizer(self.learning_rate, self.beta1)
self.train_op = slim.learning.create_train_op(self.total_loss, optim)
self.global_step = tf.Variable(0, name='global_step', trainable=False)
self.incr_global_step = tf.assign(
self.global_step, self.global_step + 1)
def build_summaries(self):
"""Adds scalar and image summaries for TensorBoard."""
tf.summary.scalar('total_loss', self.total_loss)
tf.summary.scalar('reconstr_loss', self.reconstr_loss)
if self.smooth_weight > 0:
tf.summary.scalar('smooth_loss', self.smooth_loss)
if self.ssim_weight > 0:
tf.summary.scalar('ssim_loss', self.ssim_loss)
if self.icp_weight > 0:
tf.summary.scalar('icp_transform_loss', self.icp_transform_loss)
tf.summary.scalar('icp_residual_loss', self.icp_residual_loss)
if self.size_constraint_weight > 0:
tf.summary.scalar('inf_loss', self.inf_loss)
tf.summary.histogram('global_scale_var', self.global_scale_var)
if self.handle_motion:
for s in range(NUM_SCALES):
for batch_s in range(self.batch_size):
whole_strip = tf.concat([self.warped_seq[s][0][batch_s],
self.warped_seq[s][1][batch_s],
self.warped_seq[s][2][batch_s]], axis=1)
tf.summary.image('base_warp_batch%s_scale%s' % (batch_s, s),
tf.expand_dims(whole_strip, axis=0))
whole_strip_input = tf.concat(
[self.inputs_objectmotion_net[s][batch_s][:, :, :, 0:3],
self.inputs_objectmotion_net[s][batch_s][:, :, :, 3:6],
self.inputs_objectmotion_net[s][batch_s][:, :, :, 6:9]], axis=2)
tf.summary.image('input_objectmotion_batch%s_scale%s' % (batch_s, s),
whole_strip_input) # (B, H, 3*W, 3)
for batch_s in range(self.batch_size):
whole_strip = tf.concat([self.base_input_masked[batch_s, :, :, 0:3],
self.base_input_masked[batch_s, :, :, 3:6],
self.base_input_masked[batch_s, :, :, 6:9]],
axis=1)
tf.summary.image('input_egomotion_batch%s' % batch_s,
tf.expand_dims(whole_strip, axis=0))
# Show transform predictions (of all objects).
for batch_s in range(self.batch_size):
for i in range(self.seq_length - 1):
# self.object_transforms contains batch_size elements of (N, 2, 6).
tf.summary.histogram('batch%d_tx%d' % (batch_s, i),
self.object_transforms[0][batch_s][:, i, 0])
tf.summary.histogram('batch%d_ty%d' % (batch_s, i),
self.object_transforms[0][batch_s][:, i, 1])
tf.summary.histogram('batch%d_tz%d' % (batch_s, i),
self.object_transforms[0][batch_s][:, i, 2])
tf.summary.histogram('batch%d_rx%d' % (batch_s, i),
self.object_transforms[0][batch_s][:, i, 3])
tf.summary.histogram('batch%d_ry%d' % (batch_s, i),
self.object_transforms[0][batch_s][:, i, 4])
tf.summary.histogram('batch%d_rz%d' % (batch_s, i),
self.object_transforms[0][batch_s][:, i, 5])
for i in range(self.seq_length - 1):
tf.summary.histogram('tx%d' % i, self.egomotion[:, i, 0])
tf.summary.histogram('ty%d' % i, self.egomotion[:, i, 1])
tf.summary.histogram('tz%d' % i, self.egomotion[:, i, 2])
tf.summary.histogram('rx%d' % i, self.egomotion[:, i, 3])
tf.summary.histogram('ry%d' % i, self.egomotion[:, i, 4])
tf.summary.histogram('rz%d' % i, self.egomotion[:, i, 5])
for s in range(NUM_SCALES):
for i in range(self.seq_length):
tf.summary.image('scale%d_image%d' % (s, i),
self.images[s][:, :, :, 3 * i:3 * (i + 1)])
if i in self.depth:
tf.summary.histogram('scale%d_depth%d' % (s, i), self.depth[i][s])
tf.summary.histogram('scale%d_disp%d' % (s, i), self.disp[i][s])
tf.summary.image('scale%d_disparity%d' % (s, i), self.disp[i][s])
for key in self.warped_image[s]:
tf.summary.image('scale%d_warped_image%s' % (s, key),
self.warped_image[s][key])
tf.summary.image('scale%d_warp_error%s' % (s, key),
self.warp_error[s][key])
if self.ssim_weight > 0:
tf.summary.image('scale%d_ssim_error%s' % (s, key),
self.ssim_error[s][key])
if self.icp_weight > 0:
tf.summary.image('scale%d_icp_residual%s' % (s, key),
self.icp_residual[s][key])
transform = self.icp_transform[s][key]
tf.summary.histogram('scale%d_icp_tx%s' % (s, key), transform[:, 0])
tf.summary.histogram('scale%d_icp_ty%s' % (s, key), transform[:, 1])
tf.summary.histogram('scale%d_icp_tz%s' % (s, key), transform[:, 2])
tf.summary.histogram('scale%d_icp_rx%s' % (s, key), transform[:, 3])
tf.summary.histogram('scale%d_icp_ry%s' % (s, key), transform[:, 4])
tf.summary.histogram('scale%d_icp_rz%s' % (s, key), transform[:, 5])
def build_depth_test_graph(self):
"""Builds depth model reading from placeholders."""
with tf.variable_scope('depth_prediction'):
input_image = tf.placeholder(
tf.float32, [self.batch_size, self.img_height, self.img_width, 3],
name='raw_input')
if self.imagenet_norm:
input_image = (input_image - reader.IMAGENET_MEAN) / reader.IMAGENET_SD
est_disp, _ = nets.disp_net(architecture=self.architecture,
image=input_image,
use_skip=self.use_skip,
weight_reg=self.weight_reg,
is_training=True)
est_depth = 1.0 / est_disp[0]
self.input_image = input_image
self.est_depth = est_depth
def build_egomotion_test_graph(self):
"""Builds egomotion model reading from placeholders."""
input_image_stack = tf.placeholder(
tf.float32,
[1, self.img_height, self.img_width, self.seq_length * 3],
name='raw_input')
input_bottleneck_stack = None
if self.imagenet_norm:
im_mean = tf.tile(
tf.constant(reader.IMAGENET_MEAN), multiples=[self.seq_length])
im_sd = tf.tile(
tf.constant(reader.IMAGENET_SD), multiples=[self.seq_length])
input_image_stack = (input_image_stack - im_mean) / im_sd
if self.joint_encoder:
# Pre-compute embeddings here.
with tf.variable_scope('depth_prediction', reuse=True):
input_bottleneck_stack = []
encoder_selected = nets.encoder(self.architecture)
for i in range(self.seq_length):
input_image = input_image_stack[:, :, :, i * 3:(i + 1) * 3]
tf.get_variable_scope().reuse_variables()
embedding, _ = encoder_selected(
target_image=input_image,
weight_reg=self.weight_reg,
is_training=True)
input_bottleneck_stack.append(embedding)
input_bottleneck_stack = tf.concat(input_bottleneck_stack, axis=3)
with tf.variable_scope('egomotion_prediction'):
est_egomotion = nets.egomotion_net(
image_stack=input_image_stack,
disp_bottleneck_stack=input_bottleneck_stack,
joint_encoder=self.joint_encoder,
seq_length=self.seq_length,
weight_reg=self.weight_reg)
self.input_image_stack = input_image_stack
self.est_egomotion = est_egomotion
def build_objectmotion_test_graph(self):
"""Builds egomotion model reading from placeholders."""
input_image_stack_om = tf.placeholder(
tf.float32,
[1, self.img_height, self.img_width, self.seq_length * 3],
name='raw_input')
if self.imagenet_norm:
im_mean = tf.tile(
tf.constant(reader.IMAGENET_MEAN), multiples=[self.seq_length])
im_sd = tf.tile(
tf.constant(reader.IMAGENET_SD), multiples=[self.seq_length])
input_image_stack_om = (input_image_stack_om - im_mean) / im_sd
with tf.variable_scope('objectmotion_prediction'):
est_objectmotion = nets.objectmotion_net(
image_stack=input_image_stack_om,
disp_bottleneck_stack=None,
joint_encoder=self.joint_encoder,
seq_length=self.seq_length,
weight_reg=self.weight_reg)
self.input_image_stack_om = input_image_stack_om
self.est_objectmotion = est_objectmotion
def inference_depth(self, inputs, sess):
return sess.run(self.est_depth, feed_dict={self.input_image: inputs})
def inference_egomotion(self, inputs, sess):
return sess.run(
self.est_egomotion, feed_dict={self.input_image_stack: inputs})
def inference_objectmotion(self, inputs, sess):
return sess.run(
self.est_objectmotion, feed_dict={self.input_image_stack_om: inputs})
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Depth and Ego-Motion networks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
slim = tf.contrib.slim
SIMPLE = 'simple'
RESNET = 'resnet'
ARCHITECTURES = [SIMPLE, RESNET]
SCALE_TRANSLATION = 0.001
SCALE_ROTATION = 0.01
# Disparity (inverse depth) values range from 0.01 to 10. Note that effectively,
# this is undone if depth normalization is used, which scales the values to
# have a mean of 1.
DISP_SCALING = 10
MIN_DISP = 0.01
WEIGHT_DECAY_KEY = 'WEIGHT_DECAY'
EGOMOTION_VEC_SIZE = 6
def egomotion_net(image_stack, disp_bottleneck_stack, joint_encoder, seq_length,
weight_reg):
"""Predict ego-motion vectors from a stack of frames or embeddings.
Args:
image_stack: Input tensor with shape [B, h, w, seq_length * 3] in order.
disp_bottleneck_stack: Input tensor with shape [B, h_hidden, w_hidden,
seq_length * c_hidden] in order.
joint_encoder: Determines if the same encoder is used for computing the
bottleneck layer of both the egomotion and the depth prediction
network. If enabled, disp_bottleneck_stack is used as input, and the
encoding steps are skipped. If disabled, a separate encoder is defined
on image_stack.
seq_length: The sequence length used.
weight_reg: The amount of weight regularization.
Returns:
Egomotion vectors with shape [B, seq_length - 1, 6].
"""
num_egomotion_vecs = seq_length - 1
with tf.variable_scope('pose_exp_net') as sc:
end_points_collection = sc.original_name_scope + '_end_points'
with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
normalizer_fn=None,
weights_regularizer=slim.l2_regularizer(weight_reg),
normalizer_params=None,
activation_fn=tf.nn.relu,
outputs_collections=end_points_collection):
if not joint_encoder:
# Define separate encoder. If sharing, we can skip the encoding step,
# as the bottleneck layer will already be passed as input.
cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')
with tf.variable_scope('pose'):
inputs = disp_bottleneck_stack if joint_encoder else cnv5
cnv6 = slim.conv2d(inputs, 256, [3, 3], stride=2, scope='cnv6')
cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs
egomotion_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred',
stride=1, normalizer_fn=None,
activation_fn=None)
egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2])
egomotion_res = tf.reshape(
egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE])
# Tinghui found that scaling by a small constant facilitates training.
egomotion_scaled = tf.concat([egomotion_res[:, 0:3] * SCALE_TRANSLATION,
egomotion_res[:, 3:6] * SCALE_ROTATION],
axis=1)
return egomotion_scaled
def objectmotion_net(image_stack, disp_bottleneck_stack, joint_encoder,
seq_length, weight_reg):
"""Predict object-motion vectors from a stack of frames or embeddings.
Args:
image_stack: Input tensor with shape [B, h, w, seq_length * 3] in order.
disp_bottleneck_stack: Input tensor with shape [B, h_hidden, w_hidden,
seq_length * c_hidden] in order.
joint_encoder: Determines if the same encoder is used for computing the
bottleneck layer of both the egomotion and the depth prediction
network. If enabled, disp_bottleneck_stack is used as input, and the
encoding steps are skipped. If disabled, a separate encoder is defined
on image_stack.
seq_length: The sequence length used.
weight_reg: The amount of weight regularization.
Returns:
Egomotion vectors with shape [B, seq_length - 1, 6].
"""
num_egomotion_vecs = seq_length - 1
with tf.variable_scope('pose_exp_net') as sc:
end_points_collection = sc.original_name_scope + '_end_points'
with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
normalizer_fn=None,
weights_regularizer=slim.l2_regularizer(weight_reg),
normalizer_params=None,
activation_fn=tf.nn.relu,
outputs_collections=end_points_collection):
if not joint_encoder:
# Define separate encoder. If sharing, we can skip the encoding step,
# as the bottleneck layer will already be passed as input.
cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')
with tf.variable_scope('pose'):
inputs = disp_bottleneck_stack if joint_encoder else cnv5
cnv6 = slim.conv2d(inputs, 256, [3, 3], stride=2, scope='cnv6')
cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs
egomotion_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred',
stride=1, normalizer_fn=None,
activation_fn=None)
egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2])
egomotion_res = tf.reshape(
egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE])
# Tinghui found that scaling by a small constant facilitates training.
egomotion_scaled = tf.concat([egomotion_res[:, 0:3] * SCALE_TRANSLATION,
egomotion_res[:, 3:6] * SCALE_ROTATION],
axis=1)
return egomotion_scaled
def disp_net(architecture, image, use_skip, weight_reg, is_training):
"""Defines an encoder-decoder architecture for depth prediction."""
if architecture not in ARCHITECTURES:
raise ValueError('Unknown architecture.')
encoder_selected = encoder(architecture)
decoder_selected = decoder(architecture)
# Encode image.
bottleneck, skip_connections = encoder_selected(image, weight_reg,
is_training)
# Decode to depth.
multiscale_disps_i = decoder_selected(target_image=image,
bottleneck=bottleneck,
weight_reg=weight_reg,
use_skip=use_skip,
skip_connections=skip_connections)
return multiscale_disps_i, bottleneck
def encoder(architecture):
return encoder_resnet if architecture == RESNET else encoder_simple
def decoder(architecture):
return decoder_resnet if architecture == RESNET else decoder_simple
def encoder_simple(target_image, weight_reg, is_training):
"""Defines the old encoding architecture."""
del is_training
with slim.arg_scope([slim.conv2d],
normalizer_fn=None,
normalizer_params=None,
weights_regularizer=slim.l2_regularizer(weight_reg),
activation_fn=tf.nn.relu):
# Define (joint) encoder.
cnv1 = slim.conv2d(target_image, 32, [7, 7], stride=2, scope='cnv1')
cnv1b = slim.conv2d(cnv1, 32, [7, 7], stride=1, scope='cnv1b')
cnv2 = slim.conv2d(cnv1b, 64, [5, 5], stride=2, scope='cnv2')
cnv2b = slim.conv2d(cnv2, 64, [5, 5], stride=1, scope='cnv2b')
cnv3 = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3')
cnv3b = slim.conv2d(cnv3, 128, [3, 3], stride=1, scope='cnv3b')
cnv4 = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4')
cnv4b = slim.conv2d(cnv4, 256, [3, 3], stride=1, scope='cnv4b')
cnv5 = slim.conv2d(cnv4b, 512, [3, 3], stride=2, scope='cnv5')
cnv5b = slim.conv2d(cnv5, 512, [3, 3], stride=1, scope='cnv5b')
cnv6 = slim.conv2d(cnv5b, 512, [3, 3], stride=2, scope='cnv6')
cnv6b = slim.conv2d(cnv6, 512, [3, 3], stride=1, scope='cnv6b')
cnv7 = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7')
cnv7b = slim.conv2d(cnv7, 512, [3, 3], stride=1, scope='cnv7b')
return cnv7b, (cnv6b, cnv5b, cnv4b, cnv3b, cnv2b, cnv1b)
def decoder_simple(target_image, bottleneck, weight_reg, use_skip,
skip_connections):
"""Defines the old depth decoder architecture."""
h = target_image.get_shape()[1].value
w = target_image.get_shape()[2].value
(cnv6b, cnv5b, cnv4b, cnv3b, cnv2b, cnv1b) = skip_connections
with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
normalizer_fn=None,
normalizer_params=None,
weights_regularizer=slim.l2_regularizer(weight_reg),
activation_fn=tf.nn.relu):
up7 = slim.conv2d_transpose(bottleneck, 512, [3, 3], stride=2,
scope='upcnv7')
up7 = _resize_like(up7, cnv6b)
if use_skip:
i7_in = tf.concat([up7, cnv6b], axis=3)
else:
i7_in = up7
icnv7 = slim.conv2d(i7_in, 512, [3, 3], stride=1, scope='icnv7')
up6 = slim.conv2d_transpose(icnv7, 512, [3, 3], stride=2, scope='upcnv6')
up6 = _resize_like(up6, cnv5b)
if use_skip:
i6_in = tf.concat([up6, cnv5b], axis=3)
else:
i6_in = up6
icnv6 = slim.conv2d(i6_in, 512, [3, 3], stride=1, scope='icnv6')
up5 = slim.conv2d_transpose(icnv6, 256, [3, 3], stride=2, scope='upcnv5')
up5 = _resize_like(up5, cnv4b)
if use_skip:
i5_in = tf.concat([up5, cnv4b], axis=3)
else:
i5_in = up5
icnv5 = slim.conv2d(i5_in, 256, [3, 3], stride=1, scope='icnv5')
up4 = slim.conv2d_transpose(icnv5, 128, [3, 3], stride=2, scope='upcnv4')
up4 = _resize_like(up4, cnv3b)
if use_skip:
i4_in = tf.concat([up4, cnv3b], axis=3)
else:
i4_in = up4
icnv4 = slim.conv2d(i4_in, 128, [3, 3], stride=1, scope='icnv4')
disp4 = (slim.conv2d(icnv4, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
normalizer_fn=None, scope='disp4')
* DISP_SCALING + MIN_DISP)
disp4_up = tf.image.resize_bilinear(disp4, [np.int(h / 4), np.int(w / 4)],
align_corners=True)
up3 = slim.conv2d_transpose(icnv4, 64, [3, 3], stride=2, scope='upcnv3')
up3 = _resize_like(up3, cnv2b)
if use_skip:
i3_in = tf.concat([up3, cnv2b, disp4_up], axis=3)
else:
i3_in = tf.concat([up3, disp4_up])
icnv3 = slim.conv2d(i3_in, 64, [3, 3], stride=1, scope='icnv3')
disp3 = (slim.conv2d(icnv3, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
normalizer_fn=None, scope='disp3')
* DISP_SCALING + MIN_DISP)
disp3_up = tf.image.resize_bilinear(disp3, [np.int(h / 2), np.int(w / 2)],
align_corners=True)
up2 = slim.conv2d_transpose(icnv3, 32, [3, 3], stride=2, scope='upcnv2')
up2 = _resize_like(up2, cnv1b)
if use_skip:
i2_in = tf.concat([up2, cnv1b, disp3_up], axis=3)
else:
i2_in = tf.concat([up2, disp3_up])
icnv2 = slim.conv2d(i2_in, 32, [3, 3], stride=1, scope='icnv2')
disp2 = (slim.conv2d(icnv2, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
normalizer_fn=None, scope='disp2')
* DISP_SCALING + MIN_DISP)
disp2_up = tf.image.resize_bilinear(disp2, [h, w], align_corners=True)
up1 = slim.conv2d_transpose(icnv2, 16, [3, 3], stride=2, scope='upcnv1')
i1_in = tf.concat([up1, disp2_up], axis=3)
icnv1 = slim.conv2d(i1_in, 16, [3, 3], stride=1, scope='icnv1')
disp1 = (slim.conv2d(icnv1, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
normalizer_fn=None, scope='disp1')
* DISP_SCALING + MIN_DISP)
return [disp1, disp2, disp3, disp4]
def encoder_resnet(target_image, weight_reg, is_training):
"""Defines a ResNet18-based encoding architecture.
This implementation follows Juyong Kim's implementation of ResNet18 on GitHub:
https://github.com/dalgu90/resnet-18-tensorflow
Args:
target_image: Input tensor with shape [B, h, w, 3] to encode.
weight_reg: Parameter ignored.
is_training: Whether the model is being trained or not.
Returns:
Tuple of tensors, with the first being the bottleneck layer as tensor of
size [B, h_hid, w_hid, c_hid], and others being intermediate layers
for building skip-connections.
"""
del weight_reg
encoder_filters = [64, 64, 128, 256, 512]
stride = 2
# conv1
with tf.variable_scope('conv1'):
x = _conv(target_image, 7, encoder_filters[0], stride)
x = _bn(x, is_train=is_training)
econv1 = _relu(x)
x = tf.nn.max_pool(econv1, [1, 3, 3, 1], [1, 2, 2, 1], 'SAME')
# conv2_x
x = _residual_block(x, is_training, name='conv2_1')
econv2 = _residual_block(x, is_training, name='conv2_2')
# conv3_x
x = _residual_block_first(econv2, is_training, encoder_filters[2], stride,
name='conv3_1')
econv3 = _residual_block(x, is_training, name='conv3_2')
# conv4_x
x = _residual_block_first(econv3, is_training, encoder_filters[3], stride,
name='conv4_1')
econv4 = _residual_block(x, is_training, name='conv4_2')
# conv5_x
x = _residual_block_first(econv4, is_training, encoder_filters[4], stride,
name='conv5_1')
econv5 = _residual_block(x, is_training, name='conv5_2')
return econv5, (econv4, econv3, econv2, econv1)
def decoder_resnet(target_image, bottleneck, weight_reg, use_skip,
skip_connections):
"""Defines the depth decoder architecture.
Args:
target_image: The original encoder input tensor with shape [B, h, w, 3].
Just the shape information is used here.
bottleneck: Bottleneck layer to be decoded.
weight_reg: The amount of weight regularization.
use_skip: Whether the passed skip connections econv1, econv2, econv3 and
econv4 should be used.
skip_connections: Tensors for building skip-connections.
Returns:
Disparities at 4 different scales.
"""
(econv4, econv3, econv2, econv1) = skip_connections
decoder_filters = [16, 32, 64, 128, 256]
default_pad = tf.constant([[0, 0], [1, 1], [1, 1], [0, 0]])
reg = slim.l2_regularizer(weight_reg) if weight_reg > 0.0 else None
with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
normalizer_fn=None,
normalizer_params=None,
activation_fn=tf.nn.relu,
weights_regularizer=reg):
upconv5 = slim.conv2d_transpose(bottleneck, decoder_filters[4], [3, 3],
stride=2, scope='upconv5')
upconv5 = _resize_like(upconv5, econv4)
if use_skip:
i5_in = tf.concat([upconv5, econv4], axis=3)
else:
i5_in = upconv5
i5_in = tf.pad(i5_in, default_pad, mode='REFLECT')
iconv5 = slim.conv2d(i5_in, decoder_filters[4], [3, 3], stride=1,
scope='iconv5', padding='VALID')
upconv4 = slim.conv2d_transpose(iconv5, decoder_filters[3], [3, 3],
stride=2, scope='upconv4')
upconv4 = _resize_like(upconv4, econv3)
if use_skip:
i4_in = tf.concat([upconv4, econv3], axis=3)
else:
i4_in = upconv4
i4_in = tf.pad(i4_in, default_pad, mode='REFLECT')
iconv4 = slim.conv2d(i4_in, decoder_filters[3], [3, 3], stride=1,
scope='iconv4', padding='VALID')
disp4_input = tf.pad(iconv4, default_pad, mode='REFLECT')
disp4 = (slim.conv2d(disp4_input, 1, [3, 3], stride=1,
activation_fn=tf.sigmoid, normalizer_fn=None,
scope='disp4', padding='VALID')
* DISP_SCALING + MIN_DISP)
upconv3 = slim.conv2d_transpose(iconv4, decoder_filters[2], [3, 3],
stride=2, scope='upconv3')
upconv3 = _resize_like(upconv3, econv2)
if use_skip:
i3_in = tf.concat([upconv3, econv2], axis=3)
else:
i3_in = upconv3
i3_in = tf.pad(i3_in, default_pad, mode='REFLECT')
iconv3 = slim.conv2d(i3_in, decoder_filters[2], [3, 3], stride=1,
scope='iconv3', padding='VALID')
disp3_input = tf.pad(iconv3, default_pad, mode='REFLECT')
disp3 = (slim.conv2d(disp3_input, 1, [3, 3], stride=1,
activation_fn=tf.sigmoid, normalizer_fn=None,
scope='disp3', padding='VALID')
* DISP_SCALING + MIN_DISP)
upconv2 = slim.conv2d_transpose(iconv3, decoder_filters[1], [3, 3],
stride=2, scope='upconv2')
upconv2 = _resize_like(upconv2, econv1)
if use_skip:
i2_in = tf.concat([upconv2, econv1], axis=3)
else:
i2_in = upconv2
i2_in = tf.pad(i2_in, default_pad, mode='REFLECT')
iconv2 = slim.conv2d(i2_in, decoder_filters[1], [3, 3], stride=1,
scope='iconv2', padding='VALID')
disp2_input = tf.pad(iconv2, default_pad, mode='REFLECT')
disp2 = (slim.conv2d(disp2_input, 1, [3, 3], stride=1,
activation_fn=tf.sigmoid, normalizer_fn=None,
scope='disp2', padding='VALID')
* DISP_SCALING + MIN_DISP)
upconv1 = slim.conv2d_transpose(iconv2, decoder_filters[0], [3, 3],
stride=2, scope='upconv1')
upconv1 = _resize_like(upconv1, target_image)
upconv1 = tf.pad(upconv1, default_pad, mode='REFLECT')
iconv1 = slim.conv2d(upconv1, decoder_filters[0], [3, 3], stride=1,
scope='iconv1', padding='VALID')
disp1_input = tf.pad(iconv1, default_pad, mode='REFLECT')
disp1 = (slim.conv2d(disp1_input, 1, [3, 3], stride=1,
activation_fn=tf.sigmoid, normalizer_fn=None,
scope='disp1', padding='VALID')
* DISP_SCALING + MIN_DISP)
return [disp1, disp2, disp3, disp4]
def _residual_block_first(x, is_training, out_channel, strides, name='unit'):
"""Helper function for defining ResNet architecture."""
in_channel = x.get_shape().as_list()[-1]
with tf.variable_scope(name):
# Shortcut connection
if in_channel == out_channel:
if strides == 1:
shortcut = tf.identity(x)
else:
shortcut = tf.nn.max_pool(x, [1, strides, strides, 1],
[1, strides, strides, 1], 'VALID')
else:
shortcut = _conv(x, 1, out_channel, strides, name='shortcut')
# Residual
x = _conv(x, 3, out_channel, strides, name='conv_1')
x = _bn(x, is_train=is_training, name='bn_1')
x = _relu(x, name='relu_1')
x = _conv(x, 3, out_channel, 1, name='conv_2')
x = _bn(x, is_train=is_training, name='bn_2')
# Merge
x = x + shortcut
x = _relu(x, name='relu_2')
return x
def _residual_block(x, is_training, input_q=None, output_q=None, name='unit'):
"""Helper function for defining ResNet architecture."""
num_channel = x.get_shape().as_list()[-1]
with tf.variable_scope(name):
shortcut = x # Shortcut connection
# Residual
x = _conv(x, 3, num_channel, 1, input_q=input_q, output_q=output_q,
name='conv_1')
x = _bn(x, is_train=is_training, name='bn_1')
x = _relu(x, name='relu_1')
x = _conv(x, 3, num_channel, 1, input_q=output_q, output_q=output_q,
name='conv_2')
x = _bn(x, is_train=is_training, name='bn_2')
# Merge
x = x + shortcut
x = _relu(x, name='relu_2')
return x
def _conv(x, filter_size, out_channel, stride, pad='SAME', input_q=None,
output_q=None, name='conv'):
"""Helper function for defining ResNet architecture."""
if (input_q is None) ^ (output_q is None):
raise ValueError('Input/Output splits are not correctly given.')
in_shape = x.get_shape()
with tf.variable_scope(name):
# Main operation: conv2d
with tf.device('/CPU:0'):
kernel = tf.get_variable(
'kernel', [filter_size, filter_size, in_shape[3], out_channel],
tf.float32, initializer=tf.random_normal_initializer(
stddev=np.sqrt(2.0/filter_size/filter_size/out_channel)))
if kernel not in tf.get_collection(WEIGHT_DECAY_KEY):
tf.add_to_collection(WEIGHT_DECAY_KEY, kernel)
conv = tf.nn.conv2d(x, kernel, [1, stride, stride, 1], pad)
return conv
def _bn(x, is_train, name='bn'):
"""Helper function for defining ResNet architecture."""
bn = tf.layers.batch_normalization(x, training=is_train, name=name)
return bn
def _relu(x, name=None, leakness=0.0):
"""Helper function for defining ResNet architecture."""
if leakness > 0.0:
name = 'lrelu' if name is None else name
return tf.maximum(x, x*leakness, name='lrelu')
else:
name = 'relu' if name is None else name
return tf.nn.relu(x, name='relu')
def _resize_like(inputs, ref):
i_h, i_w = inputs.get_shape()[1], inputs.get_shape()[2]
r_h, r_w = ref.get_shape()[1], ref.get_shape()[2]
if i_h == r_h and i_w == r_w:
return inputs
else:
# TODO(casser): Other interpolation methods could be explored here.
return tf.image.resize_bilinear(inputs, [r_h.value, r_w.value],
align_corners=True)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Applies online refinement while running inference.
Instructions: Run static inference first before calling this script. Make sure
to point output_dir to the same folder where static inference results were
saved previously.
For example use, please refer to README.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import datetime
import os
import random
from absl import app
from absl import flags
from absl import logging
import numpy as np
import tensorflow as tf
import model
import nets
import reader
import util
gfile = tf.gfile
SAVE_EVERY = 1 # Defines the interval that predictions should be saved at.
SAVE_PREVIEWS = True # If set, while save image previews of depth predictions.
FIXED_SEED = 8964 # Fixed seed for repeatability.
flags.DEFINE_string('output_dir', None, 'Directory to store predictions. '
'Assumes that regular inference has been executed before '
'and results were stored in this folder.')
flags.DEFINE_string('data_dir', None, 'Folder pointing to preprocessed '
'triplets to fine-tune on.')
flags.DEFINE_string('triplet_list_file', None, 'Text file containing paths to '
'image files to process. Paths should be relative with '
'respect to the list file location. Every line should be '
'of the form [input_folder_name] [input_frame_num] '
'[output_path], where [output_path] is optional to specify '
'a different path to store the prediction.')
flags.DEFINE_string('triplet_list_file_remains', None, 'Optional text file '
'containing relative paths to image files which should not '
'be fine-tuned, e.g. because of missing adjacent frames. '
'For all files listed, the static prediction will be '
'copied instead. File can be empty. If not, every line '
'should be of the form [input_folder_name] '
'[input_frame_num] [output_path], where [output_path] is '
'optional to specify a different path to take and store '
'the unrefined prediction from/to.')
flags.DEFINE_string('model_ckpt', None, 'Model checkpoint to optimize.')
flags.DEFINE_string('ft_name', '', 'Optional prefix for temporary files.')
flags.DEFINE_string('file_extension', 'png', 'Image data file extension.')
flags.DEFINE_float('learning_rate', 0.0001, 'Adam learning rate.')
flags.DEFINE_float('beta1', 0.9, 'Adam momentum.')
flags.DEFINE_float('reconstr_weight', 0.85, 'Frame reconstruction loss weight.')
flags.DEFINE_float('ssim_weight', 0.15, 'SSIM loss weight.')
flags.DEFINE_float('smooth_weight', 0.01, 'Smoothness loss weight.')
flags.DEFINE_float('icp_weight', 0.0, 'ICP loss weight.')
flags.DEFINE_float('size_constraint_weight', 0.0005, 'Weight of the object '
'size constraint loss. Use only with motion handling.')
flags.DEFINE_integer('batch_size', 1, 'The size of a sample batch')
flags.DEFINE_integer('img_height', 128, 'Input frame height.')
flags.DEFINE_integer('img_width', 416, 'Input frame width.')
flags.DEFINE_integer('seq_length', 3, 'Number of frames in sequence.')
flags.DEFINE_enum('architecture', nets.RESNET, nets.ARCHITECTURES,
'Defines the architecture to use for the depth prediction '
'network. Defaults to ResNet-based encoder and accompanying '
'decoder.')
flags.DEFINE_boolean('imagenet_norm', True, 'Whether to normalize the input '
'images channel-wise so that they match the distribution '
'most ImageNet-models were trained on.')
flags.DEFINE_float('weight_reg', 0.05, 'The amount of weight regularization to '
'apply. This has no effect on the ResNet-based encoder '
'architecture.')
flags.DEFINE_boolean('exhaustive_mode', False, 'Whether to exhaustively warp '
'from any frame to any other instead of just considering '
'adjacent frames. Where necessary, multiple egomotion '
'estimates will be applied. Does not have an effect if '
'compute_minimum_loss is enabled.')
flags.DEFINE_boolean('random_scale_crop', False, 'Whether to apply random '
'image scaling and center cropping during training.')
flags.DEFINE_bool('depth_upsampling', True, 'Whether to apply depth '
'upsampling of lower-scale representations before warping to '
'compute reconstruction loss on full-resolution image.')
flags.DEFINE_bool('depth_normalization', True, 'Whether to apply depth '
'normalization, that is, normalizing inverse depth '
'prediction maps by their mean to avoid degeneration towards '
'small values.')
flags.DEFINE_bool('compute_minimum_loss', True, 'Whether to take the '
'element-wise minimum of the reconstruction/SSIM error in '
'order to avoid overly penalizing dis-occlusion effects.')
flags.DEFINE_bool('use_skip', True, 'Whether to use skip connections in the '
'encoder-decoder architecture.')
flags.DEFINE_bool('joint_encoder', False, 'Whether to share parameters '
'between the depth and egomotion networks by using a joint '
'encoder architecture. The egomotion network is then '
'operating only on the hidden representation provided by the '
'joint encoder.')
flags.DEFINE_float('egomotion_threshold', 0.01, 'Minimum egomotion magnitude '
'to apply finetuning. If lower, just forwards the ordinary '
'prediction.')
flags.DEFINE_integer('num_steps', 20, 'Number of optimization steps to run.')
flags.DEFINE_boolean('handle_motion', True, 'Whether the checkpoint was '
'trained with motion handling.')
flags.DEFINE_bool('flip', False, 'Whether images should be flipped as well as '
'resulting predictions (for test-time augmentation). This '
'currently applies to the depth network only.')
FLAGS = flags.FLAGS
flags.mark_flag_as_required('output_dir')
flags.mark_flag_as_required('data_dir')
flags.mark_flag_as_required('model_ckpt')
flags.mark_flag_as_required('triplet_list_file')
def main(_):
"""Runs fine-tuning and inference.
There are three categories of images.
1) Images where we have previous and next frame, and that are not filtered
out by the heuristic. For them, we will use the fine-tuned predictions.
2) Images where we have previous and next frame, but that were filtered out
by our heuristic. For them, we will use the ordinary prediction instead.
3) Images where we have at least one missing adjacent frame. For them, we will
use the ordinary prediction as indicated by triplet_list_file_remains (if
provided). They will also not be part of the generated inference list in
the first place.
Raises:
ValueError: Invalid parameters have been passed.
"""
if FLAGS.handle_motion and FLAGS.joint_encoder:
raise ValueError('Using a joint encoder is currently not supported when '
'modeling object motion.')
if FLAGS.handle_motion and FLAGS.seq_length != 3:
raise ValueError('The current motion model implementation only supports '
'using a sequence length of three.')
if FLAGS.handle_motion and not FLAGS.compute_minimum_loss:
raise ValueError('Computing the minimum photometric loss is required when '
'enabling object motion handling.')
if FLAGS.size_constraint_weight > 0 and not FLAGS.handle_motion:
raise ValueError('To enforce object size constraints, enable motion '
'handling.')
if FLAGS.icp_weight > 0.0:
raise ValueError('ICP is currently not supported.')
if FLAGS.compute_minimum_loss and FLAGS.seq_length % 2 != 1:
raise ValueError('Compute minimum loss requires using an odd number of '
'images in a sequence.')
if FLAGS.compute_minimum_loss and FLAGS.exhaustive_mode:
raise ValueError('Exhaustive mode has no effect when compute_minimum_loss '
'is enabled.')
if FLAGS.img_width % (2 ** 5) != 0 or FLAGS.img_height % (2 ** 5) != 0:
logging.warn('Image size is not divisible by 2^5. For the architecture '
'employed, this could cause artefacts caused by resizing in '
'lower dimensions.')
if FLAGS.output_dir.endswith('/'):
FLAGS.output_dir = FLAGS.output_dir[:-1]
# Create file lists to prepare fine-tuning, save it to unique_file.
unique_file_name = (str(datetime.datetime.now().date()) + '_' +
str(datetime.datetime.now().time()).replace(':', '_'))
unique_file = os.path.join(FLAGS.data_dir, unique_file_name + '.txt')
with gfile.FastGFile(FLAGS.triplet_list_file, 'r') as f:
files_to_process = f.readlines()
files_to_process = [line.rstrip() for line in files_to_process]
files_to_process = [line for line in files_to_process if len(line)]
logging.info('Creating unique file list %s with %s entries.', unique_file,
len(files_to_process))
with gfile.FastGFile(unique_file, 'w') as f_out:
fetches_network = FLAGS.num_steps * FLAGS.batch_size
fetches_saves = FLAGS.batch_size * int(np.floor(FLAGS.num_steps/SAVE_EVERY))
repetitions = fetches_network + 3 * fetches_saves
for i in range(len(files_to_process)):
for _ in range(repetitions):
f_out.write(files_to_process[i] + '\n')
# Read remaining files.
remaining = []
if gfile.Exists(FLAGS.triplet_list_file_remains):
with gfile.FastGFile(FLAGS.triplet_list_file_remains, 'r') as f:
remaining = f.readlines()
remaining = [line.rstrip() for line in remaining]
remaining = [line for line in remaining if len(line)]
logging.info('Running fine-tuning on %s files, %s files are remaining.',
len(files_to_process), len(remaining))
# Run fine-tuning process and save predictions in id-folders.
tf.set_random_seed(FIXED_SEED)
np.random.seed(FIXED_SEED)
random.seed(FIXED_SEED)
flipping_mode = reader.FLIP_ALWAYS if FLAGS.flip else reader.FLIP_NONE
train_model = model.Model(data_dir=FLAGS.data_dir,
file_extension=FLAGS.file_extension,
is_training=True,
learning_rate=FLAGS.learning_rate,
beta1=FLAGS.beta1,
reconstr_weight=FLAGS.reconstr_weight,
smooth_weight=FLAGS.smooth_weight,
ssim_weight=FLAGS.ssim_weight,
icp_weight=FLAGS.icp_weight,
batch_size=FLAGS.batch_size,
img_height=FLAGS.img_height,
img_width=FLAGS.img_width,
seq_length=FLAGS.seq_length,
architecture=FLAGS.architecture,
imagenet_norm=FLAGS.imagenet_norm,
weight_reg=FLAGS.weight_reg,
exhaustive_mode=FLAGS.exhaustive_mode,
random_scale_crop=FLAGS.random_scale_crop,
flipping_mode=flipping_mode,
random_color=False,
depth_upsampling=FLAGS.depth_upsampling,
depth_normalization=FLAGS.depth_normalization,
compute_minimum_loss=FLAGS.compute_minimum_loss,
use_skip=FLAGS.use_skip,
joint_encoder=FLAGS.joint_encoder,
build_sum=False,
shuffle=False,
input_file=unique_file_name,
handle_motion=FLAGS.handle_motion,
size_constraint_weight=FLAGS.size_constraint_weight,
train_global_scale_var=False)
failed_heuristic_ids = finetune_inference(train_model, FLAGS.model_ckpt,
FLAGS.output_dir + '_ft')
logging.info('Fine-tuning completed, %s files were filtered out by '
'heuristic.', len(failed_heuristic_ids))
for failed_id in failed_heuristic_ids:
failed_entry = files_to_process[failed_id]
remaining.append(failed_entry)
logging.info('In total, %s images were fine-tuned, while %s were not.',
len(files_to_process)-len(failed_heuristic_ids), len(remaining))
# Copy all results to have the same structural output as running ordinary
# inference.
for i in range(len(files_to_process)):
if files_to_process[i] not in remaining: # Use fine-tuned result.
elements = files_to_process[i].split(' ')
source_file = os.path.join(FLAGS.output_dir + '_ft', FLAGS.ft_name +
'id_' + str(i),
str(FLAGS.num_steps).zfill(10) +
('_flip' if FLAGS.flip else ''))
if len(elements) == 2: # No differing mapping defined.
target_dir = os.path.join(FLAGS.output_dir + '_ft', elements[0])
target_file = os.path.join(
target_dir, elements[1] + ('_flip' if FLAGS.flip else ''))
else: # Other mapping for file defined, copy to this location instead.
target_dir = os.path.join(
FLAGS.output_dir + '_ft', os.path.dirname(elements[2]))
target_file = os.path.join(
target_dir,
os.path.basename(elements[2]) + ('_flip' if FLAGS.flip else ''))
if not gfile.Exists(target_dir):
gfile.MakeDirs(target_dir)
logging.info('Copy refined result %s to %s.', source_file, target_file)
gfile.Copy(source_file + '.npy', target_file + '.npy', overwrite=True)
gfile.Copy(source_file + '.txt', target_file + '.txt', overwrite=True)
gfile.Copy(source_file + '.%s' % FLAGS.file_extension,
target_file + '.%s' % FLAGS.file_extension, overwrite=True)
for j in range(len(remaining)):
elements = remaining[j].split(' ')
if len(elements) == 2: # No differing mapping defined.
target_dir = os.path.join(FLAGS.output_dir + '_ft', elements[0])
target_file = os.path.join(
target_dir, elements[1] + ('_flip' if FLAGS.flip else ''))
else: # Other mapping for file defined, copy to this location instead.
target_dir = os.path.join(
FLAGS.output_dir + '_ft', os.path.dirname(elements[2]))
target_file = os.path.join(
target_dir,
os.path.basename(elements[2]) + ('_flip' if FLAGS.flip else ''))
if not gfile.Exists(target_dir):
gfile.MakeDirs(target_dir)
source_file = target_file.replace('_ft', '')
logging.info('Copy unrefined result %s to %s.', source_file, target_file)
gfile.Copy(source_file + '.npy', target_file + '.npy', overwrite=True)
gfile.Copy(source_file + '.%s' % FLAGS.file_extension,
target_file + '.%s' % FLAGS.file_extension, overwrite=True)
logging.info('Done, predictions saved in %s.', FLAGS.output_dir + '_ft')
def finetune_inference(train_model, model_ckpt, output_dir):
"""Train model."""
vars_to_restore = None
if model_ckpt is not None:
vars_to_restore = util.get_vars_to_save_and_restore(model_ckpt)
ckpt_path = model_ckpt
pretrain_restorer = tf.train.Saver(vars_to_restore)
sv = tf.train.Supervisor(logdir=None, save_summaries_secs=0, saver=None,
summary_op=None)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
img_nr = 0
failed_heuristic = []
with sv.managed_session(config=config) as sess:
# TODO(casser): Caching the weights would be better to avoid I/O bottleneck.
while True: # Loop terminates when all examples have been processed.
if model_ckpt is not None:
logging.info('Restored weights from %s', ckpt_path)
pretrain_restorer.restore(sess, ckpt_path)
logging.info('Running fine-tuning, image %s...', img_nr)
img_pred_folder = os.path.join(
output_dir, FLAGS.ft_name + 'id_' + str(img_nr))
if not gfile.Exists(img_pred_folder):
gfile.MakeDirs(img_pred_folder)
step = 1
# Run fine-tuning.
while step <= FLAGS.num_steps:
logging.info('Running step %s of %s.', step, FLAGS.num_steps)
fetches = {
'train': train_model.train_op,
'global_step': train_model.global_step,
'incr_global_step': train_model.incr_global_step
}
_ = sess.run(fetches)
if step % SAVE_EVERY == 0:
# Get latest prediction for middle frame, highest scale.
pred = train_model.depth[1][0].eval(session=sess)
if FLAGS.flip:
pred = np.flip(pred, axis=2)
input_img = train_model.image_stack.eval(session=sess)
input_img_prev = input_img[0, :, :, 0:3]
input_img_center = input_img[0, :, :, 3:6]
input_img_next = input_img[0, :, :, 6:]
img_pred_file = os.path.join(
img_pred_folder,
str(step).zfill(10) + ('_flip' if FLAGS.flip else '') + '.npy')
motion = np.squeeze(train_model.egomotion.eval(session=sess))
# motion of shape (seq_length - 1, 6).
motion = np.mean(motion, axis=0) # Average egomotion across frames.
if SAVE_PREVIEWS or step == FLAGS.num_steps:
# Also save preview of depth map.
color_map = util.normalize_depth_for_display(
np.squeeze(pred[0, :, :]))
visualization = np.concatenate(
(input_img_prev, input_img_center, input_img_next, color_map))
motion_s = [str(m) for m in motion]
s_rep = ','.join(motion_s)
with gfile.Open(img_pred_file.replace('.npy', '.txt'), 'w') as f:
f.write(s_rep)
util.save_image(
img_pred_file.replace('.npy', '.%s' % FLAGS.file_extension),
visualization, FLAGS.file_extension)
with gfile.Open(img_pred_file, 'wb') as f:
np.save(f, pred)
# Apply heuristic to not finetune if egomotion magnitude is too low.
ego_magnitude = np.linalg.norm(motion[:3], ord=2)
heuristic = ego_magnitude >= FLAGS.egomotion_threshold
if not heuristic and step == FLAGS.num_steps:
failed_heuristic.append(img_nr)
step += 1
img_nr += 1
return failed_heuristic
if __name__ == '__main__':
app.run(main)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Geometry utilities for projecting frames based on depth and motion.
Modified from Spatial Transformer Networks:
https://github.com/tensorflow/models/blob/master/transformer/spatial_transformer.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import logging
import numpy as np
import tensorflow as tf
def inverse_warp(img, depth, egomotion_mat, intrinsic_mat,
intrinsic_mat_inv):
"""Inverse warp a source image to the target image plane.
Args:
img: The source image (to sample pixels from) -- [B, H, W, 3].
depth: Depth map of the target image -- [B, H, W].
egomotion_mat: Matrix defining egomotion transform -- [B, 4, 4].
intrinsic_mat: Camera intrinsic matrix -- [B, 3, 3].
intrinsic_mat_inv: Inverse of the intrinsic matrix -- [B, 3, 3].
Returns:
Projected source image
"""
dims = tf.shape(img)
batch_size, img_height, img_width = dims[0], dims[1], dims[2]
depth = tf.reshape(depth, [batch_size, 1, img_height * img_width])
grid = _meshgrid_abs(img_height, img_width)
grid = tf.tile(tf.expand_dims(grid, 0), [batch_size, 1, 1])
cam_coords = _pixel2cam(depth, grid, intrinsic_mat_inv)
ones = tf.ones([batch_size, 1, img_height * img_width])
cam_coords_hom = tf.concat([cam_coords, ones], axis=1)
# Get projection matrix for target camera frame to source pixel frame
hom_filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
hom_filler = tf.tile(hom_filler, [batch_size, 1, 1])
intrinsic_mat_hom = tf.concat(
[intrinsic_mat, tf.zeros([batch_size, 3, 1])], axis=2)
intrinsic_mat_hom = tf.concat([intrinsic_mat_hom, hom_filler], axis=1)
proj_target_cam_to_source_pixel = tf.matmul(intrinsic_mat_hom, egomotion_mat)
source_pixel_coords = _cam2pixel(cam_coords_hom,
proj_target_cam_to_source_pixel)
source_pixel_coords = tf.reshape(source_pixel_coords,
[batch_size, 2, img_height, img_width])
source_pixel_coords = tf.transpose(source_pixel_coords, perm=[0, 2, 3, 1])
projected_img, mask = _spatial_transformer(img, source_pixel_coords)
return projected_img, mask
def get_transform_mat(egomotion_vecs, i, j):
"""Returns a transform matrix defining the transform from frame i to j."""
egomotion_transforms = []
batchsize = tf.shape(egomotion_vecs)[0]
if i == j:
return tf.tile(tf.expand_dims(tf.eye(4, 4), axis=0), [batchsize, 1, 1])
for k in range(min(i, j), max(i, j)):
transform_matrix = _egomotion_vec2mat(egomotion_vecs[:, k, :], batchsize)
if i > j: # Going back in sequence, need to invert egomotion.
egomotion_transforms.insert(0, tf.linalg.inv(transform_matrix))
else: # Going forward in sequence
egomotion_transforms.append(transform_matrix)
# Multiply all matrices.
egomotion_mat = egomotion_transforms[0]
for i in range(1, len(egomotion_transforms)):
egomotion_mat = tf.matmul(egomotion_mat, egomotion_transforms[i])
return egomotion_mat
def _pixel2cam(depth, pixel_coords, intrinsic_mat_inv):
"""Transform coordinates in the pixel frame to the camera frame."""
cam_coords = tf.matmul(intrinsic_mat_inv, pixel_coords) * depth
return cam_coords
def _cam2pixel(cam_coords, proj_c2p):
"""Transform coordinates in the camera frame to the pixel frame."""
pcoords = tf.matmul(proj_c2p, cam_coords)
x = tf.slice(pcoords, [0, 0, 0], [-1, 1, -1])
y = tf.slice(pcoords, [0, 1, 0], [-1, 1, -1])
z = tf.slice(pcoords, [0, 2, 0], [-1, 1, -1])
# Not tested if adding a small number is necessary
x_norm = x / (z + 1e-10)
y_norm = y / (z + 1e-10)
pixel_coords = tf.concat([x_norm, y_norm], axis=1)
return pixel_coords
def _meshgrid_abs(height, width):
"""Meshgrid in the absolute coordinates."""
x_t = tf.matmul(
tf.ones(shape=tf.stack([height, 1])),
tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
y_t = tf.matmul(
tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
tf.ones(shape=tf.stack([1, width])))
x_t = (x_t + 1.0) * 0.5 * tf.cast(width - 1, tf.float32)
y_t = (y_t + 1.0) * 0.5 * tf.cast(height - 1, tf.float32)
x_t_flat = tf.reshape(x_t, (1, -1))
y_t_flat = tf.reshape(y_t, (1, -1))
ones = tf.ones_like(x_t_flat)
grid = tf.concat([x_t_flat, y_t_flat, ones], axis=0)
return grid
def _euler2mat(z, y, x):
"""Converts euler angles to rotation matrix.
From:
https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174
TODO: Remove the dimension for 'N' (deprecated for converting all source
poses altogether).
Args:
z: rotation angle along z axis (in radians) -- size = [B, n]
y: rotation angle along y axis (in radians) -- size = [B, n]
x: rotation angle along x axis (in radians) -- size = [B, n]
Returns:
Rotation matrix corresponding to the euler angles, with shape [B, n, 3, 3].
"""
batch_size = tf.shape(z)[0]
n = 1
z = tf.clip_by_value(z, -np.pi, np.pi)
y = tf.clip_by_value(y, -np.pi, np.pi)
x = tf.clip_by_value(x, -np.pi, np.pi)
# Expand to B x N x 1 x 1
z = tf.expand_dims(tf.expand_dims(z, -1), -1)
y = tf.expand_dims(tf.expand_dims(y, -1), -1)
x = tf.expand_dims(tf.expand_dims(x, -1), -1)
zeros = tf.zeros([batch_size, n, 1, 1])
ones = tf.ones([batch_size, n, 1, 1])
cosz = tf.cos(z)
sinz = tf.sin(z)
rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3)
rotz_2 = tf.concat([sinz, cosz, zeros], axis=3)
rotz_3 = tf.concat([zeros, zeros, ones], axis=3)
zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2)
cosy = tf.cos(y)
siny = tf.sin(y)
roty_1 = tf.concat([cosy, zeros, siny], axis=3)
roty_2 = tf.concat([zeros, ones, zeros], axis=3)
roty_3 = tf.concat([-siny, zeros, cosy], axis=3)
ymat = tf.concat([roty_1, roty_2, roty_3], axis=2)
cosx = tf.cos(x)
sinx = tf.sin(x)
rotx_1 = tf.concat([ones, zeros, zeros], axis=3)
rotx_2 = tf.concat([zeros, cosx, -sinx], axis=3)
rotx_3 = tf.concat([zeros, sinx, cosx], axis=3)
xmat = tf.concat([rotx_1, rotx_2, rotx_3], axis=2)
return tf.matmul(tf.matmul(xmat, ymat), zmat)
def _egomotion_vec2mat(vec, batch_size):
"""Converts 6DoF transform vector to transformation matrix.
Args:
vec: 6DoF parameters [tx, ty, tz, rx, ry, rz] -- [B, 6].
batch_size: Batch size.
Returns:
A transformation matrix -- [B, 4, 4].
"""
translation = tf.slice(vec, [0, 0], [-1, 3])
translation = tf.expand_dims(translation, -1)
rx = tf.slice(vec, [0, 3], [-1, 1])
ry = tf.slice(vec, [0, 4], [-1, 1])
rz = tf.slice(vec, [0, 5], [-1, 1])
rot_mat = _euler2mat(rz, ry, rx)
rot_mat = tf.squeeze(rot_mat, squeeze_dims=[1])
filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
filler = tf.tile(filler, [batch_size, 1, 1])
transform_mat = tf.concat([rot_mat, translation], axis=2)
transform_mat = tf.concat([transform_mat, filler], axis=1)
return transform_mat
def _bilinear_sampler(im, x, y, name='blinear_sampler'):
"""Perform bilinear sampling on im given list of x, y coordinates.
Implements the differentiable sampling mechanism with bilinear kernel
in https://arxiv.org/abs/1506.02025.
x,y are tensors specifying normalized coordinates [-1, 1] to be sampled on im.
For example, (-1, -1) in (x, y) corresponds to pixel location (0, 0) in im,
and (1, 1) in (x, y) corresponds to the bottom right pixel in im.
Args:
im: Batch of images with shape [B, h, w, channels].
x: Tensor of normalized x coordinates in [-1, 1], with shape [B, h, w, 1].
y: Tensor of normalized y coordinates in [-1, 1], with shape [B, h, w, 1].
name: Name scope for ops.
Returns:
Sampled image with shape [B, h, w, channels].
Principled mask with shape [B, h, w, 1], dtype:float32. A value of 1.0
in the mask indicates that the corresponding coordinate in the sampled
image is valid.
"""
with tf.variable_scope(name):
x = tf.reshape(x, [-1])
y = tf.reshape(y, [-1])
# Constants.
batch_size = tf.shape(im)[0]
_, height, width, channels = im.get_shape().as_list()
x = tf.to_float(x)
y = tf.to_float(y)
height_f = tf.cast(height, 'float32')
width_f = tf.cast(width, 'float32')
zero = tf.constant(0, dtype=tf.int32)
max_y = tf.cast(tf.shape(im)[1] - 1, 'int32')
max_x = tf.cast(tf.shape(im)[2] - 1, 'int32')
# Scale indices from [-1, 1] to [0, width - 1] or [0, height - 1].
x = (x + 1.0) * (width_f - 1.0) / 2.0
y = (y + 1.0) * (height_f - 1.0) / 2.0
# Compute the coordinates of the 4 pixels to sample from.
x0 = tf.cast(tf.floor(x), 'int32')
x1 = x0 + 1
y0 = tf.cast(tf.floor(y), 'int32')
y1 = y0 + 1
mask = tf.logical_and(
tf.logical_and(x0 >= zero, x1 <= max_x),
tf.logical_and(y0 >= zero, y1 <= max_y))
mask = tf.to_float(mask)
x0 = tf.clip_by_value(x0, zero, max_x)
x1 = tf.clip_by_value(x1, zero, max_x)
y0 = tf.clip_by_value(y0, zero, max_y)
y1 = tf.clip_by_value(y1, zero, max_y)
dim2 = width
dim1 = width * height
# Create base index.
base = tf.range(batch_size) * dim1
base = tf.reshape(base, [-1, 1])
base = tf.tile(base, [1, height * width])
base = tf.reshape(base, [-1])
base_y0 = base + y0 * dim2
base_y1 = base + y1 * dim2
idx_a = base_y0 + x0
idx_b = base_y1 + x0
idx_c = base_y0 + x1
idx_d = base_y1 + x1
# Use indices to lookup pixels in the flat image and restore channels dim.
im_flat = tf.reshape(im, tf.stack([-1, channels]))
im_flat = tf.to_float(im_flat)
pixel_a = tf.gather(im_flat, idx_a)
pixel_b = tf.gather(im_flat, idx_b)
pixel_c = tf.gather(im_flat, idx_c)
pixel_d = tf.gather(im_flat, idx_d)
x1_f = tf.to_float(x1)
y1_f = tf.to_float(y1)
# And finally calculate interpolated values.
wa = tf.expand_dims(((x1_f - x) * (y1_f - y)), 1)
wb = tf.expand_dims((x1_f - x) * (1.0 - (y1_f - y)), 1)
wc = tf.expand_dims(((1.0 - (x1_f - x)) * (y1_f - y)), 1)
wd = tf.expand_dims(((1.0 - (x1_f - x)) * (1.0 - (y1_f - y))), 1)
output = tf.add_n([wa * pixel_a, wb * pixel_b, wc * pixel_c, wd * pixel_d])
output = tf.reshape(output, tf.stack([batch_size, height, width, channels]))
mask = tf.reshape(mask, tf.stack([batch_size, height, width, 1]))
return output, mask
def _spatial_transformer(img, coords):
"""A wrapper over binlinear_sampler(), taking absolute coords as input."""
img_height = tf.cast(tf.shape(img)[1], tf.float32)
img_width = tf.cast(tf.shape(img)[2], tf.float32)
px = coords[:, :, :, :1]
py = coords[:, :, :, 1:]
# Normalize coordinates to [-1, 1] to send to _bilinear_sampler.
px = px / (img_width - 1) * 2.0 - 1.0
py = py / (img_height - 1) * 2.0 - 1.0
output_img, mask = _bilinear_sampler(img, px, py)
return output_img, mask
def get_cloud(depth, intrinsics_inv, name=None):
"""Convert depth map to 3D point cloud."""
with tf.name_scope(name):
dims = depth.shape.as_list()
batch_size, img_height, img_width = dims[0], dims[1], dims[2]
depth = tf.reshape(depth, [batch_size, 1, img_height * img_width])
grid = _meshgrid_abs(img_height, img_width)
grid = tf.tile(tf.expand_dims(grid, 0), [batch_size, 1, 1])
cam_coords = _pixel2cam(depth, grid, intrinsics_inv)
cam_coords = tf.transpose(cam_coords, [0, 2, 1])
cam_coords = tf.reshape(cam_coords, [batch_size, img_height, img_width, 3])
logging.info('depth -> cloud: %s', cam_coords)
return cam_coords
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment