Merge remote-tracking branch 'upstream/master'

e00e0e13 · dreamdragon · b915db4e · 402b561b · e00e0e13 · e00e0e13
Commit e00e0e13 authored Dec 03, 2018 by dreamdragon
20 changed files
--- a/research/object_detection/protos/ssd.proto
+++ b/research/object_detection/protos/ssd.proto
@@ -12,7 +12,7 @@ import "object_detection/protos/post_processing.proto";
 import "object_detection/protos/region_similarity_calculator.proto";
 // Configuration for Single Shot Detection (SSD) models.
-// Next id: 21
+// Next id: 22
 message Ssd {
  // Number of classes to predict.
@@ -92,11 +92,17 @@ message Ssd {
  // Minimum number of effective negative samples.
  // Only applies if use_expected_classification_loss_under_sampling is true.
-  optional float minimum_negative_sampling = 19 [default=0];
+  optional float min_num_negative_samples = 19 [default=0];
  // Desired number of effective negative samples per positive sample.
  // Only applies if use_expected_classification_loss_under_sampling is true.
  optional float desired_negative_sampling_ratio = 20 [default=3];
+  // Whether to add an implicit background class to one-hot encodings of
+  // groundtruth labels. Set to false if using groundtruth labels with an
+  // explicit background class, using multiclass scores, or if training a single
+  // class model.
+  optional bool add_background_class = 21 [default = true];
 }

--- a/research/object_detection/protos/train.proto
+++ b/research/object_detection/protos/train.proto
@@ -6,7 +6,7 @@ import "object_detection/protos/optimizer.proto";
 import "object_detection/protos/preprocessor.proto";
 // Message for configuring DetectionModel training jobs (train.py).
-// Next id: 27
+// Next id: 28
 message TrainConfig {
  // Effective batch size to use for training.
  // For TPU (or sync SGD jobs), the batch size per core (or GPU) is going to be
@@ -115,4 +115,7 @@ message TrainConfig {
  // Whether to use bfloat16 for training.
  optional bool use_bfloat16 = 26 [default=false];
+  // Whether to summarize gradients.
+  optional bool summarize_gradients = 27 [default=false];
 }
--- a/research/object_detection/samples/cloud/cloud.yml
+++ b/research/object_detection/samples/cloud/cloud.yml
 trainingInput:
-  runtimeVersion: "1.8"
+  runtimeVersion: "1.9"
  scaleTier: CUSTOM
  masterType: standard_gpu
  workerCount: 5

--- a/research/object_detection/samples/configs/facessd_mobilenet_v2_quantized_320x320_open_image_v4.config
+++ b/research/object_detection/samples/configs/facessd_mobilenet_v2_quantized_320x320_open_image_v4.config
+# Quantized trained SSD with Mobilenet v2 on Open Images v4.
+# Non-face boxes are dropped during training and non-face groundtruth boxes are
+# ignored when evaluating.
+#
+# Users should configure the fine_tune_checkpoint field in the train config as
+# well as the label_map_path and input_path fields in the train_input_reader and
+# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
+# should be configured.
+model {
+  ssd {
+    num_classes: 1
+    image_resizer {
+      fixed_shape_resizer {
+        height: 320
+        width: 320
+      }
+    }
+    feature_extractor {
+      type: "ssd_mobilenet_v2"
+      depth_multiplier: 1.0
+      min_depth: 16
+      conv_hyperparams {
+        regularizer {
+          l2_regularizer {
+            weight: 4.0e-05
+          }
+        }
+        initializer {
+          truncated_normal_initializer {
+            mean: 0.0
+            stddev: 0.03
+          }
+        }
+        activation: RELU_6
+        batch_norm {
+          decay: 0.9997
+          center: true
+          scale: true
+          epsilon: 0.001
+          train: true
+        }
+      }
+      pad_to_multiple: 32
+      use_explicit_padding: true
+    }
+    box_coder {
+      faster_rcnn_box_coder {
+        y_scale: 10.0
+        x_scale: 10.0
+        height_scale: 5.0
+        width_scale: 5.0
+      }
+    }
+    matcher {
+      argmax_matcher {
+        matched_threshold: 0.5
+        unmatched_threshold: 0.5
+        ignore_thresholds: false
+        negatives_lower_than_unmatched: true
+        force_match_for_each_row: true
+      }
+    }
+    similarity_calculator {
+      iou_similarity {
+      }
+    }
+    box_predictor {
+      convolutional_box_predictor {
+        conv_hyperparams {
+          regularizer {
+            l2_regularizer {
+              weight: 4.0e-05
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+              mean: 0.0
+              stddev: 0.03
+            }
+          }
+          activation: RELU_6
+          batch_norm {
+            decay: 0.9997
+            center: true
+            scale: true
+            epsilon: 0.001
+            train: true
+          }
+        }
+        min_depth: 0
+        max_depth: 0
+        num_layers_before_predictor: 0
+        use_dropout: false
+        kernel_size: 3
+        box_code_size: 4
+        apply_sigmoid_to_scores: false
+      }
+    }
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 6
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 3.0
+        aspect_ratios: 0.3333
+        height_stride: 16
+        height_stride: 32
+        height_stride: 64
+        height_stride: 128
+        height_stride: 256
+        height_stride: 512
+        width_stride: 16
+        width_stride: 32
+        width_stride: 64
+        width_stride: 128
+        width_stride: 256
+        width_stride: 512
+      }
+    }
+    post_processing {
+      batch_non_max_suppression {
+        score_threshold: 1.0e-08
+        iou_threshold: 0.5
+        max_detections_per_class: 100
+        max_total_detections: 100
+      }
+      score_converter: SIGMOID
+    }
+    normalize_loss_by_num_matches: true
+    loss {
+      localization_loss {
+        weighted_smooth_l1 {
+        }
+      }
+      classification_loss {
+        weighted_sigmoid {
+        }
+      }
+      hard_example_miner {
+        num_hard_examples: 3000
+        iou_threshold: 0.99
+        loss_type: CLASSIFICATION
+        max_negatives_per_positive: 3
+        min_negatives_per_image: 10
+      }
+      classification_weight: 1.0
+      localization_weight: 1.0
+    }
+  }
+}
+train_config {
+  batch_size: 32
+  data_augmentation_options {
+    random_horizontal_flip {
+      keypoint_flip_permutation: 1
+      keypoint_flip_permutation: 0
+      keypoint_flip_permutation: 2
+      keypoint_flip_permutation: 3
+      keypoint_flip_permutation: 5
+      keypoint_flip_permutation: 4
+    }
+  }
+  data_augmentation_options {
+    ssd_random_crop_fixed_aspect_ratio {
+    }
+  }
+  optimizer {
+    rms_prop_optimizer {
+      learning_rate {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.004
+          decay_steps: 800720
+          decay_factor: 0.95
+        }
+      }
+      momentum_optimizer_value: 0.9
+      decay: 0.9
+      epsilon: 1.0
+    }
+  }
+  fine_tune_checkpoint: ""
+}
+train_input_reader {
+  label_map_path: "PATH_TO_BE_CONFIGURED/face_label_map.pbtxt"
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/face_train.record-?????-of-00100"
+  }
+}
+eval_config {
+  metrics_set: "coco_detection_metrics"
+  use_moving_averages: true
+}
+eval_input_reader {
+  label_map_path: "PATH_TO_BE_CONFIGURED/face_label_map.pbtxt"
+  shuffle: false
+  num_readers: 1
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/face_val.record-?????-of-00010"
+  }
+}
+graph_rewriter {
+  quantization {
+    delay: 500000
+    weight_bits: 8
+    activation_bits: 8
+  }
+}
--- a/research/object_detection/samples/configs/ssd_mobilenet_v2_quantized_300x300_coco.config
+++ b/research/object_detection/samples/configs/ssd_mobilenet_v2_quantized_300x300_coco.config
+# Quantized trained SSD with Mobilenet v2 on MSCOCO Dataset.
+# Users should configure the fine_tune_checkpoint field in the train config as
+# well as the label_map_path and input_path fields in the train_input_reader and
+# eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
+# should be configured.
+model {
+  ssd {
+    num_classes: 90
+    box_coder {
+      faster_rcnn_box_coder {
+        y_scale: 10.0
+        x_scale: 10.0
+        height_scale: 5.0
+        width_scale: 5.0
+      }
+    }
+    matcher {
+      argmax_matcher {
+        matched_threshold: 0.5
+        unmatched_threshold: 0.5
+        ignore_thresholds: false
+        negatives_lower_than_unmatched: true
+        force_match_for_each_row: true
+      }
+    }
+    similarity_calculator {
+      iou_similarity {
+      }
+    }
+    anchor_generator {
+      ssd_anchor_generator {
+        num_layers: 6
+        min_scale: 0.2
+        max_scale: 0.95
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 3.0
+        aspect_ratios: 0.3333
+      }
+    }
+    image_resizer {
+      fixed_shape_resizer {
+        height: 300
+        width: 300
+      }
+    }
+    box_predictor {
+      convolutional_box_predictor {
+        min_depth: 0
+        max_depth: 0
+        num_layers_before_predictor: 0
+        use_dropout: false
+        dropout_keep_probability: 0.8
+        kernel_size: 1
+        box_code_size: 4
+        apply_sigmoid_to_scores: false
+        conv_hyperparams {
+          activation: RELU_6,
+          regularizer {
+            l2_regularizer {
+              weight: 0.00004
+            }
+          }
+          initializer {
+            truncated_normal_initializer {
+              stddev: 0.03
+              mean: 0.0
+            }
+          }
+          batch_norm {
+            train: true,
+            scale: true,
+            center: true,
+            decay: 0.9997,
+            epsilon: 0.001,
+          }
+        }
+      }
+    }
+    feature_extractor {
+      type: 'ssd_mobilenet_v2'
+      min_depth: 16
+      depth_multiplier: 1.0
+      conv_hyperparams {
+        activation: RELU_6,
+        regularizer {
+          l2_regularizer {
+            weight: 0.00004
+          }
+        }
+        initializer {
+          truncated_normal_initializer {
+            stddev: 0.03
+            mean: 0.0
+          }
+        }
+        batch_norm {
+          train: true,
+          scale: true,
+          center: true,
+          decay: 0.9997,
+          epsilon: 0.001,
+        }
+      }
+    }
+    loss {
+      classification_loss {
+        weighted_sigmoid {
+        }
+      }
+      localization_loss {
+        weighted_smooth_l1 {
+        }
+      }
+      hard_example_miner {
+        num_hard_examples: 3000
+        iou_threshold: 0.99
+        loss_type: CLASSIFICATION
+        max_negatives_per_positive: 3
+        min_negatives_per_image: 3
+      }
+      classification_weight: 1.0
+      localization_weight: 1.0
+    }
+    normalize_loss_by_num_matches: true
+    post_processing {
+      batch_non_max_suppression {
+        score_threshold: 1e-8
+        iou_threshold: 0.6
+        max_detections_per_class: 100
+        max_total_detections: 100
+      }
+      score_converter: SIGMOID
+    }
+  }
+}
+train_config: {
+  batch_size: 24
+  optimizer {
+    rms_prop_optimizer: {
+      learning_rate: {
+        exponential_decay_learning_rate {
+          initial_learning_rate: 0.004
+          decay_steps: 800720
+          decay_factor: 0.95
+        }
+      }
+      momentum_optimizer_value: 0.9
+      decay: 0.9
+      epsilon: 1.0
+    }
+  }
+  fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/model.ckpt"
+  fine_tune_checkpoint_type:  "detection"
+  # Note: The below line limits the training process to 200K steps, which we
+  # empirically found to be sufficient enough to train the pets dataset. This
+  # effectively bypasses the learning rate schedule (the learning rate will
+  # never decay). Remove the below line to train indefinitely.
+  num_steps: 200000
+  data_augmentation_options {
+    random_horizontal_flip {
+    }
+  }
+  data_augmentation_options {
+    ssd_random_crop {
+    }
+  }
+}
+train_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_train.record-?????-of-00100"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+}
+eval_config: {
+  num_examples: 8000
+  # Note: The below line limits the evaluation process to 10 evaluations.
+  # Remove the below line to evaluate indefinitely.
+  max_evals: 10
+}
+eval_input_reader: {
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/mscoco_val.record-?????-of-00010"
+  }
+  label_map_path: "PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt"
+  shuffle: false
+  num_readers: 1
+}
+graph_rewriter {
+  quantization {
+    delay: 48000
+    weight_bits: 8
+    activation_bits: 8
+  }
+}
\ No newline at end of file
--- a/research/object_detection/utils/config_util.py
+++ b/research/object_detection/utils/config_util.py
@@ -76,12 +76,14 @@ def get_spatial_image_size(image_resizer_config):
  raise ValueError("Unknown image resizer type.")
-def get_configs_from_pipeline_file(pipeline_config_path):
+def get_configs_from_pipeline_file(pipeline_config_path, config_override=None):
  """Reads config from a file containing pipeline_pb2.TrainEvalPipelineConfig.
  Args:
    pipeline_config_path: Path to pipeline_pb2.TrainEvalPipelineConfig text
      proto.
+    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
+      override pipeline_config_path.
  Returns:
    Dictionary of configuration objects. Keys are `model`, `train_config`,
@@ -92,6 +94,8 @@ def get_configs_from_pipeline_file(pipeline_config_path):
  with tf.gfile.GFile(pipeline_config_path, "r") as f:
    proto_str = f.read()
    text_format.Merge(proto_str, pipeline_config)
+  if config_override:
+    text_format.Merge(config_override, pipeline_config)
  return create_configs_from_pipeline_proto(pipeline_config)
@@ -430,7 +434,7 @@ def merge_external_params_with_configs(configs, hparams=None, kwargs_dict=None):
  final learning rates.
  In this case key can be one of the following formats:
      1. legacy update: single string that indicates the attribute to be
-        updated. E.g. 'lable_map_path', 'eval_input_path', 'shuffle'.
+        updated. E.g. 'label_map_path', 'eval_input_path', 'shuffle'.
        Note that when updating fields (e.g. eval_input_path, eval_shuffle) in
        eval_input_configs, the override will only be applied when
        eval_input_configs has exactly 1 element.

--- a/research/object_detection/utils/object_detection_evaluation.py
+++ b/research/object_detection/utils/object_detection_evaluation.py
@@ -633,11 +633,37 @@ class ObjectDetectionEvaluation(object):
               nms_max_output_boxes=10000,
               use_weighted_mean_ap=False,
               label_id_offset=0,
-               group_of_weight=0.0):
+               group_of_weight=0.0,
+               per_image_eval_class=per_image_evaluation.PerImageEvaluation):
+    """Constructor.
+    Args:
+      num_groundtruth_classes: Number of ground-truth classes.
+      matching_iou_threshold: IOU threshold used for matching detected boxes
+        to ground-truth boxes.
+      nms_iou_threshold: IOU threshold used for non-maximum suppression.
+      nms_max_output_boxes: Maximum number of boxes returned by non-maximum
+        suppression.
+      use_weighted_mean_ap: (optional) boolean which determines if the mean
+        average precision is computed directly from the scores and tp_fp_labels
+        of all classes.
+      label_id_offset: The label id offset.
+      group_of_weight: Weight of group-of boxes.If set to 0, detections of the
+        correct class within a group-of box are ignored. If weight is > 0, then
+        if at least one detection falls within a group-of box with
+        matching_iou_threshold, weight group_of_weight is added to true
+        positives. Consequently, if no detection falls within a group-of box,
+        weight group_of_weight is added to false negatives.
+      per_image_eval_class: The class that contains functions for computing
+        per image metrics.
+    Raises:
+      ValueError: if num_groundtruth_classes is smaller than 1.
+    """
    if num_groundtruth_classes < 1:
      raise ValueError('Need at least 1 groundtruth class for evaluation.')
-    self.per_image_eval = per_image_evaluation.PerImageEvaluation(
+    self.per_image_eval = per_image_eval_class(
        num_groundtruth_classes=num_groundtruth_classes,
        matching_iou_threshold=matching_iou_threshold,
        nms_iou_threshold=nms_iou_threshold,
@@ -659,14 +685,16 @@ class ObjectDetectionEvaluation(object):
    self._initialize_detections()
  def _initialize_detections(self):
+    """Initializes internal data structures."""
    self.detection_keys = set()
    self.scores_per_class = [[] for _ in range(self.num_class)]
    self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]
    self.num_images_correctly_detected_per_class = np.zeros(self.num_class)
    self.average_precision_per_class = np.empty(self.num_class, dtype=float)
    self.average_precision_per_class.fill(np.nan)
-    self.precisions_per_class = []
+    self.precisions_per_class = [np.nan] * self.num_class
-    self.recalls_per_class = []
+    self.recalls_per_class = [np.nan] * self.num_class
    self.corloc_per_class = np.ones(self.num_class, dtype=float)
  def clear_detections(self):
@@ -867,8 +895,8 @@ class ObjectDetectionEvaluation(object):
      logging.info(scores)
      precision, recall = metrics.compute_precision_recall(
          scores, tp_fp_labels, self.num_gt_instances_per_class[class_index])
-      self.precisions_per_class.append(precision)
+      self.precisions_per_class[class_index] = precision
-      self.recalls_per_class.append(recall)
+      self.recalls_per_class[class_index] = recall
      average_precision = metrics.compute_average_precision(precision, recall)
      self.average_precision_per_class[class_index] = average_precision

--- a/research/object_detection/utils/ops.py
+++ b/research/object_detection/utils/ops.py
@@ -872,7 +872,8 @@ def merge_boxes_with_multiple_labels(boxes,
            merged_box_indices)
-def nearest_neighbor_upsampling(input_tensor, scale):
+def nearest_neighbor_upsampling(input_tensor, scale=None, height_scale=None,
+                                width_scale=None):
  """Nearest neighbor upsampling implementation.
  Nearest neighbor upsampling function that maps input tensor with shape
@@ -883,19 +884,33 @@ def nearest_neighbor_upsampling(input_tensor, scale):
  Args:
    input_tensor: A float32 tensor of size [batch, height_in, width_in,
      channels].
-    scale: An integer multiple to scale resolution of input data.
+    scale: An integer multiple to scale resolution of input data in both height
+      and width dimensions.
+    height_scale: An integer multiple to scale the height of input image. This
+      option when provided overrides `scale` option.
+    width_scale: An integer multiple to scale the width of input image. This
+      option when provided overrides `scale` option.
  Returns:
    data_up: A float32 tensor of size
      [batch, height_in*scale, width_in*scale, channels].
+  Raises:
+    ValueError: If both scale and height_scale or if both scale and width_scale
+      are None.
  """
+  if not scale and (height_scale is None or width_scale is None):
+    raise ValueError('Provide either `scale` or `height_scale` and'
+                     ' `width_scale`.')
  with tf.name_scope('nearest_neighbor_upsampling'):
+    h_scale = scale if height_scale is None else height_scale
+    w_scale = scale if width_scale is None else width_scale
    (batch_size, height, width,
     channels) = shape_utils.combined_static_and_dynamic_shape(input_tensor)
    output_tensor = tf.reshape(
        input_tensor, [batch_size, height, 1, width, 1, channels]) * tf.ones(
-            [1, 1, scale, 1, scale, 1], dtype=input_tensor.dtype)
+            [1, 1, h_scale, 1, w_scale, 1], dtype=input_tensor.dtype)
    return tf.reshape(output_tensor,
-                      [batch_size, height * scale, width * scale, channels])
+                      [batch_size, height * h_scale, width * w_scale, channels])
 def matmul_gather_on_zeroth_axis(params, indices, scope=None):
@@ -1072,29 +1087,35 @@ def native_crop_and_resize(image, boxes, crop_size, scope=None):
    return tf.reshape(cropped_regions, final_shape)
-def expected_classification_loss_under_sampling(batch_cls_targets, cls_losses,
+def expected_classification_loss_under_sampling(
-                                                desired_negative_sampling_ratio,
+    batch_cls_targets, cls_losses, unmatched_cls_losses,
-                                                minimum_negative_sampling):
+    desired_negative_sampling_ratio, min_num_negative_samples):
  """Computes classification loss by background/foreground weighting.
  The weighting is such that the effective background/foreground weight ratio
  is the desired_negative_sampling_ratio. if p_i is the foreground probability
-  of anchor a_i, L(a_i) is the anchors loss, N is the number of anchors, and M
+  of anchor a_i, L(a_i) is the anchors loss, N is the number of anchors, M
-  is the sum of foreground probabilities across anchors, then the total loss L
+  is the sum of foreground probabilities across anchors, and K is the desired
-  is calculated as:
+  ratio between the number of negative and positive samples, then the total loss
+  L is calculated as:
  beta = K*M/(N-M)
-  L = sum_{i=1}^N [p_i + beta * (1 - p_i)] * (L(a_i))
+  L = sum_{i=1}^N [p_i * L_p(a_i) + beta * (1 - p_i) * L_n(a_i)]
+  where L_p(a_i) is the loss against target assuming the anchor was matched,
+  otherwise zero, and L_n(a_i) is the loss against the background target
+  assuming the anchor was unmatched, otherwise zero.
  Args:
-    batch_cls_targets: A tensor with shape [batch_size, num_anchors,
+    batch_cls_targets: A tensor with shape [batch_size, num_anchors, num_classes
-        num_classes + 1], where 0'th index is the background class, containing
+      + 1], where 0'th index is the background class, containing the class
-        the class distrubution for the target assigned to a given anchor.
+      distrubution for the target assigned to a given anchor.
-    cls_losses: Float tensor of shape [batch_size, num_anchors]
+    cls_losses: Float tensor of shape [batch_size, num_anchors] representing
-        representing anchorwise classification losses.
+      anchorwise classification losses.
+    unmatched_cls_losses: loss for each anchor against the unmatched class
+      target.
    desired_negative_sampling_ratio: The desired background/foreground weight
      ratio.
-    minimum_negative_sampling: Minimum number of effective negative samples.
+    min_num_negative_samples: Minimum number of effective negative samples.
      Used only when there are no positive examples.
  Returns:
@@ -1103,36 +1124,44 @@ def expected_classification_loss_under_sampling(batch_cls_targets, cls_losses,
  num_anchors = tf.cast(tf.shape(batch_cls_targets)[1], tf.float32)
  # find the p_i
-  foreground_probabilities = (
+  foreground_probabilities = 1 - batch_cls_targets[:, :, 0]
-      foreground_probabilities_from_targets(batch_cls_targets))
  foreground_sum = tf.reduce_sum(foreground_probabilities, axis=-1)
+  # for each anchor, expected_j is the expected number of positive anchors
+  # given that this anchor was sampled as negative.
+  tiled_foreground_sum = tf.tile(
+      tf.reshape(foreground_sum, [-1, 1]),
+      [1, tf.cast(num_anchors, tf.int32)])
+  expected_j = tiled_foreground_sum - foreground_probabilities
  k = desired_negative_sampling_ratio
  # compute beta
-  denominators = (num_anchors - foreground_sum)
+  expected_negatives = tf.to_float(num_anchors) - expected_j
-  beta = tf.where(
+  desired_negatives = k * expected_j
-      tf.equal(denominators, 0), tf.zeros_like(foreground_sum),
+  desired_negatives = tf.where(
-      k * foreground_sum / denominators)
+      tf.greater(desired_negatives, expected_negatives), expected_negatives,
+      desired_negatives)
+  # probability that an anchor is sampled for the loss computation given that it
+  # is negative.
+  beta = desired_negatives / expected_negatives
  # where the foreground sum is zero, use a minimum negative weight.
-  min_negative_weight = 1.0 * minimum_negative_sampling / num_anchors
+  min_negative_weight = 1.0 * min_num_negative_samples / num_anchors
  beta = tf.where(
-      tf.equal(foreground_sum, 0), min_negative_weight * tf.ones_like(beta),
+      tf.equal(tiled_foreground_sum, 0),
-      beta)
+      min_negative_weight * tf.ones_like(beta), beta)
-  beta = tf.reshape(beta, [-1, 1])
-  cls_loss_weights = foreground_probabilities + (
+  foreground_weights = foreground_probabilities
-      1 - foreground_probabilities) * beta
+  background_weights = (1 - foreground_weights) * beta
-  weighted_losses = cls_loss_weights * cls_losses
+  weighted_foreground_losses = foreground_weights * cls_losses
+  weighted_background_losses = background_weights * unmatched_cls_losses
-  cls_losses = tf.reduce_sum(weighted_losses, axis=-1)
+  cls_losses = tf.reduce_sum(
+      weighted_foreground_losses, axis=-1) + tf.reduce_sum(
+          weighted_background_losses, axis=-1)
  return cls_losses
-def foreground_probabilities_from_targets(batch_cls_targets):
-  foreground_probabilities = 1 - batch_cls_targets[:, :, 0]
-  return foreground_probabilities
--- a/research/object_detection/utils/ops_test.py
+++ b/research/object_detection/utils/ops_test.py
@@ -1222,7 +1222,7 @@ class MergeBoxesWithMultipleLabelsTest(tf.test.TestCase):
 class NearestNeighborUpsamplingTest(test_case.TestCase):
-  def test_upsampling(self):
+  def test_upsampling_with_single_scale(self):
    def graph_fn(inputs):
      custom_op_output = ops.nearest_neighbor_upsampling(inputs, scale=2)
@@ -1236,6 +1236,22 @@ class NearestNeighborUpsamplingTest(test_case.TestCase):
                        [[2], [2], [3], [3]]]]
    self.assertAllClose(custom_op_output, expected_output)
+  def test_upsampling_with_separate_height_width_scales(self):
+    def graph_fn(inputs):
+      custom_op_output = ops.nearest_neighbor_upsampling(inputs,
+                                                         height_scale=2,
+                                                         width_scale=3)
+      return custom_op_output
+    inputs = np.reshape(np.arange(4).astype(np.float32), [1, 2, 2, 1])
+    custom_op_output = self.execute(graph_fn, [inputs])
+    expected_output = [[[[0], [0], [0], [1], [1], [1]],
+                        [[0], [0], [0], [1], [1], [1]],
+                        [[2], [2], [2], [3], [3], [3]],
+                        [[2], [2], [2], [3], [3], [3]]]]
+    self.assertAllClose(custom_op_output, expected_output)
 class MatmulGatherOnZerothAxis(test_case.TestCase):
@@ -1454,78 +1470,182 @@ class OpsTestExpectedClassificationLoss(test_case.TestCase):
  def testExpectedClassificationLossUnderSamplingWithHardLabels(self):
-    def graph_fn(batch_cls_targets, cls_losses, negative_to_positive_ratio,
+    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses,
-                 minimum_negative_sampling):
+                 negative_to_positive_ratio, min_num_negative_samples):
      return ops.expected_classification_loss_under_sampling(
-          batch_cls_targets, cls_losses, negative_to_positive_ratio,
+          batch_cls_targets, cls_losses, unmatched_cls_losses,
-          minimum_negative_sampling)
+          negative_to_positive_ratio, min_num_negative_samples)
    batch_cls_targets = np.array(
        [[[1., 0, 0], [0, 1., 0]], [[1., 0, 0], [0, 1., 0]]], dtype=np.float32)
    cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    unmatched_cls_losses = np.array([[10, 20], [30, 40]], dtype=np.float32)
    negative_to_positive_ratio = np.array([2], dtype=np.float32)
-    minimum_negative_sampling = np.array([1], dtype=np.float32)
+    min_num_negative_samples = np.array([1], dtype=np.float32)
    classification_loss = self.execute(graph_fn, [
-        batch_cls_targets, cls_losses, negative_to_positive_ratio,
+        batch_cls_targets, cls_losses, unmatched_cls_losses,
-        minimum_negative_sampling
+        negative_to_positive_ratio, min_num_negative_samples
    ])
-    # expected_foregorund_sum = [1,1]
+    # expected_foreground_sum = [1,1]
-    # expected_beta = [2,2]
+    # expected_expected_j = [[1, 0], [1, 0]]
-    # expected_cls_loss_weights = [2,1],[2,1]
+    # expected_expected_negatives = [[1, 2], [1, 2]]
-    # expected_classification_loss_under_sampling = [2*1+1*2, 2*3+1*4]
+    # expected_desired_negatives = [[2, 0], [2, 0]]
-    expected_classification_loss_under_sampling = [2 + 2, 6 + 4]
+    # expected_beta = [[1, 0], [1, 0]]
+    # expected_foreground_weights = [[0, 1], [0, 1]]
+    # expected_background_weights = [[1, 0], [1, 0]]
+    # expected_weighted_foreground_losses = [[0, 2], [0, 4]]
+    # expected_weighted_background_losses = [[10, 0], [30, 0]]
+    # expected_classification_loss_under_sampling = [6, 40]
+    expected_classification_loss_under_sampling = [2 + 10, 4 + 30]
+    self.assertAllClose(expected_classification_loss_under_sampling,
+                        classification_loss)
+  def testExpectedClassificationLossUnderSamplingWithHardLabelsMoreNegatives(
+      self):
+    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses,
+                 negative_to_positive_ratio, min_num_negative_samples):
+      return ops.expected_classification_loss_under_sampling(
+          batch_cls_targets, cls_losses, unmatched_cls_losses,
+          negative_to_positive_ratio, min_num_negative_samples)
+    batch_cls_targets = np.array(
+        [[[1., 0, 0], [0, 1., 0], [1., 0, 0], [1., 0, 0], [1., 0, 0]]],
+        dtype=np.float32)
+    cls_losses = np.array([[1, 2, 3, 4, 5]], dtype=np.float32)
+    unmatched_cls_losses = np.array([[10, 20, 30, 40, 50]], dtype=np.float32)
+    negative_to_positive_ratio = np.array([2], dtype=np.float32)
+    min_num_negative_samples = np.array([1], dtype=np.float32)
+    classification_loss = self.execute(graph_fn, [
+        batch_cls_targets, cls_losses, unmatched_cls_losses,
+        negative_to_positive_ratio, min_num_negative_samples
+    ])
+    # expected_foreground_sum = [1]
+    # expected_expected_j = [[1, 0, 1, 1, 1]]
+    # expected_expected_negatives = [[4, 5, 4, 4, 4]]
+    # expected_desired_negatives = [[2, 0, 2, 2, 2]]
+    # expected_beta = [[.5, 0, .5, .5, .5]]
+    # expected_foreground_weights = [[0, 1, 0, 0, 0]]
+    # expected_background_weights = [[.5, 0, .5, .5, .5]]
+    # expected_weighted_foreground_losses = [[0, 2, 0, 0, 0]]
+    # expected_weighted_background_losses = [[10*.5, 0, 30*.5, 40*.5, 50*.5]]
+    # expected_classification_loss_under_sampling = [5+2+15+20+25]
+    expected_classification_loss_under_sampling = [5 + 2 + 15 + 20 + 25]
    self.assertAllClose(expected_classification_loss_under_sampling,
                        classification_loss)
  def testExpectedClassificationLossUnderSamplingWithAllNegative(self):
-    def graph_fn(batch_cls_targets, cls_losses):
+    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses):
      return ops.expected_classification_loss_under_sampling(
-          batch_cls_targets, cls_losses, negative_to_positive_ratio,
+          batch_cls_targets, cls_losses, unmatched_cls_losses,
-          minimum_negative_sampling)
+          negative_to_positive_ratio, min_num_negative_samples)
    batch_cls_targets = np.array(
        [[[1, 0, 0], [1, 0, 0]], [[1, 0, 0], [1, 0, 0]]], dtype=np.float32)
    cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    unmatched_cls_losses = np.array([[10, 20], [30, 40]], dtype=np.float32)
    negative_to_positive_ratio = np.array([2], dtype=np.float32)
-    minimum_negative_sampling = np.array([1], dtype=np.float32)
+    min_num_negative_samples = np.array([1], dtype=np.float32)
-    classification_loss = self.execute(graph_fn,
+    classification_loss = self.execute(
-                                       [batch_cls_targets, cls_losses])
+        graph_fn, [batch_cls_targets, cls_losses, unmatched_cls_losses])
-    # expected_foregorund_sum = [0,0]
+    # expected_foreground_sum = [0,0]
-    # expected_beta = [0.5,0.5]
+    # expected_expected_j = [[0, 0], [0, 0]]
-    # expected_cls_loss_weights = [0.5,0.5],[0.5,0.5]
+    # expected_expected_negatives = [[2, 2], [2, 2]]
-    # expected_classification_loss_under_sampling = [.5*1+.5*2, .5*3+.5*4]
+    # expected_desired_negatives = [[0, 0], [0, 0]]
-    expected_classification_loss_under_sampling = [1.5, 3.5]
+    # expected_beta = [[0, 0],[0, 0]]
+    # expected_foreground_weights = [[0, 0], [0, 0]]
+    # expected_background_weights = [[.5, .5], [.5, .5]]
+    # expected_weighted_foreground_losses = [[0, 0], [0, 0]]
+    # expected_weighted_background_losses = [[5, 10], [15, 20]]
+    # expected_classification_loss_under_sampling = [15, 35]
+    expected_classification_loss_under_sampling = [
+        10 * .5 + 20 * .5, 30 * .5 + 40 * .5
+    ]
    self.assertAllClose(expected_classification_loss_under_sampling,
                        classification_loss)
  def testExpectedClassificationLossUnderSamplingWithAllPositive(self):
-    def graph_fn(batch_cls_targets, cls_losses):
+    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses):
      return ops.expected_classification_loss_under_sampling(
-          batch_cls_targets, cls_losses, negative_to_positive_ratio,
+          batch_cls_targets, cls_losses, unmatched_cls_losses,
-          minimum_negative_sampling)
+          negative_to_positive_ratio, min_num_negative_samples)
    batch_cls_targets = np.array(
        [[[0, 1., 0], [0, 1., 0]], [[0, 1, 0], [0, 0, 1]]], dtype=np.float32)
    cls_losses = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    unmatched_cls_losses = np.array([[10, 20], [30, 40]], dtype=np.float32)
    negative_to_positive_ratio = np.array([2], dtype=np.float32)
-    minimum_negative_sampling = np.array([1], dtype=np.float32)
+    min_num_negative_samples = np.array([1], dtype=np.float32)
+    classification_loss = self.execute(
+        graph_fn, [batch_cls_targets, cls_losses, unmatched_cls_losses])
+    # expected_foreground_sum = [2,2]
+    # expected_expected_j = [[1, 1], [1, 1]]
+    # expected_expected_negatives = [[1, 1], [1, 1]]
+    # expected_desired_negatives = [[1, 1], [1, 1]]
+    # expected_beta = [[1, 1],[1, 1]]
+    # expected_foreground_weights = [[1, 1], [1, 1]]
+    # expected_background_weights = [[0, 0], [0, 0]]
+    # expected_weighted_foreground_losses = [[1, 2], [3, 4]]
+    # expected_weighted_background_losses = [[0, 0], [0, 0]]
+    # expected_classification_loss_under_sampling = [15, 35]
+    expected_classification_loss_under_sampling = [1 + 2, 3 + 4]
-    classification_loss = self.execute(graph_fn,
+    self.assertAllClose(expected_classification_loss_under_sampling,
-                                       [batch_cls_targets, cls_losses])
+                        classification_loss)
-    # expected_foregorund_sum = [2,2]
+  def testExpectedClassificationLossUnderSamplingWithSoftLabels(self):
-    # expected_beta = [0,0]
-    # expected_cls_loss_weights = [1,1],[1,1]
+    def graph_fn(batch_cls_targets, cls_losses, unmatched_cls_losses,
-    # expected_classification_loss_under_sampling = [1*1+1*2, 1*3+1*4]
+                 negative_to_positive_ratio, min_num_negative_samples):
-    expected_classification_loss_under_sampling = [1 + 2, 3 + 4]
+      return ops.expected_classification_loss_under_sampling(
+          batch_cls_targets, cls_losses, unmatched_cls_losses,
+          negative_to_positive_ratio, min_num_negative_samples)
+    batch_cls_targets = np.array([[[.75, .25, 0], [0.25, .75, 0], [.75, .25, 0],
+                                   [0.25, .75, 0], [1., 0, 0]]],
+                                 dtype=np.float32)
+    cls_losses = np.array([[1, 2, 3, 4, 5]], dtype=np.float32)
+    unmatched_cls_losses = np.array([[10, 20, 30, 40, 50]], dtype=np.float32)
+    negative_to_positive_ratio = np.array([2], dtype=np.float32)
+    min_num_negative_samples = np.array([1], dtype=np.float32)
+    classification_loss = self.execute(graph_fn, [
+        batch_cls_targets, cls_losses, unmatched_cls_losses,
+        negative_to_positive_ratio, min_num_negative_samples
+    ])
+    # expected_foreground_sum = [2]
+    # expected_expected_j = [[1.75, 1.25, 1.75, 1.25, 2]]
+    # expected_expected_negatives = [[3.25, 3.75, 3.25, 3.75, 3]]
+    # expected_desired_negatives = [[3.25, 2.5, 3.25, 2.5, 3]]
+    # expected_beta = [[1, 2/3, 1, 2/3, 1]]
+    # expected_foreground_weights = [[0.25, .75, .25, .75, 0]]
+    # expected_background_weights = [[[.75, 1/6., .75, 1/6., 1]]]
+    # expected_weighted_foreground_losses = [[.25*1, .75*2, .25*3, .75*4, 0*5]]
+    # expected_weighted_background_losses = [[
+    #     .75*10, 1/6.*20, .75*30, 1/6.*40, 1*50]]
+    # expected_classification_loss_under_sampling = sum([
+    #     .25*1, .75*2, .25*3, .75*4, 0, .75*10, 1/6.*20, .75*30,
+    #     1/6.*40, 1*50])
+    expected_classification_loss_under_sampling = [
+        sum([
+            .25 * 1, .75 * 2, .25 * 3, .75 * 4, 0, .75 * 10, 1 / 6. * 20,
+            .75 * 30, 1 / 6. * 40, 1 * 50
+        ])
+    ]
    self.assertAllClose(expected_classification_loss_under_sampling,
                        classification_loss)

--- a/research/object_detection/utils/test_utils.py
+++ b/research/object_detection/utils/test_utils.py
@@ -45,8 +45,10 @@ class MockBoxCoder(box_coder.BoxCoder):
 class MockBoxPredictor(box_predictor.BoxPredictor):
  """Simple box predictor that ignores inputs and outputs all zeros."""
-  def __init__(self, is_training, num_classes, predict_mask=False):
+  def __init__(self, is_training, num_classes, add_background_class=True,
+               predict_mask=False):
    super(MockBoxPredictor, self).__init__(is_training, num_classes)
+    self._add_background_class = add_background_class
    self._predict_mask = predict_mask
  def _predict(self, image_features, num_predictions_per_location):
@@ -57,10 +59,13 @@ class MockBoxPredictor(box_predictor.BoxPredictor):
    num_anchors = (combined_feature_shape[1] * combined_feature_shape[2])
    code_size = 4
    zero = tf.reduce_sum(0 * image_feature)
+    num_class_slots = self.num_classes
+    if self._add_background_class:
+      num_class_slots = num_class_slots + 1
    box_encodings = zero + tf.zeros(
        (batch_size, num_anchors, 1, code_size), dtype=tf.float32)
    class_predictions_with_background = zero + tf.zeros(
-        (batch_size, num_anchors, self.num_classes + 1), dtype=tf.float32)
+        (batch_size, num_anchors, num_class_slots), dtype=tf.float32)
    masks = zero + tf.zeros(
        (batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE,
         DEFAULT_MASK_SIZE),
@@ -80,9 +85,11 @@ class MockBoxPredictor(box_predictor.BoxPredictor):
 class MockKerasBoxPredictor(box_predictor.KerasBoxPredictor):
  """Simple box predictor that ignores inputs and outputs all zeros."""
-  def __init__(self, is_training, num_classes, predict_mask=False):
+  def __init__(self, is_training, num_classes, add_background_class=True,
+               predict_mask=False):
    super(MockKerasBoxPredictor, self).__init__(
        is_training, num_classes, False, False)
+    self._add_background_class = add_background_class
    self._predict_mask = predict_mask
  def _predict(self, image_features, **kwargs):
@@ -93,10 +100,13 @@ class MockKerasBoxPredictor(box_predictor.KerasBoxPredictor):
    num_anchors = (combined_feature_shape[1] * combined_feature_shape[2])
    code_size = 4
    zero = tf.reduce_sum(0 * image_feature)
+    num_class_slots = self.num_classes
+    if self._add_background_class:
+      num_class_slots = num_class_slots + 1
    box_encodings = zero + tf.zeros(
        (batch_size, num_anchors, 1, code_size), dtype=tf.float32)
    class_predictions_with_background = zero + tf.zeros(
-        (batch_size, num_anchors, self.num_classes + 1), dtype=tf.float32)
+        (batch_size, num_anchors, num_class_slots), dtype=tf.float32)
    masks = zero + tf.zeros(
        (batch_size, num_anchors, self.num_classes, DEFAULT_MASK_SIZE,
         DEFAULT_MASK_SIZE),

--- a/research/struct2depth/BUILD
+++ b/research/struct2depth/BUILD
+package(default_visibility = ["//visibility:public"])
--- a/research/struct2depth/README.md
+++ b/research/struct2depth/README.md
+# struct2depth
+This a method for unsupervised learning of depth and egomotion from monocular video, achieving new state-of-the-art results on both tasks by explicitly modeling 3D object motion, performing on-line refinement and improving quality for moving objects by novel loss formulations. It will appear in the following paper: 
+**V. Casser, S. Pirk, R. Mahjourian, A. Angelova, Depth Prediction Without the Sensors: Leveraging Structure for Unsupervised Learning from Monocular Videos, AAAI Conference on Artificial Intelligence, 2019**
+https://arxiv.org/pdf/1811.06152.pdf
+This code is implemented and supported by Vincent Casser (git username: VincentCa) and Anelia Angelova (git username: AneliaAngelova). Please contact anelia@google.com for questions. 
+Project website: https://sites.google.com/view/struct2depth.
+## Quick start: Running training
+Before running training, run gen_data_* script for the respective dataset in order to generate the data in the appropriate format for KITTI or Cityscapes. It is assumed that motion masks are already generated and stored as images.
+Models are trained from an Imagenet pretrained model.
+```shell
+ckpt_dir="your/checkpoint/folder"
+data_dir="KITTI_SEQ2_LR/" # Set for KITTI
+data_dir="CITYSCAPES_SEQ2_LR/" # Set for Cityscapes
+imagenet_ckpt="resnet_pretrained/model.ckpt"
+python train.py \
+  --logtostderr \
+  --checkpoint_dir $ckpt_dir \
+  --data_dir $data_dir \
+  --architecture resnet \
+  --imagenet_ckpt $imagenet_ckpt \
+  --imagenet_norm true \
+  --joint_encoder false
+```
+## Running depth/egomotion inference on an image folder
+KITTI is trained on the raw image data (resized to 416 x 128), but inputs are standardized before feeding them, and Cityscapes images are cropped using the following cropping parameters: (192, 1856, 256, 768). If using a different crop, it is likely that additional training is necessary. Therefore, please follow the inference example shown below when using one of the models. The right choice might depend on a variety of factors. For example, if a checkpoint should be used for odometry, be aware that for improved odometry on motion models, using segmentation masks could be advantageous (setting *use_masks=true* for inference). On the other hand, all models can be used for single-frame depth estimation without any additional information.
+```shell
+input_dir="your/image/folder"
+output_dir="your/output/folder"
+model_checkpoint="your/model/checkpoint"
+python inference.py \
+    --logtostderr \
+    --file_extension png \
+    --depth \
+    --egomotion true \
+    --input_dir $input_dir \
+    --output_dir $output_dir \
+    --model_ckpt $model_checkpoint
+```
+Note that the egomotion prediction expects the files in the input directory to be a consecutive sequence, and that sorting the filenames alphabetically is putting them in the right order.
+One can also run inference on KITTI by providing
+```shell
+--input_list_file ~/kitti-raw-uncompressed/test_files_eigen.txt
+```
+and on Cityscapes by passing
+```shell
+--input_list_file CITYSCAPES_FULL/test_files_cityscapes.txt
+```
+instead of *input_dir*.
+Alternatively inference can also be ran on pre-processed images.
+## Running on-line refinement
+On-line refinement is executed on top of an existing inference folder, so make sure to run regular inference first. Then you can run the on-line fusion procedure as follows:
+```shell
+prediction_dir="some/prediction/dir"
+model_ckpt="checkpoints/checkpoints_baseline/model-199160"
+handle_motion="false"
+size_constraint_weight="0" # This must be zero when not handling motion.
+# If running on KITTI, set as follows:
+data_dir="KITTI_SEQ2_LR_EIGEN/"
+triplet_list_file="$data_dir/test_files_eigen_triplets.txt"
+triplet_list_file_remains="$data_dir/test_files_eigen_triplets_remains.txt"
+ft_name="kitti"
+# If running on Cityscapes, set as follows:
+data_dir="CITYSCAPES_SEQ2_LR_TEST/" # Set for Cityscapes
+triplet_list_file="/CITYSCAPES_SEQ2_LR_TEST/test_files_cityscapes_triplets.txt"
+triplet_list_file_remains="CITYSCAPES_SEQ2_LR_TEST/test_files_cityscapes_triplets_remains.txt"
+ft_name="cityscapes"
+python optimize.py \
+  --logtostderr \
+  --output_dir $prediction_dir \
+  --data_dir $data_dir \
+  --triplet_list_file $triplet_list_file \
+  --triplet_list_file_remains $triplet_list_file_remains \
+  --ft_name $ft_name \
+  --model_ckpt $model_ckpt \
+  --file_extension png \
+  --handle_motion $handle_motion \
+  --size_constraint_weight $size_constraint_weight
+```
+## Running evaluation
+```shell
+prediction_dir="some/prediction/dir"
+# Use these settings for KITTI:
+eval_list_file="KITTI_FULL/kitti-raw-uncompressed/test_files_eigen.txt"
+eval_crop="garg"
+eval_mode="kitti"
+# Use these settings for Cityscapes:
+eval_list_file="CITYSCAPES_FULL/test_files_cityscapes.txt"
+eval_crop="none"
+eval_mode="cityscapes"
+python evaluate.py \
+  --logtostderr \
+  --prediction_dir $prediction_dir \
+  --eval_list_file $eval_list_file \
+  --eval_crop $eval_crop \
+  --eval_mode $eval_mode
+```
+## Credits
+This code is implemented and supported by Vincent Casser and Anelia Angelova and can be found at
+https://sites.google.com/view/struct2depth.
+The core implementation is derived from [https://github.com/tensorflow/models/tree/master/research/vid2depth)](https://github.com/tensorflow/models/tree/master/research/vid2depth)
+by [Reza Mahjourian](rezama@google.com), which in turn is based on [SfMLearner
+(https://github.com/tinghuiz/SfMLearner)](https://github.com/tinghuiz/SfMLearner)
+by [Tinghui Zhou](https://github.com/tinghuiz).
--- a/research/struct2depth/alignment.py
+++ b/research/struct2depth/alignment.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common utilities for data pre-processing, e.g. matching moving object across frames."""
+import numpy as np
+def compute_overlap(mask1, mask2):
+    # Use IoU here.
+    return np.sum(mask1 & mask2)/np.sum(mask1 | mask2)
+def align(seg_img1, seg_img2, seg_img3, threshold_same=0.3):
+    res_img1 = np.zeros_like(seg_img1)
+    res_img2 = np.zeros_like(seg_img2)
+    res_img3 = np.zeros_like(seg_img3)
+    remaining_objects2 = list(np.unique(seg_img2.flatten()))
+    remaining_objects3 = list(np.unique(seg_img3.flatten()))
+    for seg_id in np.unique(seg_img1):
+        # See if we can find correspondences to seg_id in seg_img2.
+        max_overlap2 = float('-inf')
+        max_segid2 = -1
+        for seg_id2 in remaining_objects2:
+            overlap = compute_overlap(seg_img1==seg_id, seg_img2==seg_id2)
+            if overlap>max_overlap2:
+                max_overlap2 = overlap
+                max_segid2 = seg_id2
+        if max_overlap2 > threshold_same:
+            max_overlap3 = float('-inf')
+            max_segid3 = -1
+            for seg_id3 in remaining_objects3:
+                overlap = compute_overlap(seg_img2==max_segid2, seg_img3==seg_id3)
+                if overlap>max_overlap3:
+                    max_overlap3 = overlap
+                    max_segid3 = seg_id3
+            if max_overlap3 > threshold_same:
+                res_img1[seg_img1==seg_id] = seg_id
+                res_img2[seg_img2==max_segid2] = seg_id
+                res_img3[seg_img3==max_segid3] = seg_id
+                remaining_objects2.remove(max_segid2)
+                remaining_objects3.remove(max_segid3)
+    return res_img1, res_img2, res_img3
--- a/research/struct2depth/gen_data_city.py
+++ b/research/struct2depth/gen_data_city.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+""" Offline data generation for the Cityscapes dataset."""
+import os
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import cv2
+import os, glob
+import alignment
+from alignment import compute_overlap
+from alignment import align
+SKIP = 2
+WIDTH = 416
+HEIGHT = 128
+SUB_FOLDER = 'train'
+INPUT_DIR = '/usr/local/google/home/anelia/struct2depth/CITYSCAPES_FULL/'
+OUTPUT_DIR = '/usr/local/google/home/anelia/struct2depth/CITYSCAPES_Processed/'
+def crop(img, segimg, fx, fy, cx, cy):
+    # Perform center cropping, preserving 50% vertically.
+    middle_perc = 0.50
+    left = 1 - middle_perc
+    half = left / 2
+    a = img[int(img.shape[0]*(half)):int(img.shape[0]*(1-half)), :]
+    aseg = segimg[int(segimg.shape[0]*(half)):int(segimg.shape[0]*(1-half)), :]
+    cy /= (1 / middle_perc)
+    # Resize to match target height while preserving aspect ratio.
+    wdt = int((float(HEIGHT)*a.shape[1]/a.shape[0]))
+    x_scaling = float(wdt)/a.shape[1]
+    y_scaling = float(HEIGHT)/a.shape[0]
+    b = cv2.resize(a, (wdt, HEIGHT))
+    bseg = cv2.resize(aseg, (wdt, HEIGHT))
+    # Adjust intrinsics.
+    fx*=x_scaling
+    fy*=y_scaling
+    cx*=x_scaling
+    cy*=y_scaling
+    # Perform center cropping horizontally.
+    remain = b.shape[1] - WIDTH
+    cx /= (b.shape[1] / WIDTH)
+    c = b[:, int(remain/2):b.shape[1]-int(remain/2)]
+    cseg = bseg[:, int(remain/2):b.shape[1]-int(remain/2)]
+    return c, cseg, fx, fy, cx, cy
+def run_all():
+  dir_name=INPUT_DIR + '/leftImg8bit_sequence/' + SUB_FOLDER + '/*'
+  print('Processing directory', dir_name)
+  for location in glob.glob(INPUT_DIR + '/leftImg8bit_sequence/' + SUB_FOLDER + '/*'):
+    location_name = os.path.basename(location)
+    print('Processing location', location_name)
+    files = sorted(glob.glob(location + '/*.png'))
+    files = [file for file in files if '-seg.png' not in file]
+    # Break down into sequences
+    sequences = {}
+    seq_nr = 0
+    last_seq = ''
+    last_imgnr = -1
+    for i in range(len(files)):
+        seq = os.path.basename(files[i]).split('_')[1]
+        nr = int(os.path.basename(files[i]).split('_')[2])
+        if seq!=last_seq or last_imgnr+1!=nr:
+            seq_nr+=1
+        last_imgnr = nr
+        last_seq = seq
+        if not seq_nr in sequences:
+            sequences[seq_nr] = []
+        sequences[seq_nr].append(files[i])
+    for (k,v) in sequences.items():
+        print('Processing sequence', k, 'with', len(v), 'elements...')
+        output_dir = OUTPUT_DIR + '/' + location_name + '_' + str(k)
+        if not os.path.isdir(output_dir):
+            os.mkdir(output_dir)
+        files = sorted(v)
+        triplet = []
+        seg_triplet = []
+        ct = 1
+        # Find applicable intrinsics.
+        for j in range(len(files)):
+            osegname = os.path.basename(files[j]).split('_')[1]
+            oimgnr = os.path.basename(files[j]).split('_')[2]
+            applicable_intrinsics = INPUT_DIR + '/camera/' + SUB_FOLDER + '/' + location_name + '/' + location_name + '_' + osegname + '_' + oimgnr + '_camera.json'
+            # Get the intrinsics for one of the file of the sequence.
+            if os.path.isfile(applicable_intrinsics):
+                f = open(applicable_intrinsics, 'r')
+                lines = f.readlines()
+                f.close()
+                lines = [line.rstrip() for line in lines]
+                fx = float(lines[11].split(': ')[1].replace(',', ''))
+                fy = float(lines[12].split(': ')[1].replace(',', ''))
+                cx = float(lines[13].split(': ')[1].replace(',', ''))
+                cy = float(lines[14].split(': ')[1].replace(',', ''))
+        for j in range(0, len(files), SKIP):
+            img = cv2.imread(files[j])
+            segimg = cv2.imread(files[j].replace('.png', '-seg.png'))
+            smallimg, segimg, fx_this, fy_this, cx_this, cy_this = crop(img, segimg, fx, fy, cx, cy)
+            triplet.append(smallimg)
+            seg_triplet.append(segimg)
+            if len(triplet)==3:
+                cmb = np.hstack(triplet)
+                align1, align2, align3 = align(seg_triplet[0], seg_triplet[1], seg_triplet[2])
+                cmb_seg = np.hstack([align1, align2, align3])
+                cv2.imwrite(os.path.join(output_dir, str(ct).zfill(10) + '.png'), cmb)
+                cv2.imwrite(os.path.join(output_dir, str(ct).zfill(10) + '-fseg.png'), cmb_seg)
+                f = open(os.path.join(output_dir, str(ct).zfill(10) + '_cam.txt'), 'w')
+                f.write(str(fx_this) + ',0.0,' + str(cx_this) + ',0.0,' + str(fy_this) + ',' + str(cy_this) + ',0.0,0.0,1.0')
+                f.close()
+                del triplet[0]
+                del seg_triplet[0]
+                ct+=1
+# Create file list for training. Be careful as it collects and includes all files recursively.
+fn = open(OUTPUT_DIR + '/' + SUB_FOLDER + '.txt', 'w')
+for f in glob.glob(OUTPUT_DIR + '/*/*.png'):
+    if '-seg.png' in f or '-fseg.png' in f:
+        continue
+    folder_name = f.split('/')[-2]
+    img_name = f.split('/')[-1].replace('.png', '')
+    fn.write(folder_name + ' ' + img_name + '\n')
+fn.close()
+def main(_):
+  run_all()
+if __name__ == '__main__':
+  app.run(main)
--- a/research/struct2depth/gen_data_kitti.py
+++ b/research/struct2depth/gen_data_kitti.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+""" Offline data generation for the KITTI dataset."""
+import os
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import cv2
+import os, glob
+import alignment
+from alignment import compute_overlap
+from alignment import align
+SEQ_LENGTH = 3
+WIDTH = 416
+HEIGHT = 128
+STEPSIZE = 1
+INPUT_DIR = '/usr/local/google/home/anelia/struct2depth/KITTI_FULL/kitti-raw-uncompressed'
+OUTPUT_DIR = '/usr/local/google/home/anelia/struct2depth/KITTI_procesed/'
+def get_line(file, start):
+    file = open(file, 'r')
+    lines = file.readlines()
+    lines = [line.rstrip() for line in lines]
+    ret = None
+    for line in lines:
+        nline = line.split(': ')
+        if nline[0]==start:
+            ret = nline[1].split(' ')
+            ret = np.array([float(r) for r in ret], dtype=float)
+            ret = ret.reshape((3,4))[0:3, 0:3]
+            break
+    file.close()
+    return ret
+def crop(img, segimg, fx, fy, cx, cy):
+    # Perform center cropping, preserving 50% vertically.
+    middle_perc = 0.50
+    left = 1-middle_perc
+    half = left/2
+    a = img[int(img.shape[0]*(half)):int(img.shape[0]*(1-half)), :]
+    aseg = segimg[int(segimg.shape[0]*(half)):int(segimg.shape[0]*(1-half)), :]
+    cy /= (1/middle_perc)
+    # Resize to match target height while preserving aspect ratio.
+    wdt = int((128*a.shape[1]/a.shape[0]))
+    x_scaling = float(wdt)/a.shape[1]
+    y_scaling = 128.0/a.shape[0]
+    b = cv2.resize(a, (wdt, 128))
+    bseg = cv2.resize(aseg, (wdt, 128))
+    # Adjust intrinsics.
+    fx*=x_scaling
+    fy*=y_scaling
+    cx*=x_scaling
+    cy*=y_scaling
+    # Perform center cropping horizontally.
+    remain = b.shape[1] - 416
+    cx /= (b.shape[1]/416)
+    c = b[:, int(remain/2):b.shape[1]-int(remain/2)]
+    cseg = bseg[:, int(remain/2):b.shape[1]-int(remain/2)]
+    return c, cseg, fx, fy, cx, cy
+def run_all():
+  ct = 0
+if not OUTPUT_DIR.endswith('/'):
+    OUTPUT_DIR = OUTPUT_DIR + '/'
+for d in glob.glob(INPUT_DIR + '/*/'):
+    date = d.split('/')[-2]
+    file_calibration = d + 'calib_cam_to_cam.txt'
+    calib_raw = [get_line(file_calibration, 'P_rect_02'), get_line(file_calibration, 'P_rect_03')]
+    for d2 in glob.glob(d + '*/'):
+        seqname = d2.split('/')[-2]
+        print('Processing sequence', seqname)
+        for subfolder in ['image_02/data', 'image_03/data']:
+            ct = 1
+            seqname = d2.split('/')[-2] + subfolder.replace('image', '').replace('/data', '')
+            if not os.path.exists(OUTPUT_DIR + seqname):
+                os.mkdir(OUTPUT_DIR + seqname)
+            calib_camera = calib_raw[0] if subfolder=='image_02/data' else calib_raw[1]
+            folder = d2 + subfolder
+            files = glob.glob(folder + '/*.png')
+            files = [file for file in files if not 'disp' in file and not 'flip' in file and not 'seg' in file]
+            files = sorted(files)
+            for i in range(SEQ_LENGTH, len(files)+1, STEPSIZE):
+                imgnum = str(ct).zfill(10)
+                if os.path.exists(OUTPUT_DIR + seqname + '/' + imgnum + '.png'):
+                    ct+=1
+                    continue
+                big_img = np.zeros(shape=(HEIGHT, WIDTH*SEQ_LENGTH, 3))
+                wct = 0
+                for j in range(i-SEQ_LENGTH, i):  # Collect frames for this sample.
+                    img = cv2.imread(files[j])
+                    ORIGINAL_HEIGHT, ORIGINAL_WIDTH, _ = img.shape
+                    zoom_x = WIDTH/ORIGINAL_WIDTH
+                    zoom_y = HEIGHT/ORIGINAL_HEIGHT
+                    # Adjust intrinsics.
+                    calib_current = calib_camera.copy()
+                    calib_current[0, 0] *= zoom_x
+                    calib_current[0, 2] *= zoom_x
+                    calib_current[1, 1] *= zoom_y
+                    calib_current[1, 2] *= zoom_y
+                    calib_representation = ','.join([str(c) for c in calib_current.flatten()])
+                    img = cv2.resize(img, (WIDTH, HEIGHT))
+                    big_img[:,wct*WIDTH:(wct+1)*WIDTH] = img
+                    wct+=1
+                cv2.imwrite(OUTPUT_DIR + seqname + '/' + imgnum + '.png', big_img)
+                f = open(OUTPUT_DIR + seqname + '/' + imgnum + '_cam.txt', 'w')
+                f.write(calib_representation)
+                f.close()
+                ct+=1
+def main(_):
+  run_all()
+if __name__ == '__main__':
+  app.run(main)
--- a/research/struct2depth/inference.py
+++ b/research/struct2depth/inference.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs struct2depth at inference. Produces depth estimates, ego-motion and object motion."""
+# Example usage:
+#
+# python inference.py \
+#    --input_dir ~/struct2depth/kitti-raw-uncompressed/ \
+#    --output_dir ~/struct2depth/output \
+#    --model_ckpt ~/struct2depth/model/model-199160
+#    --file_extension png \
+#    --depth \
+#    --egomotion true \
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from absl import app
+from absl import flags
+from absl import logging
+#import matplotlib.pyplot as plt
+import model
+import numpy as np
+import fnmatch
+import tensorflow as tf
+import nets
+import util
+gfile = tf.gfile
+# CMAP = 'plasma'
+INFERENCE_MODE_SINGLE = 'single'  # Take plain single-frame input.
+INFERENCE_MODE_TRIPLETS = 'triplets'  # Take image triplets as input.
+# For KITTI, we just resize input images and do not perform cropping. For
+# Cityscapes, the car hood and more image content has been cropped in order
+# to fit aspect ratio, and remove static content from the images. This has to be
+# kept at inference time.
+INFERENCE_CROP_NONE = 'none'
+INFERENCE_CROP_CITYSCAPES = 'cityscapes'
+flags.DEFINE_string('output_dir', None, 'Directory to store predictions.')
+flags.DEFINE_string('file_extension', 'png', 'Image data file extension of '
+                    'files provided with input_dir. Also determines the output '
+                    'file format of depth prediction images.')
+flags.DEFINE_bool('depth', True, 'Determines if the depth prediction network '
+                  'should be executed and its predictions be saved.')
+flags.DEFINE_bool('egomotion', False, 'Determines if the egomotion prediction '
+                  'network should be executed and its predictions be saved. If '
+                  'inference is run in single inference mode, it is assumed '
+                  'that files in the same directory belong in the same '
+                  'sequence, and sorting them alphabetically establishes the '
+                  'right temporal order.')
+flags.DEFINE_string('model_ckpt', None, 'Model checkpoint to evaluate.')
+flags.DEFINE_string('input_dir', None, 'Directory containing image files to '
+                    'evaluate. This crawls recursively for images in the '
+                    'directory, mirroring relative subdirectory structures '
+                    'into the output directory.')
+flags.DEFINE_string('input_list_file', None, 'Text file containing paths to '
+                    'image files to process. Paths should be relative with '
+                    'respect to the list file location. Relative path '
+                    'structures will be mirrored in the output directory.')
+flags.DEFINE_integer('batch_size', 1, 'The size of a sample batch')
+flags.DEFINE_integer('img_height', 128, 'Input frame height.')
+flags.DEFINE_integer('img_width', 416, 'Input frame width.')
+flags.DEFINE_integer('seq_length', 3, 'Number of frames in sequence.')
+flags.DEFINE_enum('architecture', nets.RESNET, nets.ARCHITECTURES,
+                  'Defines the architecture to use for the depth prediction '
+                  'network. Defaults to ResNet-based encoder and accompanying '
+                  'decoder.')
+flags.DEFINE_boolean('imagenet_norm', True, 'Whether to normalize the input '
+                     'images channel-wise so that they match the distribution '
+                     'most ImageNet-models were trained on.')
+flags.DEFINE_bool('use_skip', True, 'Whether to use skip connections in the '
+                  'encoder-decoder architecture.')
+flags.DEFINE_bool('joint_encoder', False, 'Whether to share parameters '
+                  'between the depth and egomotion networks by using a joint '
+                  'encoder architecture. The egomotion network is then '
+                  'operating only on the hidden representation provided by the '
+                  'joint encoder.')
+flags.DEFINE_bool('shuffle', False, 'Whether to shuffle the order in which '
+                  'images are processed.')
+flags.DEFINE_bool('flip', False, 'Whether images should be flipped as well as '
+                  'resulting predictions (for test-time augmentation). This '
+                  'currently applies to the depth network only.')
+flags.DEFINE_enum('inference_mode', INFERENCE_MODE_SINGLE,
+                  [INFERENCE_MODE_SINGLE,
+                   INFERENCE_MODE_TRIPLETS],
+                  'Whether to use triplet mode for inference, which accepts '
+                  'triplets instead of single frames.')
+flags.DEFINE_enum('inference_crop', INFERENCE_CROP_NONE,
+                  [INFERENCE_CROP_NONE,
+                   INFERENCE_CROP_CITYSCAPES],
+                  'Whether to apply a Cityscapes-specific crop on the input '
+                  'images first before running inference.')
+flags.DEFINE_bool('use_masks', False, 'Whether to mask out potentially '
+                  'moving objects when feeding image input to the egomotion '
+                  'network. This might improve odometry results when using '
+                  'a motion model. For this, pre-computed segmentation '
+                  'masks have to be available for every image, with the '
+                  'background being zero.')
+FLAGS = flags.FLAGS
+flags.mark_flag_as_required('output_dir')
+flags.mark_flag_as_required('model_ckpt')
+def _run_inference(output_dir=None,
+                   file_extension='png',
+                   depth=True,
+                   egomotion=False,
+                   model_ckpt=None,
+                   input_dir=None,
+                   input_list_file=None,
+                   batch_size=1,
+                   img_height=128,
+                   img_width=416,
+                   seq_length=3,
+                   architecture=nets.RESNET,
+                   imagenet_norm=True,
+                   use_skip=True,
+                   joint_encoder=True,
+                   shuffle=False,
+                   flip_for_depth=False,
+                   inference_mode=INFERENCE_MODE_SINGLE,
+                   inference_crop=INFERENCE_CROP_NONE,
+                   use_masks=False):
+  """Runs inference. Refer to flags in inference.py for details."""
+  inference_model = model.Model(is_training=False,
+                                batch_size=batch_size,
+                                img_height=img_height,
+                                img_width=img_width,
+                                seq_length=seq_length,
+                                architecture=architecture,
+                                imagenet_norm=imagenet_norm,
+                                use_skip=use_skip,
+                                joint_encoder=joint_encoder)
+  vars_to_restore = util.get_vars_to_save_and_restore(model_ckpt)
+  saver = tf.train.Saver(vars_to_restore)
+  sv = tf.train.Supervisor(logdir='/tmp/', saver=None)
+  with sv.managed_session() as sess:
+    saver.restore(sess, model_ckpt)
+    if not gfile.Exists(output_dir):
+      gfile.MakeDirs(output_dir)
+    logging.info('Predictions will be saved in %s.', output_dir)
+    # Collect all images to run inference on.
+    im_files, basepath_in = collect_input_images(input_dir, input_list_file,
+                                                 file_extension)
+    if shuffle:
+      logging.info('Shuffling data...')
+      np.random.shuffle(im_files)
+    logging.info('Running inference on %d files.', len(im_files))
+    # Create missing output folders and pre-compute target directories.
+    output_dirs = create_output_dirs(im_files, basepath_in, output_dir)
+    # Run depth prediction network.
+    if depth:
+      im_batch = []
+      for i in range(len(im_files)):
+        if i % 100 == 0:
+          logging.info('%s of %s files processed.', i, len(im_files))
+        # Read image and run inference.
+        if inference_mode == INFERENCE_MODE_SINGLE:
+          if inference_crop == INFERENCE_CROP_NONE:
+            im = util.load_image(im_files[i], resize=(img_width, img_height))
+          elif inference_crop == INFERENCE_CROP_CITYSCAPES:
+            im = util.crop_cityscapes(util.load_image(im_files[i]),
+                                      resize=(img_width, img_height))
+        elif inference_mode == INFERENCE_MODE_TRIPLETS:
+          im = util.load_image(im_files[i], resize=(img_width * 3, img_height))
+          im = im[:, img_width:img_width*2]
+        if flip_for_depth:
+          im = np.flip(im, axis=1)
+        im_batch.append(im)
+        if len(im_batch) == batch_size or i == len(im_files) - 1:
+          # Call inference on batch.
+          for _ in range(batch_size - len(im_batch)):  # Fill up batch.
+            im_batch.append(np.zeros(shape=(img_height, img_width, 3),
+                                     dtype=np.float32))
+          im_batch = np.stack(im_batch, axis=0)
+          est_depth = inference_model.inference_depth(im_batch, sess)
+          if flip_for_depth:
+            est_depth = np.flip(est_depth, axis=2)
+            im_batch = np.flip(im_batch, axis=2)
+          for j in range(len(im_batch)):
+            color_map = util.normalize_depth_for_display(
+                np.squeeze(est_depth[j]))
+            visualization = np.concatenate((im_batch[j], color_map), axis=0)
+            # Save raw prediction and color visualization. Extract filename
+            # without extension from full path: e.g. path/to/input_dir/folder1/
+            # file1.png -> file1
+            k = i - len(im_batch) + 1 + j
+            filename_root = os.path.splitext(os.path.basename(im_files[k]))[0]
+            pref = '_flip' if flip_for_depth else ''
+            output_raw = os.path.join(
+                output_dirs[k], filename_root + pref + '.npy')
+            output_vis = os.path.join(
+                output_dirs[k], filename_root + pref + '.png')
+            with gfile.Open(output_raw, 'wb') as f:
+              np.save(f, est_depth[j])
+            util.save_image(output_vis, visualization, file_extension)
+          im_batch = []
+    # Run egomotion network.
+    if egomotion:
+      if inference_mode == INFERENCE_MODE_SINGLE:
+        # Run regular egomotion inference loop.
+        input_image_seq = []
+        input_seg_seq = []
+        current_sequence_dir = None
+        current_output_handle = None
+        for i in range(len(im_files)):
+          sequence_dir = os.path.dirname(im_files[i])
+          if sequence_dir != current_sequence_dir:
+            # Assume start of a new sequence, since this image lies in a
+            # different directory than the previous ones.
+            # Clear egomotion input buffer.
+            output_filepath = os.path.join(output_dirs[i], 'egomotion.txt')
+            if current_output_handle is not None:
+              current_output_handle.close()
+            current_sequence_dir = sequence_dir
+            logging.info('Writing egomotion sequence to %s.', output_filepath)
+            current_output_handle = gfile.Open(output_filepath, 'w')
+            input_image_seq = []
+          im = util.load_image(im_files[i], resize=(img_width, img_height))
+          input_image_seq.append(im)
+          if use_masks:
+            im_seg_path = im_files[i].replace('.%s' % file_extension,
+                                              '-seg.%s' % file_extension)
+            if not gfile.Exists(im_seg_path):
+              raise ValueError('No segmentation mask %s has been found for '
+                               'image %s. If none are available, disable '
+                               'use_masks.' % (im_seg_path, im_files[i]))
+            input_seg_seq.append(util.load_image(im_seg_path,
+                                                 resize=(img_width, img_height),
+                                                 interpolation='nn'))
+          if len(input_image_seq) < seq_length:  # Buffer not filled yet.
+            continue
+          if len(input_image_seq) > seq_length:  # Remove oldest entry.
+            del input_image_seq[0]
+            if use_masks:
+              del input_seg_seq[0]
+          input_image_stack = np.concatenate(input_image_seq, axis=2)
+          input_image_stack = np.expand_dims(input_image_stack, axis=0)
+          if use_masks:
+            input_image_stack = mask_image_stack(input_image_stack,
+                                                 input_seg_seq)
+          est_egomotion = np.squeeze(inference_model.inference_egomotion(
+              input_image_stack, sess))
+          egomotion_str = []
+          for j in range(seq_length - 1):
+            egomotion_str.append(','.join([str(d) for d in est_egomotion[j]]))
+          current_output_handle.write(
+              str(i) + ' ' + ' '.join(egomotion_str) + '\n')
+        if current_output_handle is not None:
+          current_output_handle.close()
+      elif inference_mode == INFERENCE_MODE_TRIPLETS:
+        written_before = []
+        for i in range(len(im_files)):
+          im = util.load_image(im_files[i], resize=(img_width * 3, img_height))
+          input_image_stack = np.concatenate(
+              [im[:, :img_width], im[:, img_width:img_width*2],
+               im[:, img_width*2:]], axis=2)
+          input_image_stack = np.expand_dims(input_image_stack, axis=0)
+          if use_masks:
+            im_seg_path = im_files[i].replace('.%s' % file_extension,
+                                              '-seg.%s' % file_extension)
+            if not gfile.Exists(im_seg_path):
+              raise ValueError('No segmentation mask %s has been found for '
+                               'image %s. If none are available, disable '
+                               'use_masks.' % (im_seg_path, im_files[i]))
+            seg = util.load_image(im_seg_path,
+                                  resize=(img_width * 3, img_height),
+                                  interpolation='nn')
+            input_seg_seq = [seg[:, :img_width], seg[:, img_width:img_width*2],
+                             seg[:, img_width*2:]]
+            input_image_stack = mask_image_stack(input_image_stack,
+                                                 input_seg_seq)
+          est_egomotion = inference_model.inference_egomotion(
+              input_image_stack, sess)
+          est_egomotion = np.squeeze(est_egomotion)
+          egomotion_1_2 = ','.join([str(d) for d in est_egomotion[0]])
+          egomotion_2_3 = ','.join([str(d) for d in est_egomotion[1]])
+          output_filepath = os.path.join(output_dirs[i], 'egomotion.txt')
+          file_mode = 'w' if output_filepath not in written_before else 'a'
+          with gfile.Open(output_filepath, file_mode) as current_output_handle:
+            current_output_handle.write(str(i) + ' ' + egomotion_1_2 + ' ' +
+                                        egomotion_2_3 + '\n')
+          written_before.append(output_filepath)
+      logging.info('Done.')
+def mask_image_stack(input_image_stack, input_seg_seq):
+  """Masks out moving image contents by using the segmentation masks provided.
+  This can lead to better odometry accuracy for motion models, but is optional
+  to use. Is only called if use_masks is enabled.
+  Args:
+    input_image_stack: The input image stack of shape (1, H, W, seq_length).
+    input_seg_seq: List of segmentation masks with seq_length elements of shape
+                   (H, W, C) for some number of channels C.
+  Returns:
+    Input image stack with detections provided by segmentation mask removed.
+  """
+  background = [mask == 0 for mask in input_seg_seq]
+  background = reduce(lambda m1, m2: m1 & m2, background)
+  # If masks are RGB, assume all channels to be the same. Reduce to the first.
+  if background.ndim == 3 and background.shape[2] > 1:
+    background = np.expand_dims(background[:, :, 0], axis=2)
+  elif background.ndim == 2:  # Expand.
+    background = np.expand_dism(background, axis=2)
+  # background is now of shape (H, W, 1).
+  background_stack = np.tile(background, [1, 1, input_image_stack.shape[3]])
+  return np.multiply(input_image_stack, background_stack)
+def collect_input_images(input_dir, input_list_file, file_extension):
+  """Collects all input images that are to be processed."""
+  if input_dir is not None:
+    im_files = _recursive_glob(input_dir, '*.' + file_extension)
+    basepath_in = os.path.normpath(input_dir)
+  elif input_list_file is not None:
+    im_files = util.read_text_lines(input_list_file)
+    basepath_in = os.path.dirname(input_list_file)
+    im_files = [os.path.join(basepath_in, f) for f in im_files]
+  im_files = [f for f in im_files if 'disp' not in f and '-seg' not in f and
+              '-fseg' not in f and '-flip' not in f]
+  return sorted(im_files), basepath_in
+def create_output_dirs(im_files, basepath_in, output_dir):
+  """Creates required directories, and returns output dir for each file."""
+  output_dirs = []
+  for i in range(len(im_files)):
+    relative_folder_in = os.path.relpath(
+        os.path.dirname(im_files[i]), basepath_in)
+    absolute_folder_out = os.path.join(output_dir, relative_folder_in)
+    if not gfile.IsDirectory(absolute_folder_out):
+      gfile.MakeDirs(absolute_folder_out)
+    output_dirs.append(absolute_folder_out)
+  return output_dirs
+def _recursive_glob(treeroot, pattern):
+  results = []
+  for base, _, files in os.walk(treeroot):
+    files = fnmatch.filter(files, pattern)
+    results.extend(os.path.join(base, f) for f in files)
+  return results
+def main(_):
+  #if (flags.input_dir is None) == (flags.input_list_file is None):
+  #  raise ValueError('Exactly one of either input_dir or input_list_file has '
+  #                   'to be provided.')
+  #if not flags.depth and not flags.egomotion:
+  #  raise ValueError('At least one of the depth and egomotion network has to '
+  #                   'be called for inference.')
+  #if (flags.inference_mode == inference_lib.INFERENCE_MODE_TRIPLETS and
+  #    flags.seq_length != 3):
+  #  raise ValueError('For sequence lengths other than three, single inference '
+  #                   'mode has to be used.')
+  _run_inference(output_dir=FLAGS.output_dir,
+                 file_extension=FLAGS.file_extension,
+                 depth=FLAGS.depth,
+                 egomotion=FLAGS.egomotion,
+                 model_ckpt=FLAGS.model_ckpt,
+                 input_dir=FLAGS.input_dir,
+                 input_list_file=FLAGS.input_list_file,
+                 batch_size=FLAGS.batch_size,
+                 img_height=FLAGS.img_height,
+                 img_width=FLAGS.img_width,
+                 seq_length=FLAGS.seq_length,
+                 architecture=FLAGS.architecture,
+                 imagenet_norm=FLAGS.imagenet_norm,
+                 use_skip=FLAGS.use_skip,
+                 joint_encoder=FLAGS.joint_encoder,
+                 shuffle=FLAGS.shuffle,
+                 flip_for_depth=FLAGS.flip,
+                 inference_mode=FLAGS.inference_mode,
+                 inference_crop=FLAGS.inference_crop,
+                 use_masks=FLAGS.use_masks)
+if __name__ == '__main__':
+  app.run(main)
--- a/research/struct2depth/model.py
+++ b/research/struct2depth/model.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Build model for inference or training."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import logging
+import numpy as np
+import tensorflow as tf
+import nets
+import project
+import reader
+import util
+gfile = tf.gfile
+slim = tf.contrib.slim
+NUM_SCALES = 4
+class Model(object):
+  """Model code based on SfMLearner."""
+  def __init__(self,
+               data_dir=None,
+               file_extension='png',
+               is_training=True,
+               learning_rate=0.0002,
+               beta1=0.9,
+               reconstr_weight=0.85,
+               smooth_weight=0.05,
+               ssim_weight=0.15,
+               icp_weight=0.0,
+               batch_size=4,
+               img_height=128,
+               img_width=416,
+               seq_length=3,
+               architecture=nets.RESNET,
+               imagenet_norm=True,
+               weight_reg=0.05,
+               exhaustive_mode=False,
+               random_scale_crop=False,
+               flipping_mode=reader.FLIP_RANDOM,
+               random_color=True,
+               depth_upsampling=True,
+               depth_normalization=True,
+               compute_minimum_loss=True,
+               use_skip=True,
+               joint_encoder=True,
+               build_sum=True,
+               shuffle=True,
+               input_file='train',
+               handle_motion=False,
+               equal_weighting=False,
+               size_constraint_weight=0.0,
+               train_global_scale_var=True):
+    self.data_dir = data_dir
+    self.file_extension = file_extension
+    self.is_training = is_training
+    self.learning_rate = learning_rate
+    self.reconstr_weight = reconstr_weight
+    self.smooth_weight = smooth_weight
+    self.ssim_weight = ssim_weight
+    self.icp_weight = icp_weight
+    self.beta1 = beta1
+    self.batch_size = batch_size
+    self.img_height = img_height
+    self.img_width = img_width
+    self.seq_length = seq_length
+    self.architecture = architecture
+    self.imagenet_norm = imagenet_norm
+    self.weight_reg = weight_reg
+    self.exhaustive_mode = exhaustive_mode
+    self.random_scale_crop = random_scale_crop
+    self.flipping_mode = flipping_mode
+    self.random_color = random_color
+    self.depth_upsampling = depth_upsampling
+    self.depth_normalization = depth_normalization
+    self.compute_minimum_loss = compute_minimum_loss
+    self.use_skip = use_skip
+    self.joint_encoder = joint_encoder
+    self.build_sum = build_sum
+    self.shuffle = shuffle
+    self.input_file = input_file
+    self.handle_motion = handle_motion
+    self.equal_weighting = equal_weighting
+    self.size_constraint_weight = size_constraint_weight
+    self.train_global_scale_var = train_global_scale_var
+    logging.info('data_dir: %s', data_dir)
+    logging.info('file_extension: %s', file_extension)
+    logging.info('is_training: %s', is_training)
+    logging.info('learning_rate: %s', learning_rate)
+    logging.info('reconstr_weight: %s', reconstr_weight)
+    logging.info('smooth_weight: %s', smooth_weight)
+    logging.info('ssim_weight: %s', ssim_weight)
+    logging.info('icp_weight: %s', icp_weight)
+    logging.info('size_constraint_weight: %s', size_constraint_weight)
+    logging.info('beta1: %s', beta1)
+    logging.info('batch_size: %s', batch_size)
+    logging.info('img_height: %s', img_height)
+    logging.info('img_width: %s', img_width)
+    logging.info('seq_length: %s', seq_length)
+    logging.info('architecture: %s', architecture)
+    logging.info('imagenet_norm: %s', imagenet_norm)
+    logging.info('weight_reg: %s', weight_reg)
+    logging.info('exhaustive_mode: %s', exhaustive_mode)
+    logging.info('random_scale_crop: %s', random_scale_crop)
+    logging.info('flipping_mode: %s', flipping_mode)
+    logging.info('random_color: %s', random_color)
+    logging.info('depth_upsampling: %s', depth_upsampling)
+    logging.info('depth_normalization: %s', depth_normalization)
+    logging.info('compute_minimum_loss: %s', compute_minimum_loss)
+    logging.info('use_skip: %s', use_skip)
+    logging.info('joint_encoder: %s', joint_encoder)
+    logging.info('build_sum: %s', build_sum)
+    logging.info('shuffle: %s', shuffle)
+    logging.info('input_file: %s', input_file)
+    logging.info('handle_motion: %s', handle_motion)
+    logging.info('equal_weighting: %s', equal_weighting)
+    logging.info('train_global_scale_var: %s', train_global_scale_var)
+    if self.size_constraint_weight > 0 or not is_training:
+      self.global_scale_var = tf.Variable(
+          0.1, name='global_scale_var',
+          trainable=self.is_training and train_global_scale_var,
+          dtype=tf.float32,
+          constraint=lambda x: tf.clip_by_value(x, 0, np.infty))
+    if self.is_training:
+      self.reader = reader.DataReader(self.data_dir, self.batch_size,
+                                      self.img_height, self.img_width,
+                                      self.seq_length, NUM_SCALES,
+                                      self.file_extension,
+                                      self.random_scale_crop,
+                                      self.flipping_mode,
+                                      self.random_color,
+                                      self.imagenet_norm,
+                                      self.shuffle,
+                                      self.input_file)
+      self.build_train_graph()
+    else:
+      self.build_depth_test_graph()
+      self.build_egomotion_test_graph()
+      if self.handle_motion:
+        self.build_objectmotion_test_graph()
+    # At this point, the model is ready. Print some info on model params.
+    util.count_parameters()
+  def build_train_graph(self):
+    self.build_inference_for_training()
+    self.build_loss()
+    self.build_train_op()
+    if self.build_sum:
+      self.build_summaries()
+  def build_inference_for_training(self):
+    """Invokes depth and ego-motion networks and computes clouds if needed."""
+    (self.image_stack, self.image_stack_norm, self.seg_stack,
+     self.intrinsic_mat, self.intrinsic_mat_inv) = self.reader.read_data()
+    with tf.variable_scope('depth_prediction'):
+      # Organized by ...[i][scale].  Note that the order is flipped in
+      # variables in build_loss() below.
+      self.disp = {}
+      self.depth = {}
+      self.depth_upsampled = {}
+      self.inf_loss = 0.0
+      # Organized by [i].
+      disp_bottlenecks = [None] * self.seq_length
+      if self.icp_weight > 0:
+        self.cloud = {}
+      for i in range(self.seq_length):
+        image = self.image_stack_norm[:, :, :, 3 * i:3 * (i + 1)]
+        multiscale_disps_i, disp_bottlenecks[i] = nets.disp_net(
+            self.architecture, image, self.use_skip,
+            self.weight_reg, True)
+        multiscale_depths_i = [1.0 / d for d in multiscale_disps_i]
+        self.disp[i] = multiscale_disps_i
+        self.depth[i] = multiscale_depths_i
+        if self.depth_upsampling:
+          self.depth_upsampled[i] = []
+          # Upsample low-resolution depth maps using differentiable bilinear
+          # interpolation.
+          for s in range(len(multiscale_depths_i)):
+            self.depth_upsampled[i].append(tf.image.resize_bilinear(
+                multiscale_depths_i[s], [self.img_height, self.img_width],
+                align_corners=True))
+        if self.icp_weight > 0:
+          multiscale_clouds_i = [
+              project.get_cloud(d,
+                                self.intrinsic_mat_inv[:, s, :, :],
+                                name='cloud%d_%d' % (s, i))
+              for (s, d) in enumerate(multiscale_depths_i)
+          ]
+          self.cloud[i] = multiscale_clouds_i
+        # Reuse the same depth graph for all images.
+        tf.get_variable_scope().reuse_variables()
+    if self.handle_motion:
+      # Define egomotion network. This network can see the whole scene except
+      # for any moving objects as indicated by the provided segmentation masks.
+      # To avoid the network getting clues of motion by tracking those masks, we
+      # define the segmentation masks as the union temporally.
+      with tf.variable_scope('egomotion_prediction'):
+        base_input = self.image_stack_norm  # (B, H, W, 9)
+        seg_input = self.seg_stack  # (B, H, W, 9)
+        ref_zero = tf.constant(0, dtype=tf.uint8)
+        # Motion model is currently defined for three-frame sequences.
+        object_mask1 = tf.equal(seg_input[:, :, :, 0], ref_zero)
+        object_mask2 = tf.equal(seg_input[:, :, :, 3], ref_zero)
+        object_mask3 = tf.equal(seg_input[:, :, :, 6], ref_zero)
+        mask_complete = tf.expand_dims(tf.logical_and(  # (B, H, W, 1)
+            tf.logical_and(object_mask1, object_mask2), object_mask3), axis=3)
+        mask_complete = tf.tile(mask_complete, (1, 1, 1, 9))  # (B, H, W, 9)
+        # Now mask out base_input.
+        self.mask_complete = tf.to_float(mask_complete)
+        self.base_input_masked = base_input * self.mask_complete
+        self.egomotion = nets.egomotion_net(
+            image_stack=self.base_input_masked,
+            disp_bottleneck_stack=None,
+            joint_encoder=False,
+            seq_length=self.seq_length,
+            weight_reg=self.weight_reg)
+      # Define object motion network for refinement. This network only sees
+      # one object at a time over the whole sequence, and tries to estimate its
+      # motion. The sequence of images are the respective warped frames.
+      # For each scale, contains batch_size elements of shape (N, 2, 6).
+      self.object_transforms = {}
+      # For each scale, contains batch_size elements of shape (N, H, W, 9).
+      self.object_masks = {}
+      self.object_masks_warped = {}
+      # For each scale, contains batch_size elements of size N.
+      self.object_ids = {}
+      self.egomotions_seq = {}
+      self.warped_seq = {}
+      self.inputs_objectmotion_net = {}
+      with tf.variable_scope('objectmotion_prediction'):
+        # First, warp raw images according to overall egomotion.
+        for s in range(NUM_SCALES):
+          self.warped_seq[s] = []
+          self.egomotions_seq[s] = []
+          for source_index in range(self.seq_length):
+            egomotion_mat_i_1 = project.get_transform_mat(
+                self.egomotion, source_index, 1)
+            warped_image_i_1, _ = (
+                project.inverse_warp(
+                    self.image_stack[
+                        :, :, :, source_index*3:(source_index+1)*3],
+                    self.depth_upsampled[1][s],
+                    egomotion_mat_i_1,
+                    self.intrinsic_mat[:, 0, :, :],
+                    self.intrinsic_mat_inv[:, 0, :, :]))
+            self.warped_seq[s].append(warped_image_i_1)
+            self.egomotions_seq[s].append(egomotion_mat_i_1)
+          # Second, for every object in the segmentation mask, take its mask and
+          # warp it according to the egomotion estimate. Then put a threshold to
+          # binarize the warped result. Use this mask to mask out background and
+          # other objects, and pass the filtered image to the object motion
+          # network.
+          self.object_transforms[s] = []
+          self.object_masks[s] = []
+          self.object_ids[s] = []
+          self.object_masks_warped[s] = []
+          self.inputs_objectmotion_net[s] = {}
+          for i in range(self.batch_size):
+            seg_sequence = self.seg_stack[i]  # (H, W, 9=3*3)
+            object_ids = tf.unique(tf.reshape(seg_sequence, [-1]))[0]
+            self.object_ids[s].append(object_ids)
+            color_stack = []
+            mask_stack = []
+            mask_stack_warped = []
+            for j in range(self.seq_length):
+              current_image = self.warped_seq[s][j][i]  # (H, W, 3)
+              current_seg = seg_sequence[:, :, j * 3:(j+1) * 3]  # (H, W, 3)
+              def process_obj_mask_warp(obj_id):
+                """Performs warping of the individual object masks."""
+                obj_mask = tf.to_float(tf.equal(current_seg, obj_id))
+                # Warp obj_mask according to overall egomotion.
+                obj_mask_warped, _ = (
+                    project.inverse_warp(
+                        tf.expand_dims(obj_mask, axis=0),
+                        # Middle frame, highest scale, batch element i:
+                        tf.expand_dims(self.depth_upsampled[1][s][i], axis=0),
+                        # Matrix for warping j into middle frame, batch elem. i:
+                        tf.expand_dims(self.egomotions_seq[s][j][i], axis=0),
+                        tf.expand_dims(self.intrinsic_mat[i, 0, :, :], axis=0),
+                        tf.expand_dims(self.intrinsic_mat_inv[i, 0, :, :],
+                                       axis=0)))
+                obj_mask_warped = tf.squeeze(obj_mask_warped)
+                obj_mask_binarized = tf.greater(  # Threshold to binarize mask.
+                    obj_mask_warped, tf.constant(0.5))
+                return tf.to_float(obj_mask_binarized)
+              def process_obj_mask(obj_id):
+                """Returns the individual object masks separately."""
+                return tf.to_float(tf.equal(current_seg, obj_id))
+              object_masks = tf.map_fn(  # (N, H, W, 3)
+                  process_obj_mask, object_ids, dtype=tf.float32)
+              if self.size_constraint_weight > 0:
+                # The object segmentation masks are all in object_masks.
+                # We need to measure the height of every of them, and get the
+                # approximate distance.
+                # self.depth_upsampled of shape (seq_length, scale, B, H, W).
+                depth_pred = self.depth_upsampled[j][s][i]  # (H, W)
+                def get_losses(obj_mask):
+                  """Get motion constraint loss."""
+                  # Find height of segment.
+                  coords = tf.where(tf.greater(  # Shape (num_true, 2=yx)
+                      obj_mask[:, :, 0], tf.constant(0.5, dtype=tf.float32)))
+                  y_max = tf.reduce_max(coords[:, 0])
+                  y_min = tf.reduce_min(coords[:, 0])
+                  seg_height = y_max - y_min
+                  f_y = self.intrinsic_mat[i, 0, 1, 1]
+                  approx_depth = ((f_y * self.global_scale_var) /
+                                  tf.to_float(seg_height))
+                  reference_pred = tf.boolean_mask(
+                      depth_pred, tf.greater(
+                          tf.reshape(obj_mask[:, :, 0],
+                                     (self.img_height, self.img_width, 1)),
+                          tf.constant(0.5, dtype=tf.float32)))
+                  # Establish loss on approx_depth, a scalar, and
+                  # reference_pred, our dense prediction. Normalize both to
+                  # prevent degenerative depth shrinking.
+                  global_mean_depth_pred = tf.reduce_mean(depth_pred)
+                  reference_pred /= global_mean_depth_pred
+                  approx_depth /= global_mean_depth_pred
+                  spatial_err = tf.abs(reference_pred - approx_depth)
+                  mean_spatial_err = tf.reduce_mean(spatial_err)
+                  return mean_spatial_err
+                losses = tf.map_fn(
+                    get_losses, object_masks, dtype=tf.float32)
+                self.inf_loss += tf.reduce_mean(losses)
+              object_masks_warped = tf.map_fn(  # (N, H, W, 3)
+                  process_obj_mask_warp, object_ids, dtype=tf.float32)
+              filtered_images = tf.map_fn(
+                  lambda mask: current_image * mask, object_masks_warped,
+                  dtype=tf.float32)  # (N, H, W, 3)
+              color_stack.append(filtered_images)
+              mask_stack.append(object_masks)
+              mask_stack_warped.append(object_masks_warped)
+            # For this batch-element, if there are N moving objects,
+            # color_stack, mask_stack and mask_stack_warped contain both
+            # seq_length elements of shape (N, H, W, 3).
+            # We can now concatenate them on the last axis, creating a tensor of
+            # (N, H, W, 3*3 = 9), and, assuming N does not get too large so that
+            # we have enough memory, pass them in a single batch to the object
+            # motion network.
+            mask_stack = tf.concat(mask_stack, axis=3)  # (N, H, W, 9)
+            mask_stack_warped = tf.concat(mask_stack_warped, axis=3)
+            color_stack = tf.concat(color_stack, axis=3)  # (N, H, W, 9)
+            all_transforms = nets.objectmotion_net(
+                # We cut the gradient flow here as the object motion gradient
+                # should have no saying in how the egomotion network behaves.
+                # One could try just stopping the gradient for egomotion, but
+                # not for the depth prediction network.
+                image_stack=tf.stop_gradient(color_stack),
+                disp_bottleneck_stack=None,
+                joint_encoder=False,  # Joint encoder not supported.
+                seq_length=self.seq_length,
+                weight_reg=self.weight_reg)
+            # all_transforms of shape (N, 2, 6).
+            self.object_transforms[s].append(all_transforms)
+            self.object_masks[s].append(mask_stack)
+            self.object_masks_warped[s].append(mask_stack_warped)
+            self.inputs_objectmotion_net[s][i] = color_stack
+            tf.get_variable_scope().reuse_variables()
+    else:
+      # Don't handle motion, classic model formulation.
+      with tf.name_scope('egomotion_prediction'):
+        if self.joint_encoder:
+          # Re-arrange disp_bottleneck_stack to be of shape
+          # [B, h_hid, w_hid, c_hid * seq_length]. Currently, it is a list with
+          # seq_length elements, each of dimension [B, h_hid, w_hid, c_hid].
+          disp_bottleneck_stack = tf.concat(disp_bottlenecks, axis=3)
+        else:
+          disp_bottleneck_stack = None
+        self.egomotion = nets.egomotion_net(
+            image_stack=self.image_stack_norm,
+            disp_bottleneck_stack=disp_bottleneck_stack,
+            joint_encoder=self.joint_encoder,
+            seq_length=self.seq_length,
+            weight_reg=self.weight_reg)
+  def build_loss(self):
+    """Adds ops for computing loss."""
+    with tf.name_scope('compute_loss'):
+      self.reconstr_loss = 0
+      self.smooth_loss = 0
+      self.ssim_loss = 0
+      self.icp_transform_loss = 0
+      self.icp_residual_loss = 0
+      # self.images is organized by ...[scale][B, h, w, seq_len * 3].
+      self.images = [None for _ in range(NUM_SCALES)]
+      # Following nested lists are organized by ...[scale][source-target].
+      self.warped_image = [{} for _ in range(NUM_SCALES)]
+      self.warp_mask = [{} for _ in range(NUM_SCALES)]
+      self.warp_error = [{} for _ in range(NUM_SCALES)]
+      self.ssim_error = [{} for _ in range(NUM_SCALES)]
+      self.icp_transform = [{} for _ in range(NUM_SCALES)]
+      self.icp_residual = [{} for _ in range(NUM_SCALES)]
+      self.middle_frame_index = util.get_seq_middle(self.seq_length)
+      # Compute losses at each scale.
+      for s in range(NUM_SCALES):
+        # Scale image stack.
+        if s == 0:  # Just as a precaution. TF often has interpolation bugs.
+          self.images[s] = self.image_stack
+        else:
+          height_s = int(self.img_height / (2**s))
+          width_s = int(self.img_width / (2**s))
+          self.images[s] = tf.image.resize_bilinear(
+              self.image_stack, [height_s, width_s], align_corners=True)
+        # Smoothness.
+        if self.smooth_weight > 0:
+          for i in range(self.seq_length):
+            # When computing minimum loss, use the depth map from the middle
+            # frame only.
+            if not self.compute_minimum_loss or i == self.middle_frame_index:
+              disp_smoothing = self.disp[i][s]
+              if self.depth_normalization:
+                # Perform depth normalization, dividing by the mean.
+                mean_disp = tf.reduce_mean(disp_smoothing, axis=[1, 2, 3],
+                                           keep_dims=True)
+                disp_input = disp_smoothing / mean_disp
+              else:
+                disp_input = disp_smoothing
+              scaling_f = (1.0 if self.equal_weighting else 1.0 / (2**s))
+              self.smooth_loss += scaling_f * self.depth_smoothness(
+                  disp_input, self.images[s][:, :, :, 3 * i:3 * (i + 1)])
+        self.debug_all_warped_image_batches = []
+        for i in range(self.seq_length):
+          for j in range(self.seq_length):
+            if i == j:
+              continue
+            # When computing minimum loss, only consider the middle frame as
+            # target.
+            if self.compute_minimum_loss and j != self.middle_frame_index:
+              continue
+            # We only consider adjacent frames, unless either
+            # compute_minimum_loss is on (where the middle frame is matched with
+            # all other frames) or exhaustive_mode is on (where all frames are
+            # matched with each other).
+            if (not self.compute_minimum_loss and not self.exhaustive_mode and
+                abs(i - j) != 1):
+              continue
+            selected_scale = 0 if self.depth_upsampling else s
+            source = self.images[selected_scale][:, :, :, 3 * i:3 * (i + 1)]
+            target = self.images[selected_scale][:, :, :, 3 * j:3 * (j + 1)]
+            if self.depth_upsampling:
+              target_depth = self.depth_upsampled[j][s]
+            else:
+              target_depth = self.depth[j][s]
+            key = '%d-%d' % (i, j)
+            if self.handle_motion:
+              # self.seg_stack of shape (B, H, W, 9).
+              # target_depth corresponds to middle frame, of shape (B, H, W, 1).
+              # Now incorporate the other warping results, performed according
+              # to the object motion network's predictions.
+              # self.object_masks batch_size elements of (N, H, W, 9).
+              # self.object_masks_warped batch_size elements of (N, H, W, 9).
+              # self.object_transforms batch_size elements of (N, 2, 6).
+              self.all_batches = []
+              for batch_s in range(self.batch_size):
+                # To warp i into j, first take the base warping (this is the
+                # full image i warped into j using only the egomotion estimate).
+                base_warping = self.warped_seq[s][i][batch_s]
+                transform_matrices_thisbatch = tf.map_fn(
+                    lambda transform: project.get_transform_mat(
+                        tf.expand_dims(transform, axis=0), i, j)[0],
+                    self.object_transforms[0][batch_s])
+                def inverse_warp_wrapper(matrix):
+                  """Wrapper for inverse warping method."""
+                  warp_image, _ = (
+                      project.inverse_warp(
+                          tf.expand_dims(base_warping, axis=0),
+                          tf.expand_dims(target_depth[batch_s], axis=0),
+                          tf.expand_dims(matrix, axis=0),
+                          tf.expand_dims(self.intrinsic_mat[
+                              batch_s, selected_scale, :, :], axis=0),
+                          tf.expand_dims(self.intrinsic_mat_inv[
+                              batch_s, selected_scale, :, :], axis=0)))
+                  return warp_image
+                warped_images_thisbatch = tf.map_fn(
+                    inverse_warp_wrapper, transform_matrices_thisbatch,
+                    dtype=tf.float32)
+                warped_images_thisbatch = warped_images_thisbatch[:, 0, :, :, :]
+                # warped_images_thisbatch is now of shape (N, H, W, 9).
+                # Combine warped frames into a single one, using the object
+                # masks. Result should be (1, 128, 416, 3).
+                # Essentially, we here want to sum them all up, filtered by the
+                # respective object masks.
+                mask_base_valid_source = tf.equal(
+                    self.seg_stack[batch_s, :, :, i*3:(i+1)*3],
+                    tf.constant(0, dtype=tf.uint8))
+                mask_base_valid_target = tf.equal(
+                    self.seg_stack[batch_s, :, :, j*3:(j+1)*3],
+                    tf.constant(0, dtype=tf.uint8))
+                mask_valid = tf.logical_and(
+                    mask_base_valid_source, mask_base_valid_target)
+                self.base_warping = base_warping * tf.to_float(mask_valid)
+                background = tf.expand_dims(self.base_warping, axis=0)
+                def construct_const_filter_tensor(obj_id):
+                  return tf.fill(
+                      dims=[self.img_height, self.img_width, 3],
+                      value=tf.sign(obj_id)) * tf.to_float(
+                          tf.equal(self.seg_stack[batch_s, :, :, 3:6],
+                                   tf.cast(obj_id, dtype=tf.uint8)))
+                filter_tensor = tf.map_fn(
+                    construct_const_filter_tensor,
+                    tf.to_float(self.object_ids[s][batch_s]))
+                filter_tensor = tf.stack(filter_tensor, axis=0)
+                objects_to_add = tf.reduce_sum(
+                    tf.multiply(warped_images_thisbatch, filter_tensor),
+                    axis=0, keepdims=True)
+                combined = background + objects_to_add
+                self.all_batches.append(combined)
+               # Now of shape (B, 128, 416, 3).
+              self.warped_image[s][key] = tf.concat(self.all_batches, axis=0)
+            else:
+              # Don't handle motion, classic model formulation.
+              egomotion_mat_i_j = project.get_transform_mat(
+                  self.egomotion, i, j)
+              # Inverse warp the source image to the target image frame for
+              # photometric consistency loss.
+              self.warped_image[s][key], self.warp_mask[s][key] = (
+                  project.inverse_warp(
+                      source,
+                      target_depth,
+                      egomotion_mat_i_j,
+                      self.intrinsic_mat[:, selected_scale, :, :],
+                      self.intrinsic_mat_inv[:, selected_scale, :, :]))
+            # Reconstruction loss.
+            self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target)
+            if not self.compute_minimum_loss:
+              self.reconstr_loss += tf.reduce_mean(
+                  self.warp_error[s][key] * self.warp_mask[s][key])
+            # SSIM.
+            if self.ssim_weight > 0:
+              self.ssim_error[s][key] = self.ssim(self.warped_image[s][key],
+                                                  target)
+              # TODO(rezama): This should be min_pool2d().
+              if not self.compute_minimum_loss:
+                ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1,
+                                            'VALID')
+                self.ssim_loss += tf.reduce_mean(
+                    self.ssim_error[s][key] * ssim_mask)
+        # If the minimum loss should be computed, the loss calculation has been
+        # postponed until here.
+        if self.compute_minimum_loss:
+          for frame_index in range(self.middle_frame_index):
+            key1 = '%d-%d' % (frame_index, self.middle_frame_index)
+            key2 = '%d-%d' % (self.seq_length - frame_index - 1,
+                              self.middle_frame_index)
+            logging.info('computing min error between %s and %s', key1, key2)
+            min_error = tf.minimum(self.warp_error[s][key1],
+                                   self.warp_error[s][key2])
+            self.reconstr_loss += tf.reduce_mean(min_error)
+            if self.ssim_weight > 0:  # Also compute the minimum SSIM loss.
+              min_error_ssim = tf.minimum(self.ssim_error[s][key1],
+                                          self.ssim_error[s][key2])
+              self.ssim_loss += tf.reduce_mean(min_error_ssim)
+      # Build the total loss as composed of L1 reconstruction, SSIM, smoothing
+      # and object size constraint loss as appropriate.
+      self.reconstr_loss *= self.reconstr_weight
+      self.total_loss = self.reconstr_loss
+      if self.smooth_weight > 0:
+        self.smooth_loss *= self.smooth_weight
+        self.total_loss += self.smooth_loss
+      if self.ssim_weight > 0:
+        self.ssim_loss *= self.ssim_weight
+        self.total_loss += self.ssim_loss
+      if self.size_constraint_weight > 0:
+        self.inf_loss *= self.size_constraint_weight
+        self.total_loss += self.inf_loss
+  def gradient_x(self, img):
+    return img[:, :, :-1, :] - img[:, :, 1:, :]
+  def gradient_y(self, img):
+    return img[:, :-1, :, :] - img[:, 1:, :, :]
+  def depth_smoothness(self, depth, img):
+    """Computes image-aware depth smoothness loss."""
+    depth_dx = self.gradient_x(depth)
+    depth_dy = self.gradient_y(depth)
+    image_dx = self.gradient_x(img)
+    image_dy = self.gradient_y(img)
+    weights_x = tf.exp(-tf.reduce_mean(tf.abs(image_dx), 3, keepdims=True))
+    weights_y = tf.exp(-tf.reduce_mean(tf.abs(image_dy), 3, keepdims=True))
+    smoothness_x = depth_dx * weights_x
+    smoothness_y = depth_dy * weights_y
+    return tf.reduce_mean(abs(smoothness_x)) + tf.reduce_mean(abs(smoothness_y))
+  def ssim(self, x, y):
+    """Computes a differentiable structured image similarity measure."""
+    c1 = 0.01**2  # As defined in SSIM to stabilize div. by small denominator.
+    c2 = 0.03**2
+    mu_x = slim.avg_pool2d(x, 3, 1, 'VALID')
+    mu_y = slim.avg_pool2d(y, 3, 1, 'VALID')
+    sigma_x = slim.avg_pool2d(x**2, 3, 1, 'VALID') - mu_x**2
+    sigma_y = slim.avg_pool2d(y**2, 3, 1, 'VALID') - mu_y**2
+    sigma_xy = slim.avg_pool2d(x * y, 3, 1, 'VALID') - mu_x * mu_y
+    ssim_n = (2 * mu_x * mu_y + c1) * (2 * sigma_xy + c2)
+    ssim_d = (mu_x**2 + mu_y**2 + c1) * (sigma_x + sigma_y + c2)
+    ssim = ssim_n / ssim_d
+    return tf.clip_by_value((1 - ssim) / 2, 0, 1)
+  def build_train_op(self):
+    with tf.name_scope('train_op'):
+      optim = tf.train.AdamOptimizer(self.learning_rate, self.beta1)
+      self.train_op = slim.learning.create_train_op(self.total_loss, optim)
+      self.global_step = tf.Variable(0, name='global_step', trainable=False)
+      self.incr_global_step = tf.assign(
+          self.global_step, self.global_step + 1)
+  def build_summaries(self):
+    """Adds scalar and image summaries for TensorBoard."""
+    tf.summary.scalar('total_loss', self.total_loss)
+    tf.summary.scalar('reconstr_loss', self.reconstr_loss)
+    if self.smooth_weight > 0:
+      tf.summary.scalar('smooth_loss', self.smooth_loss)
+    if self.ssim_weight > 0:
+      tf.summary.scalar('ssim_loss', self.ssim_loss)
+    if self.icp_weight > 0:
+      tf.summary.scalar('icp_transform_loss', self.icp_transform_loss)
+      tf.summary.scalar('icp_residual_loss', self.icp_residual_loss)
+    if self.size_constraint_weight > 0:
+      tf.summary.scalar('inf_loss', self.inf_loss)
+      tf.summary.histogram('global_scale_var', self.global_scale_var)
+    if self.handle_motion:
+      for s in range(NUM_SCALES):
+        for batch_s in range(self.batch_size):
+          whole_strip = tf.concat([self.warped_seq[s][0][batch_s],
+                                   self.warped_seq[s][1][batch_s],
+                                   self.warped_seq[s][2][batch_s]], axis=1)
+          tf.summary.image('base_warp_batch%s_scale%s' % (batch_s, s),
+                           tf.expand_dims(whole_strip, axis=0))
+          whole_strip_input = tf.concat(
+              [self.inputs_objectmotion_net[s][batch_s][:, :, :, 0:3],
+               self.inputs_objectmotion_net[s][batch_s][:, :, :, 3:6],
+               self.inputs_objectmotion_net[s][batch_s][:, :, :, 6:9]], axis=2)
+          tf.summary.image('input_objectmotion_batch%s_scale%s' % (batch_s, s),
+                           whole_strip_input)  # (B, H, 3*W, 3)
+      for batch_s in range(self.batch_size):
+        whole_strip = tf.concat([self.base_input_masked[batch_s, :, :, 0:3],
+                                 self.base_input_masked[batch_s, :, :, 3:6],
+                                 self.base_input_masked[batch_s, :, :, 6:9]],
+                                axis=1)
+        tf.summary.image('input_egomotion_batch%s' % batch_s,
+                         tf.expand_dims(whole_strip, axis=0))
+      # Show transform predictions (of all objects).
+      for batch_s in range(self.batch_size):
+        for i in range(self.seq_length - 1):
+          # self.object_transforms contains batch_size elements of (N, 2, 6).
+          tf.summary.histogram('batch%d_tx%d' % (batch_s, i),
+                               self.object_transforms[0][batch_s][:, i, 0])
+          tf.summary.histogram('batch%d_ty%d' % (batch_s, i),
+                               self.object_transforms[0][batch_s][:, i, 1])
+          tf.summary.histogram('batch%d_tz%d' % (batch_s, i),
+                               self.object_transforms[0][batch_s][:, i, 2])
+          tf.summary.histogram('batch%d_rx%d' % (batch_s, i),
+                               self.object_transforms[0][batch_s][:, i, 3])
+          tf.summary.histogram('batch%d_ry%d' % (batch_s, i),
+                               self.object_transforms[0][batch_s][:, i, 4])
+          tf.summary.histogram('batch%d_rz%d' % (batch_s, i),
+                               self.object_transforms[0][batch_s][:, i, 5])
+    for i in range(self.seq_length - 1):
+      tf.summary.histogram('tx%d' % i, self.egomotion[:, i, 0])
+      tf.summary.histogram('ty%d' % i, self.egomotion[:, i, 1])
+      tf.summary.histogram('tz%d' % i, self.egomotion[:, i, 2])
+      tf.summary.histogram('rx%d' % i, self.egomotion[:, i, 3])
+      tf.summary.histogram('ry%d' % i, self.egomotion[:, i, 4])
+      tf.summary.histogram('rz%d' % i, self.egomotion[:, i, 5])
+    for s in range(NUM_SCALES):
+      for i in range(self.seq_length):
+        tf.summary.image('scale%d_image%d' % (s, i),
+                         self.images[s][:, :, :, 3 * i:3 * (i + 1)])
+        if i in self.depth:
+          tf.summary.histogram('scale%d_depth%d' % (s, i), self.depth[i][s])
+          tf.summary.histogram('scale%d_disp%d' % (s, i), self.disp[i][s])
+          tf.summary.image('scale%d_disparity%d' % (s, i), self.disp[i][s])
+      for key in self.warped_image[s]:
+        tf.summary.image('scale%d_warped_image%s' % (s, key),
+                         self.warped_image[s][key])
+        tf.summary.image('scale%d_warp_error%s' % (s, key),
+                         self.warp_error[s][key])
+        if self.ssim_weight > 0:
+          tf.summary.image('scale%d_ssim_error%s' % (s, key),
+                           self.ssim_error[s][key])
+        if self.icp_weight > 0:
+          tf.summary.image('scale%d_icp_residual%s' % (s, key),
+                           self.icp_residual[s][key])
+          transform = self.icp_transform[s][key]
+          tf.summary.histogram('scale%d_icp_tx%s' % (s, key), transform[:, 0])
+          tf.summary.histogram('scale%d_icp_ty%s' % (s, key), transform[:, 1])
+          tf.summary.histogram('scale%d_icp_tz%s' % (s, key), transform[:, 2])
+          tf.summary.histogram('scale%d_icp_rx%s' % (s, key), transform[:, 3])
+          tf.summary.histogram('scale%d_icp_ry%s' % (s, key), transform[:, 4])
+          tf.summary.histogram('scale%d_icp_rz%s' % (s, key), transform[:, 5])
+  def build_depth_test_graph(self):
+    """Builds depth model reading from placeholders."""
+    with tf.variable_scope('depth_prediction'):
+      input_image = tf.placeholder(
+          tf.float32, [self.batch_size, self.img_height, self.img_width, 3],
+          name='raw_input')
+      if self.imagenet_norm:
+        input_image = (input_image - reader.IMAGENET_MEAN) / reader.IMAGENET_SD
+      est_disp, _ = nets.disp_net(architecture=self.architecture,
+                                  image=input_image,
+                                  use_skip=self.use_skip,
+                                  weight_reg=self.weight_reg,
+                                  is_training=True)
+    est_depth = 1.0 / est_disp[0]
+    self.input_image = input_image
+    self.est_depth = est_depth
+  def build_egomotion_test_graph(self):
+    """Builds egomotion model reading from placeholders."""
+    input_image_stack = tf.placeholder(
+        tf.float32,
+        [1, self.img_height, self.img_width, self.seq_length * 3],
+        name='raw_input')
+    input_bottleneck_stack = None
+    if self.imagenet_norm:
+      im_mean = tf.tile(
+          tf.constant(reader.IMAGENET_MEAN), multiples=[self.seq_length])
+      im_sd = tf.tile(
+          tf.constant(reader.IMAGENET_SD), multiples=[self.seq_length])
+      input_image_stack = (input_image_stack - im_mean) / im_sd
+    if self.joint_encoder:
+      # Pre-compute embeddings here.
+      with tf.variable_scope('depth_prediction', reuse=True):
+        input_bottleneck_stack = []
+        encoder_selected = nets.encoder(self.architecture)
+        for i in range(self.seq_length):
+          input_image = input_image_stack[:, :, :, i * 3:(i + 1) * 3]
+          tf.get_variable_scope().reuse_variables()
+          embedding, _ = encoder_selected(
+              target_image=input_image,
+              weight_reg=self.weight_reg,
+              is_training=True)
+          input_bottleneck_stack.append(embedding)
+        input_bottleneck_stack = tf.concat(input_bottleneck_stack, axis=3)
+    with tf.variable_scope('egomotion_prediction'):
+      est_egomotion = nets.egomotion_net(
+          image_stack=input_image_stack,
+          disp_bottleneck_stack=input_bottleneck_stack,
+          joint_encoder=self.joint_encoder,
+          seq_length=self.seq_length,
+          weight_reg=self.weight_reg)
+    self.input_image_stack = input_image_stack
+    self.est_egomotion = est_egomotion
+  def build_objectmotion_test_graph(self):
+    """Builds egomotion model reading from placeholders."""
+    input_image_stack_om = tf.placeholder(
+        tf.float32,
+        [1, self.img_height, self.img_width, self.seq_length * 3],
+        name='raw_input')
+    if self.imagenet_norm:
+      im_mean = tf.tile(
+          tf.constant(reader.IMAGENET_MEAN), multiples=[self.seq_length])
+      im_sd = tf.tile(
+          tf.constant(reader.IMAGENET_SD), multiples=[self.seq_length])
+      input_image_stack_om = (input_image_stack_om - im_mean) / im_sd
+    with tf.variable_scope('objectmotion_prediction'):
+      est_objectmotion = nets.objectmotion_net(
+          image_stack=input_image_stack_om,
+          disp_bottleneck_stack=None,
+          joint_encoder=self.joint_encoder,
+          seq_length=self.seq_length,
+          weight_reg=self.weight_reg)
+    self.input_image_stack_om = input_image_stack_om
+    self.est_objectmotion = est_objectmotion
+  def inference_depth(self, inputs, sess):
+    return sess.run(self.est_depth, feed_dict={self.input_image: inputs})
+  def inference_egomotion(self, inputs, sess):
+    return sess.run(
+        self.est_egomotion, feed_dict={self.input_image_stack: inputs})
+  def inference_objectmotion(self, inputs, sess):
+    return sess.run(
+        self.est_objectmotion, feed_dict={self.input_image_stack_om: inputs})
--- a/research/struct2depth/nets.py
+++ b/research/struct2depth/nets.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Depth and Ego-Motion networks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+slim = tf.contrib.slim
+SIMPLE = 'simple'
+RESNET = 'resnet'
+ARCHITECTURES = [SIMPLE, RESNET]
+SCALE_TRANSLATION = 0.001
+SCALE_ROTATION = 0.01
+# Disparity (inverse depth) values range from 0.01 to 10. Note that effectively,
+# this is undone if depth normalization is used, which scales the values to
+# have a mean of 1.
+DISP_SCALING = 10
+MIN_DISP = 0.01
+WEIGHT_DECAY_KEY = 'WEIGHT_DECAY'
+EGOMOTION_VEC_SIZE = 6
+def egomotion_net(image_stack, disp_bottleneck_stack, joint_encoder, seq_length,
+                  weight_reg):
+  """Predict ego-motion vectors from a stack of frames or embeddings.
+  Args:
+    image_stack: Input tensor with shape [B, h, w, seq_length * 3] in order.
+    disp_bottleneck_stack: Input tensor with shape [B, h_hidden, w_hidden,
+        seq_length * c_hidden] in order.
+    joint_encoder: Determines if the same encoder is used for computing the
+        bottleneck layer of both the egomotion and the depth prediction
+        network. If enabled, disp_bottleneck_stack is used as input, and the
+        encoding steps are skipped. If disabled, a separate encoder is defined
+        on image_stack.
+    seq_length: The sequence length used.
+    weight_reg: The amount of weight regularization.
+  Returns:
+    Egomotion vectors with shape [B, seq_length - 1, 6].
+  """
+  num_egomotion_vecs = seq_length - 1
+  with tf.variable_scope('pose_exp_net') as sc:
+    end_points_collection = sc.original_name_scope + '_end_points'
+    with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                        normalizer_fn=None,
+                        weights_regularizer=slim.l2_regularizer(weight_reg),
+                        normalizer_params=None,
+                        activation_fn=tf.nn.relu,
+                        outputs_collections=end_points_collection):
+      if not joint_encoder:
+        # Define separate encoder. If sharing, we can skip the encoding step,
+        # as the bottleneck layer will already be passed as input.
+        cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
+        cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
+        cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
+        cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
+        cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')
+      with tf.variable_scope('pose'):
+        inputs = disp_bottleneck_stack if joint_encoder else cnv5
+        cnv6 = slim.conv2d(inputs, 256, [3, 3], stride=2, scope='cnv6')
+        cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
+        pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs
+        egomotion_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred',
+                                     stride=1, normalizer_fn=None,
+                                     activation_fn=None)
+        egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2])
+        egomotion_res = tf.reshape(
+            egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE])
+        # Tinghui found that scaling by a small constant facilitates training.
+        egomotion_scaled = tf.concat([egomotion_res[:, 0:3] * SCALE_TRANSLATION,
+                                      egomotion_res[:, 3:6] * SCALE_ROTATION],
+                                     axis=1)
+    return egomotion_scaled
+def objectmotion_net(image_stack, disp_bottleneck_stack, joint_encoder,
+                     seq_length, weight_reg):
+  """Predict object-motion vectors from a stack of frames or embeddings.
+  Args:
+    image_stack: Input tensor with shape [B, h, w, seq_length * 3] in order.
+    disp_bottleneck_stack: Input tensor with shape [B, h_hidden, w_hidden,
+        seq_length * c_hidden] in order.
+    joint_encoder: Determines if the same encoder is used for computing the
+        bottleneck layer of both the egomotion and the depth prediction
+        network. If enabled, disp_bottleneck_stack is used as input, and the
+        encoding steps are skipped. If disabled, a separate encoder is defined
+        on image_stack.
+    seq_length: The sequence length used.
+    weight_reg: The amount of weight regularization.
+  Returns:
+    Egomotion vectors with shape [B, seq_length - 1, 6].
+  """
+  num_egomotion_vecs = seq_length - 1
+  with tf.variable_scope('pose_exp_net') as sc:
+    end_points_collection = sc.original_name_scope + '_end_points'
+    with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                        normalizer_fn=None,
+                        weights_regularizer=slim.l2_regularizer(weight_reg),
+                        normalizer_params=None,
+                        activation_fn=tf.nn.relu,
+                        outputs_collections=end_points_collection):
+      if not joint_encoder:
+        # Define separate encoder. If sharing, we can skip the encoding step,
+        # as the bottleneck layer will already be passed as input.
+        cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
+        cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
+        cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
+        cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
+        cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')
+      with tf.variable_scope('pose'):
+        inputs = disp_bottleneck_stack if joint_encoder else cnv5
+        cnv6 = slim.conv2d(inputs, 256, [3, 3], stride=2, scope='cnv6')
+        cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
+        pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs
+        egomotion_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred',
+                                     stride=1, normalizer_fn=None,
+                                     activation_fn=None)
+        egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2])
+        egomotion_res = tf.reshape(
+            egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE])
+        # Tinghui found that scaling by a small constant facilitates training.
+        egomotion_scaled = tf.concat([egomotion_res[:, 0:3] * SCALE_TRANSLATION,
+                                      egomotion_res[:, 3:6] * SCALE_ROTATION],
+                                     axis=1)
+    return egomotion_scaled
+def disp_net(architecture, image, use_skip, weight_reg, is_training):
+  """Defines an encoder-decoder architecture for depth prediction."""
+  if architecture not in ARCHITECTURES:
+    raise ValueError('Unknown architecture.')
+  encoder_selected = encoder(architecture)
+  decoder_selected = decoder(architecture)
+  # Encode image.
+  bottleneck, skip_connections = encoder_selected(image, weight_reg,
+                                                  is_training)
+  # Decode to depth.
+  multiscale_disps_i = decoder_selected(target_image=image,
+                                        bottleneck=bottleneck,
+                                        weight_reg=weight_reg,
+                                        use_skip=use_skip,
+                                        skip_connections=skip_connections)
+  return multiscale_disps_i, bottleneck
+def encoder(architecture):
+  return encoder_resnet if architecture == RESNET else encoder_simple
+def decoder(architecture):
+  return decoder_resnet if architecture == RESNET else decoder_simple
+def encoder_simple(target_image, weight_reg, is_training):
+  """Defines the old encoding architecture."""
+  del is_training
+  with slim.arg_scope([slim.conv2d],
+                      normalizer_fn=None,
+                      normalizer_params=None,
+                      weights_regularizer=slim.l2_regularizer(weight_reg),
+                      activation_fn=tf.nn.relu):
+    # Define (joint) encoder.
+    cnv1 = slim.conv2d(target_image, 32, [7, 7], stride=2, scope='cnv1')
+    cnv1b = slim.conv2d(cnv1, 32, [7, 7], stride=1, scope='cnv1b')
+    cnv2 = slim.conv2d(cnv1b, 64, [5, 5], stride=2, scope='cnv2')
+    cnv2b = slim.conv2d(cnv2, 64, [5, 5], stride=1, scope='cnv2b')
+    cnv3 = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3')
+    cnv3b = slim.conv2d(cnv3, 128, [3, 3], stride=1, scope='cnv3b')
+    cnv4 = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4')
+    cnv4b = slim.conv2d(cnv4, 256, [3, 3], stride=1, scope='cnv4b')
+    cnv5 = slim.conv2d(cnv4b, 512, [3, 3], stride=2, scope='cnv5')
+    cnv5b = slim.conv2d(cnv5, 512, [3, 3], stride=1, scope='cnv5b')
+    cnv6 = slim.conv2d(cnv5b, 512, [3, 3], stride=2, scope='cnv6')
+    cnv6b = slim.conv2d(cnv6, 512, [3, 3], stride=1, scope='cnv6b')
+    cnv7 = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7')
+    cnv7b = slim.conv2d(cnv7, 512, [3, 3], stride=1, scope='cnv7b')
+  return cnv7b, (cnv6b, cnv5b, cnv4b, cnv3b, cnv2b, cnv1b)
+def decoder_simple(target_image, bottleneck, weight_reg, use_skip,
+                   skip_connections):
+  """Defines the old depth decoder architecture."""
+  h = target_image.get_shape()[1].value
+  w = target_image.get_shape()[2].value
+  (cnv6b, cnv5b, cnv4b, cnv3b, cnv2b, cnv1b) = skip_connections
+  with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                      normalizer_fn=None,
+                      normalizer_params=None,
+                      weights_regularizer=slim.l2_regularizer(weight_reg),
+                      activation_fn=tf.nn.relu):
+    up7 = slim.conv2d_transpose(bottleneck, 512, [3, 3], stride=2,
+                                scope='upcnv7')
+    up7 = _resize_like(up7, cnv6b)
+    if use_skip:
+      i7_in = tf.concat([up7, cnv6b], axis=3)
+    else:
+      i7_in = up7
+    icnv7 = slim.conv2d(i7_in, 512, [3, 3], stride=1, scope='icnv7')
+    up6 = slim.conv2d_transpose(icnv7, 512, [3, 3], stride=2, scope='upcnv6')
+    up6 = _resize_like(up6, cnv5b)
+    if use_skip:
+      i6_in = tf.concat([up6, cnv5b], axis=3)
+    else:
+      i6_in = up6
+    icnv6 = slim.conv2d(i6_in, 512, [3, 3], stride=1, scope='icnv6')
+    up5 = slim.conv2d_transpose(icnv6, 256, [3, 3], stride=2, scope='upcnv5')
+    up5 = _resize_like(up5, cnv4b)
+    if use_skip:
+      i5_in = tf.concat([up5, cnv4b], axis=3)
+    else:
+      i5_in = up5
+    icnv5 = slim.conv2d(i5_in, 256, [3, 3], stride=1, scope='icnv5')
+    up4 = slim.conv2d_transpose(icnv5, 128, [3, 3], stride=2, scope='upcnv4')
+    up4 = _resize_like(up4, cnv3b)
+    if use_skip:
+      i4_in = tf.concat([up4, cnv3b], axis=3)
+    else:
+      i4_in = up4
+    icnv4 = slim.conv2d(i4_in, 128, [3, 3], stride=1, scope='icnv4')
+    disp4 = (slim.conv2d(icnv4, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
+                         normalizer_fn=None, scope='disp4')
+             * DISP_SCALING + MIN_DISP)
+    disp4_up = tf.image.resize_bilinear(disp4, [np.int(h / 4), np.int(w / 4)],
+                                        align_corners=True)
+    up3 = slim.conv2d_transpose(icnv4, 64, [3, 3], stride=2, scope='upcnv3')
+    up3 = _resize_like(up3, cnv2b)
+    if use_skip:
+      i3_in = tf.concat([up3, cnv2b, disp4_up], axis=3)
+    else:
+      i3_in = tf.concat([up3, disp4_up])
+    icnv3 = slim.conv2d(i3_in, 64, [3, 3], stride=1, scope='icnv3')
+    disp3 = (slim.conv2d(icnv3, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
+                         normalizer_fn=None, scope='disp3')
+             * DISP_SCALING + MIN_DISP)
+    disp3_up = tf.image.resize_bilinear(disp3, [np.int(h / 2), np.int(w / 2)],
+                                        align_corners=True)
+    up2 = slim.conv2d_transpose(icnv3, 32, [3, 3], stride=2, scope='upcnv2')
+    up2 = _resize_like(up2, cnv1b)
+    if use_skip:
+      i2_in = tf.concat([up2, cnv1b, disp3_up], axis=3)
+    else:
+      i2_in = tf.concat([up2, disp3_up])
+    icnv2 = slim.conv2d(i2_in, 32, [3, 3], stride=1, scope='icnv2')
+    disp2 = (slim.conv2d(icnv2, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
+                         normalizer_fn=None, scope='disp2')
+             * DISP_SCALING + MIN_DISP)
+    disp2_up = tf.image.resize_bilinear(disp2, [h, w], align_corners=True)
+    up1 = slim.conv2d_transpose(icnv2, 16, [3, 3], stride=2, scope='upcnv1')
+    i1_in = tf.concat([up1, disp2_up], axis=3)
+    icnv1 = slim.conv2d(i1_in, 16, [3, 3], stride=1, scope='icnv1')
+    disp1 = (slim.conv2d(icnv1, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
+                         normalizer_fn=None, scope='disp1')
+             * DISP_SCALING + MIN_DISP)
+  return [disp1, disp2, disp3, disp4]
+def encoder_resnet(target_image, weight_reg, is_training):
+  """Defines a ResNet18-based encoding architecture.
+  This implementation follows Juyong Kim's implementation of ResNet18 on GitHub:
+  https://github.com/dalgu90/resnet-18-tensorflow
+  Args:
+    target_image: Input tensor with shape [B, h, w, 3] to encode.
+    weight_reg: Parameter ignored.
+    is_training: Whether the model is being trained or not.
+  Returns:
+    Tuple of tensors, with the first being the bottleneck layer as tensor of
+    size [B, h_hid, w_hid, c_hid], and others being intermediate layers
+    for building skip-connections.
+  """
+  del weight_reg
+  encoder_filters = [64, 64, 128, 256, 512]
+  stride = 2
+  # conv1
+  with tf.variable_scope('conv1'):
+    x = _conv(target_image, 7, encoder_filters[0], stride)
+    x = _bn(x, is_train=is_training)
+    econv1 = _relu(x)
+    x = tf.nn.max_pool(econv1, [1, 3, 3, 1], [1, 2, 2, 1], 'SAME')
+  # conv2_x
+  x = _residual_block(x, is_training, name='conv2_1')
+  econv2 = _residual_block(x, is_training, name='conv2_2')
+  # conv3_x
+  x = _residual_block_first(econv2, is_training, encoder_filters[2], stride,
+                            name='conv3_1')
+  econv3 = _residual_block(x, is_training, name='conv3_2')
+  # conv4_x
+  x = _residual_block_first(econv3, is_training, encoder_filters[3], stride,
+                            name='conv4_1')
+  econv4 = _residual_block(x, is_training, name='conv4_2')
+  # conv5_x
+  x = _residual_block_first(econv4, is_training, encoder_filters[4], stride,
+                            name='conv5_1')
+  econv5 = _residual_block(x, is_training, name='conv5_2')
+  return econv5, (econv4, econv3, econv2, econv1)
+def decoder_resnet(target_image, bottleneck, weight_reg, use_skip,
+                   skip_connections):
+  """Defines the depth decoder architecture.
+  Args:
+    target_image: The original encoder input tensor with shape [B, h, w, 3].
+                  Just the shape information is used here.
+    bottleneck: Bottleneck layer to be decoded.
+    weight_reg: The amount of weight regularization.
+    use_skip: Whether the passed skip connections econv1, econv2, econv3 and
+              econv4 should be used.
+    skip_connections: Tensors for building skip-connections.
+  Returns:
+    Disparities at 4 different scales.
+  """
+  (econv4, econv3, econv2, econv1) = skip_connections
+  decoder_filters = [16, 32, 64, 128, 256]
+  default_pad = tf.constant([[0, 0], [1, 1], [1, 1], [0, 0]])
+  reg = slim.l2_regularizer(weight_reg) if weight_reg > 0.0 else None
+  with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
+                      normalizer_fn=None,
+                      normalizer_params=None,
+                      activation_fn=tf.nn.relu,
+                      weights_regularizer=reg):
+    upconv5 = slim.conv2d_transpose(bottleneck, decoder_filters[4], [3, 3],
+                                    stride=2, scope='upconv5')
+    upconv5 = _resize_like(upconv5, econv4)
+    if use_skip:
+      i5_in = tf.concat([upconv5, econv4], axis=3)
+    else:
+      i5_in = upconv5
+    i5_in = tf.pad(i5_in, default_pad, mode='REFLECT')
+    iconv5 = slim.conv2d(i5_in, decoder_filters[4], [3, 3], stride=1,
+                         scope='iconv5', padding='VALID')
+    upconv4 = slim.conv2d_transpose(iconv5, decoder_filters[3], [3, 3],
+                                    stride=2, scope='upconv4')
+    upconv4 = _resize_like(upconv4, econv3)
+    if use_skip:
+      i4_in = tf.concat([upconv4, econv3], axis=3)
+    else:
+      i4_in = upconv4
+    i4_in = tf.pad(i4_in, default_pad, mode='REFLECT')
+    iconv4 = slim.conv2d(i4_in, decoder_filters[3], [3, 3], stride=1,
+                         scope='iconv4', padding='VALID')
+    disp4_input = tf.pad(iconv4, default_pad, mode='REFLECT')
+    disp4 = (slim.conv2d(disp4_input, 1, [3, 3], stride=1,
+                         activation_fn=tf.sigmoid, normalizer_fn=None,
+                         scope='disp4', padding='VALID')
+             * DISP_SCALING + MIN_DISP)
+    upconv3 = slim.conv2d_transpose(iconv4, decoder_filters[2], [3, 3],
+                                    stride=2, scope='upconv3')
+    upconv3 = _resize_like(upconv3, econv2)
+    if use_skip:
+      i3_in = tf.concat([upconv3, econv2], axis=3)
+    else:
+      i3_in = upconv3
+    i3_in = tf.pad(i3_in, default_pad, mode='REFLECT')
+    iconv3 = slim.conv2d(i3_in, decoder_filters[2], [3, 3], stride=1,
+                         scope='iconv3', padding='VALID')
+    disp3_input = tf.pad(iconv3, default_pad, mode='REFLECT')
+    disp3 = (slim.conv2d(disp3_input, 1, [3, 3], stride=1,
+                         activation_fn=tf.sigmoid, normalizer_fn=None,
+                         scope='disp3', padding='VALID')
+             * DISP_SCALING + MIN_DISP)
+    upconv2 = slim.conv2d_transpose(iconv3, decoder_filters[1], [3, 3],
+                                    stride=2, scope='upconv2')
+    upconv2 = _resize_like(upconv2, econv1)
+    if use_skip:
+      i2_in = tf.concat([upconv2, econv1], axis=3)
+    else:
+      i2_in = upconv2
+    i2_in = tf.pad(i2_in, default_pad, mode='REFLECT')
+    iconv2 = slim.conv2d(i2_in, decoder_filters[1], [3, 3], stride=1,
+                         scope='iconv2', padding='VALID')
+    disp2_input = tf.pad(iconv2, default_pad, mode='REFLECT')
+    disp2 = (slim.conv2d(disp2_input, 1, [3, 3], stride=1,
+                         activation_fn=tf.sigmoid, normalizer_fn=None,
+                         scope='disp2', padding='VALID')
+             * DISP_SCALING + MIN_DISP)
+    upconv1 = slim.conv2d_transpose(iconv2, decoder_filters[0], [3, 3],
+                                    stride=2, scope='upconv1')
+    upconv1 = _resize_like(upconv1, target_image)
+    upconv1 = tf.pad(upconv1, default_pad, mode='REFLECT')
+    iconv1 = slim.conv2d(upconv1, decoder_filters[0], [3, 3], stride=1,
+                         scope='iconv1', padding='VALID')
+    disp1_input = tf.pad(iconv1, default_pad, mode='REFLECT')
+    disp1 = (slim.conv2d(disp1_input, 1, [3, 3], stride=1,
+                         activation_fn=tf.sigmoid, normalizer_fn=None,
+                         scope='disp1', padding='VALID')
+             * DISP_SCALING + MIN_DISP)
+  return [disp1, disp2, disp3, disp4]
+def _residual_block_first(x, is_training, out_channel, strides, name='unit'):
+  """Helper function for defining ResNet architecture."""
+  in_channel = x.get_shape().as_list()[-1]
+  with tf.variable_scope(name):
+    # Shortcut connection
+    if in_channel == out_channel:
+      if strides == 1:
+        shortcut = tf.identity(x)
+      else:
+        shortcut = tf.nn.max_pool(x, [1, strides, strides, 1],
+                                  [1, strides, strides, 1], 'VALID')
+    else:
+      shortcut = _conv(x, 1, out_channel, strides, name='shortcut')
+    # Residual
+    x = _conv(x, 3, out_channel, strides, name='conv_1')
+    x = _bn(x, is_train=is_training, name='bn_1')
+    x = _relu(x, name='relu_1')
+    x = _conv(x, 3, out_channel, 1, name='conv_2')
+    x = _bn(x, is_train=is_training, name='bn_2')
+    # Merge
+    x = x + shortcut
+    x = _relu(x, name='relu_2')
+  return x
+def _residual_block(x, is_training, input_q=None, output_q=None, name='unit'):
+  """Helper function for defining ResNet architecture."""
+  num_channel = x.get_shape().as_list()[-1]
+  with tf.variable_scope(name):
+    shortcut = x  # Shortcut connection
+    # Residual
+    x = _conv(x, 3, num_channel, 1, input_q=input_q, output_q=output_q,
+              name='conv_1')
+    x = _bn(x, is_train=is_training, name='bn_1')
+    x = _relu(x, name='relu_1')
+    x = _conv(x, 3, num_channel, 1, input_q=output_q, output_q=output_q,
+              name='conv_2')
+    x = _bn(x, is_train=is_training, name='bn_2')
+    # Merge
+    x = x + shortcut
+    x = _relu(x, name='relu_2')
+  return x
+def _conv(x, filter_size, out_channel, stride, pad='SAME', input_q=None,
+          output_q=None, name='conv'):
+  """Helper function for defining ResNet architecture."""
+  if (input_q is None) ^ (output_q is None):
+    raise ValueError('Input/Output splits are not correctly given.')
+  in_shape = x.get_shape()
+  with tf.variable_scope(name):
+    # Main operation: conv2d
+    with tf.device('/CPU:0'):
+      kernel = tf.get_variable(
+          'kernel', [filter_size, filter_size, in_shape[3], out_channel],
+          tf.float32, initializer=tf.random_normal_initializer(
+              stddev=np.sqrt(2.0/filter_size/filter_size/out_channel)))
+    if kernel not in tf.get_collection(WEIGHT_DECAY_KEY):
+      tf.add_to_collection(WEIGHT_DECAY_KEY, kernel)
+    conv = tf.nn.conv2d(x, kernel, [1, stride, stride, 1], pad)
+  return conv
+def _bn(x, is_train, name='bn'):
+  """Helper function for defining ResNet architecture."""
+  bn = tf.layers.batch_normalization(x, training=is_train, name=name)
+  return bn
+def _relu(x, name=None, leakness=0.0):
+  """Helper function for defining ResNet architecture."""
+  if leakness > 0.0:
+    name = 'lrelu' if name is None else name
+    return tf.maximum(x, x*leakness, name='lrelu')
+  else:
+    name = 'relu' if name is None else name
+    return tf.nn.relu(x, name='relu')
+def _resize_like(inputs, ref):
+  i_h, i_w = inputs.get_shape()[1], inputs.get_shape()[2]
+  r_h, r_w = ref.get_shape()[1], ref.get_shape()[2]
+  if i_h == r_h and i_w == r_w:
+    return inputs
+  else:
+    # TODO(casser): Other interpolation methods could be explored here.
+    return tf.image.resize_bilinear(inputs, [r_h.value, r_w.value],
+                                    align_corners=True)
--- a/research/struct2depth/optimize.py
+++ b/research/struct2depth/optimize.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Applies online refinement while running inference.
+Instructions: Run static inference first before calling this script. Make sure
+to point output_dir to the same folder where static inference results were
+saved previously.
+For example use, please refer to README.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import datetime
+import os
+import random
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf
+import model
+import nets
+import reader
+import util
+gfile = tf.gfile
+SAVE_EVERY = 1  # Defines the interval that predictions should be saved at.
+SAVE_PREVIEWS = True  # If set, while save image previews of depth predictions.
+FIXED_SEED = 8964  # Fixed seed for repeatability.
+flags.DEFINE_string('output_dir', None, 'Directory to store predictions. '
+                    'Assumes that regular inference has been executed before '
+                    'and results were stored in this folder.')
+flags.DEFINE_string('data_dir', None, 'Folder pointing to preprocessed '
+                    'triplets to fine-tune on.')
+flags.DEFINE_string('triplet_list_file', None, 'Text file containing paths to '
+                    'image files to process. Paths should be relative with '
+                    'respect to the list file location. Every line should be '
+                    'of the form [input_folder_name] [input_frame_num] '
+                    '[output_path], where [output_path] is optional to specify '
+                    'a different path to store the prediction.')
+flags.DEFINE_string('triplet_list_file_remains', None, 'Optional text file '
+                    'containing relative paths to image files which should not '
+                    'be fine-tuned, e.g. because of missing adjacent frames. '
+                    'For all files listed, the static prediction will be '
+                    'copied instead. File can be empty. If not, every line '
+                    'should be of the form [input_folder_name] '
+                    '[input_frame_num] [output_path], where [output_path] is '
+                    'optional to specify a different path to take and store '
+                    'the unrefined prediction from/to.')
+flags.DEFINE_string('model_ckpt', None, 'Model checkpoint to optimize.')
+flags.DEFINE_string('ft_name', '', 'Optional prefix for temporary files.')
+flags.DEFINE_string('file_extension', 'png', 'Image data file extension.')
+flags.DEFINE_float('learning_rate', 0.0001, 'Adam learning rate.')
+flags.DEFINE_float('beta1', 0.9, 'Adam momentum.')
+flags.DEFINE_float('reconstr_weight', 0.85, 'Frame reconstruction loss weight.')
+flags.DEFINE_float('ssim_weight', 0.15, 'SSIM loss weight.')
+flags.DEFINE_float('smooth_weight', 0.01, 'Smoothness loss weight.')
+flags.DEFINE_float('icp_weight', 0.0, 'ICP loss weight.')
+flags.DEFINE_float('size_constraint_weight', 0.0005, 'Weight of the object '
+                   'size constraint loss. Use only with motion handling.')
+flags.DEFINE_integer('batch_size', 1, 'The size of a sample batch')
+flags.DEFINE_integer('img_height', 128, 'Input frame height.')
+flags.DEFINE_integer('img_width', 416, 'Input frame width.')
+flags.DEFINE_integer('seq_length', 3, 'Number of frames in sequence.')
+flags.DEFINE_enum('architecture', nets.RESNET, nets.ARCHITECTURES,
+                  'Defines the architecture to use for the depth prediction '
+                  'network. Defaults to ResNet-based encoder and accompanying '
+                  'decoder.')
+flags.DEFINE_boolean('imagenet_norm', True, 'Whether to normalize the input '
+                     'images channel-wise so that they match the distribution '
+                     'most ImageNet-models were trained on.')
+flags.DEFINE_float('weight_reg', 0.05, 'The amount of weight regularization to '
+                   'apply. This has no effect on the ResNet-based encoder '
+                   'architecture.')
+flags.DEFINE_boolean('exhaustive_mode', False, 'Whether to exhaustively warp '
+                     'from any frame to any other instead of just considering '
+                     'adjacent frames. Where necessary, multiple egomotion '
+                     'estimates will be applied. Does not have an effect if '
+                     'compute_minimum_loss is enabled.')
+flags.DEFINE_boolean('random_scale_crop', False, 'Whether to apply random '
+                     'image scaling and center cropping during training.')
+flags.DEFINE_bool('depth_upsampling', True, 'Whether to apply depth '
+                  'upsampling of lower-scale representations before warping to '
+                  'compute reconstruction loss on full-resolution image.')
+flags.DEFINE_bool('depth_normalization', True, 'Whether to apply depth '
+                  'normalization, that is, normalizing inverse depth '
+                  'prediction maps by their mean to avoid degeneration towards '
+                  'small values.')
+flags.DEFINE_bool('compute_minimum_loss', True, 'Whether to take the '
+                  'element-wise minimum of the reconstruction/SSIM error in '
+                  'order to avoid overly penalizing dis-occlusion effects.')
+flags.DEFINE_bool('use_skip', True, 'Whether to use skip connections in the '
+                  'encoder-decoder architecture.')
+flags.DEFINE_bool('joint_encoder', False, 'Whether to share parameters '
+                  'between the depth and egomotion networks by using a joint '
+                  'encoder architecture. The egomotion network is then '
+                  'operating only on the hidden representation provided by the '
+                  'joint encoder.')
+flags.DEFINE_float('egomotion_threshold', 0.01, 'Minimum egomotion magnitude '
+                   'to apply finetuning. If lower, just forwards the ordinary '
+                   'prediction.')
+flags.DEFINE_integer('num_steps', 20, 'Number of optimization steps to run.')
+flags.DEFINE_boolean('handle_motion', True, 'Whether the checkpoint was '
+                     'trained with motion handling.')
+flags.DEFINE_bool('flip', False, 'Whether images should be flipped as well as '
+                  'resulting predictions (for test-time augmentation). This '
+                  'currently applies to the depth network only.')
+FLAGS = flags.FLAGS
+flags.mark_flag_as_required('output_dir')
+flags.mark_flag_as_required('data_dir')
+flags.mark_flag_as_required('model_ckpt')
+flags.mark_flag_as_required('triplet_list_file')
+def main(_):
+  """Runs fine-tuning and inference.
+  There are three categories of images.
+  1) Images where we have previous and next frame, and that are not filtered
+     out by the heuristic. For them, we will use the fine-tuned predictions.
+  2) Images where we have previous and next frame, but that were filtered out
+     by our heuristic. For them, we will use the ordinary prediction instead.
+  3) Images where we have at least one missing adjacent frame. For them, we will
+     use the ordinary prediction as indicated by triplet_list_file_remains (if
+     provided). They will also not be part of the generated inference list in
+     the first place.
+  Raises:
+     ValueError: Invalid parameters have been passed.
+  """
+  if FLAGS.handle_motion and FLAGS.joint_encoder:
+    raise ValueError('Using a joint encoder is currently not supported when '
+                     'modeling object motion.')
+  if FLAGS.handle_motion and FLAGS.seq_length != 3:
+    raise ValueError('The current motion model implementation only supports '
+                     'using a sequence length of three.')
+  if FLAGS.handle_motion and not FLAGS.compute_minimum_loss:
+    raise ValueError('Computing the minimum photometric loss is required when '
+                     'enabling object motion handling.')
+  if FLAGS.size_constraint_weight > 0 and not FLAGS.handle_motion:
+    raise ValueError('To enforce object size constraints, enable motion '
+                     'handling.')
+  if FLAGS.icp_weight > 0.0:
+    raise ValueError('ICP is currently not supported.')
+  if FLAGS.compute_minimum_loss and FLAGS.seq_length % 2 != 1:
+    raise ValueError('Compute minimum loss requires using an odd number of '
+                     'images in a sequence.')
+  if FLAGS.compute_minimum_loss and FLAGS.exhaustive_mode:
+    raise ValueError('Exhaustive mode has no effect when compute_minimum_loss '
+                     'is enabled.')
+  if FLAGS.img_width % (2 ** 5) != 0 or FLAGS.img_height % (2 ** 5) != 0:
+    logging.warn('Image size is not divisible by 2^5. For the architecture '
+                 'employed, this could cause artefacts caused by resizing in '
+                 'lower dimensions.')
+  if FLAGS.output_dir.endswith('/'):
+    FLAGS.output_dir = FLAGS.output_dir[:-1]
+  # Create file lists to prepare fine-tuning, save it to unique_file.
+  unique_file_name = (str(datetime.datetime.now().date()) + '_' +
+                      str(datetime.datetime.now().time()).replace(':', '_'))
+  unique_file = os.path.join(FLAGS.data_dir, unique_file_name + '.txt')
+  with gfile.FastGFile(FLAGS.triplet_list_file, 'r') as f:
+    files_to_process = f.readlines()
+    files_to_process = [line.rstrip() for line in files_to_process]
+    files_to_process = [line for line in files_to_process if len(line)]
+  logging.info('Creating unique file list %s with %s entries.', unique_file,
+               len(files_to_process))
+  with gfile.FastGFile(unique_file, 'w') as f_out:
+    fetches_network = FLAGS.num_steps * FLAGS.batch_size
+    fetches_saves = FLAGS.batch_size * int(np.floor(FLAGS.num_steps/SAVE_EVERY))
+    repetitions = fetches_network + 3 * fetches_saves
+    for i in range(len(files_to_process)):
+      for _ in range(repetitions):
+        f_out.write(files_to_process[i] + '\n')
+  # Read remaining files.
+  remaining = []
+  if gfile.Exists(FLAGS.triplet_list_file_remains):
+    with gfile.FastGFile(FLAGS.triplet_list_file_remains, 'r') as f:
+      remaining = f.readlines()
+      remaining = [line.rstrip() for line in remaining]
+      remaining = [line for line in remaining if len(line)]
+  logging.info('Running fine-tuning on %s files, %s files are remaining.',
+               len(files_to_process), len(remaining))
+  # Run fine-tuning process and save predictions in id-folders.
+  tf.set_random_seed(FIXED_SEED)
+  np.random.seed(FIXED_SEED)
+  random.seed(FIXED_SEED)
+  flipping_mode = reader.FLIP_ALWAYS if FLAGS.flip else reader.FLIP_NONE
+  train_model = model.Model(data_dir=FLAGS.data_dir,
+                            file_extension=FLAGS.file_extension,
+                            is_training=True,
+                            learning_rate=FLAGS.learning_rate,
+                            beta1=FLAGS.beta1,
+                            reconstr_weight=FLAGS.reconstr_weight,
+                            smooth_weight=FLAGS.smooth_weight,
+                            ssim_weight=FLAGS.ssim_weight,
+                            icp_weight=FLAGS.icp_weight,
+                            batch_size=FLAGS.batch_size,
+                            img_height=FLAGS.img_height,
+                            img_width=FLAGS.img_width,
+                            seq_length=FLAGS.seq_length,
+                            architecture=FLAGS.architecture,
+                            imagenet_norm=FLAGS.imagenet_norm,
+                            weight_reg=FLAGS.weight_reg,
+                            exhaustive_mode=FLAGS.exhaustive_mode,
+                            random_scale_crop=FLAGS.random_scale_crop,
+                            flipping_mode=flipping_mode,
+                            random_color=False,
+                            depth_upsampling=FLAGS.depth_upsampling,
+                            depth_normalization=FLAGS.depth_normalization,
+                            compute_minimum_loss=FLAGS.compute_minimum_loss,
+                            use_skip=FLAGS.use_skip,
+                            joint_encoder=FLAGS.joint_encoder,
+                            build_sum=False,
+                            shuffle=False,
+                            input_file=unique_file_name,
+                            handle_motion=FLAGS.handle_motion,
+                            size_constraint_weight=FLAGS.size_constraint_weight,
+                            train_global_scale_var=False)
+  failed_heuristic_ids = finetune_inference(train_model, FLAGS.model_ckpt,
+                                            FLAGS.output_dir + '_ft')
+  logging.info('Fine-tuning completed, %s files were filtered out by '
+               'heuristic.', len(failed_heuristic_ids))
+  for failed_id in failed_heuristic_ids:
+    failed_entry = files_to_process[failed_id]
+    remaining.append(failed_entry)
+  logging.info('In total, %s images were fine-tuned, while %s were not.',
+               len(files_to_process)-len(failed_heuristic_ids), len(remaining))
+  # Copy all results to have the same structural output as running ordinary
+  # inference.
+  for i in range(len(files_to_process)):
+    if files_to_process[i] not in remaining:  # Use fine-tuned result.
+      elements = files_to_process[i].split(' ')
+      source_file = os.path.join(FLAGS.output_dir + '_ft', FLAGS.ft_name +
+                                 'id_' + str(i),
+                                 str(FLAGS.num_steps).zfill(10) +
+                                 ('_flip' if FLAGS.flip else ''))
+      if len(elements) == 2:  # No differing mapping defined.
+        target_dir = os.path.join(FLAGS.output_dir + '_ft', elements[0])
+        target_file = os.path.join(
+            target_dir, elements[1] + ('_flip' if FLAGS.flip else ''))
+      else:  # Other mapping for file defined, copy to this location instead.
+        target_dir = os.path.join(
+            FLAGS.output_dir + '_ft', os.path.dirname(elements[2]))
+        target_file = os.path.join(
+            target_dir,
+            os.path.basename(elements[2]) + ('_flip' if FLAGS.flip else ''))
+      if not gfile.Exists(target_dir):
+        gfile.MakeDirs(target_dir)
+      logging.info('Copy refined result %s to %s.', source_file, target_file)
+      gfile.Copy(source_file + '.npy', target_file + '.npy', overwrite=True)
+      gfile.Copy(source_file + '.txt', target_file + '.txt', overwrite=True)
+      gfile.Copy(source_file + '.%s' % FLAGS.file_extension,
+                 target_file + '.%s' % FLAGS.file_extension, overwrite=True)
+  for j in range(len(remaining)):
+    elements = remaining[j].split(' ')
+    if len(elements) == 2:  # No differing mapping defined.
+      target_dir = os.path.join(FLAGS.output_dir + '_ft', elements[0])
+      target_file = os.path.join(
+          target_dir, elements[1] + ('_flip' if FLAGS.flip else ''))
+    else:  # Other mapping for file defined, copy to this location instead.
+      target_dir = os.path.join(
+          FLAGS.output_dir + '_ft', os.path.dirname(elements[2]))
+      target_file = os.path.join(
+          target_dir,
+          os.path.basename(elements[2]) + ('_flip' if FLAGS.flip else ''))
+    if not gfile.Exists(target_dir):
+      gfile.MakeDirs(target_dir)
+    source_file = target_file.replace('_ft', '')
+    logging.info('Copy unrefined result %s to %s.', source_file, target_file)
+    gfile.Copy(source_file + '.npy', target_file + '.npy', overwrite=True)
+    gfile.Copy(source_file + '.%s' % FLAGS.file_extension,
+               target_file + '.%s' % FLAGS.file_extension, overwrite=True)
+  logging.info('Done, predictions saved in %s.', FLAGS.output_dir + '_ft')
+def finetune_inference(train_model, model_ckpt, output_dir):
+  """Train model."""
+  vars_to_restore = None
+  if model_ckpt is not None:
+    vars_to_restore = util.get_vars_to_save_and_restore(model_ckpt)
+    ckpt_path = model_ckpt
+  pretrain_restorer = tf.train.Saver(vars_to_restore)
+  sv = tf.train.Supervisor(logdir=None, save_summaries_secs=0, saver=None,
+                           summary_op=None)
+  config = tf.ConfigProto()
+  config.gpu_options.allow_growth = True
+  img_nr = 0
+  failed_heuristic = []
+  with sv.managed_session(config=config) as sess:
+    # TODO(casser): Caching the weights would be better to avoid I/O bottleneck.
+    while True:  # Loop terminates when all examples have been processed.
+      if model_ckpt is not None:
+        logging.info('Restored weights from %s', ckpt_path)
+        pretrain_restorer.restore(sess, ckpt_path)
+      logging.info('Running fine-tuning, image %s...', img_nr)
+      img_pred_folder = os.path.join(
+          output_dir, FLAGS.ft_name + 'id_' + str(img_nr))
+      if not gfile.Exists(img_pred_folder):
+        gfile.MakeDirs(img_pred_folder)
+      step = 1
+      # Run fine-tuning.
+      while step <= FLAGS.num_steps:
+        logging.info('Running step %s of %s.', step, FLAGS.num_steps)
+        fetches = {
+            'train': train_model.train_op,
+            'global_step': train_model.global_step,
+            'incr_global_step': train_model.incr_global_step
+        }
+        _ = sess.run(fetches)
+        if step % SAVE_EVERY == 0:
+          # Get latest prediction for middle frame, highest scale.
+          pred = train_model.depth[1][0].eval(session=sess)
+          if FLAGS.flip:
+            pred = np.flip(pred, axis=2)
+          input_img = train_model.image_stack.eval(session=sess)
+          input_img_prev = input_img[0, :, :, 0:3]
+          input_img_center = input_img[0, :, :, 3:6]
+          input_img_next = input_img[0, :, :, 6:]
+          img_pred_file = os.path.join(
+              img_pred_folder,
+              str(step).zfill(10) + ('_flip' if FLAGS.flip else '') + '.npy')
+          motion = np.squeeze(train_model.egomotion.eval(session=sess))
+          # motion of shape (seq_length - 1, 6).
+          motion = np.mean(motion, axis=0)  # Average egomotion across frames.
+          if SAVE_PREVIEWS or step == FLAGS.num_steps:
+            # Also save preview of depth map.
+            color_map = util.normalize_depth_for_display(
+                np.squeeze(pred[0, :, :]))
+            visualization = np.concatenate(
+                (input_img_prev, input_img_center, input_img_next, color_map))
+            motion_s = [str(m) for m in motion]
+            s_rep = ','.join(motion_s)
+            with gfile.Open(img_pred_file.replace('.npy', '.txt'), 'w') as f:
+              f.write(s_rep)
+            util.save_image(
+                img_pred_file.replace('.npy', '.%s' % FLAGS.file_extension),
+                visualization, FLAGS.file_extension)
+          with gfile.Open(img_pred_file, 'wb') as f:
+            np.save(f, pred)
+        # Apply heuristic to not finetune if egomotion magnitude is too low.
+        ego_magnitude = np.linalg.norm(motion[:3], ord=2)
+        heuristic = ego_magnitude >= FLAGS.egomotion_threshold
+        if not heuristic and step == FLAGS.num_steps:
+          failed_heuristic.append(img_nr)
+        step += 1
+      img_nr += 1
+  return failed_heuristic
+if __name__ == '__main__':
+  app.run(main)
--- a/research/struct2depth/project.py
+++ b/research/struct2depth/project.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Geometry utilities for projecting frames based on depth and motion.
+Modified from Spatial Transformer Networks:
+https://github.com/tensorflow/models/blob/master/transformer/spatial_transformer.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import logging
+import numpy as np
+import tensorflow as tf
+def inverse_warp(img, depth, egomotion_mat, intrinsic_mat,
+                 intrinsic_mat_inv):
+  """Inverse warp a source image to the target image plane.
+  Args:
+    img: The source image (to sample pixels from) -- [B, H, W, 3].
+    depth: Depth map of the target image -- [B, H, W].
+    egomotion_mat: Matrix defining egomotion transform -- [B, 4, 4].
+    intrinsic_mat: Camera intrinsic matrix -- [B, 3, 3].
+    intrinsic_mat_inv: Inverse of the intrinsic matrix -- [B, 3, 3].
+  Returns:
+    Projected source image
+  """
+  dims = tf.shape(img)
+  batch_size, img_height, img_width = dims[0], dims[1], dims[2]
+  depth = tf.reshape(depth, [batch_size, 1, img_height * img_width])
+  grid = _meshgrid_abs(img_height, img_width)
+  grid = tf.tile(tf.expand_dims(grid, 0), [batch_size, 1, 1])
+  cam_coords = _pixel2cam(depth, grid, intrinsic_mat_inv)
+  ones = tf.ones([batch_size, 1, img_height * img_width])
+  cam_coords_hom = tf.concat([cam_coords, ones], axis=1)
+  # Get projection matrix for target camera frame to source pixel frame
+  hom_filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
+  hom_filler = tf.tile(hom_filler, [batch_size, 1, 1])
+  intrinsic_mat_hom = tf.concat(
+      [intrinsic_mat, tf.zeros([batch_size, 3, 1])], axis=2)
+  intrinsic_mat_hom = tf.concat([intrinsic_mat_hom, hom_filler], axis=1)
+  proj_target_cam_to_source_pixel = tf.matmul(intrinsic_mat_hom, egomotion_mat)
+  source_pixel_coords = _cam2pixel(cam_coords_hom,
+                                   proj_target_cam_to_source_pixel)
+  source_pixel_coords = tf.reshape(source_pixel_coords,
+                                   [batch_size, 2, img_height, img_width])
+  source_pixel_coords = tf.transpose(source_pixel_coords, perm=[0, 2, 3, 1])
+  projected_img, mask = _spatial_transformer(img, source_pixel_coords)
+  return projected_img, mask
+def get_transform_mat(egomotion_vecs, i, j):
+  """Returns a transform matrix defining the transform from frame i to j."""
+  egomotion_transforms = []
+  batchsize = tf.shape(egomotion_vecs)[0]
+  if i == j:
+    return tf.tile(tf.expand_dims(tf.eye(4, 4), axis=0), [batchsize, 1, 1])
+  for k in range(min(i, j), max(i, j)):
+    transform_matrix = _egomotion_vec2mat(egomotion_vecs[:, k, :], batchsize)
+    if i > j:  # Going back in sequence, need to invert egomotion.
+      egomotion_transforms.insert(0, tf.linalg.inv(transform_matrix))
+    else:  # Going forward in sequence
+      egomotion_transforms.append(transform_matrix)
+  # Multiply all matrices.
+  egomotion_mat = egomotion_transforms[0]
+  for i in range(1, len(egomotion_transforms)):
+    egomotion_mat = tf.matmul(egomotion_mat, egomotion_transforms[i])
+  return egomotion_mat
+def _pixel2cam(depth, pixel_coords, intrinsic_mat_inv):
+  """Transform coordinates in the pixel frame to the camera frame."""
+  cam_coords = tf.matmul(intrinsic_mat_inv, pixel_coords) * depth
+  return cam_coords
+def _cam2pixel(cam_coords, proj_c2p):
+  """Transform coordinates in the camera frame to the pixel frame."""
+  pcoords = tf.matmul(proj_c2p, cam_coords)
+  x = tf.slice(pcoords, [0, 0, 0], [-1, 1, -1])
+  y = tf.slice(pcoords, [0, 1, 0], [-1, 1, -1])
+  z = tf.slice(pcoords, [0, 2, 0], [-1, 1, -1])
+  # Not tested if adding a small number is necessary
+  x_norm = x / (z + 1e-10)
+  y_norm = y / (z + 1e-10)
+  pixel_coords = tf.concat([x_norm, y_norm], axis=1)
+  return pixel_coords
+def _meshgrid_abs(height, width):
+  """Meshgrid in the absolute coordinates."""
+  x_t = tf.matmul(
+      tf.ones(shape=tf.stack([height, 1])),
+      tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
+  y_t = tf.matmul(
+      tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
+      tf.ones(shape=tf.stack([1, width])))
+  x_t = (x_t + 1.0) * 0.5 * tf.cast(width - 1, tf.float32)
+  y_t = (y_t + 1.0) * 0.5 * tf.cast(height - 1, tf.float32)
+  x_t_flat = tf.reshape(x_t, (1, -1))
+  y_t_flat = tf.reshape(y_t, (1, -1))
+  ones = tf.ones_like(x_t_flat)
+  grid = tf.concat([x_t_flat, y_t_flat, ones], axis=0)
+  return grid
+def _euler2mat(z, y, x):
+  """Converts euler angles to rotation matrix.
+   From:
+   https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174
+   TODO: Remove the dimension for 'N' (deprecated for converting all source
+   poses altogether).
+  Args:
+    z: rotation angle along z axis (in radians) -- size = [B, n]
+    y: rotation angle along y axis (in radians) -- size = [B, n]
+    x: rotation angle along x axis (in radians) -- size = [B, n]
+  Returns:
+    Rotation matrix corresponding to the euler angles, with shape [B, n, 3, 3].
+  """
+  batch_size = tf.shape(z)[0]
+  n = 1
+  z = tf.clip_by_value(z, -np.pi, np.pi)
+  y = tf.clip_by_value(y, -np.pi, np.pi)
+  x = tf.clip_by_value(x, -np.pi, np.pi)
+  # Expand to B x N x 1 x 1
+  z = tf.expand_dims(tf.expand_dims(z, -1), -1)
+  y = tf.expand_dims(tf.expand_dims(y, -1), -1)
+  x = tf.expand_dims(tf.expand_dims(x, -1), -1)
+  zeros = tf.zeros([batch_size, n, 1, 1])
+  ones = tf.ones([batch_size, n, 1, 1])
+  cosz = tf.cos(z)
+  sinz = tf.sin(z)
+  rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3)
+  rotz_2 = tf.concat([sinz, cosz, zeros], axis=3)
+  rotz_3 = tf.concat([zeros, zeros, ones], axis=3)
+  zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2)
+  cosy = tf.cos(y)
+  siny = tf.sin(y)
+  roty_1 = tf.concat([cosy, zeros, siny], axis=3)
+  roty_2 = tf.concat([zeros, ones, zeros], axis=3)
+  roty_3 = tf.concat([-siny, zeros, cosy], axis=3)
+  ymat = tf.concat([roty_1, roty_2, roty_3], axis=2)
+  cosx = tf.cos(x)
+  sinx = tf.sin(x)
+  rotx_1 = tf.concat([ones, zeros, zeros], axis=3)
+  rotx_2 = tf.concat([zeros, cosx, -sinx], axis=3)
+  rotx_3 = tf.concat([zeros, sinx, cosx], axis=3)
+  xmat = tf.concat([rotx_1, rotx_2, rotx_3], axis=2)
+  return tf.matmul(tf.matmul(xmat, ymat), zmat)
+def _egomotion_vec2mat(vec, batch_size):
+  """Converts 6DoF transform vector to transformation matrix.
+  Args:
+    vec: 6DoF parameters [tx, ty, tz, rx, ry, rz] -- [B, 6].
+    batch_size: Batch size.
+  Returns:
+    A transformation matrix -- [B, 4, 4].
+  """
+  translation = tf.slice(vec, [0, 0], [-1, 3])
+  translation = tf.expand_dims(translation, -1)
+  rx = tf.slice(vec, [0, 3], [-1, 1])
+  ry = tf.slice(vec, [0, 4], [-1, 1])
+  rz = tf.slice(vec, [0, 5], [-1, 1])
+  rot_mat = _euler2mat(rz, ry, rx)
+  rot_mat = tf.squeeze(rot_mat, squeeze_dims=[1])
+  filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
+  filler = tf.tile(filler, [batch_size, 1, 1])
+  transform_mat = tf.concat([rot_mat, translation], axis=2)
+  transform_mat = tf.concat([transform_mat, filler], axis=1)
+  return transform_mat
+def _bilinear_sampler(im, x, y, name='blinear_sampler'):
+  """Perform bilinear sampling on im given list of x, y coordinates.
+  Implements the differentiable sampling mechanism with bilinear kernel
+  in https://arxiv.org/abs/1506.02025.
+  x,y are tensors specifying normalized coordinates [-1, 1] to be sampled on im.
+  For example, (-1, -1) in (x, y) corresponds to pixel location (0, 0) in im,
+  and (1, 1) in (x, y) corresponds to the bottom right pixel in im.
+  Args:
+    im: Batch of images with shape [B, h, w, channels].
+    x: Tensor of normalized x coordinates in [-1, 1], with shape [B, h, w, 1].
+    y: Tensor of normalized y coordinates in [-1, 1], with shape [B, h, w, 1].
+    name: Name scope for ops.
+  Returns:
+    Sampled image with shape [B, h, w, channels].
+    Principled mask with shape [B, h, w, 1], dtype:float32.  A value of 1.0
+      in the mask indicates that the corresponding coordinate in the sampled
+      image is valid.
+  """
+  with tf.variable_scope(name):
+    x = tf.reshape(x, [-1])
+    y = tf.reshape(y, [-1])
+    # Constants.
+    batch_size = tf.shape(im)[0]
+    _, height, width, channels = im.get_shape().as_list()
+    x = tf.to_float(x)
+    y = tf.to_float(y)
+    height_f = tf.cast(height, 'float32')
+    width_f = tf.cast(width, 'float32')
+    zero = tf.constant(0, dtype=tf.int32)
+    max_y = tf.cast(tf.shape(im)[1] - 1, 'int32')
+    max_x = tf.cast(tf.shape(im)[2] - 1, 'int32')
+    # Scale indices from [-1, 1] to [0, width - 1] or [0, height - 1].
+    x = (x + 1.0) * (width_f - 1.0) / 2.0
+    y = (y + 1.0) * (height_f - 1.0) / 2.0
+    # Compute the coordinates of the 4 pixels to sample from.
+    x0 = tf.cast(tf.floor(x), 'int32')
+    x1 = x0 + 1
+    y0 = tf.cast(tf.floor(y), 'int32')
+    y1 = y0 + 1
+    mask = tf.logical_and(
+        tf.logical_and(x0 >= zero, x1 <= max_x),
+        tf.logical_and(y0 >= zero, y1 <= max_y))
+    mask = tf.to_float(mask)
+    x0 = tf.clip_by_value(x0, zero, max_x)
+    x1 = tf.clip_by_value(x1, zero, max_x)
+    y0 = tf.clip_by_value(y0, zero, max_y)
+    y1 = tf.clip_by_value(y1, zero, max_y)
+    dim2 = width
+    dim1 = width * height
+    # Create base index.
+    base = tf.range(batch_size) * dim1
+    base = tf.reshape(base, [-1, 1])
+    base = tf.tile(base, [1, height * width])
+    base = tf.reshape(base, [-1])
+    base_y0 = base + y0 * dim2
+    base_y1 = base + y1 * dim2
+    idx_a = base_y0 + x0
+    idx_b = base_y1 + x0
+    idx_c = base_y0 + x1
+    idx_d = base_y1 + x1
+    # Use indices to lookup pixels in the flat image and restore channels dim.
+    im_flat = tf.reshape(im, tf.stack([-1, channels]))
+    im_flat = tf.to_float(im_flat)
+    pixel_a = tf.gather(im_flat, idx_a)
+    pixel_b = tf.gather(im_flat, idx_b)
+    pixel_c = tf.gather(im_flat, idx_c)
+    pixel_d = tf.gather(im_flat, idx_d)
+    x1_f = tf.to_float(x1)
+    y1_f = tf.to_float(y1)
+    # And finally calculate interpolated values.
+    wa = tf.expand_dims(((x1_f - x) * (y1_f - y)), 1)
+    wb = tf.expand_dims((x1_f - x) * (1.0 - (y1_f - y)), 1)
+    wc = tf.expand_dims(((1.0 - (x1_f - x)) * (y1_f - y)), 1)
+    wd = tf.expand_dims(((1.0 - (x1_f - x)) * (1.0 - (y1_f - y))), 1)
+    output = tf.add_n([wa * pixel_a, wb * pixel_b, wc * pixel_c, wd * pixel_d])
+    output = tf.reshape(output, tf.stack([batch_size, height, width, channels]))
+    mask = tf.reshape(mask, tf.stack([batch_size, height, width, 1]))
+    return output, mask
+def _spatial_transformer(img, coords):
+  """A wrapper over binlinear_sampler(), taking absolute coords as input."""
+  img_height = tf.cast(tf.shape(img)[1], tf.float32)
+  img_width = tf.cast(tf.shape(img)[2], tf.float32)
+  px = coords[:, :, :, :1]
+  py = coords[:, :, :, 1:]
+  # Normalize coordinates to [-1, 1] to send to _bilinear_sampler.
+  px = px / (img_width - 1) * 2.0 - 1.0
+  py = py / (img_height - 1) * 2.0 - 1.0
+  output_img, mask = _bilinear_sampler(img, px, py)
+  return output_img, mask
+def get_cloud(depth, intrinsics_inv, name=None):
+  """Convert depth map to 3D point cloud."""
+  with tf.name_scope(name):
+    dims = depth.shape.as_list()
+    batch_size, img_height, img_width = dims[0], dims[1], dims[2]
+    depth = tf.reshape(depth, [batch_size, 1, img_height * img_width])
+    grid = _meshgrid_abs(img_height, img_width)
+    grid = tf.tile(tf.expand_dims(grid, 0), [batch_size, 1, 1])
+    cam_coords = _pixel2cam(depth, grid, intrinsics_inv)
+    cam_coords = tf.transpose(cam_coords, [0, 2, 1])
+    cam_coords = tf.reshape(cam_coords, [batch_size, img_height, img_width, 3])
+    logging.info('depth -> cloud: %s', cam_coords)
+    return cam_coords