move data creation tools into a subfolder.

6f4dff46 · Vivek Rathod · 11e9c7ad · 6f4dff46 · 6f4dff46 · 6f4dff46
Commit 6f4dff46 authored Nov 07, 2017 by Vivek Rathod
8 changed files
--- a/research/object_detection/BUILD
+++ b/research/object_detection/BUILD
@@ -134,42 +134,3 @@ py_binary(
        "//tensorflow_models/object_detection/protos:pipeline_py_pb2",
    ],
 )
-py_binary(
-    name = "create_pascal_tf_record",
-    srcs = [
-        "create_pascal_tf_record.py",
-    ],
-    deps = [
-        "//third_party/py/PIL:pil",
-        "//third_party/py/lxml",
-        "//tensorflow",
-        "//tensorflow_models/object_detection/utils:dataset_util",
-        "//tensorflow_models/object_detection/utils:label_map_util",
-    ],
-)
-py_test(
-    name = "create_pascal_tf_record_test",
-    srcs = [
-        "create_pascal_tf_record_test.py",
-    ],
-    deps = [
-        ":create_pascal_tf_record",
-        "//tensorflow",
-    ],
-)
-py_binary(
-    name = "create_pet_tf_record",
-    srcs = [
-        "create_pet_tf_record.py",
-    ],
-    deps = [
-        "//third_party/py/PIL:pil",
-        "//third_party/py/lxml",
-        "//tensorflow",
-        "//tensorflow_models/object_detection/utils:dataset_util",
-        "//tensorflow_models/object_detection/utils:label_map_util",
-    ],
-)
--- a/research/object_detection/dataset_tools/BUILD
+++ b/research/object_detection/dataset_tools/BUILD
+# Tensorflow Object Detection API: main runnables.
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])
+# Apache 2.0
+py_binary(
+    name = "create_kitti_tf_record",
+    srcs = [
+        "create_kitti_tf_record.py",
+    ],
+    deps = [
+        "//third_party/py/PIL:pil",
+        "//third_party/py/lxml",
+        "//tensorflow",
+        "//tensorflow_models/object_detection/utils:dataset_util",
+        "//tensorflow_models/object_detection/utils:label_map_util",
+        "//tensorflow_models/object_detection/utils:np_box_ops",
+    ],
+)
+py_test(
+    name = "create_kitti_tf_record_test",
+    srcs = [
+        "create_kitti_tf_record_test.py",
+    ],
+    deps = [
+        ":create_kitti_tf_record",
+        "//tensorflow",
+    ],
+)
+py_binary(
+    name = "create_pascal_tf_record",
+    srcs = [
+        "create_pascal_tf_record.py",
+    ],
+    deps = [
+        "//third_party/py/PIL:pil",
+        "//third_party/py/lxml",
+        "//tensorflow",
+        "//tensorflow_models/object_detection/utils:dataset_util",
+        "//tensorflow_models/object_detection/utils:label_map_util",
+    ],
+)
+py_test(
+    name = "create_pascal_tf_record_test",
+    srcs = [
+        "create_pascal_tf_record_test.py",
+    ],
+    deps = [
+        ":create_pascal_tf_record",
+        "//tensorflow",
+    ],
+)
+py_binary(
+    name = "create_pet_tf_record",
+    srcs = [
+        "create_pet_tf_record.py",
+    ],
+    deps = [
+        "//third_party/py/PIL:pil",
+        "//third_party/py/lxml",
+        "//tensorflow",
+        "//tensorflow_models/object_detection/utils:dataset_util",
+        "//tensorflow_models/object_detection/utils:label_map_util",
+    ],
+)
+py_library(
+    name = "oid_tfrecord_creation",
+    srcs = ["oid_tfrecord_creation.py"],
+    deps = [
+        "//tensorflow",
+        "//tensorflow_models/object_detection/core:standard_fields",
+        "//tensorflow_models/object_detection/utils:dataset_util",
+    ],
+)
--- a/research/object_detection/dataset_tools/create_kitti_tf_record.py
+++ b/research/object_detection/dataset_tools/create_kitti_tf_record.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Convert raw KITTI detection dataset to TFRecord for object_detection.
+Converts KITTI detection dataset to TFRecords with a standard format allowing
+  to use this dataset to train object detectors. The raw dataset can be
+  downloaded from:
+  http://kitti.is.tue.mpg.de/kitti/data_object_image_2.zip.
+  http://kitti.is.tue.mpg.de/kitti/data_object_label_2.zip
+  Permission can be requested at the main website.
+  KITTI detection dataset contains 7481 training images. Using this code with
+  the default settings will set aside the first 500 images as a validation set.
+  This can be altered using the flags, see details below.
+Example usage:
+    python create_kitti_tf_record.py \
+        --data_dir=/home/user/kitti \
+        --output_path=/home/user/kitti.record
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import hashlib
+import io
+import os
+import numpy as np
+import PIL.Image as pil
+import tensorflow as tf
+from object_detection.utils import dataset_util
+from object_detection.utils import label_map_util
+from object_detection.utils.np_box_ops import iou
+tf.app.flags.DEFINE_string('data_dir', '', 'Location of root directory for the '
+                           'data. Folder structure is assumed to be:'
+                           '<data_dir>/training/label_2 (annotations) and'
+                           '<data_dir>/data_object_image_2/training/image_2'
+                           '(images).')
+tf.app.flags.DEFINE_string('output_path', '', 'Path to which TFRecord files'
+                           'will be written. The TFRecord with the training set'
+                           'will be located at: <output_path>_train.tfrecord.'
+                           'And the TFRecord with the validation set will be'
+                           'located at: <output_path>_val.tfrecord')
+tf.app.flags.DEFINE_list('classes_to_use', ['car', 'pedestrian', 'dontcare'],
+                         'Which classes of bounding boxes to use. Adding the'
+                         'dontcare class will remove all bboxs in the dontcare'
+                         'regions.')
+tf.app.flags.DEFINE_string('label_map_path', 'data/kitti_label_map.pbtxt',
+                           'Path to label map proto.')
+tf.app.flags.DEFINE_integer('validation_set_size', '500', 'Number of images to'
+                            'be used as a validation set.')
+FLAGS = tf.app.flags.FLAGS
+def convert_kitti_to_tfrecords(data_dir, output_path, classes_to_use,
+                               label_map_path, validation_set_size):
+  """Convert the KITTI detection dataset to TFRecords.
+  Args:
+    data_dir: The full path to the unzipped folder containing the unzipped data
+      from data_object_image_2 and data_object_label_2.zip.
+      Folder structure is assumed to be: data_dir/training/label_2 (annotations)
+      and data_dir/data_object_image_2/training/image_2 (images).
+    output_path: The path to which TFRecord files will be written. The TFRecord
+      with the training set will be located at: <output_path>_train.tfrecord
+      And the TFRecord with the validation set will be located at:
+      <output_path>_val.tfrecord
+    classes_to_use: List of strings naming the classes for which data should be
+      converted. Use the same names as presented in the KIITI README file.
+      Adding dontcare class will remove all other bounding boxes that overlap
+      with areas marked as dontcare regions.
+    label_map_path: Path to label map proto
+    validation_set_size: How many images should be left as the validation set.
+      (Ffirst `validation_set_size` examples are selected to be in the
+      validation set).
+  """
+  label_map_dict = label_map_util.get_label_map_dict(label_map_path)
+  train_count = 0
+  val_count = 0
+  annotation_dir = os.path.join(data_dir,
+                                'training',
+                                'label_2')
+  image_dir = os.path.join(data_dir,
+                           'data_object_image_2',
+                           'training',
+                           'image_2')
+  train_writer = tf.python_io.TFRecordWriter('%s_train.tfrecord'%
+                                             output_path)
+  val_writer = tf.python_io.TFRecordWriter('%s_val.tfrecord'%
+                                           output_path)
+  images = sorted(tf.gfile.ListDirectory(image_dir))
+  for img_name in images:
+    img_num = int(img_name.split('.')[0])
+    is_validation_img = img_num < validation_set_size
+    img_anno = read_annotation_file(os.path.join(annotation_dir,
+                                                 str(img_num).zfill(6)+'.txt'))
+    image_path = os.path.join(image_dir, img_name)
+    # Filter all bounding boxes of this frame that are of a legal class, and
+    # don't overlap with a dontcare region.
+    # TODO(talremez) filter out targets that are truncated or heavily occluded.
+    annotation_for_image = filter_annotations(img_anno, classes_to_use)
+    example = prepare_example(image_path, annotation_for_image, label_map_dict)
+    if is_validation_img:
+      val_writer.write(example.SerializeToString())
+      val_count += 1
+    else:
+      train_writer.write(example.SerializeToString())
+      train_count += 1
+  train_writer.close()
+  val_writer.close()
+def prepare_example(image_path, annotations, label_map_dict):
+  """Converts a dictionary with annotations for an image to tf.Example proto.
+  Args:
+    image_path: The complete path to image.
+    annotations: A dictionary representing the annotation of a single object
+      that appears in the image.
+    label_map_dict: A map from string label names to integer ids.
+  Returns:
+    example: The converted tf.Example.
+  """
+  with tf.gfile.GFile(image_path, 'rb') as fid:
+    encoded_png = fid.read()
+  encoded_png_io = io.BytesIO(encoded_png)
+  image = pil.open(encoded_png_io)
+  image = np.asarray(image)
+  key = hashlib.sha256(encoded_png).hexdigest()
+  width = int(image.shape[1])
+  height = int(image.shape[0])
+  xmin_norm = annotations['2d_bbox_left'] / float(width)
+  ymin_norm = annotations['2d_bbox_top'] / float(height)
+  xmax_norm = annotations['2d_bbox_right'] / float(width)
+  ymax_norm = annotations['2d_bbox_bottom'] / float(height)
+  difficult_obj = [0]*len(xmin_norm)
+  example = tf.train.Example(features=tf.train.Features(feature={
+      'image/height': dataset_util.int64_feature(height),
+      'image/width': dataset_util.int64_feature(width),
+      'image/filename': dataset_util.bytes_feature(image_path.encode('utf8')),
+      'image/source_id': dataset_util.bytes_feature(image_path.encode('utf8')),
+      'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')),
+      'image/encoded': dataset_util.bytes_feature(encoded_png),
+      'image/format': dataset_util.bytes_feature('png'.encode('utf8')),
+      'image/object/bbox/xmin': dataset_util.float_list_feature(xmin_norm),
+      'image/object/bbox/xmax': dataset_util.float_list_feature(xmax_norm),
+      'image/object/bbox/ymin': dataset_util.float_list_feature(ymin_norm),
+      'image/object/bbox/ymax': dataset_util.float_list_feature(ymax_norm),
+      'image/object/class/text': dataset_util.bytes_list_feature(
+          [x.encode('utf8') for x in annotations['type']]),
+      'image/object/class/label': dataset_util.int64_list_feature(
+          [label_map_dict[x] for x in annotations['type']]),
+      'image/object/difficult': dataset_util.int64_list_feature(difficult_obj),
+      'image/object/truncated': dataset_util.float_list_feature(
+          annotations['truncated']),
+      'image/object/alpha': dataset_util.float_list_feature(
+          annotations['alpha']),
+      'image/object/3d_bbox/height': dataset_util.float_list_feature(
+          annotations['3d_bbox_height']),
+      'image/object/3d_bbox/width': dataset_util.float_list_feature(
+          annotations['3d_bbox_width']),
+      'image/object/3d_bbox/length': dataset_util.float_list_feature(
+          annotations['3d_bbox_length']),
+      'image/object/3d_bbox/x': dataset_util.float_list_feature(
+          annotations['3d_bbox_x']),
+      'image/object/3d_bbox/y': dataset_util.float_list_feature(
+          annotations['3d_bbox_y']),
+      'image/object/3d_bbox/z': dataset_util.float_list_feature(
+          annotations['3d_bbox_z']),
+      'image/object/3d_bbox/rot_y': dataset_util.float_list_feature(
+          annotations['3d_bbox_rot_y']),
+  }))
+  return example
+def filter_annotations(img_all_annotations, used_classes):
+  """Filters out annotations from the unused classes and dontcare regions.
+  Filters out the annotations that belong to classes we do now wish to use and
+  (optionally) also removes all boxes that overlap with dontcare regions.
+  Args:
+    img_all_annotations: A list of annotation dictionaries. See documentation of
+      read_annotation_file for more details about the format of the annotations.
+    used_classes: A list of strings listing the classes we want to keep, if the
+    list contains "dontcare", all bounding boxes with overlapping with dont
+    care regions will also be filtered out.
+  Returns:
+    img_filtered_annotations: A list of annotation dictionaries that have passed
+      the filtering.
+  """
+  img_filtered_annotations = {}
+  # Filter the type of the objects.
+  relevant_annotation_indices = [
+      i for i, x in enumerate(img_all_annotations['type']) if x in used_classes
+  ]
+  for key in img_all_annotations.keys():
+    img_filtered_annotations[key] = (
+        img_all_annotations[key][relevant_annotation_indices])
+  if 'dontcare' in used_classes:
+    dont_care_indices = [i for i,
+                         x in enumerate(img_filtered_annotations['type'])
+                         if x == 'dontcare']
+    # bounding box format [y_min, x_min, y_max, x_max]
+    all_boxes = np.stack([img_filtered_annotations['2d_bbox_top'],
+                          img_filtered_annotations['2d_bbox_left'],
+                          img_filtered_annotations['2d_bbox_bottom'],
+                          img_filtered_annotations['2d_bbox_right']],
+                         axis=1)
+    ious = iou(boxes1=all_boxes,
+               boxes2=all_boxes[dont_care_indices])
+    # Remove all bounding boxes that overlap with a dontcare region.
+    if ious.size > 0:
+      boxes_to_remove = np.amax(ious, axis=1) > 0.0
+      for key in img_all_annotations.keys():
+        img_filtered_annotations[key] = (
+            img_filtered_annotations[key][np.logical_not(boxes_to_remove)])
+  return img_filtered_annotations
+def read_annotation_file(filename):
+  """Reads a KITTI annotation file.
+  Converts a KITTI annotation file into a dictionary containing all the
+  relevant information.
+  Args:
+    filename: the path to the annotataion text file.
+  Returns:
+    anno: A dictionary with the converted annotation information. See annotation
+    README file for details on the different fields.
+  """
+  with open(filename) as f:
+    content = f.readlines()
+  content = [x.strip().split(' ') for x in content]
+  anno = {}
+  anno['type'] = np.array([x[0].lower() for x in content])
+  anno['truncated'] = np.array([float(x[1]) for x in content])
+  anno['occluded'] = np.array([int(x[2]) for x in content])
+  anno['alpha'] = np.array([float(x[3]) for x in content])
+  anno['2d_bbox_left'] = np.array([float(x[4]) for x in content])
+  anno['2d_bbox_top'] = np.array([float(x[5]) for x in content])
+  anno['2d_bbox_right'] = np.array([float(x[6]) for x in content])
+  anno['2d_bbox_bottom'] = np.array([float(x[7]) for x in content])
+  anno['3d_bbox_height'] = np.array([float(x[8]) for x in content])
+  anno['3d_bbox_width'] = np.array([float(x[9]) for x in content])
+  anno['3d_bbox_length'] = np.array([float(x[10]) for x in content])
+  anno['3d_bbox_x'] = np.array([float(x[11]) for x in content])
+  anno['3d_bbox_y'] = np.array([float(x[12]) for x in content])
+  anno['3d_bbox_z'] = np.array([float(x[13]) for x in content])
+  anno['3d_bbox_rot_y'] = np.array([float(x[14]) for x in content])
+  return anno
+def main(_):
+  convert_kitti_to_tfrecords(
+      data_dir=FLAGS.data_dir,
+      output_path=FLAGS.output_path,
+      classes_to_use=FLAGS.classes_to_use,
+      label_map_path=FLAGS.label_map_path,
+      validation_set_size=FLAGS.validation_set_size)
+if __name__ == '__main__':
+  tf.app.run()
--- a/research/object_detection/dataset_tools/create_kitti_tf_record_test.py
+++ b/research/object_detection/dataset_tools/create_kitti_tf_record_test.py
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for create_kitti_tf_record.py."""
+import os
+import numpy as np
+import PIL.Image
+import tensorflow as tf
+from object_detection.dataset_tools import create_kitti_tf_record
+class DictToTFExampleTest(tf.test.TestCase):
+  def _assertProtoEqual(self, proto_field, expectation):
+    """Helper function to assert if a proto field equals some value.
+    Args:
+      proto_field: The protobuf field to compare.
+      expectation: The expected value of the protobuf field.
+    """
+    proto_list = [p for p in proto_field]
+    self.assertListEqual(proto_list, expectation)
+  def test_dict_to_tf_example(self):
+    image_file_name = 'tmp_image.jpg'
+    image_data = np.random.rand(256, 256, 3)
+    save_path = os.path.join(self.get_temp_dir(), image_file_name)
+    image = PIL.Image.fromarray(image_data, 'RGB')
+    image.save(save_path)
+    annotations = {}
+    annotations['2d_bbox_left'] = np.array([64])
+    annotations['2d_bbox_top'] = np.array([64])
+    annotations['2d_bbox_right'] = np.array([192])
+    annotations['2d_bbox_bottom'] = np.array([192])
+    annotations['type'] = ['car']
+    annotations['truncated'] = np.array([1])
+    annotations['alpha'] = np.array([2])
+    annotations['3d_bbox_height'] = np.array([10])
+    annotations['3d_bbox_width'] = np.array([11])
+    annotations['3d_bbox_length'] = np.array([12])
+    annotations['3d_bbox_x'] = np.array([13])
+    annotations['3d_bbox_y'] = np.array([14])
+    annotations['3d_bbox_z'] = np.array([15])
+    annotations['3d_bbox_rot_y'] = np.array([4])
+    label_map_dict = {
+        'background': 0,
+        'car': 1,
+    }
+    example = create_kitti_tf_record.prepare_example(
+        save_path,
+        annotations,
+        label_map_dict)
+    self._assertProtoEqual(
+        example.features.feature['image/height'].int64_list.value, [256])
+    self._assertProtoEqual(
+        example.features.feature['image/width'].int64_list.value, [256])
+    self._assertProtoEqual(
+        example.features.feature['image/filename'].bytes_list.value,
+        [save_path])
+    self._assertProtoEqual(
+        example.features.feature['image/source_id'].bytes_list.value,
+        [save_path])
+    self._assertProtoEqual(
+        example.features.feature['image/format'].bytes_list.value, ['png'])
+    self._assertProtoEqual(
+        example.features.feature['image/object/bbox/xmin'].float_list.value,
+        [0.25])
+    self._assertProtoEqual(
+        example.features.feature['image/object/bbox/ymin'].float_list.value,
+        [0.25])
+    self._assertProtoEqual(
+        example.features.feature['image/object/bbox/xmax'].float_list.value,
+        [0.75])
+    self._assertProtoEqual(
+        example.features.feature['image/object/bbox/ymax'].float_list.value,
+        [0.75])
+    self._assertProtoEqual(
+        example.features.feature['image/object/class/text'].bytes_list.value,
+        ['car'])
+    self._assertProtoEqual(
+        example.features.feature['image/object/class/label'].int64_list.value,
+        [1])
+    self._assertProtoEqual(
+        example.features.feature['image/object/truncated'].float_list.value,
+        [1])
+    self._assertProtoEqual(
+        example.features.feature['image/object/alpha'].float_list.value,
+        [2])
+    self._assertProtoEqual(example.features.feature[
+        'image/object/3d_bbox/height'].float_list.value, [10])
+    self._assertProtoEqual(
+        example.features.feature['image/object/3d_bbox/width'].float_list.value,
+        [11])
+    self._assertProtoEqual(example.features.feature[
+        'image/object/3d_bbox/length'].float_list.value, [12])
+    self._assertProtoEqual(
+        example.features.feature['image/object/3d_bbox/x'].float_list.value,
+        [13])
+    self._assertProtoEqual(
+        example.features.feature['image/object/3d_bbox/y'].float_list.value,
+        [14])
+    self._assertProtoEqual(
+        example.features.feature['image/object/3d_bbox/z'].float_list.value,
+        [15])
+    self._assertProtoEqual(
+        example.features.feature['image/object/3d_bbox/rot_y'].float_list.value,
+        [4])
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/object_detection/create_pascal_tf_record.py
+++ b/research/object_detection/create_pascal_tf_record.py
--- a/research/object_detection/create_pascal_tf_record_test.py
+++ b/research/object_detection/create_pascal_tf_record_test.py
@@ -21,7 +21,7 @@ import numpy as np
 import PIL.Image
 import tensorflow as tf
-from object_detection import create_pascal_tf_record
+from object_detection.dataset_tools import create_pascal_tf_record
 class DictToTFExampleTest(tf.test.TestCase):

--- a/research/object_detection/create_pet_tf_record.py
+++ b/research/object_detection/create_pet_tf_record.py
@@ -33,6 +33,7 @@ import random
 import re
 from lxml import etree
+import numpy as np
 import PIL.Image
 import tensorflow as tf
@@ -44,6 +45,10 @@ flags.DEFINE_string('data_dir', '', 'Root directory to raw pet dataset.')
 flags.DEFINE_string('output_dir', '', 'Path to directory to output TFRecords.')
 flags.DEFINE_string('label_map_path', 'data/pet_label_map.pbtxt',
                    'Path to label map proto')
+flags.DEFINE_boolean('faces_only', True, 'If True, generates bounding boxes '
+                     'for pet faces.  Otherwise generates bounding boxes (as '
+                     'well as segmentations for full pet bodies).  Note that '
+                     'in the latter case, the resulting files are much larger.')
 FLAGS = flags.FLAGS
@@ -62,9 +67,11 @@ def get_class_name_from_filename(file_name):
 def dict_to_tf_example(data,
+                       mask_path,
                       label_map_dict,
                       image_subdirectory,
-                       ignore_difficult_instances=False):
+                       ignore_difficult_instances=False,
+                       faces_only=True):
  """Convert XML derived dict to tf.Example proto.
  Notice that this function normalizes the bounding box coordinates provided
@@ -73,11 +80,14 @@ def dict_to_tf_example(data,
  Args:
    data: dict holding PASCAL XML fields for a single image (obtained by
      running dataset_util.recursive_parse_xml_to_dict)
+    mask_path: String path to PNG encoded mask.
    label_map_dict: A map from string label names to integers ids.
    image_subdirectory: String specifying subdirectory within the
      Pascal dataset directory holding the actual image data.
    ignore_difficult_instances: Whether to skip difficult instances in the
      dataset  (default: False).
+    faces_only: If True, generates bounding boxes for pet faces.  Otherwise
+      generates bounding boxes (as well as segmentations for full pet bodies).
  Returns:
    example: The converted tf.Example.
@@ -94,36 +104,65 @@ def dict_to_tf_example(data,
    raise ValueError('Image format not JPEG')
  key = hashlib.sha256(encoded_jpg).hexdigest()
+  with tf.gfile.GFile(mask_path, 'rb') as fid:
+    encoded_mask_png = fid.read()
+  encoded_png_io = io.BytesIO(encoded_mask_png)
+  mask = PIL.Image.open(encoded_png_io)
+  if mask.format != 'PNG':
+    raise ValueError('Mask format not PNG')
+  mask_np = np.asarray(mask)
+  nonbackground_indices_x = np.any(mask_np != 2, axis=0)
+  nonbackground_indices_y = np.any(mask_np != 2, axis=1)
+  nonzero_x_indices = np.where(nonbackground_indices_x)
+  nonzero_y_indices = np.where(nonbackground_indices_y)
  width = int(data['size']['width'])
  height = int(data['size']['height'])
-  xmin = []
+  xmins = []
-  ymin = []
+  ymins = []
-  xmax = []
+  xmaxs = []
-  ymax = []
+  ymaxs = []
  classes = []
  classes_text = []
  truncated = []
  poses = []
  difficult_obj = []
+  masks = []
  for obj in data['object']:
    difficult = bool(int(obj['difficult']))
    if ignore_difficult_instances and difficult:
      continue
    difficult_obj.append(int(difficult))
-    xmin.append(float(obj['bndbox']['xmin']) / width)
+    if faces_only:
-    ymin.append(float(obj['bndbox']['ymin']) / height)
+      xmin = float(obj['bndbox']['xmin'])
-    xmax.append(float(obj['bndbox']['xmax']) / width)
+      xmax = float(obj['bndbox']['xmax'])
-    ymax.append(float(obj['bndbox']['ymax']) / height)
+      ymin = float(obj['bndbox']['ymin'])
+      ymax = float(obj['bndbox']['ymax'])
+    else:
+      xmin = float(np.min(nonzero_x_indices))
+      xmax = float(np.max(nonzero_x_indices))
+      ymin = float(np.min(nonzero_y_indices))
+      ymax = float(np.max(nonzero_y_indices))
+    xmins.append(xmin / width)
+    ymins.append(ymin / height)
+    xmaxs.append(xmax / width)
+    ymaxs.append(ymax / height)
    class_name = get_class_name_from_filename(data['filename'])
    classes_text.append(class_name.encode('utf8'))
    classes.append(label_map_dict[class_name])
    truncated.append(int(obj['truncated']))
    poses.append(obj['pose'].encode('utf8'))
+    if not faces_only:
+      mask_remapped = mask_np != 2
+      masks.append(mask_remapped)
+    mask_stack = np.stack(masks).astype(np.float32)
+    masks_flattened = np.reshape(mask_stack, [-1])
-  example = tf.train.Example(features=tf.train.Features(feature={
+  feature_dict = {
      'image/height': dataset_util.int64_feature(height),
      'image/width': dataset_util.int64_feature(width),
      'image/filename': dataset_util.bytes_feature(
@@ -133,16 +172,20 @@ def dict_to_tf_example(data,
      'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')),
      'image/encoded': dataset_util.bytes_feature(encoded_jpg),
      'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')),
-      'image/object/bbox/xmin': dataset_util.float_list_feature(xmin),
+      'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
-      'image/object/bbox/xmax': dataset_util.float_list_feature(xmax),
+      'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
-      'image/object/bbox/ymin': dataset_util.float_list_feature(ymin),
+      'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
-      'image/object/bbox/ymax': dataset_util.float_list_feature(ymax),
+      'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
      'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
      'image/object/class/label': dataset_util.int64_list_feature(classes),
      'image/object/difficult': dataset_util.int64_list_feature(difficult_obj),
      'image/object/truncated': dataset_util.int64_list_feature(truncated),
      'image/object/view': dataset_util.bytes_list_feature(poses),
-  }))
+  }
+  if not faces_only:
+    feature_dict['image/object/mask'] = (
+        dataset_util.float_list_feature(masks_flattened.tolist()))
+  example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
  return example
@@ -150,7 +193,8 @@ def create_tf_record(output_filename,
                     label_map_dict,
                     annotations_dir,
                     image_dir,
-                     examples):
+                     examples,
+                     faces_only=True):
  """Creates a TFRecord file from examples.
  Args:
@@ -159,28 +203,35 @@ def create_tf_record(output_filename,
    annotations_dir: Directory where annotation files are stored.
    image_dir: Directory where image files are stored.
    examples: Examples to parse and save to tf record.
+    faces_only: If True, generates bounding boxes for pet faces.  Otherwise
+      generates bounding boxes (as well as segmentations for full pet bodies).
  """
  writer = tf.python_io.TFRecordWriter(output_filename)
  for idx, example in enumerate(examples):
    if idx % 100 == 0:
      logging.info('On image %d of %d', idx, len(examples))
-    path = os.path.join(annotations_dir, 'xmls', example + '.xml')
+    xml_path = os.path.join(annotations_dir, 'xmls', example + '.xml')
+    mask_path = os.path.join(annotations_dir, 'trimaps', example + '.png')
-    if not os.path.exists(path):
+    if not os.path.exists(xml_path):
-      logging.warning('Could not find %s, ignoring example.', path)
+      logging.warning('Could not find %s, ignoring example.', xml_path)
      continue
-    with tf.gfile.GFile(path, 'r') as fid:
+    with tf.gfile.GFile(xml_path, 'r') as fid:
      xml_str = fid.read()
    xml = etree.fromstring(xml_str)
    data = dataset_util.recursive_parse_xml_to_dict(xml)['annotation']
-    tf_example = dict_to_tf_example(data, label_map_dict, image_dir)
+    try:
+      tf_example = dict_to_tf_example(
+          data, mask_path, label_map_dict, image_dir, faces_only=faces_only)
      writer.write(tf_example.SerializeToString())
+    except ValueError:
+      logging.warning('Invalid example: %s, ignoring.', xml_path)
  writer.close()
-# TODO: Add test for pet/PASCAL main files.
+# TODO(derekjchow): Add test for pet/PASCAL main files.
 def main(_):
  data_dir = FLAGS.data_dir
  label_map_dict = label_map_util.get_label_map_dict(FLAGS.label_map_path)
@@ -204,10 +255,16 @@ def main(_):
  train_output_path = os.path.join(FLAGS.output_dir, 'pet_train.record')
  val_output_path = os.path.join(FLAGS.output_dir, 'pet_val.record')
+  if FLAGS.faces_only:
+    train_output_path = os.path.join(FLAGS.output_dir,
+                                     'pet_train_with_masks.record')
+    val_output_path = os.path.join(FLAGS.output_dir,
+                                   'pet_val_with_masks.record')
  create_tf_record(train_output_path, label_map_dict, annotations_dir,
-                   image_dir, train_examples)
+                   image_dir, train_examples, faces_only=FLAGS.faces_only)
  create_tf_record(val_output_path, label_map_dict, annotations_dir,
-                   image_dir, val_examples)
+                   image_dir, val_examples, faces_only=FLAGS.faces_only)
 if __name__ == '__main__':
  tf.app.run()
--- a/research/object_detection/g3doc/preparing_inputs.md
+++ b/research/object_detection/g3doc/preparing_inputs.md
@@ -16,11 +16,11 @@ below:
 # From tensorflow/models/research/
 wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
 tar -xvf VOCtrainval_11-May-2012.tar
-python object_detection/create_pascal_tf_record.py \
+python object_detection/dataset_tools/create_pascal_tf_record.py \
    --label_map_path=object_detection/data/pascal_label_map.pbtxt \
    --data_dir=VOCdevkit --year=VOC2012 --set=train \
    --output_path=pascal_train.record
-python object_detection/create_pascal_tf_record.py \
+python object_detection/dataset_tools/create_pascal_tf_record.py \
    --label_map_path=object_detection/data/pascal_label_map.pbtxt \
    --data_dir=VOCdevkit --year=VOC2012 --set=val \
    --output_path=pascal_val.record
@@ -44,7 +44,7 @@ wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
 wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz
 tar -xvf annotations.tar.gz
 tar -xvf images.tar.gz
-python object_detection/create_pet_tf_record.py \
+python object_detection/dataset_tools/create_pet_tf_record.py \
    --label_map_path=object_detection/data/pet_label_map.pbtxt \
    --data_dir=`pwd` \
    --output_dir=`pwd`