Open source MnasFPN and minor fixes to OD API (#8484)

310447280 by lzc: Internal change 310420845 by Zhichao Lu: Open source the internal Context RCNN code. -- 310362339 by Zhichao Lu: Internal change 310259448 by lzc: Update required TF version for OD API. -- 310252159 by Zhichao Lu: Port patch_ops_test to TF1/TF2 as TPUs. -- 310247180 by Zhichao Lu: Ignore keypoint heatmap loss in the regions/bounding boxes with target keypoint class but no valid keypoint annotations. -- 310178294 by Zhichao Lu: Opensource MnasFPN https://arxiv.org/abs/1912.01106 -- 310094222 by lzc: Internal changes. -- 310085250 by lzc: Internal Change. -- 310016447 by huizhongc: Remove unrecognized classes from labeled_classes. -- 310009470 by rathodv: Mark batcher.py as TF1 only. -- 310001984 by rathodv: Update core/preprocessor.py to be compatible with TF1/TF2.. -- 309455035 by Zhi...

Open source MnasFPN and minor fixes to OD API (#8484)
310447280 by lzc: Internal change 310420845 by Zhichao Lu: Open source the internal Context RCNN code. -- 310362339 by Zhichao Lu: Internal change 310259448 by lzc: Update required TF version for OD API. -- 310252159 by Zhichao Lu: Port patch_ops_test to TF1/TF2 as TPUs. -- 310247180 by Zhichao Lu: Ignore keypoint heatmap loss in the regions/bounding boxes with target keypoint class but no valid keypoint annotations. -- 310178294 by Zhichao Lu: Opensource MnasFPN https://arxiv.org/abs/1912.01106 -- 310094222 by lzc: Internal changes. -- 310085250 by lzc: Internal Change. -- 310016447 by huizhongc: Remove unrecognized classes from labeled_classes. -- 310009470 by rathodv: Mark batcher.py as TF1 only. -- 310001984 by rathodv: Update core/preprocessor.py to be compatible with TF1/TF2.. -- 309455035 by Zhi...
8518d053 · pkulzc · GitHub · ac5fff19 · 8518d053 · 8518d053
Unverified Commit 8518d053 authored May 12, 2020 by pkulzc Committed by GitHub May 12, 2020
20 changed files
--- a/research/object_detection/core/standard_fields.py
+++ b/research/object_detection/core/standard_fields.py
@@ -42,6 +42,8 @@ class InputDataFields(object):
    filename: original filename of the dataset (without common path).
    groundtruth_image_classes: image-level class labels.
    groundtruth_image_confidences: image-level class confidences.
+    groundtruth_labeled_classes: image-level annotation that indicates the
+      classes for which an image has been labeled.
    groundtruth_boxes: coordinates of the ground truth boxes in the image.
    groundtruth_classes: box-level class labels.
    groundtruth_confidences: box-level class confidences. The shape should be
@@ -61,6 +63,7 @@ class InputDataFields(object):
    groundtruth_instance_classes: instance mask-level class labels.
    groundtruth_keypoints: ground truth keypoints.
    groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
+    groundtruth_keypoint_weights: groundtruth weight factor for keypoints.
    groundtruth_label_weights: groundtruth label weights.
    groundtruth_weights: groundtruth weight factor for bounding boxes.
    num_groundtruth_boxes: number of groundtruth boxes.
@@ -68,6 +71,11 @@ class InputDataFields(object):
    true_image_shapes: true shapes of images in the resized images, as resized
      images can be padded with zeros.
    multiclass_scores: the label score per class for each box.
+    context_features: a flattened list of contextual features.
+    context_feature_length: the fixed length of each feature in
+      context_features, used for reshaping.
+    valid_context_size: the valid context size, used in filtering the padded
+      context features.
  """
  image = 'image'
  image_additional_channels = 'image_additional_channels'
@@ -78,6 +86,7 @@ class InputDataFields(object):
  filename = 'filename'
  groundtruth_image_classes = 'groundtruth_image_classes'
  groundtruth_image_confidences = 'groundtruth_image_confidences'
+  groundtruth_labeled_classes = 'groundtruth_labeled_classes'
  groundtruth_boxes = 'groundtruth_boxes'
  groundtruth_classes = 'groundtruth_classes'
  groundtruth_confidences = 'groundtruth_confidences'
@@ -93,12 +102,16 @@ class InputDataFields(object):
  groundtruth_instance_classes = 'groundtruth_instance_classes'
  groundtruth_keypoints = 'groundtruth_keypoints'
  groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'
+  groundtruth_keypoint_weights = 'groundtruth_keypoint_weights'
  groundtruth_label_weights = 'groundtruth_label_weights'
  groundtruth_weights = 'groundtruth_weights'
  num_groundtruth_boxes = 'num_groundtruth_boxes'
  is_annotated = 'is_annotated'
  true_image_shape = 'true_image_shape'
  multiclass_scores = 'multiclass_scores'
+  context_features = 'context_features'
+  context_feature_length = 'context_feature_length'
+  valid_context_size = 'valid_context_size'
 class DetectionResultFields(object):
@@ -115,6 +128,7 @@ class DetectionResultFields(object):
    detection_masks: contains a segmentation mask for each detection box.
    detection_boundaries: contains an object boundary for each detection box.
    detection_keypoints: contains detection keypoints for each detection box.
+    detection_keypoint_scores: contains detection keypoint scores.
    num_detections: number of detections in the batch.
    raw_detection_boxes: contains decoded detection boxes without Non-Max
      suppression.
@@ -134,6 +148,7 @@ class DetectionResultFields(object):
  detection_masks = 'detection_masks'
  detection_boundaries = 'detection_boundaries'
  detection_keypoints = 'detection_keypoints'
+  detection_keypoint_scores = 'detection_keypoint_scores'
  num_detections = 'num_detections'
  raw_detection_boxes = 'raw_detection_boxes'
  raw_detection_scores = 'raw_detection_scores'
@@ -164,6 +179,7 @@ class BoxListFields(object):
  masks = 'masks'
  boundaries = 'boundaries'
  keypoints = 'keypoints'
+  keypoint_visibilities = 'keypoint_visibilities'
  keypoint_heatmaps = 'keypoint_heatmaps'
  is_crowd = 'is_crowd'
@@ -201,6 +217,7 @@ class TfExampleFields(object):
    source_id: original source of the image
    image_class_text: image-level label in text format
    image_class_label: image-level label in numerical format
+    image_class_confidence: image-level confidence of the label
    object_class_text: labels in text format, e.g. ["person", "cat"]
    object_class_label: labels in numbers, e.g. [16, 8]
    object_bbox_xmin: xmin coordinates of groundtruth box, e.g. 10, 30
@@ -237,6 +254,7 @@ class TfExampleFields(object):
  source_id = 'image/source_id'
  image_class_text = 'image/class/text'
  image_class_label = 'image/class/label'
+  image_class_confidence = 'image/class/confidence'
  object_class_text = 'image/object/class/text'
  object_class_label = 'image/object/class/label'
  object_bbox_ymin = 'image/object/bbox/ymin'

--- a/research/object_detection/core/target_assigner.py
+++ b/research/object_detection/core/target_assigner.py
@@ -44,12 +44,17 @@ from object_detection.box_coders import mean_stddev_box_coder
 from object_detection.core import box_coder
 from object_detection.core import box_list
 from object_detection.core import box_list_ops
+from object_detection.core import keypoint_ops
 from object_detection.core import matcher as mat
 from object_detection.core import region_similarity_calculator as sim_calc
 from object_detection.core import standard_fields as fields
 from object_detection.matchers import argmax_matcher
 from object_detection.matchers import bipartite_matcher
 from object_detection.utils import shape_utils
+from object_detection.utils import target_assigner_utils as ta_utils
+_DEFAULT_KEYPOINT_OFFSET_STD_DEV = 1.0
 class TargetAssigner(object):

--- a/research/object_detection/core/target_assigner_test.py
+++ b/research/object_detection/core/target_assigner_test.py
@@ -25,6 +25,7 @@ from object_detection.core import standard_fields as fields
 from object_detection.core import target_assigner as targetassigner
 from object_detection.matchers import argmax_matcher
 from object_detection.matchers import bipartite_matcher
+from object_detection.utils import np_box_ops
 from object_detection.utils import test_case
@@ -65,10 +66,10 @@ class TargetAssignerTest(test_case.TestCase):
    self.assertAllClose(cls_weights_out, exp_cls_weights)
    self.assertAllClose(reg_targets_out, exp_reg_targets)
    self.assertAllClose(reg_weights_out, exp_reg_weights)
-    self.assertEquals(cls_targets_out.dtype, np.float32)
+    self.assertEqual(cls_targets_out.dtype, np.float32)
-    self.assertEquals(cls_weights_out.dtype, np.float32)
+    self.assertEqual(cls_weights_out.dtype, np.float32)
-    self.assertEquals(reg_targets_out.dtype, np.float32)
+    self.assertEqual(reg_targets_out.dtype, np.float32)
-    self.assertEquals(reg_weights_out.dtype, np.float32)
+    self.assertEqual(reg_weights_out.dtype, np.float32)
  def test_assign_class_agnostic_with_ignored_matches(self):
    # Note: test is very similar to above. The third box matched with an IOU
@@ -108,10 +109,10 @@ class TargetAssignerTest(test_case.TestCase):
    self.assertAllClose(cls_weights_out, exp_cls_weights)
    self.assertAllClose(reg_targets_out, exp_reg_targets)
    self.assertAllClose(reg_weights_out, exp_reg_weights)
-    self.assertEquals(cls_targets_out.dtype, np.float32)
+    self.assertEqual(cls_targets_out.dtype, np.float32)
-    self.assertEquals(cls_weights_out.dtype, np.float32)
+    self.assertEqual(cls_weights_out.dtype, np.float32)
-    self.assertEquals(reg_targets_out.dtype, np.float32)
+    self.assertEqual(reg_targets_out.dtype, np.float32)
-    self.assertEquals(reg_weights_out.dtype, np.float32)
+    self.assertEqual(reg_weights_out.dtype, np.float32)
  def test_assign_agnostic_with_keypoints(self):
    def graph_fn(anchor_means, groundtruth_box_corners,
@@ -158,10 +159,10 @@ class TargetAssignerTest(test_case.TestCase):
    self.assertAllClose(cls_weights_out, exp_cls_weights)
    self.assertAllClose(reg_targets_out, exp_reg_targets)
    self.assertAllClose(reg_weights_out, exp_reg_weights)
-    self.assertEquals(cls_targets_out.dtype, np.float32)
+    self.assertEqual(cls_targets_out.dtype, np.float32)
-    self.assertEquals(cls_weights_out.dtype, np.float32)
+    self.assertEqual(cls_weights_out.dtype, np.float32)
-    self.assertEquals(reg_targets_out.dtype, np.float32)
+    self.assertEqual(reg_targets_out.dtype, np.float32)
-    self.assertEquals(reg_weights_out.dtype, np.float32)
+    self.assertEqual(reg_weights_out.dtype, np.float32)
  def test_assign_class_agnostic_with_keypoints_and_ignored_matches(self):
    # Note: test is very similar to above. The third box matched with an IOU
@@ -213,10 +214,10 @@ class TargetAssignerTest(test_case.TestCase):
    self.assertAllClose(cls_weights_out, exp_cls_weights)
    self.assertAllClose(reg_targets_out, exp_reg_targets)
    self.assertAllClose(reg_weights_out, exp_reg_weights)
-    self.assertEquals(cls_targets_out.dtype, np.float32)
+    self.assertEqual(cls_targets_out.dtype, np.float32)
-    self.assertEquals(cls_weights_out.dtype, np.float32)
+    self.assertEqual(cls_weights_out.dtype, np.float32)
-    self.assertEquals(reg_targets_out.dtype, np.float32)
+    self.assertEqual(reg_targets_out.dtype, np.float32)
-    self.assertEquals(reg_weights_out.dtype, np.float32)
+    self.assertEqual(reg_weights_out.dtype, np.float32)
  def test_assign_multiclass(self):
@@ -271,10 +272,10 @@ class TargetAssignerTest(test_case.TestCase):
    self.assertAllClose(cls_weights_out, exp_cls_weights)
    self.assertAllClose(reg_targets_out, exp_reg_targets)
    self.assertAllClose(reg_weights_out, exp_reg_weights)
-    self.assertEquals(cls_targets_out.dtype, np.float32)
+    self.assertEqual(cls_targets_out.dtype, np.float32)
-    self.assertEquals(cls_weights_out.dtype, np.float32)
+    self.assertEqual(cls_weights_out.dtype, np.float32)
-    self.assertEquals(reg_targets_out.dtype, np.float32)
+    self.assertEqual(reg_targets_out.dtype, np.float32)
-    self.assertEquals(reg_weights_out.dtype, np.float32)
+    self.assertEqual(reg_weights_out.dtype, np.float32)
  def test_assign_multiclass_with_groundtruth_weights(self):
@@ -379,10 +380,10 @@ class TargetAssignerTest(test_case.TestCase):
    self.assertAllClose(cls_weights_out, exp_cls_weights)
    self.assertAllClose(reg_targets_out, exp_reg_targets)
    self.assertAllClose(reg_weights_out, exp_reg_weights)
-    self.assertEquals(cls_targets_out.dtype, np.float32)
+    self.assertEqual(cls_targets_out.dtype, np.float32)
-    self.assertEquals(cls_weights_out.dtype, np.float32)
+    self.assertEqual(cls_weights_out.dtype, np.float32)
-    self.assertEquals(reg_targets_out.dtype, np.float32)
+    self.assertEqual(reg_targets_out.dtype, np.float32)
-    self.assertEquals(reg_weights_out.dtype, np.float32)
+    self.assertEqual(reg_weights_out.dtype, np.float32)
  def test_assign_empty_groundtruth(self):
@@ -431,10 +432,10 @@ class TargetAssignerTest(test_case.TestCase):
    self.assertAllClose(cls_weights_out, exp_cls_weights)
    self.assertAllClose(reg_targets_out, exp_reg_targets)
    self.assertAllClose(reg_weights_out, exp_reg_weights)
-    self.assertEquals(cls_targets_out.dtype, np.float32)
+    self.assertEqual(cls_targets_out.dtype, np.float32)
-    self.assertEquals(cls_weights_out.dtype, np.float32)
+    self.assertEqual(cls_weights_out.dtype, np.float32)
-    self.assertEquals(reg_targets_out.dtype, np.float32)
+    self.assertEqual(reg_targets_out.dtype, np.float32)
-    self.assertEquals(reg_weights_out.dtype, np.float32)
+    self.assertEqual(reg_weights_out.dtype, np.float32)
  def test_raises_error_on_incompatible_groundtruth_boxes_and_labels(self):
    similarity_calc = region_similarity_calculator.NegSqDistSimilarity()
@@ -1228,5 +1229,7 @@ class CreateTargetAssignerTest(tf.test.TestCase):
                                            stage='invalid_stage')
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/data/face_person_with_keypoints_label_map.pbtxt
+++ b/research/object_detection/data/face_person_with_keypoints_label_map.pbtxt
+item: {
+ id: 1
+ name: 'face'
+ display_name: 'face'
+ keypoints {
+   id: 0
+   label: "left_eye_center"
+ }
+ keypoints {
+   id: 1
+   label: "right_eye_center"
+ }
+ keypoints {
+   id: 2
+   label: "nose_tip"
+ }
+ keypoints {
+   id: 3
+   label: "mouth_center"
+ }
+ keypoints {
+   id: 4
+   label: "left_ear_tragion"
+ }
+ keypoints {
+   id: 5
+   label: "right_ear_tragion"
+ }
+}
+item: {
+ id: 2
+ name: 'Person'
+ display_name: 'PERSON'
+ keypoints {
+   id: 6
+   label: "NOSE_TIP"
+ }
+ keypoints {
+   id: 7
+   label: "LEFT_EYE"
+ }
+ keypoints {
+   id: 8
+   label: "RIGHT_EYE"
+ }
+ keypoints {
+   id: 9
+   label: "LEFT_EAR_TRAGION"
+ }
+ keypoints {
+   id: 10
+   label: "RIGHT_EAR_TRAGION"
+ }
+ keypoints {
+   id: 11
+   label: "LEFT_SHOULDER"
+ }
+ keypoints {
+   id: 12
+   label: "RIGHT_SHOULDER"
+ }
+ keypoints {
+   id: 13
+   label: "LEFT_ELBOW"
+ }
+ keypoints {
+   id: 14
+   label: "RIGHT_ELBOW"
+ }
+ keypoints {
+   id: 15
+   label: "LEFT_WRIST"
+ }
+ keypoints {
+   id: 16
+   label: "RIGHT_WRIST"
+ }
+ keypoints {
+   id: 17
+   label: "LEFT_HIP"
+ }
+ keypoints {
+   id: 18
+   label: "RIGHT_HIP"
+ }
+ keypoints {
+   id: 19
+   label: "LEFT_KNEE"
+ }
+ keypoints {
+   id: 20
+   label: "RIGHT_KNEE"
+ }
+ keypoints {
+   id: 21
+   label: "LEFT_ANKLE"
+ }
+ keypoints {
+   id: 22
+   label: "RIGHT_ANKLE"
+ }
+}
--- a/research/object_detection/data_decoders/tf_example_decoder.py
+++ b/research/object_detection/data_decoders/tf_example_decoder.py
@@ -21,6 +21,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import enum
+import numpy as np
 from six.moves import zip
 import tensorflow as tf
@@ -29,7 +31,27 @@ from object_detection.core import standard_fields as fields
 from object_detection.protos import input_reader_pb2
 from object_detection.utils import label_map_util
-slim_example_decoder = tf.contrib.slim.tfexample_decoder
+# pylint: disable=g-import-not-at-top
+try:
+  from tensorflow.contrib import lookup as contrib_lookup
+  from tensorflow.contrib.slim import tfexample_decoder as slim_example_decoder
+except ImportError:
+  # TF 2.0 doesn't ship with contrib.
+  pass
+# pylint: enable=g-import-not-at-top
+class Visibility(enum.Enum):
+  """Visibility definitions.
+  This follows the MS Coco convention (http://cocodataset.org/#format-data).
+  """
+  # Keypoint is not labeled.
+  UNLABELED = 0
+  # Keypoint is labeled but falls outside the object segment (e.g. occluded).
+  NOT_VISIBLE = 1
+  # Keypoint is labeled and visible.
+  VISIBLE = 2
 class _ClassTensorHandler(slim_example_decoder.Tensor):
@@ -69,8 +91,8 @@ class _ClassTensorHandler(slim_example_decoder.Tensor):
      lookup = tf.compat.v2.lookup
      hash_table_class = tf.compat.v2.lookup.StaticHashTable
    except AttributeError:
-      lookup = tf.contrib.lookup
+      lookup = contrib_lookup
-      hash_table_class = tf.contrib.lookup.HashTable
+      hash_table_class = contrib_lookup.HashTable
    name_to_id_table = hash_table_class(
        initializer=lookup.KeyValueTensorInitializer(
            keys=tf.constant(list(name_to_id.keys())),
@@ -144,7 +166,8 @@ class TfExampleDecoder(data_decoder.DataDecoder):
               dct_method='',
               num_keypoints=0,
               num_additional_channels=0,
-               load_multiclass_scores=False):
+               load_multiclass_scores=False,
+               load_context_features=False):
    """Constructor sets keys_to_features and items_to_handlers.
    Args:
@@ -168,6 +191,9 @@ class TfExampleDecoder(data_decoder.DataDecoder):
      num_additional_channels: how many additional channels to use.
      load_multiclass_scores: Whether to load multiclass scores associated with
        boxes.
+      load_context_features: Whether to load information from context_features,
+        to provide additional context to a detection model for training and/or
+        inference
    Raises:
      ValueError: If `instance_mask_type` option is not one of
@@ -197,6 +223,8 @@ class TfExampleDecoder(data_decoder.DataDecoder):
            tf.VarLenFeature(tf.string),
        'image/class/label':
            tf.VarLenFeature(tf.int64),
+        'image/class/confidence':
+            tf.VarLenFeature(tf.float32),
        # Object boxes and classes.
        'image/object/bbox/xmin':
            tf.VarLenFeature(tf.float32),
@@ -253,6 +281,9 @@ class TfExampleDecoder(data_decoder.DataDecoder):
            slim_example_decoder.Tensor('image/key/sha256')),
        fields.InputDataFields.filename: (
            slim_example_decoder.Tensor('image/filename')),
+        # Image-level labels.
+        fields.InputDataFields.groundtruth_image_confidences: (
+            slim_example_decoder.Tensor('image/class/confidence')),
        # Object boxes and classes.
        fields.InputDataFields.groundtruth_boxes: (
            slim_example_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
@@ -274,6 +305,20 @@ class TfExampleDecoder(data_decoder.DataDecoder):
          'image/object/class/multiclass_scores'] = tf.VarLenFeature(tf.float32)
      self.items_to_handlers[fields.InputDataFields.multiclass_scores] = (
          slim_example_decoder.Tensor('image/object/class/multiclass_scores'))
+    if load_context_features:
+      self.keys_to_features[
+          'image/context_features'] = tf.VarLenFeature(tf.float32)
+      self.items_to_handlers[fields.InputDataFields.context_features] = (
+          slim_example_decoder.ItemHandlerCallback(
+              ['image/context_features', 'image/context_feature_length'],
+              self._reshape_context_features))
+      self.keys_to_features[
+          'image/context_feature_length'] = tf.FixedLenFeature((), tf.int64)
+      self.items_to_handlers[fields.InputDataFields.context_feature_length] = (
+          slim_example_decoder.Tensor('image/context_feature_length'))
    if num_additional_channels > 0:
      self.keys_to_features[
          'image/additional_channels/encoded'] = tf.FixedLenFeature(
@@ -287,10 +332,17 @@ class TfExampleDecoder(data_decoder.DataDecoder):
          tf.VarLenFeature(tf.float32))
      self.keys_to_features['image/object/keypoint/y'] = (
          tf.VarLenFeature(tf.float32))
+      self.keys_to_features['image/object/keypoint/visibility'] = (
+          tf.VarLenFeature(tf.int64))
      self.items_to_handlers[fields.InputDataFields.groundtruth_keypoints] = (
          slim_example_decoder.ItemHandlerCallback(
              ['image/object/keypoint/y', 'image/object/keypoint/x'],
              self._reshape_keypoints))
+      kpt_vis_field = fields.InputDataFields.groundtruth_keypoint_visibilities
+      self.items_to_handlers[kpt_vis_field] = (
+          slim_example_decoder.ItemHandlerCallback(
+              ['image/object/keypoint/x', 'image/object/keypoint/visibility'],
+              self._reshape_keypoint_visibilities))
    if load_instance_masks:
      if instance_mask_type in (input_reader_pb2.DEFAULT,
                                input_reader_pb2.NUMERICAL_MASKS):
@@ -363,6 +415,9 @@ class TfExampleDecoder(data_decoder.DataDecoder):
        [None] indicating if the boxes enclose a crowd.
    Optional:
+      fields.InputDataFields.groundtruth_image_confidences - 1D float tensor of
+        shape [None] indicating if a class is present in the image (1.0) or
+        a class is not present in the image (0.0).
      fields.InputDataFields.image_additional_channels - 3D uint8 tensor of
        shape [None, None, num_additional_channels]. 1st dim is height; 2nd dim
        is width; 3rd dim is the number of additional channels.
@@ -371,8 +426,10 @@ class TfExampleDecoder(data_decoder.DataDecoder):
      fields.InputDataFields.groundtruth_group_of - 1D bool tensor of shape
        [None] indicating if the boxes represent `group_of` instances.
      fields.InputDataFields.groundtruth_keypoints - 3D float32 tensor of
-        shape [None, None, 2] containing keypoints, where the coordinates of
+        shape [None, num_keypoints, 2] containing keypoints, where the
-        the keypoints are ordered (y, x).
+        coordinates of the keypoints are ordered (y, x).
+      fields.InputDataFields.groundtruth_keypoint_visibilities - 2D bool
+        tensor of shape [None, num_keypoints] containing keypoint visibilites.
      fields.InputDataFields.groundtruth_instance_masks - 3D float32 tensor of
        shape [None, None, None] containing instance masks.
      fields.InputDataFields.groundtruth_image_classes - 1D uint64 of shape
@@ -380,6 +437,10 @@ class TfExampleDecoder(data_decoder.DataDecoder):
      fields.InputDataFields.multiclass_scores - 1D float32 tensor of shape
        [None * num_classes] containing flattened multiclass scores for
        groundtruth boxes.
+      fields.InputDataFields.context_features - 1D float32 tensor of shape
+        [context_feature_length * num_context_features]
+      fields.InputDataFields.context_feature_length - int32 tensor specifying
+        the length of each feature in context_features
    """
    serialized_example = tf.reshape(tf_example_string_tensor, shape=[])
    decoder = slim_example_decoder.TFExampleDecoder(self.keys_to_features,
@@ -410,20 +471,34 @@ class TfExampleDecoder(data_decoder.DataDecoder):
                tensor_dict[fields.InputDataFields.groundtruth_weights])[0],
            0), lambda: tensor_dict[fields.InputDataFields.groundtruth_weights],
        default_groundtruth_weights)
+    if fields.InputDataFields.groundtruth_keypoints in tensor_dict:
+      # Set all keypoints that are not labeled to NaN.
+      gt_kpt_fld = fields.InputDataFields.groundtruth_keypoints
+      gt_kpt_vis_fld = fields.InputDataFields.groundtruth_keypoint_visibilities
+      visibilities_tiled = tf.tile(
+          tf.expand_dims(tensor_dict[gt_kpt_vis_fld], -1),
+          [1, 1, 2])
+      tensor_dict[gt_kpt_fld] = tf.where(
+          visibilities_tiled,
+          tensor_dict[gt_kpt_fld],
+          np.nan * tf.ones_like(tensor_dict[gt_kpt_fld]))
    return tensor_dict
  def _reshape_keypoints(self, keys_to_tensors):
    """Reshape keypoints.
-    The instance segmentation masks are reshaped to [num_instances,
+    The keypoints are reshaped to [num_instances, num_keypoints, 2].
-    num_keypoints, 2].
    Args:
-      keys_to_tensors: a dictionary from keys to tensors.
+      keys_to_tensors: a dictionary from keys to tensors. Expected keys are:
+        'image/object/keypoint/x'
+        'image/object/keypoint/y'
    Returns:
      A 3-D float tensor of shape [num_instances, num_keypoints, 2] with values
-        in {0, 1}.
+        in [0, 1].
    """
    y = keys_to_tensors['image/object/keypoint/y']
    if isinstance(y, tf.SparseTensor):
@@ -437,6 +512,54 @@ class TfExampleDecoder(data_decoder.DataDecoder):
    keypoints = tf.reshape(keypoints, [-1, self._num_keypoints, 2])
    return keypoints
+  def _reshape_keypoint_visibilities(self, keys_to_tensors):
+    """Reshape keypoint visibilities.
+    The keypoint visibilities are reshaped to [num_instances,
+    num_keypoints].
+    The raw keypoint visibilities are expected to conform to the
+    MSCoco definition. See Visibility enum.
+    The returned boolean is True for the labeled case (either
+    Visibility.NOT_VISIBLE or Visibility.VISIBLE). These are the same categories
+    that COCO uses to evaluate keypoint detection performance:
+    http://cocodataset.org/#keypoints-eval
+    If image/object/keypoint/visibility is not provided, visibilities will be
+    set to True for finite keypoint coordinate values, and 0 if the coordinates
+    are NaN.
+    Args:
+      keys_to_tensors: a dictionary from keys to tensors. Expected keys are:
+        'image/object/keypoint/x'
+        'image/object/keypoint/visibility'
+    Returns:
+      A 2-D bool tensor of shape [num_instances, num_keypoints] with values
+        in {0, 1}. 1 if the keypoint is labeled, 0 otherwise.
+    """
+    x = keys_to_tensors['image/object/keypoint/x']
+    vis = keys_to_tensors['image/object/keypoint/visibility']
+    if isinstance(vis, tf.SparseTensor):
+      vis = tf.sparse_tensor_to_dense(vis)
+    if isinstance(x, tf.SparseTensor):
+      x = tf.sparse_tensor_to_dense(x)
+    default_vis = tf.where(
+        tf.math.is_nan(x),
+        Visibility.UNLABELED.value * tf.ones_like(x, dtype=tf.int64),
+        Visibility.VISIBLE.value * tf.ones_like(x, dtype=tf.int64))
+    # Use visibility if provided, otherwise use the default visibility.
+    vis = tf.cond(tf.equal(tf.size(x), tf.size(vis)),
+                  true_fn=lambda: vis,
+                  false_fn=lambda: default_vis)
+    vis = tf.math.logical_or(
+        tf.math.equal(vis, Visibility.NOT_VISIBLE.value),
+        tf.math.equal(vis, Visibility.VISIBLE.value))
+    vis = tf.reshape(vis, [-1, self._num_keypoints])
+    return vis
  def _reshape_instance_masks(self, keys_to_tensors):
    """Reshape instance segmentation masks.
@@ -460,6 +583,26 @@ class TfExampleDecoder(data_decoder.DataDecoder):
        tf.cast(tf.greater(masks, 0.0), dtype=tf.float32), to_shape)
    return tf.cast(masks, tf.float32)
+  def _reshape_context_features(self, keys_to_tensors):
+    """Reshape context features.
+    The instance context_features are reshaped to
+      [num_context_features, context_feature_length]
+    Args:
+      keys_to_tensors: a dictionary from keys to tensors.
+    Returns:
+      A 2-D float tensor of shape [num_context_features, context_feature_length]
+    """
+    context_feature_length = keys_to_tensors['image/context_feature_length']
+    to_shape = tf.cast(tf.stack([-1, context_feature_length]), tf.int32)
+    context_features = keys_to_tensors['image/context_features']
+    if isinstance(context_features, tf.SparseTensor):
+      context_features = tf.sparse_tensor_to_dense(context_features)
+    context_features = tf.reshape(context_features, to_shape)
+    return context_features
  def _decode_png_instance_masks(self, keys_to_tensors):
    """Decode PNG instance segmentation masks and stack into dense tensor.

--- a/research/object_detection/data_decoders/tf_example_decoder_test.py
+++ b/research/object_detection/data_decoders/tf_example_decoder_test.py
@@ -24,8 +24,6 @@ from object_detection.data_decoders import tf_example_decoder
 from object_detection.protos import input_reader_pb2
 from object_detection.utils import dataset_util
-slim_example_decoder = tf.contrib.slim.tfexample_decoder
 class TfExampleDecoderTest(tf.test.TestCase):
@@ -265,6 +263,68 @@ class TfExampleDecoderTest(tf.test.TestCase):
                        tensor_dict[fields.InputDataFields.groundtruth_boxes])
  def testDecodeKeypoint(self):
+    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
+    encoded_jpeg = self._EncodeImage(image_tensor)
+    bbox_ymins = [0.0, 4.0]
+    bbox_xmins = [1.0, 5.0]
+    bbox_ymaxs = [2.0, 6.0]
+    bbox_xmaxs = [3.0, 7.0]
+    keypoint_ys = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
+    keypoint_xs = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+    keypoint_visibility = [1, 2, 0, 1, 0, 2]
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded':
+                    dataset_util.bytes_feature(encoded_jpeg),
+                'image/format':
+                    dataset_util.bytes_feature(six.b('jpeg')),
+                'image/object/bbox/ymin':
+                    dataset_util.float_list_feature(bbox_ymins),
+                'image/object/bbox/xmin':
+                    dataset_util.float_list_feature(bbox_xmins),
+                'image/object/bbox/ymax':
+                    dataset_util.float_list_feature(bbox_ymaxs),
+                'image/object/bbox/xmax':
+                    dataset_util.float_list_feature(bbox_xmaxs),
+                'image/object/keypoint/y':
+                    dataset_util.float_list_feature(keypoint_ys),
+                'image/object/keypoint/x':
+                    dataset_util.float_list_feature(keypoint_xs),
+                'image/object/keypoint/visibility':
+                    dataset_util.int64_list_feature(keypoint_visibility),
+            })).SerializeToString()
+    example_decoder = tf_example_decoder.TfExampleDecoder(num_keypoints=3)
+    tensor_dict = example_decoder.decode(tf.convert_to_tensor(example))
+    self.assertAllEqual((tensor_dict[fields.InputDataFields.groundtruth_boxes]
+                         .get_shape().as_list()), [None, 4])
+    self.assertAllEqual(
+        (tensor_dict[fields.InputDataFields.groundtruth_keypoints].get_shape()
+         .as_list()), [2, 3, 2])
+    with self.test_session() as sess:
+      tensor_dict = sess.run(tensor_dict)
+    expected_boxes = np.vstack([bbox_ymins, bbox_xmins, bbox_ymaxs,
+                                bbox_xmaxs]).transpose()
+    self.assertAllEqual(expected_boxes,
+                        tensor_dict[fields.InputDataFields.groundtruth_boxes])
+    expected_keypoints = [
+        [[0.0, 1.0], [1.0, 2.0], [np.nan, np.nan]],
+        [[3.0, 4.0], [np.nan, np.nan], [5.0, 6.0]]]
+    self.assertAllClose(
+        expected_keypoints,
+        tensor_dict[fields.InputDataFields.groundtruth_keypoints])
+    expected_visibility = (
+        (np.array(keypoint_visibility) > 0).reshape((2, 3)))
+    self.assertAllEqual(
+        expected_visibility,
+        tensor_dict[fields.InputDataFields.groundtruth_keypoint_visibilities])
+  def testDecodeKeypointNoVisibilities(self):
    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
    encoded_jpeg = self._EncodeImage(image_tensor)
    bbox_ymins = [0.0, 4.0]
@@ -316,6 +376,11 @@ class TfExampleDecoderTest(tf.test.TestCase):
        expected_keypoints,
        tensor_dict[fields.InputDataFields.groundtruth_keypoints])
+    expected_visibility = np.ones((2, 3))
+    self.assertAllEqual(
+        expected_visibility,
+        tensor_dict[fields.InputDataFields.groundtruth_keypoint_visibilities])
  def testDecodeDefaultGroundtruthWeights(self):
    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
    encoded_jpeg = self._EncodeImage(image_tensor)
@@ -841,6 +906,34 @@ class TfExampleDecoderTest(tf.test.TestCase):
    self.assertAllEqual(object_weights,
                        tensor_dict[fields.InputDataFields.groundtruth_weights])
+  def testDecodeClassConfidence(self):
+    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
+    encoded_jpeg = self._EncodeImage(image_tensor)
+    class_confidence = [0.0, 1.0, 0.0]
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded':
+                    dataset_util.bytes_feature(encoded_jpeg),
+                'image/format':
+                    dataset_util.bytes_feature(six.b('jpeg')),
+                'image/class/confidence':
+                    dataset_util.float_list_feature(class_confidence),
+            })).SerializeToString()
+    example_decoder = tf_example_decoder.TfExampleDecoder()
+    tensor_dict = example_decoder.decode(tf.convert_to_tensor(example))
+    self.assertAllEqual(
+        (tensor_dict[fields.InputDataFields.groundtruth_image_confidences]
+         .get_shape().as_list()), [3])
+    with self.test_session() as sess:
+      tensor_dict = sess.run(tensor_dict)
+    self.assertAllEqual(
+        class_confidence,
+        tensor_dict[fields.InputDataFields.groundtruth_image_confidences])
  def testDecodeInstanceSegmentation(self):
    num_instances = 4
    image_height = 5
@@ -992,6 +1085,87 @@ class TfExampleDecoderTest(tf.test.TestCase):
        tensor_dict[fields.InputDataFields.groundtruth_image_classes],
        np.array([1, 3]))
+  def testDecodeContextFeatures(self):
+    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
+    encoded_jpeg = self._EncodeImage(image_tensor)
+    bbox_ymins = [0.0, 4.0]
+    bbox_xmins = [1.0, 5.0]
+    bbox_ymaxs = [2.0, 6.0]
+    bbox_xmaxs = [3.0, 7.0]
+    num_features = 8
+    context_feature_length = 10
+    context_features = np.random.random(num_features*context_feature_length)
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded':
+                    dataset_util.bytes_feature(encoded_jpeg),
+                'image/format':
+                    dataset_util.bytes_feature(six.b('jpeg')),
+                'image/context_features':
+                    dataset_util.float_list_feature(context_features),
+                'image/context_feature_length':
+                    dataset_util.int64_feature(context_feature_length),
+                'image/object/bbox/ymin':
+                    dataset_util.float_list_feature(bbox_ymins),
+                'image/object/bbox/xmin':
+                    dataset_util.float_list_feature(bbox_xmins),
+                'image/object/bbox/ymax':
+                    dataset_util.float_list_feature(bbox_ymaxs),
+                'image/object/bbox/xmax':
+                    dataset_util.float_list_feature(bbox_xmaxs),
+            })).SerializeToString()
+    example_decoder = tf_example_decoder.TfExampleDecoder(
+        load_context_features=True)
+    tensor_dict = example_decoder.decode(tf.convert_to_tensor(example))
+    with self.test_session() as sess:
+      tensor_dict = sess.run(tensor_dict)
+    self.assertAllClose(
+        context_features.reshape(num_features, context_feature_length),
+        tensor_dict[fields.InputDataFields.context_features])
+    self.assertAllEqual(
+        context_feature_length,
+        tensor_dict[fields.InputDataFields.context_feature_length])
+  def testContextFeaturesNotAvailableByDefault(self):
+    image_tensor = np.random.randint(256, size=(4, 5, 3)).astype(np.uint8)
+    encoded_jpeg = self._EncodeImage(image_tensor)
+    bbox_ymins = [0.0, 4.0]
+    bbox_xmins = [1.0, 5.0]
+    bbox_ymaxs = [2.0, 6.0]
+    bbox_xmaxs = [3.0, 7.0]
+    num_features = 10
+    context_feature_length = 10
+    context_features = np.random.random(num_features*context_feature_length)
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                'image/encoded':
+                    dataset_util.bytes_feature(encoded_jpeg),
+                'image/format':
+                    dataset_util.bytes_feature(six.b('jpeg')),
+                'image/context_features':
+                    dataset_util.float_list_feature(context_features),
+                'image/context_feature_length':
+                    dataset_util.int64_feature(context_feature_length),
+                'image/object/bbox/ymin':
+                    dataset_util.float_list_feature(bbox_ymins),
+                'image/object/bbox/xmin':
+                    dataset_util.float_list_feature(bbox_xmins),
+                'image/object/bbox/ymax':
+                    dataset_util.float_list_feature(bbox_ymaxs),
+                'image/object/bbox/xmax':
+                    dataset_util.float_list_feature(bbox_xmaxs),
+            })).SerializeToString()
+    example_decoder = tf_example_decoder.TfExampleDecoder()
+    tensor_dict = example_decoder.decode(tf.convert_to_tensor(example))
+    with self.test_session() as sess:
+      tensor_dict = sess.run(tensor_dict)
+    self.assertNotIn(fields.InputDataFields.context_features,
+                     tensor_dict)
 if __name__ == '__main__':
  tf.test.main()
--- a/research/object_detection/dataset_tools/create_coco_tf_record.py
+++ b/research/object_detection/dataset_tools/create_coco_tf_record.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 r"""Convert raw COCO dataset to TFRecord for object_detection.
 Please note that this tool creates sharded output files.
@@ -34,6 +33,7 @@ from __future__ import print_function
 import hashlib
 import io
 import json
+import logging
 import os
 import contextlib2
 import numpy as np
@@ -46,57 +46,69 @@ from object_detection.dataset_tools import tf_record_creation_util
 from object_detection.utils import dataset_util
 from object_detection.utils import label_map_util
 flags = tf.app.flags
-tf.flags.DEFINE_boolean('include_masks', False,
+tf.flags.DEFINE_boolean(
-                        'Whether to include instance segmentations masks '
+    'include_masks', False, 'Whether to include instance segmentations masks '
-                        '(PNG encoded) in the result. default: False.')
+    '(PNG encoded) in the result. default: False.')
-tf.flags.DEFINE_string('train_image_dir', '',
+tf.flags.DEFINE_string('train_image_dir', '', 'Training image directory.')
-                       'Training image directory.')
+tf.flags.DEFINE_string('val_image_dir', '', 'Validation image directory.')
-tf.flags.DEFINE_string('val_image_dir', '',
+tf.flags.DEFINE_string('test_image_dir', '', 'Test image directory.')
-                       'Validation image directory.')
-tf.flags.DEFINE_string('test_image_dir', '',
-                       'Test image directory.')
 tf.flags.DEFINE_string('train_annotations_file', '',
                       'Training annotations JSON file.')
 tf.flags.DEFINE_string('val_annotations_file', '',
                       'Validation annotations JSON file.')
 tf.flags.DEFINE_string('testdev_annotations_file', '',
                       'Test-dev annotations JSON file.')
+tf.flags.DEFINE_string('train_keypoint_annotations_file', '',
+                       'Training annotations JSON file.')
+tf.flags.DEFINE_string('val_keypoint_annotations_file', '',
+                       'Validation annotations JSON file.')
 tf.flags.DEFINE_string('output_dir', '/tmp/', 'Output data directory.')
 FLAGS = flags.FLAGS
-tf.logging.set_verbosity(tf.logging.INFO)
+logger = tf.get_logger()
+logger.setLevel(logging.INFO)
+_COCO_KEYPOINT_NAMES = [
+    b'nose', b'left_eye', b'right_eye', b'left_ear', b'right_ear',
+    b'left_shoulder', b'right_shoulder', b'left_elbow', b'right_elbow',
+    b'left_wrist', b'right_wrist', b'left_hip', b'right_hip',
+    b'left_knee', b'right_knee', b'left_ankle', b'right_ankle'
+]
 def create_tf_example(image,
                      annotations_list,
                      image_dir,
                      category_index,
-                      include_masks=False):
+                      include_masks=False,
+                      keypoint_annotations_dict=None):
  """Converts image and annotations to a tf.Example proto.
  Args:
-    image: dict with keys:
+    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
-      [u'license', u'file_name', u'coco_url', u'height', u'width',
+      u'width', u'date_captured', u'flickr_url', u'id']
-      u'date_captured', u'flickr_url', u'id']
    annotations_list:
-      list of dicts with keys:
+      list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
-      [u'segmentation', u'area', u'iscrowd', u'image_id',
+        u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
-      u'bbox', u'category_id', u'id']
+        coordinates in the official COCO dataset are given as [x, y, width,
-      Notice that bounding box coordinates in the official COCO dataset are
+        height] tuples using absolute coordinates where x, y represent the
-      given as [x, y, width, height] tuples using absolute coordinates where
+        top-left (0-indexed) corner.  This function converts to the format
-      x, y represent the top-left (0-indexed) corner.  This function converts
+        expected by the Tensorflow Object Detection API (which is which is
-      to the format expected by the Tensorflow Object Detection API (which is
+        [ymin, xmin, ymax, xmax] with coordinates normalized relative to image
-      which is [ymin, xmin, ymax, xmax] with coordinates normalized relative
+        size).
-      to image size).
    image_dir: directory containing the image files.
-    category_index: a dict containing COCO category information keyed
+    category_index: a dict containing COCO category information keyed by the
-      by the 'id' field of each category.  See the
+      'id' field of each category.  See the label_map_util.create_category_index
-      label_map_util.create_category_index function.
+      function.
    include_masks: Whether to include instance segmentations masks
      (PNG encoded) in the result. default: False.
+    keypoint_annotations_dict: A dictionary that maps from annotation_id to a
+      dictionary with keys: [u'keypoints', u'num_keypoints'] represeting the
+      keypoint information for this person object annotation. If None, then
+      no keypoint annotations will be populated.
  Returns:
    example: The converted tf.Example
    num_annotations_skipped: Number of (invalid) annotations that were ignored.
@@ -125,7 +137,15 @@ def create_tf_example(image,
  category_ids = []
  area = []
  encoded_mask_png = []
+  keypoints_x = []
+  keypoints_y = []
+  keypoints_visibility = []
+  keypoints_name = []
+  num_keypoints = []
+  include_keypoint = keypoint_annotations_dict is not None
  num_annotations_skipped = 0
+  num_keypoint_annotation_used = 0
+  num_keypoint_annotation_skipped = 0
  for object_annotations in annotations_list:
    (x, y, width, height) = tuple(object_annotations['bbox'])
    if width <= 0 or height <= 0:
@@ -154,6 +174,29 @@ def create_tf_example(image,
      output_io = io.BytesIO()
      pil_image.save(output_io, format='PNG')
      encoded_mask_png.append(output_io.getvalue())
+    if include_keypoint:
+      annotation_id = object_annotations['id']
+      if annotation_id in keypoint_annotations_dict:
+        num_keypoint_annotation_used += 1
+        keypoint_annotations = keypoint_annotations_dict[annotation_id]
+        keypoints = keypoint_annotations['keypoints']
+        num_kpts = keypoint_annotations['num_keypoints']
+        keypoints_x_abs = keypoints[::3]
+        keypoints_x.extend(
+            [float(x_abs) / image_width for x_abs in keypoints_x_abs])
+        keypoints_y_abs = keypoints[1::3]
+        keypoints_y.extend(
+            [float(y_abs) / image_height for y_abs in keypoints_y_abs])
+        keypoints_visibility.extend(keypoints[2::3])
+        keypoints_name.extend(_COCO_KEYPOINT_NAMES)
+        num_keypoints.append(num_kpts)
+      else:
+        keypoints_x.extend([0.0] * len(_COCO_KEYPOINT_NAMES))
+        keypoints_y.extend([0.0] * len(_COCO_KEYPOINT_NAMES))
+        keypoints_visibility.extend([0] * len(_COCO_KEYPOINT_NAMES))
+        keypoints_name.extend(_COCO_KEYPOINT_NAMES)
+        num_keypoints.append(0)
  feature_dict = {
      'image/height':
          dataset_util.int64_feature(image_height),
@@ -187,12 +230,28 @@ def create_tf_example(image,
  if include_masks:
    feature_dict['image/object/mask'] = (
        dataset_util.bytes_list_feature(encoded_mask_png))
+  if include_keypoint:
+    feature_dict['image/object/keypoint/x'] = (
+        dataset_util.float_list_feature(keypoints_x))
+    feature_dict['image/object/keypoint/y'] = (
+        dataset_util.float_list_feature(keypoints_y))
+    feature_dict['image/object/keypoint/num'] = (
+        dataset_util.int64_list_feature(num_keypoints))
+    feature_dict['image/object/keypoint/visibility'] = (
+        dataset_util.int64_list_feature(keypoints_visibility))
+    feature_dict['image/object/keypoint/text'] = (
+        dataset_util.bytes_list_feature(keypoints_name))
+    num_keypoint_annotation_skipped = (
+        len(keypoint_annotations_dict) - num_keypoint_annotation_used)
  example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
-  return key, example, num_annotations_skipped
+  return key, example, num_annotations_skipped, num_keypoint_annotation_skipped
-def _create_tf_record_from_coco_annotations(
+def _create_tf_record_from_coco_annotations(annotations_file, image_dir,
-    annotations_file, image_dir, output_path, include_masks, num_shards):
+                                            output_path, include_masks,
+                                            num_shards,
+                                            keypoint_annotations_file=''):
  """Loads COCO annotation json files and converts to tf.Record format.
  Args:
@@ -202,6 +261,9 @@ def _create_tf_record_from_coco_annotations(
    include_masks: Whether to include instance segmentations masks
      (PNG encoded) in the result. default: False.
    num_shards: number of output file shards.
+    keypoint_annotations_file: JSON file containing the person keypoint
+      annotations. If empty, then no person keypoint annotations will be
+      generated.
  """
  with contextlib2.ExitStack() as tf_record_close_stack, \
      tf.gfile.GFile(annotations_file, 'r') as fid:
@@ -214,8 +276,7 @@ def _create_tf_record_from_coco_annotations(
    annotations_index = {}
    if 'annotations' in groundtruth_data:
-      tf.logging.info(
+      logging.info('Found groundtruth annotations. Building annotations index.')
-          'Found groundtruth annotations. Building annotations index.')
      for annotation in groundtruth_data['annotations']:
        image_id = annotation['image_id']
        if image_id not in annotations_index:
@@ -227,21 +288,43 @@ def _create_tf_record_from_coco_annotations(
      if image_id not in annotations_index:
        missing_annotation_count += 1
        annotations_index[image_id] = []
-    tf.logging.info('%d images are missing annotations.',
+    logging.info('%d images are missing annotations.', missing_annotation_count)
-                    missing_annotation_count)
+    keypoint_annotations_index = {}
+    if keypoint_annotations_file:
+      with tf.gfile.GFile(keypoint_annotations_file, 'r') as kid:
+        keypoint_groundtruth_data = json.load(kid)
+      if 'annotations' in keypoint_groundtruth_data:
+        for annotation in keypoint_groundtruth_data['annotations']:
+          image_id = annotation['image_id']
+          if image_id not in keypoint_annotations_index:
+            keypoint_annotations_index[image_id] = {}
+          keypoint_annotations_index[image_id][annotation['id']] = annotation
    total_num_annotations_skipped = 0
+    total_num_keypoint_annotations_skipped = 0
    for idx, image in enumerate(images):
      if idx % 100 == 0:
-        tf.logging.info('On image %d of %d', idx, len(images))
+        logging.info('On image %d of %d', idx, len(images))
      annotations_list = annotations_index[image['id']]
-      _, tf_example, num_annotations_skipped = create_tf_example(
+      keypoint_annotations_dict = None
-          image, annotations_list, image_dir, category_index, include_masks)
+      if keypoint_annotations_file:
+        keypoint_annotations_dict = {}
+        if image['id'] in keypoint_annotations_index:
+          keypoint_annotations_dict = keypoint_annotations_index[image['id']]
+      (_, tf_example, num_annotations_skipped,
+       num_keypoint_annotations_skipped) = create_tf_example(
+           image, annotations_list, image_dir, category_index, include_masks,
+           keypoint_annotations_dict)
      total_num_annotations_skipped += num_annotations_skipped
+      total_num_keypoint_annotations_skipped += num_keypoint_annotations_skipped
      shard_idx = idx % num_shards
      output_tfrecords[shard_idx].write(tf_example.SerializeToString())
-    tf.logging.info('Finished writing, skipped %d annotations.',
+    logging.info('Finished writing, skipped %d annotations.',
-                    total_num_annotations_skipped)
+                 total_num_annotations_skipped)
+    if keypoint_annotations_file:
+      logging.info('Finished writing, skipped %d keypoint annotations.',
+                   total_num_keypoint_annotations_skipped)
 def main(_):
@@ -263,13 +346,15 @@ def main(_):
      FLAGS.train_image_dir,
      train_output_path,
      FLAGS.include_masks,
-      num_shards=100)
+      num_shards=100,
+      keypoint_annotations_file=FLAGS.train_keypoint_annotations_file)
  _create_tf_record_from_coco_annotations(
      FLAGS.val_annotations_file,
      FLAGS.val_image_dir,
      val_output_path,
      FLAGS.include_masks,
-      num_shards=10)
+      num_shards=100,
+      keypoint_annotations_file=FLAGS.val_keypoint_annotations_file)
  _create_tf_record_from_coco_annotations(
      FLAGS.testdev_annotations_file,
      FLAGS.test_image_dir,

--- a/research/object_detection/dataset_tools/create_coco_tf_record_test.py
+++ b/research/object_detection/dataset_tools/create_coco_tf_record_test.py
@@ -20,6 +20,7 @@ import os
 import numpy as np
 import PIL.Image
+import six
 import tensorflow as tf
 from object_detection.dataset_tools import create_coco_tf_record
@@ -37,6 +38,16 @@ class CreateCocoTFRecordTest(tf.test.TestCase):
    proto_list = [p for p in proto_field]
    self.assertListEqual(proto_list, expectation)
+  def _assertProtoClose(self, proto_field, expectation):
+    """Helper function to assert if a proto field nearly equals some value.
+    Args:
+      proto_field: The protobuf field to compare.
+      expectation: The expected value of the protobuf field.
+    """
+    proto_list = [p for p in proto_field]
+    self.assertAllClose(proto_list, expectation)
  def test_create_tf_example(self):
    image_file_name = 'tmp_image.jpg'
    image_data = np.random.rand(256, 256, 3)
@@ -78,7 +89,7 @@ class CreateCocoTFRecordTest(tf.test.TestCase):
    }
    (_, example,
-     num_annotations_skipped) = create_coco_tf_record.create_tf_example(
+     num_annotations_skipped, _) = create_coco_tf_record.create_tf_example(
         image, annotations_list, image_dir, category_index)
    self.assertEqual(num_annotations_skipped, 0)
@@ -88,12 +99,13 @@ class CreateCocoTFRecordTest(tf.test.TestCase):
        example.features.feature['image/width'].int64_list.value, [256])
    self._assertProtoEqual(
        example.features.feature['image/filename'].bytes_list.value,
-        [image_file_name])
+        [six.b(image_file_name)])
    self._assertProtoEqual(
        example.features.feature['image/source_id'].bytes_list.value,
-        [str(image['id'])])
+        [six.b(str(image['id']))])
    self._assertProtoEqual(
-        example.features.feature['image/format'].bytes_list.value, ['jpeg'])
+        example.features.feature['image/format'].bytes_list.value,
+        [six.b('jpeg')])
    self._assertProtoEqual(
        example.features.feature['image/object/bbox/xmin'].float_list.value,
        [0.25])
@@ -108,7 +120,7 @@ class CreateCocoTFRecordTest(tf.test.TestCase):
        [0.75])
    self._assertProtoEqual(
        example.features.feature['image/object/class/text'].bytes_list.value,
-        ['cat'])
+        [six.b('cat')])
  def test_create_tf_example_with_instance_masks(self):
    image_file_name = 'tmp_image.jpg'
@@ -144,7 +156,7 @@ class CreateCocoTFRecordTest(tf.test.TestCase):
    }
    (_, example,
-     num_annotations_skipped) = create_coco_tf_record.create_tf_example(
+     num_annotations_skipped, _) = create_coco_tf_record.create_tf_example(
         image, annotations_list, image_dir, category_index, include_masks=True)
    self.assertEqual(num_annotations_skipped, 0)
@@ -154,12 +166,13 @@ class CreateCocoTFRecordTest(tf.test.TestCase):
        example.features.feature['image/width'].int64_list.value, [8])
    self._assertProtoEqual(
        example.features.feature['image/filename'].bytes_list.value,
-        [image_file_name])
+        [six.b(image_file_name)])
    self._assertProtoEqual(
        example.features.feature['image/source_id'].bytes_list.value,
-        [str(image['id'])])
+        [six.b(str(image['id']))])
    self._assertProtoEqual(
-        example.features.feature['image/format'].bytes_list.value, ['jpeg'])
+        example.features.feature['image/format'].bytes_list.value,
+        [six.b('jpeg')])
    self._assertProtoEqual(
        example.features.feature['image/object/bbox/xmin'].float_list.value,
        [0])
@@ -174,7 +187,7 @@ class CreateCocoTFRecordTest(tf.test.TestCase):
        [1])
    self._assertProtoEqual(
        example.features.feature['image/object/class/text'].bytes_list.value,
-        ['dog'])
+        [six.b('dog')])
    encoded_mask_pngs = [
        io.BytesIO(encoded_masks) for encoded_masks in example.features.feature[
            'image/object/mask'].bytes_list.value
@@ -183,13 +196,120 @@ class CreateCocoTFRecordTest(tf.test.TestCase):
        np.array(PIL.Image.open(encoded_mask_png))
        for encoded_mask_png in encoded_mask_pngs
    ]
-    self.assertTrue(len(pil_masks) == 1)
+    self.assertEqual(len(pil_masks), 1)
    self.assertAllEqual(pil_masks[0],
                        [[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 1],
                         [0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1]])
+  def test_create_tf_example_with_keypoints(self):
+    image_dir = self.get_temp_dir()
+    image_file_name = 'tmp_image.jpg'
+    image_data = np.random.randint(low=0, high=256, size=(256, 256, 3)).astype(
+        np.uint8)
+    save_path = os.path.join(image_dir, image_file_name)
+    image = PIL.Image.fromarray(image_data, 'RGB')
+    image.save(save_path)
+    image = {
+        'file_name': image_file_name,
+        'height': 256,
+        'width': 256,
+        'id': 11,
+    }
+    min_x, min_y = 64, 64
+    max_x, max_y = 128, 128
+    keypoints = []
+    num_visible_keypoints = 0
+    xv = []
+    yv = []
+    vv = []
+    for _ in range(17):
+      xc = min_x + int(np.random.rand()*(max_x - min_x))
+      yc = min_y + int(np.random.rand()*(max_y - min_y))
+      vis = np.random.randint(0, 3)
+      xv.append(xc)
+      yv.append(yc)
+      vv.append(vis)
+      keypoints.extend([xc, yc, vis])
+      num_visible_keypoints += (vis > 0)
+    annotations_list = [{
+        'area': 0.5,
+        'iscrowd': False,
+        'image_id': 11,
+        'bbox': [64, 64, 128, 128],
+        'category_id': 1,
+        'id': 1000
+    }]
+    keypoint_annotations_dict = {
+        1000: {
+            'keypoints': keypoints,
+            'num_keypoints': num_visible_keypoints
+        }
+    }
+    category_index = {
+        1: {
+            'name': 'person',
+            'id': 1
+        }
+    }
+    (_, example, _,
+     num_keypoint_annotation_skipped) = create_coco_tf_record.create_tf_example(
+         image,
+         annotations_list,
+         image_dir,
+         category_index,
+         include_masks=False,
+         keypoint_annotations_dict=keypoint_annotations_dict)
+    self.assertEqual(num_keypoint_annotation_skipped, 0)
+    self._assertProtoEqual(
+        example.features.feature['image/height'].int64_list.value, [256])
+    self._assertProtoEqual(
+        example.features.feature['image/width'].int64_list.value, [256])
+    self._assertProtoEqual(
+        example.features.feature['image/filename'].bytes_list.value,
+        [six.b(image_file_name)])
+    self._assertProtoEqual(
+        example.features.feature['image/source_id'].bytes_list.value,
+        [six.b(str(image['id']))])
+    self._assertProtoEqual(
+        example.features.feature['image/format'].bytes_list.value,
+        [six.b('jpeg')])
+    self._assertProtoEqual(
+        example.features.feature['image/object/bbox/xmin'].float_list.value,
+        [0.25])
+    self._assertProtoEqual(
+        example.features.feature['image/object/bbox/ymin'].float_list.value,
+        [0.25])
+    self._assertProtoEqual(
+        example.features.feature['image/object/bbox/xmax'].float_list.value,
+        [0.75])
+    self._assertProtoEqual(
+        example.features.feature['image/object/bbox/ymax'].float_list.value,
+        [0.75])
+    self._assertProtoEqual(
+        example.features.feature['image/object/class/text'].bytes_list.value,
+        [six.b('person')])
+    self._assertProtoClose(
+        example.features.feature['image/object/keypoint/x'].float_list.value,
+        np.array(xv, dtype=np.float32) / 256)
+    self._assertProtoClose(
+        example.features.feature['image/object/keypoint/y'].float_list.value,
+        np.array(yv, dtype=np.float32) / 256)
+    self._assertProtoEqual(
+        example.features.feature['image/object/keypoint/text'].bytes_list.value,
+        create_coco_tf_record._COCO_KEYPOINT_NAMES)
+    self._assertProtoEqual(
+        example.features.feature[
+            'image/object/keypoint/visibility'].int64_list.value, vv)
  def test_create_sharded_tf_record(self):
    tmp_dir = self.get_temp_dir()
    image_paths = ['tmp1_image.jpg', 'tmp2_image.jpg']

--- a/research/object_detection/dataset_tools/create_kitti_tf_record_test.py
+++ b/research/object_detection/dataset_tools/create_kitti_tf_record_test.py
@@ -19,6 +19,7 @@ import os
 import numpy as np
 import PIL.Image
+import six
 import tensorflow as tf
 from object_detection.dataset_tools import create_kitti_tf_record
@@ -75,12 +76,13 @@ class CreateKittiTFRecordTest(tf.test.TestCase):
        example.features.feature['image/width'].int64_list.value, [256])
    self._assertProtoEqual(
        example.features.feature['image/filename'].bytes_list.value,
-        [save_path])
+        [six.b(save_path)])
    self._assertProtoEqual(
        example.features.feature['image/source_id'].bytes_list.value,
-        [save_path])
+        [six.b(save_path)])
    self._assertProtoEqual(
-        example.features.feature['image/format'].bytes_list.value, ['png'])
+        example.features.feature['image/format'].bytes_list.value,
+        [six.b('png')])
    self._assertProtoEqual(
        example.features.feature['image/object/bbox/xmin'].float_list.value,
        [0.25])
@@ -95,7 +97,7 @@ class CreateKittiTFRecordTest(tf.test.TestCase):
        [0.75])
    self._assertProtoEqual(
        example.features.feature['image/object/class/text'].bytes_list.value,
-        ['car'])
+        [six.b('car')])
    self._assertProtoEqual(
        example.features.feature['image/object/class/label'].int64_list.value,
        [1])

--- a/research/object_detection/dataset_tools/create_pascal_tf_record_test.py
+++ b/research/object_detection/dataset_tools/create_pascal_tf_record_test.py
@@ -19,6 +19,7 @@ import os
 import numpy as np
 import PIL.Image
+import six
 import tensorflow as tf
 from object_detection.dataset_tools import create_pascal_tf_record
@@ -80,12 +81,13 @@ class CreatePascalTFRecordTest(tf.test.TestCase):
        example.features.feature['image/width'].int64_list.value, [256])
    self._assertProtoEqual(
        example.features.feature['image/filename'].bytes_list.value,
-        [image_file_name])
+        [six.b(image_file_name)])
    self._assertProtoEqual(
        example.features.feature['image/source_id'].bytes_list.value,
-        [image_file_name])
+        [six.b(image_file_name)])
    self._assertProtoEqual(
-        example.features.feature['image/format'].bytes_list.value, ['jpeg'])
+        example.features.feature['image/format'].bytes_list.value,
+        [six.b('jpeg')])
    self._assertProtoEqual(
        example.features.feature['image/object/bbox/xmin'].float_list.value,
        [0.25])
@@ -100,7 +102,7 @@ class CreatePascalTFRecordTest(tf.test.TestCase):
        [0.75])
    self._assertProtoEqual(
        example.features.feature['image/object/class/text'].bytes_list.value,
-        ['person'])
+        [six.b('person')])
    self._assertProtoEqual(
        example.features.feature['image/object/class/label'].int64_list.value,
        [1])
@@ -111,7 +113,8 @@ class CreatePascalTFRecordTest(tf.test.TestCase):
        example.features.feature['image/object/truncated'].int64_list.value,
        [0])
    self._assertProtoEqual(
-        example.features.feature['image/object/view'].bytes_list.value, [''])
+        example.features.feature['image/object/view'].bytes_list.value,
+        [six.b('')])
 if __name__ == '__main__':

--- a/research/object_detection/dataset_tools/oid_hierarchical_labels_expansion.py
+++ b/research/object_detection/dataset_tools/oid_hierarchical_labels_expansion.py
+# Lint as: python2, python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,12 +33,15 @@ oid_hierarchical_labels_expansion.py \
 --annotation_type=<1 (for boxes and segments) or 2 (for image-level labels)>
 """
+from __future__ import absolute_import
+from __future__ import division
 from __future__ import print_function
 import copy
 import json
 from absl import app
 from absl import flags
+import six
 flags.DEFINE_string(
    'json_hierarchy_file', None,
@@ -136,7 +140,7 @@ class OIDHierarchicalLabelsExpansion(object):
    # Row header is expected to be the following for segments:
    # ImageID,LabelName,ImageWidth,ImageHeight,XMin,XMax,YMin,YMax,
    # IsGroupOf,Mask
-    split_csv_row = csv_row.split(',')
+    split_csv_row = six.ensure_str(csv_row).split(',')
    result = [csv_row]
    assert split_csv_row[
        labelname_column_index] in self._hierarchy_keyed_child
@@ -165,7 +169,7 @@ class OIDHierarchicalLabelsExpansion(object):
    """
    # Row header is expected to be exactly:
    # ImageID,Source,LabelName,Confidence
-    split_csv_row = csv_row.split(',')
+    split_csv_row = six.ensure_str(csv_row).split(',')
    result = [csv_row]
    if int(split_csv_row[confidence_column_index]) == 1:
      assert split_csv_row[

--- a/research/object_detection/dataset_tools/oid_tfrecord_creation.py
+++ b/research/object_detection/dataset_tools/oid_tfrecord_creation.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+import six
 import tensorflow as tf
 from object_detection.core import standard_fields
@@ -61,18 +62,21 @@ def tf_example_from_annotations_data_frame(annotations_data_frame, label_map,
          dataset_util.float_list_feature(
              filtered_data_frame_boxes.XMax.as_matrix()),
      standard_fields.TfExampleFields.object_class_text:
-          dataset_util.bytes_list_feature(
+          dataset_util.bytes_list_feature([
-              filtered_data_frame_boxes.LabelName.as_matrix()),
+              six.ensure_binary(label_text)
+              for label_text in filtered_data_frame_boxes.LabelName.as_matrix()
+          ]),
      standard_fields.TfExampleFields.object_class_label:
          dataset_util.int64_list_feature(
-              filtered_data_frame_boxes.LabelName.map(lambda x: label_map[x])
+              filtered_data_frame_boxes.LabelName.map(
-              .as_matrix()),
+                  lambda x: label_map[x]).as_matrix()),
      standard_fields.TfExampleFields.filename:
-          dataset_util.bytes_feature('{}.jpg'.format(image_id)),
+          dataset_util.bytes_feature(
+              six.ensure_binary('{}.jpg'.format(image_id))),
      standard_fields.TfExampleFields.source_id:
-          dataset_util.bytes_feature(image_id),
+          dataset_util.bytes_feature(six.ensure_binary(image_id)),
      standard_fields.TfExampleFields.image_encoded:
-          dataset_util.bytes_feature(encoded_image),
+          dataset_util.bytes_feature(six.ensure_binary(encoded_image)),
  }
  if 'IsGroupOf' in filtered_data_frame.columns:
@@ -100,7 +104,9 @@ def tf_example_from_annotations_data_frame(annotations_data_frame, label_map,
                image_class_label] = dataset_util.int64_list_feature(
                    filtered_data_frame_labels.LabelName.map(
                        lambda x: label_map[x]).as_matrix())
-    feature_map[standard_fields.TfExampleFields.
+    feature_map[standard_fields.TfExampleFields
-                image_class_text] = dataset_util.bytes_list_feature(
+                .image_class_text] = dataset_util.bytes_list_feature([
-                    filtered_data_frame_labels.LabelName.as_matrix()),
+                    six.ensure_binary(label_text) for label_text in
+                    filtered_data_frame_labels.LabelName.as_matrix()
+                ]),
  return tf.train.Example(features=tf.train.Features(feature=feature_map))
--- a/research/object_detection/dataset_tools/oid_tfrecord_creation_test.py
+++ b/research/object_detection/dataset_tools/oid_tfrecord_creation_test.py
@@ -15,6 +15,7 @@
 """Tests for oid_tfrecord_creation.py."""
 import pandas as pd
+import six
 import tensorflow as tf
 from object_detection.dataset_tools import oid_tfrecord_creation
@@ -46,8 +47,7 @@ class TfExampleFromAnnotationsDataFrameTests(tf.test.TestCase):
    tf_example = oid_tfrecord_creation.tf_example_from_annotations_data_frame(
        df[df.ImageID == 'i1'], label_map, 'encoded_image_test')
-    self.assertProtoEquals(
+    self.assertProtoEquals(six.ensure_str("""
-        """
        features {
          feature {
            key: "image/encoded"
@@ -94,7 +94,7 @@ class TfExampleFromAnnotationsDataFrameTests(tf.test.TestCase):
          feature {
            key: "image/class/text"
            value { bytes_list { value: ["c"] } } } }
-    """, tf_example)
+    """), tf_example)
  def test_no_attributes(self):
    label_map, df = create_test_data()
@@ -107,7 +107,7 @@ class TfExampleFromAnnotationsDataFrameTests(tf.test.TestCase):
    tf_example = oid_tfrecord_creation.tf_example_from_annotations_data_frame(
        df[df.ImageID == 'i2'], label_map, 'encoded_image_test')
-    self.assertProtoEquals("""
+    self.assertProtoEquals(six.ensure_str("""
        features {
          feature {
            key: "image/encoded"
@@ -136,7 +136,7 @@ class TfExampleFromAnnotationsDataFrameTests(tf.test.TestCase):
          feature {
            key: "image/source_id"
           value { bytes_list { value: "i2" } } } }
-    """, tf_example)
+    """), tf_example)
  def test_label_filtering(self):
    label_map, df = create_test_data()
@@ -146,7 +146,7 @@ class TfExampleFromAnnotationsDataFrameTests(tf.test.TestCase):
    tf_example = oid_tfrecord_creation.tf_example_from_annotations_data_frame(
        df[df.ImageID == 'i1'], label_map, 'encoded_image_test')
    self.assertProtoEquals(
-        """
+        six.ensure_str("""
        features {
          feature {
            key: "image/encoded"
@@ -193,7 +193,7 @@ class TfExampleFromAnnotationsDataFrameTests(tf.test.TestCase):
          feature {
            key: "image/class/text"
            value { bytes_list { } } } }
-    """, tf_example)
+    """), tf_example)
 if __name__ == '__main__':

--- a/research/object_detection/dataset_tools/tf_record_creation_util.py
+++ b/research/object_detection/dataset_tools/tf_record_creation_util.py
+# Lint as: python2, python3
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from six.moves import range
 import tensorflow as tf

--- a/research/object_detection/dataset_tools/tf_record_creation_util_test.py
+++ b/research/object_detection/dataset_tools/tf_record_creation_util_test.py
+# Lint as: python2, python3
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,8 +15,14 @@
 # ==============================================================================
 """Tests for tf_record_creation_util.py."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import os
 import contextlib2
+import six
+from six.moves import range
 import tensorflow as tf
 from object_detection.dataset_tools import tf_record_creation_util
@@ -29,7 +36,7 @@ class OpenOutputTfrecordsTests(tf.test.TestCase):
          tf_record_close_stack,
          os.path.join(tf.test.get_temp_dir(), 'test.tfrec'), 10)
      for idx in range(10):
-        output_tfrecords[idx].write('test_{}'.format(idx))
+        output_tfrecords[idx].write(six.ensure_binary('test_{}'.format(idx)))
    for idx in range(10):
      tf_record_path = '{}-{:05d}-of-00010'.format(

--- a/research/object_detection/dockerfiles/android/Dockerfile
+++ b/research/object_detection/dockerfiles/android/Dockerfile
@@ -24,7 +24,8 @@ RUN git clone --depth 1 https://github.com/tensorflow/models.git && \
 # Install gcloud and gsutil commands
 # https://cloud.google.com/sdk/docs/quickstart-debian-ubuntu
-RUN apt-get -y update && apt-get install -y gpg-agent && export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \
+RUN apt-get -y update && apt-get install -y gpg-agent && \
+    export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \
    echo "deb http://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
    apt-get update -y && apt-get install google-cloud-sdk -y
@@ -33,8 +34,10 @@ RUN apt-get -y update && apt-get install -y gpg-agent && export CLOUD_SDK_REPO="
 # Install the Tensorflow Object Detection API from here
 # https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/installation.md
-# Install object detection api dependencies - use non-interactive mode to set default tzdata config during installation
+# Install object detection api dependencies - use non-interactive mode to set
-RUN export DEBIAN_FRONTEND=noninteractive && apt-get install -y protobuf-compiler python-pil python-lxml python-tk && \
+# default tzdata config during installation.
+RUN export DEBIAN_FRONTEND=noninteractive && \
+    apt-get install -y protobuf-compiler python-pil python-lxml python-tk && \
    pip install Cython && \
    pip install contextlib2 && \
    pip install jupyter && \

--- a/research/object_detection/eval_util.py
+++ b/research/object_detection/eval_util.py
@@ -26,18 +26,20 @@ import numpy as np
 from six.moves import range
 import tensorflow as tf
+from tensorflow.contrib import slim
 from object_detection.core import box_list
 from object_detection.core import box_list_ops
 from object_detection.core import keypoint_ops
 from object_detection.core import standard_fields as fields
 from object_detection.metrics import coco_evaluation
+from object_detection.protos import eval_pb2
 from object_detection.utils import label_map_util
 from object_detection.utils import object_detection_evaluation
 from object_detection.utils import ops
 from object_detection.utils import shape_utils
 from object_detection.utils import visualization_utils as vis_utils
-slim = tf.contrib.slim
+EVAL_KEYPOINT_METRIC = 'coco_keypoint_metrics'
 # A dictionary of metric names to classes that implement the metric. The classes
 # in the dictionary must implement
@@ -45,6 +47,8 @@ slim = tf.contrib.slim
 EVAL_METRICS_CLASS_DICT = {
    'coco_detection_metrics':
        coco_evaluation.CocoDetectionEvaluator,
+    'coco_keypoint_metrics':
+        coco_evaluation.CocoKeypointEvaluator,
    'coco_mask_metrics':
        coco_evaluation.CocoMaskEvaluator,
    'oid_challenge_detection_metrics':
@@ -324,7 +328,7 @@ def _run_checkpoint_once(tensor_dict,
  counters = {'skipped': 0, 'success': 0}
  aggregate_result_losses_dict = collections.defaultdict(list)
-  with tf.contrib.slim.queues.QueueRunners(sess):
+  with slim.queues.QueueRunners(sess):
    try:
      for batch in range(int(num_batches)):
        if (batch + 1) % 100 == 0:
@@ -591,6 +595,8 @@ def result_dict_for_single_example(image,
      'groundtruth_group_of': [num_boxes] int64 tensor. (Optional)
      'groundtruth_instance_masks': 3D int64 tensor of instance masks
        (Optional).
+      'groundtruth_keypoints': [num_boxes, num_keypoints, 2] float32 tensor with
+        keypoints (Optional).
    class_agnostic: Boolean indicating whether the detections are class-agnostic
      (i.e. binary). Default False.
    scale_to_absolute: Boolean indicating whether boxes and keypoints should be
@@ -620,7 +626,8 @@ def result_dict_for_single_example(image,
    'groundtruth_group_of': [num_boxes] int64 tensor. (Optional)
    'groundtruth_instance_masks': 3D int64 tensor of instance masks
      (Optional).
+    'groundtruth_keypoints': [num_boxes, num_keypoints, 2] float32 tensor with
+      keypoints (Optional).
  """
  if groundtruth:
@@ -675,6 +682,10 @@ def result_dict_for_batched_example(images,
  Note that evaluation tools require classes that are 1-indexed, and so this
  function performs the offset. If `class_agnostic` is True, all output classes
  have label 1.
+  The groundtruth coordinates of boxes/keypoints in 'groundtruth' dictionary are
+  normalized relative to the (potentially padded) input image, while the
+  coordinates in 'detection' dictionary are normalized relative to the true
+  image shape.
  Args:
    images: A single 4D uint8 image tensor of shape [batch_size, H, W, C].
@@ -696,6 +707,10 @@ def result_dict_for_batched_example(images,
        tensor. (Optional)
      'groundtruth_instance_masks': 4D int64 tensor of instance
        masks (Optional).
+      'groundtruth_keypoints': [batch_size, max_number_of_boxes, num_keypoints,
+        2] float32 tensor with keypoints (Optional).
+      'groundtruth_keypoint_visibilities': [batch_size, max_number_of_boxes,
+        num_keypoints] bool tensor with keypoint visibilities (Optional).
    class_agnostic: Boolean indicating whether the detections are class-agnostic
      (i.e. binary). Default False.
    scale_to_absolute: Boolean indicating whether boxes and keypoints should be
@@ -724,7 +739,11 @@ def result_dict_for_batched_example(images,
    'detection_classes': [batch_size, max_detections] int64 tensor of 1-indexed
      classes.
    'detection_masks': [batch_size, max_detections, H, W] float32 tensor of
-      binarized masks, reframed to full image masks.
+      binarized masks, reframed to full image masks. (Optional)
+    'detection_keypoints': [batch_size, max_detections, num_keypoints, 2]
+      float32 tensor containing keypoint coordinates. (Optional)
+    'detection_keypoint_scores': [batch_size, max_detections, num_keypoints]
+      float32 tensor containing keypoint scores. (Optional)
    'num_detections': [batch_size] int64 tensor containing number of valid
      detections.
    'groundtruth_boxes': [batch_size, num_boxes, 4] float32 tensor of boxes, in
@@ -739,6 +758,10 @@ def result_dict_for_batched_example(images,
    'groundtruth_group_of': [batch_size, num_boxes] int64 tensor. (Optional)
    'groundtruth_instance_masks': 4D int64 tensor of instance masks
      (Optional).
+    'groundtruth_keypoints': [batch_size, num_boxes, num_keypoints, 2] float32
+      tensor with keypoints (Optional).
+    'groundtruth_keypoint_visibilities': [batch_size, num_boxes, num_keypoints]
+      bool tensor with keypoint visibilities (Optional).
    'num_groundtruth_boxes': [batch_size] tensor containing the maximum number
      of groundtruth boxes per image.
@@ -828,6 +851,12 @@ def result_dict_for_batched_example(images,
              _scale_keypoint_to_absolute,
              elems=[detection_keypoints, original_image_spatial_shapes],
              dtype=tf.float32))
+    if detection_fields.detection_keypoint_scores in detections:
+      output_dict[detection_fields.detection_keypoint_scores] = detections[
+          detection_fields.detection_keypoint_scores]
+    else:
+      output_dict[detection_fields.detection_keypoint_scores] = tf.ones_like(
+          detections[detection_fields.detection_keypoints][:, :, :, 0])
  if groundtruth:
    if max_gt_boxes is None:
@@ -866,6 +895,28 @@ def result_dict_for_batched_example(images,
        elems=[groundtruth_boxes, true_image_shapes], dtype=tf.float32)
    output_dict[input_data_fields.groundtruth_boxes] = groundtruth_boxes
+    if input_data_fields.groundtruth_keypoints in groundtruth:
+      # If groundtruth_keypoints is in the groundtruth dictionary. Update the
+      # coordinates to conform with the true image shape.
+      def _scale_keypoints_to_normalized_true_image(args):
+        """Scale the box coordinates to be relative to the true image shape."""
+        keypoints, true_image_shape = args
+        true_image_shape = tf.cast(true_image_shape, tf.float32)
+        true_height, true_width = true_image_shape[0], true_image_shape[1]
+        normalized_window = tf.stack(
+            [0.0, 0.0, true_height / image_height, true_width / image_width])
+        return keypoint_ops.change_coordinate_frame(keypoints,
+                                                    normalized_window)
+      groundtruth_keypoints = groundtruth[
+          input_data_fields.groundtruth_keypoints]
+      groundtruth_keypoints = shape_utils.static_or_dynamic_map_fn(
+          _scale_keypoints_to_normalized_true_image,
+          elems=[groundtruth_keypoints, true_image_shapes],
+          dtype=tf.float32)
+      output_dict[
+          input_data_fields.groundtruth_keypoints] = groundtruth_keypoints
    if scale_to_absolute:
      groundtruth_boxes = output_dict[input_data_fields.groundtruth_boxes]
      output_dict[input_data_fields.groundtruth_boxes] = (
@@ -873,6 +924,14 @@ def result_dict_for_batched_example(images,
              _scale_box_to_absolute,
              elems=[groundtruth_boxes, original_image_spatial_shapes],
              dtype=tf.float32))
+      if input_data_fields.groundtruth_keypoints in groundtruth:
+        groundtruth_keypoints = output_dict[
+            input_data_fields.groundtruth_keypoints]
+        output_dict[input_data_fields.groundtruth_keypoints] = (
+            shape_utils.static_or_dynamic_map_fn(
+                _scale_keypoint_to_absolute,
+                elems=[groundtruth_keypoints, original_image_spatial_shapes],
+                dtype=tf.float32))
    # For class-agnostic models, groundtruth classes all become 1.
    if class_agnostic:
@@ -893,6 +952,8 @@ def get_evaluators(eval_config, categories, evaluator_options=None):
    categories: A list of dicts, each of which has the following keys -
        'id': (required) an integer id uniquely identifying this category.
        'name': (required) string representing category name e.g., 'cat', 'dog'.
+        'keypoints': (optional) dict mapping this category's keypoints to unique
+          ids.
    evaluator_options: A dictionary of metric names (see
      EVAL_METRICS_CLASS_DICT) to `DetectionEvaluator` initialization
      keyword arguments. For example:
@@ -919,6 +980,32 @@ def get_evaluators(eval_config, categories, evaluator_options=None):
    evaluators_list.append(EVAL_METRICS_CLASS_DICT[eval_metric_fn_key](
        categories,
        **kwargs_dict))
+  if isinstance(eval_config, eval_pb2.EvalConfig):
+    parameterized_metrics = eval_config.parameterized_metric
+    for parameterized_metric in parameterized_metrics:
+      assert parameterized_metric.HasField('parameterized_metric')
+      if parameterized_metric.WhichOneof(
+          'parameterized_metric') == EVAL_KEYPOINT_METRIC:
+        keypoint_metrics = parameterized_metric.coco_keypoint_metrics
+        # Create category to keypoints mapping dict.
+        category_keypoints = {}
+        class_label = keypoint_metrics.class_label
+        category = None
+        for cat in categories:
+          if cat['name'] == class_label:
+            category = cat
+            break
+        if not category:
+          continue
+        keypoints_for_this_class = category['keypoints']
+        category_keypoints = [{
+            'id': keypoints_for_this_class[kp_name], 'name': kp_name
+        } for kp_name in keypoints_for_this_class]
+        # Create keypoint evaluator for this category.
+        evaluators_list.append(EVAL_METRICS_CLASS_DICT[EVAL_KEYPOINT_METRIC](
+            category['id'], category_keypoints, class_label,
+            keypoint_metrics.keypoint_label_to_sigmas))
  return evaluators_list

--- a/research/object_detection/eval_util_test.py
+++ b/research/object_detection/eval_util_test.py
@@ -27,6 +27,7 @@ import tensorflow as tf
 from object_detection import eval_util
 from object_detection.core import standard_fields as fields
+from object_detection.metrics import coco_evaluation
 from object_detection.protos import eval_pb2
 from object_detection.utils import test_case
@@ -38,6 +39,26 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
            {'id': 2, 'name': 'dog'},
            {'id': 3, 'name': 'cat'}]
+  def _get_categories_list_with_keypoints(self):
+    return [{
+        'id': 1,
+        'name': 'person',
+        'keypoints': {
+            'left_eye': 0,
+            'right_eye': 3
+        }
+    }, {
+        'id': 2,
+        'name': 'dog',
+        'keypoints': {
+            'tail_start': 1,
+            'mouth': 2
+        }
+    }, {
+        'id': 3,
+        'name': 'cat'
+    }]
  def _make_evaluation_dict(self,
                            resized_groundtruth_masks=False,
                            batch_size=1,
@@ -61,6 +82,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
    groundtruth_boxes = tf.constant([[0., 0., 1., 1.]])
    groundtruth_classes = tf.constant([1])
    groundtruth_instance_masks = tf.ones(shape=[1, 20, 20], dtype=tf.uint8)
+    groundtruth_keypoints = tf.constant([[0.0, 0.0], [0.5, 0.5], [1.0, 1.0]])
    if resized_groundtruth_masks:
      groundtruth_instance_masks = tf.ones(shape=[1, 10, 10], dtype=tf.uint8)
@@ -72,6 +94,9 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      groundtruth_instance_masks = tf.tile(
          tf.expand_dims(groundtruth_instance_masks, 0),
          multiples=[batch_size, 1, 1, 1])
+      groundtruth_keypoints = tf.tile(
+          tf.expand_dims(groundtruth_keypoints, 0),
+          multiples=[batch_size, 1, 1])
    detections = {
        detection_fields.detection_boxes: detection_boxes,
@@ -83,6 +108,7 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
    groundtruth = {
        input_data_fields.groundtruth_boxes: groundtruth_boxes,
        input_data_fields.groundtruth_classes: groundtruth_classes,
+        input_data_fields.groundtruth_keypoints: groundtruth_keypoints,
        input_data_fields.groundtruth_instance_masks: groundtruth_instance_masks
    }
    if batch_size > 1:
@@ -255,6 +281,49 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
    self.assertAlmostEqual(evaluator[1]._recall_lower_bound, 0.0)
    self.assertAlmostEqual(evaluator[1]._recall_upper_bound, 1.0)
+  def test_get_evaluator_with_keypoint_metrics(self):
+    eval_config = eval_pb2.EvalConfig()
+    person_keypoints_metric = eval_config.parameterized_metric.add()
+    person_keypoints_metric.coco_keypoint_metrics.class_label = 'person'
+    person_keypoints_metric.coco_keypoint_metrics.keypoint_label_to_sigmas[
+        'left_eye'] = 0.1
+    person_keypoints_metric.coco_keypoint_metrics.keypoint_label_to_sigmas[
+        'right_eye'] = 0.2
+    dog_keypoints_metric = eval_config.parameterized_metric.add()
+    dog_keypoints_metric.coco_keypoint_metrics.class_label = 'dog'
+    dog_keypoints_metric.coco_keypoint_metrics.keypoint_label_to_sigmas[
+        'tail_start'] = 0.3
+    dog_keypoints_metric.coco_keypoint_metrics.keypoint_label_to_sigmas[
+        'mouth'] = 0.4
+    categories = self._get_categories_list_with_keypoints()
+    evaluator = eval_util.get_evaluators(
+        eval_config, categories, evaluator_options=None)
+    # Verify keypoint evaluator class variables.
+    self.assertLen(evaluator, 3)
+    self.assertFalse(evaluator[0]._include_metrics_per_category)
+    self.assertEqual(evaluator[1]._category_name, 'person')
+    self.assertEqual(evaluator[2]._category_name, 'dog')
+    self.assertAllEqual(evaluator[1]._keypoint_ids, [0, 3])
+    self.assertAllEqual(evaluator[2]._keypoint_ids, [1, 2])
+    self.assertAllClose([0.1, 0.2], evaluator[1]._oks_sigmas)
+    self.assertAllClose([0.3, 0.4], evaluator[2]._oks_sigmas)
+  def test_get_evaluator_with_unmatched_label(self):
+    eval_config = eval_pb2.EvalConfig()
+    person_keypoints_metric = eval_config.parameterized_metric.add()
+    person_keypoints_metric.coco_keypoint_metrics.class_label = 'unmatched'
+    person_keypoints_metric.coco_keypoint_metrics.keypoint_label_to_sigmas[
+        'kpt'] = 0.1
+    categories = self._get_categories_list_with_keypoints()
+    evaluator = eval_util.get_evaluators(
+        eval_config, categories, evaluator_options=None)
+    self.assertLen(evaluator, 1)
+    self.assertNotIsInstance(
+        evaluator[0], coco_evaluation.CocoKeypointEvaluator)
  def test_padded_image_result_dict(self):
    input_data_fields = fields.InputDataFields
@@ -263,6 +332,8 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
    detection_boxes = np.array([[[0., 0., 1., 1.]], [[0.0, 0.0, 0.5, 0.5]]],
                               dtype=np.float32)
+    detection_keypoints = np.array([[0.0, 0.0], [0.5, 0.5], [1.0, 1.0]],
+                                   dtype=np.float32)
    detections = {
        detection_fields.detection_boxes:
            tf.constant(detection_boxes),
@@ -271,7 +342,12 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
        detection_fields.detection_classes:
            tf.constant([[1], [2]]),
        detection_fields.num_detections:
-            tf.constant([1, 1])
+            tf.constant([1, 1]),
+        detection_fields.detection_keypoints:
+            tf.tile(
+                tf.reshape(
+                    tf.constant(detection_keypoints), shape=[1, 1, 3, 2]),
+                multiples=[2, 1, 1, 1])
    }
    gt_boxes = detection_boxes
@@ -280,6 +356,11 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
            tf.constant(gt_boxes),
        input_data_fields.groundtruth_classes:
            tf.constant([[1.], [1.]]),
+        input_data_fields.groundtruth_keypoints:
+            tf.tile(
+                tf.reshape(
+                    tf.constant(detection_keypoints), shape=[1, 1, 3, 2]),
+                multiples=[2, 1, 1, 1])
    }
    image = tf.zeros((2, 100, 100, 3), dtype=tf.float32)
@@ -299,11 +380,17 @@ class EvalUtilTest(test_case.TestCase, parameterized.TestCase):
      self.assertAllEqual(
          [[[0., 0., 200., 200.]], [[0.0, 0.0, 150., 150.]]],
          result[input_data_fields.groundtruth_boxes])
+      self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
+                           [[[0., 0.], [150., 150.], [300., 300.]]]],
+                          result[input_data_fields.groundtruth_keypoints])
      # Predictions from the model are not scaled.
      self.assertAllEqual(
          [[[0., 0., 200., 200.]], [[0.0, 0.0, 75., 150.]]],
          result[detection_fields.detection_boxes])
+      self.assertAllClose([[[[0., 0.], [100., 100.], [200., 200.]]],
+                           [[[0., 0.], [75., 150.], [150., 300.]]]],
+                          result[detection_fields.detection_keypoints])
 if __name__ == '__main__':

--- a/research/object_detection/export_inference_graph.py
+++ b/research/object_detection/export_inference_graph.py
+# Lint as: python2, python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -108,7 +109,6 @@ from google.protobuf import text_format
 from object_detection import exporter
 from object_detection.protos import pipeline_pb2
-slim = tf.contrib.slim
 flags = tf.app.flags
 flags.DEFINE_string('input_type', 'image_tensor', 'Type of input node. Can be '

--- a/research/object_detection/export_tflite_ssd_graph.py
+++ b/research/object_detection/export_tflite_ssd_graph.py
+# Lint as: python2, python3
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");