Internal change

PiperOrigin-RevId: 378869744

Internal change
PiperOrigin-RevId: 378869744
7af2ff16 · Vincent Dumoulin · A. Unique TensorFlower · a0be6885 · 7af2ff16 · 7af2ff16
Commit 7af2ff16 authored Jun 11, 2021 by Vincent Dumoulin Committed by A. Unique TensorFlower Jun 11, 2021
3 changed files
--- a/official/vision/beta/data/create_coco_tf_record.py
+++ b/official/vision/beta/data/create_coco_tf_record.py
@@ -46,7 +46,7 @@ from official.vision.beta.data import tfrecord_lib
 flags.DEFINE_boolean(
    'include_masks', False, 'Whether to include instance segmentations masks '
    '(PNG encoded) in the result. default: False.')
-flags.DEFINE_string('image_dir', '', 'Directory containing images.')
+flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.')
 flags.DEFINE_string(
    'image_info_file', '', 'File containing image information. '
    'Tf Examples in the output files correspond to the image '
@@ -159,7 +159,7 @@ def encode_caption_annotations(caption_annotations):


 def create_tf_example(image,
-                      image_dir,
+                      image_dirs,
                      bbox_annotations=None,
                      id_to_name_map=None,
                      caption_annotations=None,
@@ -169,7 +169,7 @@ def create_tf_example(image,
  Args:
    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
      u'width', u'date_captured', u'flickr_url', u'id']
-    image_dir: directory containing the image files.
+    image_dirs: list of directories containing the image files.
    bbox_annotations:
      list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
        u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
@@ -190,14 +190,31 @@ def create_tf_example(image,
    num_annotations_skipped: Number of (invalid) annotations that were ignored.

  Raises:
-    ValueError: if the image pointed to by data['filename'] is not a valid JPEG
+    ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
+      does not exist, or is not unique across image directories.
  """
  image_height = image['height']
  image_width = image['width']
  filename = image['file_name']
  image_id = image['id']

+  if len(image_dirs) > 1:
+    full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs]
+    full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)]
+    if not full_existing_paths:
+      raise ValueError(
+          '{} does not exist across image directories.'.format(filename))
+    if len(full_existing_paths) > 1:
+      raise ValueError(
+          '{} is not unique across image directories'.format(filename))
+    full_path, = full_existing_paths
+  # If there is only one image directory, it's not worth checking for existence,
+  # since trying to open the file will raise an informative error message if it
+  # does not exist.
+  else:
+    image_dir, = image_dirs
    full_path = os.path.join(image_dir, filename)
+
  with tf.io.gfile.GFile(full_path, 'rb') as fid:
    encoded_jpg = fid.read()

@@ -276,7 +293,7 @@ def _load_images_info(images_info_file):
  return info_dict['images']


-def generate_annotations(images, image_dir,
+def generate_annotations(images, image_dirs,
                         img_to_obj_annotation=None,
                         img_to_caption_annotation=None, id_to_name_map=None,
                         include_masks=False):
@@ -289,12 +306,12 @@ def generate_annotations(images, image_dir,
    caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if
                         img_to_caption_annotation else None)

-    yield (image, image_dir, object_annotation, id_to_name_map,
+    yield (image, image_dirs, object_annotation, id_to_name_map,
           caption_annotaion, include_masks)


 def _create_tf_record_from_coco_annotations(images_info_file,
-                                            image_dir,
+                                            image_dirs,
                                            output_path,
                                            num_shards,
                                            object_annotations_file=None,
@@ -309,7 +326,7 @@ def _create_tf_record_from_coco_annotations(images_info_file,
      files Eg. 'image_info_test-dev2017.json',
      'instance_annotations_train2017.json',
      'caption_annotations_train2017.json', etc.
-    image_dir: Directory containing the image files.
+    image_dirs: List of directories containing the image files.
    output_path: Path to output tf.Record file.
    num_shards: Number of output files to create.
    object_annotations_file: JSON file containing bounding box annotations.
@@ -333,7 +350,7 @@ def _create_tf_record_from_coco_annotations(images_info_file,
        _load_caption_annotations(caption_annotations_file))

  coco_annotations_iter = generate_annotations(
-      images, image_dir, img_to_obj_annotation, img_to_caption_annotation,
+      images, image_dirs, img_to_obj_annotation, img_to_caption_annotation,
      id_to_name_map=id_to_name_map, include_masks=include_masks)

  num_skipped = tfrecord_lib.write_tf_record_dataset(

--- a/official/vision/beta/data/process_coco_few_shot.sh
+++ b/official/vision/beta/data/process_coco_few_shot.sh
+#!/bin/bash
+#
+# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
+
+tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX)
+output_dir="/tmp/coco_few_shot"
+while getopts "o:" o; do
+  case "${o}" in
+    o) output_dir=${OPTARG} ;;
+    *) echo "Usage: ${0} [-o <output_dir>]" 1>&2; exit 1 ;;
+  esac
+done
+
+cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
+wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
+    -P "${tmp_dir}" -A "5k.json,*10shot*.json,*30shot*.json" \
+    "http://${cocosplit_url}/"
+mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
+rm -rf "${tmp_dir}/${cocosplit_url}/"
+
+python process_coco_few_shot_json_files.py \
+    --logtostderr --workdir="${tmp_dir}"
+
+for seed in {0..9}; do
+  for shots in 10 30; do
+    python create_coco_tf_record.py \
+        --logtostderr \
+        --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/train2014 \
+        --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/val2014 \
+        --image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
+        --caption_annotations_file="" \
+        --output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \
+        --num_shards=4
+  done
+done
+
+python create_coco_tf_record.py \
+    --logtostderr \
+    --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/train2014 \
+    --image_dir=/namespace/vale-project/datasets/mscoco_raw/images/val2014 \
+    --image_info_file="${tmp_dir}/datasplit/5k.json" \
+    --object_annotations_file="${tmp_dir}/datasplit/5k.json" \
+    --caption_annotations_file="" \
+    --output_file_prefix="${output_dir}/5k" \
+    --num_shards=10
+
+rm -rf "${tmp_dir}"
--- a/official/vision/beta/data/process_coco_few_shot_json_files.py
+++ b/official/vision/beta/data/process_coco_few_shot_json_files.py
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processes the JSON files for COCO few-shot.
+
+We assume that `workdir` mirrors the contents of
+http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
+files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
+"Frustratingly Simple Few-Shot Object Detection" paper uses.
+"""
+
+import collections
+import itertools
+import json
+import logging
+import os
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+logger = tf.get_logger()
+logger.setLevel(logging.INFO)
+
+flags.DEFINE_string('workdir', None, 'Working directory.')
+
+FLAGS = flags.FLAGS
+CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
+              'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird',
+              'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake',
+              'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch',
+              'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant',
+              'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier',
+              'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife',
+              'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven',
+              'parking meter', 'person', 'pizza', 'potted plant',
+              'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep',
+              'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball',
+              'stop sign', 'suitcase', 'surfboard', 'teddy bear',
+              'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush',
+              'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
+              'wine glass', 'zebra']
+SEEDS = list(range(10))
+SHOTS = [10, 30]
+
+FILE_SUFFIXES = collections.defaultdict(list)
+for _seed, _shots in itertools.product(SEEDS, SHOTS):
+  for _category in CATEGORIES:
+    FILE_SUFFIXES[(_seed, _shots)].append(
+        '{}full_box_{}shot_{}_trainval.json'.format(
+            # http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
+            #
+            #   datasplit/
+            #     trainvalno5k.json
+            #     5k.json
+            #   full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #   seed{1-9}/
+            #     full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
+            #
+            # This means that the JSON files for seed0 are located in the root
+            # directory rather than in a `seed?/` subdirectory, hence the
+            # conditional expression below.
+            '' if _seed == 0 else 'seed{}/'.format(_seed),
+            _shots,
+            _category))
+
+
+def main(unused_argv):
+  workdir = FLAGS.workdir
+
+  for seed, shots in itertools.product(SEEDS, SHOTS):
+    # Retrieve all examples for a given seed and shots setting.
+    file_paths = [os.path.join(workdir, suffix)
+                  for suffix in FILE_SUFFIXES[(seed, shots)]]
+    json_dicts = []
+    for file_path in file_paths:
+      with tf.io.gfile.GFile(file_path, 'r') as f:
+        json_dicts.append(json.load(f))
+
+    # Make sure that all JSON files for a given seed and shots setting have the
+    # same metadata. We count on this to fuse them later on.
+    metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'],
+                       'categories': d['categories']} for d in json_dicts]
+    if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]):
+      raise RuntimeError(
+          'JSON files for {} shots (seed {}) '.format(shots, seed) +
+          'have different info, licences, or categories fields')
+
+    # Retrieve images across all JSON files.
+    images = sum((d['images'] for d in json_dicts), [])
+    # Remove duplicate image entries.
+    images = list({image['id']: image for image in images}.values())
+
+    output_dict = {
+        'info': json_dicts[0]['info'],
+        'licenses': json_dicts[0]['licenses'],
+        'categories': json_dicts[0]['categories'],
+        'images': images,
+        'annotations': sum((d['annotations'] for d in json_dicts), [])
+    }
+
+    output_path = os.path.join(workdir,
+                               '{}shot_seed{}.json'.format(shots, seed))
+    with tf.io.gfile.GFile(output_path, 'w') as f:
+      json.dump(output_dict, f)
+    logger.info('Processed %d shots (seed %d) and saved to %s',
+                shots, seed, output_path)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('workdir')
+  app.run(main)