Commit 93ae9c4d authored by Vincent Dumoulin's avatar Vincent Dumoulin Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 378869744
parent e3764cd8
......@@ -46,7 +46,7 @@ from official.vision.beta.data import tfrecord_lib
flags.DEFINE_boolean(
'include_masks', False, 'Whether to include instance segmentations masks '
'(PNG encoded) in the result. default: False.')
flags.DEFINE_string('image_dir', '', 'Directory containing images.')
flags.DEFINE_multi_string('image_dir', '', 'Directory containing images.')
flags.DEFINE_string(
'image_info_file', '', 'File containing image information. '
'Tf Examples in the output files correspond to the image '
......@@ -159,7 +159,7 @@ def encode_caption_annotations(caption_annotations):
def create_tf_example(image,
image_dir,
image_dirs,
bbox_annotations=None,
id_to_name_map=None,
caption_annotations=None,
......@@ -169,7 +169,7 @@ def create_tf_example(image,
Args:
image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
u'width', u'date_captured', u'flickr_url', u'id']
image_dir: directory containing the image files.
image_dirs: list of directories containing the image files.
bbox_annotations:
list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
......@@ -190,14 +190,31 @@ def create_tf_example(image,
num_annotations_skipped: Number of (invalid) annotations that were ignored.
Raises:
ValueError: if the image pointed to by data['filename'] is not a valid JPEG
ValueError: if the image pointed to by data['filename'] is not a valid JPEG,
does not exist, or is not unique across image directories.
"""
image_height = image['height']
image_width = image['width']
filename = image['file_name']
image_id = image['id']
if len(image_dirs) > 1:
full_paths = [os.path.join(image_dir, filename) for image_dir in image_dirs]
full_existing_paths = [p for p in full_paths if tf.io.gfile.exists(p)]
if not full_existing_paths:
raise ValueError(
'{} does not exist across image directories.'.format(filename))
if len(full_existing_paths) > 1:
raise ValueError(
'{} is not unique across image directories'.format(filename))
full_path, = full_existing_paths
# If there is only one image directory, it's not worth checking for existence,
# since trying to open the file will raise an informative error message if it
# does not exist.
else:
image_dir, = image_dirs
full_path = os.path.join(image_dir, filename)
with tf.io.gfile.GFile(full_path, 'rb') as fid:
encoded_jpg = fid.read()
......@@ -276,7 +293,7 @@ def _load_images_info(images_info_file):
return info_dict['images']
def generate_annotations(images, image_dir,
def generate_annotations(images, image_dirs,
img_to_obj_annotation=None,
img_to_caption_annotation=None, id_to_name_map=None,
include_masks=False):
......@@ -289,12 +306,12 @@ def generate_annotations(images, image_dir,
caption_annotaion = (img_to_caption_annotation.get(image['id'], None) if
img_to_caption_annotation else None)
yield (image, image_dir, object_annotation, id_to_name_map,
yield (image, image_dirs, object_annotation, id_to_name_map,
caption_annotaion, include_masks)
def _create_tf_record_from_coco_annotations(images_info_file,
image_dir,
image_dirs,
output_path,
num_shards,
object_annotations_file=None,
......@@ -309,7 +326,7 @@ def _create_tf_record_from_coco_annotations(images_info_file,
files Eg. 'image_info_test-dev2017.json',
'instance_annotations_train2017.json',
'caption_annotations_train2017.json', etc.
image_dir: Directory containing the image files.
image_dirs: List of directories containing the image files.
output_path: Path to output tf.Record file.
num_shards: Number of output files to create.
object_annotations_file: JSON file containing bounding box annotations.
......@@ -333,7 +350,7 @@ def _create_tf_record_from_coco_annotations(images_info_file,
_load_caption_annotations(caption_annotations_file))
coco_annotations_iter = generate_annotations(
images, image_dir, img_to_obj_annotation, img_to_caption_annotation,
images, image_dirs, img_to_obj_annotation, img_to_caption_annotation,
id_to_name_map=id_to_name_map, include_masks=include_masks)
num_skipped = tfrecord_lib.write_tf_record_dataset(
......
#!/bin/bash
#
# Processes the COCO few-shot benchmark into TFRecord files. Requires `wget`.
tmp_dir=$(mktemp -d -t coco-XXXXXXXXXX)
output_dir="/tmp/coco_few_shot"
while getopts "o:" o; do
case "${o}" in
o) output_dir=${OPTARG} ;;
*) echo "Usage: ${0} [-o <output_dir>]" 1>&2; exit 1 ;;
esac
done
cocosplit_url="dl.yf.io/fs-det/datasets/cocosplit"
wget --recursive --no-parent -q --show-progress --progress=bar:force:noscroll \
-P "${tmp_dir}" -A "5k.json,*10shot*.json,*30shot*.json" \
"http://${cocosplit_url}/"
mv "${tmp_dir}/${cocosplit_url}/"* "${tmp_dir}"
rm -rf "${tmp_dir}/${cocosplit_url}/"
python process_coco_few_shot_json_files.py \
--logtostderr --workdir="${tmp_dir}"
for seed in {0..9}; do
for shots in 10 30; do
python create_coco_tf_record.py \
--logtostderr \
--image_dir=/namespace/vale-project/datasets/mscoco_raw/images/train2014 \
--image_dir=/namespace/vale-project/datasets/mscoco_raw/images/val2014 \
--image_info_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
--object_annotations_file="${tmp_dir}/${shots}shot_seed${seed}.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/${shots}shot_seed${seed}" \
--num_shards=4
done
done
python create_coco_tf_record.py \
--logtostderr \
--image_dir=/namespace/vale-project/datasets/mscoco_raw/images/train2014 \
--image_dir=/namespace/vale-project/datasets/mscoco_raw/images/val2014 \
--image_info_file="${tmp_dir}/datasplit/5k.json" \
--object_annotations_file="${tmp_dir}/datasplit/5k.json" \
--caption_annotations_file="" \
--output_file_prefix="${output_dir}/5k" \
--num_shards=10
rm -rf "${tmp_dir}"
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processes the JSON files for COCO few-shot.
We assume that `workdir` mirrors the contents of
http://dl.yf.io/fs-det/datasets/cocosplit/, which contains the official JSON
files for the few-shot COCO evaluation procedure that Wang et al. (2020)'s
"Frustratingly Simple Few-Shot Object Detection" paper uses.
"""
import collections
import itertools
import json
import logging
import os
from absl import app
from absl import flags
import tensorflow as tf
logger = tf.get_logger()
logger.setLevel(logging.INFO)
flags.DEFINE_string('workdir', None, 'Working directory.')
FLAGS = flags.FLAGS
CATEGORIES = ['airplane', 'apple', 'backpack', 'banana', 'baseball bat',
'baseball glove', 'bear', 'bed', 'bench', 'bicycle', 'bird',
'boat', 'book', 'bottle', 'bowl', 'broccoli', 'bus', 'cake',
'car', 'carrot', 'cat', 'cell phone', 'chair', 'clock', 'couch',
'cow', 'cup', 'dining table', 'dog', 'donut', 'elephant',
'fire hydrant', 'fork', 'frisbee', 'giraffe', 'hair drier',
'handbag', 'horse', 'hot dog', 'keyboard', 'kite', 'knife',
'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'oven',
'parking meter', 'person', 'pizza', 'potted plant',
'refrigerator', 'remote', 'sandwich', 'scissors', 'sheep',
'sink', 'skateboard', 'skis', 'snowboard', 'spoon', 'sports ball',
'stop sign', 'suitcase', 'surfboard', 'teddy bear',
'tennis racket', 'tie', 'toaster', 'toilet', 'toothbrush',
'traffic light', 'train', 'truck', 'tv', 'umbrella', 'vase',
'wine glass', 'zebra']
SEEDS = list(range(10))
SHOTS = [10, 30]
FILE_SUFFIXES = collections.defaultdict(list)
for _seed, _shots in itertools.product(SEEDS, SHOTS):
for _category in CATEGORIES:
FILE_SUFFIXES[(_seed, _shots)].append(
'{}full_box_{}shot_{}_trainval.json'.format(
# http://dl.yf.io/fs-det/datasets/cocosplit/ is organized like so:
#
# datasplit/
# trainvalno5k.json
# 5k.json
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
# seed{1-9}/
# full_box_{1,2,3,5,10,30}shot_{category}_trainval.json
#
# This means that the JSON files for seed0 are located in the root
# directory rather than in a `seed?/` subdirectory, hence the
# conditional expression below.
'' if _seed == 0 else 'seed{}/'.format(_seed),
_shots,
_category))
def main(unused_argv):
workdir = FLAGS.workdir
for seed, shots in itertools.product(SEEDS, SHOTS):
# Retrieve all examples for a given seed and shots setting.
file_paths = [os.path.join(workdir, suffix)
for suffix in FILE_SUFFIXES[(seed, shots)]]
json_dicts = []
for file_path in file_paths:
with tf.io.gfile.GFile(file_path, 'r') as f:
json_dicts.append(json.load(f))
# Make sure that all JSON files for a given seed and shots setting have the
# same metadata. We count on this to fuse them later on.
metadata_dicts = [{'info': d['info'], 'licenses': d['licenses'],
'categories': d['categories']} for d in json_dicts]
if not all(d == metadata_dicts[0] for d in metadata_dicts[1:]):
raise RuntimeError(
'JSON files for {} shots (seed {}) '.format(shots, seed) +
'have different info, licences, or categories fields')
# Retrieve images across all JSON files.
images = sum((d['images'] for d in json_dicts), [])
# Remove duplicate image entries.
images = list({image['id']: image for image in images}.values())
output_dict = {
'info': json_dicts[0]['info'],
'licenses': json_dicts[0]['licenses'],
'categories': json_dicts[0]['categories'],
'images': images,
'annotations': sum((d['annotations'] for d in json_dicts), [])
}
output_path = os.path.join(workdir,
'{}shot_seed{}.json'.format(shots, seed))
with tf.io.gfile.GFile(output_path, 'w') as f:
json.dump(output_dict, f)
logger.info('Processed %d shots (seed %d) and saved to %s',
shots, seed, output_path)
if __name__ == '__main__':
flags.mark_flag_as_required('workdir')
app.run(main)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment