Unverified Commit e82a2d9f authored by Dan Anghel's avatar Dan Anghel Committed by GitHub
Browse files

Push to Github of changes to the TFRecord generation of GLDv2 dataset (#8650)

* First version of working script to download the GLDv2 dataset

* First version of the DEFL package installation script

* First working version of the DELF package installation script

* Fixed feedback from PR review

* Push to Github of changes to the TFRecord data generation script for DELF.

* Merged commit includes the following changes:
315363544  

    Added the generation of TRAIN and VALIDATE splits from the train dataset.

--
314676530  

    Updated script to download GLDv2 images for DELF training.

--
314101235  

    Added newly created module 'utils' to the copybara script.

--
313677085  

    Code migration from TF1 to TF2 for:
    - logging (replaced usage of tf.compat.v1.logging.info)
    - testing directories (replaced usage of tf.compat.v1.test.get_temp_dir())
    - feature/object extraction scripts (replaced usage of tf.compat.v1.train.string_input_producer and tf.compat.v1.train.start_queue_runners with PIL)

--
312770828  by 

    Internal change.

--

PiperOrigin-RevId: 315363544
parent a9684184
...@@ -17,9 +17,6 @@ ...@@ -17,9 +17,6 @@
The image data set is expected to reside in JPEG files ends up with '.jpg'. The image data set is expected to reside in JPEG files ends up with '.jpg'.
This script assumes you have downloaded using the provided script:
https://www.kaggle.com/tobwey/landmark-recognition-challenge-image-downloader
This script converts the training and testing data into This script converts the training and testing data into
a sharded data set consisting of TFRecord files a sharded data set consisting of TFRecord files
train_directory/train-00000-of-00128 train_directory/train-00000-of-00128
...@@ -50,6 +47,7 @@ from __future__ import absolute_import ...@@ -50,6 +47,7 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import csv
import os import os
from absl import app from absl import app
...@@ -62,23 +60,48 @@ import tensorflow as tf ...@@ -62,23 +60,48 @@ import tensorflow as tf
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
flags.DEFINE_string('train_directory', '/tmp/', 'Training data directory.') flags.DEFINE_string('train_directory', '/tmp/', 'Training data directory.')
flags.DEFINE_string('test_directory', '/tmp/', 'Testing data directory.') flags.DEFINE_string('test_directory', None,
'(Optional) Testing data directory. Required only if '
'test_csv_path is not None.')
flags.DEFINE_string('output_directory', '/tmp/', 'Output data directory.') flags.DEFINE_string('output_directory', '/tmp/', 'Output data directory.')
flags.DEFINE_string('train_csv_path', '/tmp/train.csv', flags.DEFINE_string('train_csv_path', '/tmp/train.csv',
'Training data csv file path.') 'Training data csv file path.')
flags.DEFINE_string('test_csv_path', '/tmp/test.csv', flags.DEFINE_string('train_clean_csv_path', None,
'Testing data csv file path.') ('(Optional) Clean training data csv file path. '
'If provided, filters images keeping the ones listed in '
'this file. In this case, also outputs a CSV file '
'relabeling.csv mapping new labels to old ones.'))
flags.DEFINE_string('test_csv_path', None,
'(Optional) Testing data csv file path. If None or absent,'
'TFRecords for the images in the test dataset are not'
'generated')
flags.DEFINE_integer('num_shards', 128, 'Number of shards in output data.') flags.DEFINE_integer('num_shards', 128, 'Number of shards in output data.')
flags.DEFINE_boolean('generate_train_validation_splits', False,
'(Optional) Whether to split the train dataset into'
def _get_image_files_and_labels(name, csv_path, image_dir): 'TRAIN and VALIDATION splits.')
flags.DEFINE_float('validation_split_size', 0.2,
'(Optional) The size of the VALIDATION split as a fraction'
'of the train dataset.')
flags.DEFINE_integer('seed', 0,
'(Optional) The seed to be used while shuffling the train'
'dataset when generating the TRAIN and VALIDATION splits.'
'Recommended for splits reproducibility purposes.')
_FILE_IDS_KEY = 'file_ids'
_IMAGE_PATHS_KEY = 'image_paths'
_LABELS_KEY = 'labels'
_TEST_SPLIT = 'test'
_TRAIN_SPLIT = 'train'
_VALIDATION_SPLIT = 'validation'
def _get_all_image_files_and_labels(name, csv_path, image_dir):
"""Process input and get the image file paths, image ids and the labels. """Process input and get the image file paths, image ids and the labels.
Args: Args:
name: 'train' or 'test'. name: 'train' or 'test'.
csv_path: path to the Google-landmark Dataset csv Data Sources files. csv_path: path to the Google-landmark Dataset csv Data Sources files.
image_dir: directory that stores downloaded images. image_dir: directory that stores downloaded images.
Returns: Returns:
image_paths: the paths to all images in the image_dir. image_paths: the paths to all images in the image_dir.
file_ids: the unique ids of images. file_ids: the unique ids of images.
...@@ -87,21 +110,71 @@ def _get_image_files_and_labels(name, csv_path, image_dir): ...@@ -87,21 +110,71 @@ def _get_image_files_and_labels(name, csv_path, image_dir):
Raises: Raises:
ValueError: if input name is not supported. ValueError: if input name is not supported.
""" """
image_paths = tf.io.gfile.glob(os.path.join(image_dir, '*.jpg'))
image_paths = tf.io.gfile.glob(image_dir + '/*.jpg')
file_ids = [os.path.basename(os.path.normpath(f))[:-4] for f in image_paths] file_ids = [os.path.basename(os.path.normpath(f))[:-4] for f in image_paths]
if name == 'train': if name == _TRAIN_SPLIT:
with tf.io.gfile.GFile(csv_path, 'rb') as csv_file: with tf.io.gfile.GFile(csv_path, 'rb') as csv_file:
df = pd.read_csv(csv_file) df = pd.read_csv(csv_file)
df = df.set_index('id') df = df.set_index('id')
labels = [int(df.loc[fid]['landmark_id']) for fid in file_ids] labels = [int(df.loc[fid]['landmark_id']) for fid in file_ids]
elif name == 'test': elif name == _TEST_SPLIT:
labels = [] labels = []
else: else:
raise ValueError('Unsupported dataset split name: %s' % name) raise ValueError('Unsupported dataset split name: %s' % name)
return image_paths, file_ids, labels return image_paths, file_ids, labels
def _get_clean_train_image_files_and_labels(csv_path, image_dir):
"""Get image file paths, image ids and labels for the clean training split.
Args:
csv_path: path to the Google-landmark Dataset v2 CSV Data Sources files
of the clean train dataset. Assumes CSV header landmark_id;images.
image_dir: directory that stores downloaded images.
Returns:
image_paths: the paths to all images in the image_dir.
file_ids: the unique ids of images.
labels: the landmark id of all images.
relabeling: relabeling rules created to replace actual labels with
a continuous set of labels.
"""
# Load the content of the CSV file (landmark_id/label -> images).
with tf.io.gfile.GFile(csv_path, 'rb') as csv_file:
df = pd.read_csv(csv_file)
# Create the dictionary (key = image_id, value = {label, file_id}).
images = {}
for _, row in df.iterrows():
label = row['landmark_id']
for file_id in row['images'].split(' '):
images[file_id] = {}
images[file_id]['label'] = label
images[file_id]['file_id'] = file_id
# Add the full image path to the dictionary of images.
image_paths = tf.io.gfile.glob(os.path.join(image_dir, '*.jpg'))
for image_path in image_paths:
file_id = os.path.basename(os.path.normpath(image_path))[:-4]
if file_id in images:
images[file_id]['image_path'] = image_path
# Explode the dictionary into lists (1 per image attribute).
image_paths = []
file_ids = []
labels = []
for _, value in images.items():
image_paths.append(value['image_path'])
file_ids.append(value['file_id'])
labels.append(value['label'])
# Relabel image labels to contiguous values.
unique_labels = sorted(set(labels))
relabeling = {label: index for index, label in enumerate(unique_labels)}
new_labels = [relabeling[label] for label in labels]
return image_paths, file_ids, new_labels, relabeling
def _process_image(filename): def _process_image(filename):
"""Process a single image file. """Process a single image file.
...@@ -190,7 +263,7 @@ def _write_tfrecord(output_prefix, image_paths, file_ids, labels): ...@@ -190,7 +263,7 @@ def _write_tfrecord(output_prefix, image_paths, file_ids, labels):
Raises: Raises:
ValueError: if the length of input images, ids and labels don't match ValueError: if the length of input images, ids and labels don't match
""" """
if output_prefix == 'test': if output_prefix == _TEST_SPLIT:
labels = [None] * len(image_paths) labels = [None] * len(image_paths)
if not len(image_paths) == len(file_ids) == len(labels): if not len(image_paths) == len(file_ids) == len(labels):
raise ValueError('length of image_paths, file_ids, labels shoud be the' + raise ValueError('length of image_paths, file_ids, labels shoud be the' +
...@@ -213,26 +286,189 @@ def _write_tfrecord(output_prefix, image_paths, file_ids, labels): ...@@ -213,26 +286,189 @@ def _write_tfrecord(output_prefix, image_paths, file_ids, labels):
writer.close() writer.close()
def _build_tfrecord_dataset(name, csv_path, image_dir): def _write_relabeling_rules(relabeling_rules):
"""Build a TFRecord dataset. """Write to a file the relabeling rules when the clean train dataset is used.
Args: Args:
name: 'train' or 'test' to indicate which set of data to be processed. relabeling_rules: dictionary of relabeling rules applied when the clean
csv_path: path to the Google-landmark Dataset csv Data Sources files. train dataset is used (key = old_label, value = new_label).
"""
relabeling_file_name = os.path.join(FLAGS.output_directory,
'relabeling.csv')
with tf.io.gfile.GFile(relabeling_file_name, 'w') as relabeling_file:
csv_writer = csv.writer(relabeling_file, delimiter=',')
csv_writer.writerow(['new_label', 'old_label'])
for old_label, new_label in relabeling_rules.items():
csv_writer.writerow([new_label, old_label])
def _build_train_and_validation_splits(image_paths, file_ids, labels,
validation_split_size, seed):
"""Create TRAIN and VALIDATION splits containg all labels in equal proportion.
Args:
image_paths: list of paths to the image files in the train dataset.
file_ids: list of image file ids in the train dataset.
labels: list of image labels in the train dataset.
validation_split_size: size of the VALIDATION split as a ratio of the train
dataset.
seed: seed to use for shuffling the dataset for reproducibility purposes.
Returns:
splits : tuple containing the TRAIN and VALIDATION splits.
Raises:
ValueError: if the image attributes arrays don't all have the same length,
which makes the shuffling impossible.
"""
# Ensure all image attribute arrays have the same length.
total_images = len(file_ids)
if not (len(image_paths) == total_images and len(labels) == total_images):
raise ValueError('Inconsistencies between number of file_ids (%d), number '
'of image_paths (%d) and number of labels (%d). Cannot'
'shuffle the train dataset.'% (total_images,
len(image_paths),
len(labels)))
# Stack all image attributes arrays in a single 2D array of dimensions
# (3, number of images) and group by label the indices of datapoins in the
# image attributes arrays. Explicitly convert label types from 'int' to 'str'
# to avoid implicit conversion during stacking with image_paths and file_ids
# which are 'str'.
labels_str = [str(label) for label in labels]
image_attrs = np.stack((image_paths, file_ids, labels_str))
image_attrs_idx_by_label = {}
for index, label in enumerate(labels):
if label not in image_attrs_idx_by_label:
image_attrs_idx_by_label[label] = []
image_attrs_idx_by_label[label].append(index)
# Create subsets of image attributes by label, shuffle them separately and
# split each subset into TRAIN and VALIDATION splits based on the size of the
# validation split.
splits = {}
rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))
for label, indexes in image_attrs_idx_by_label.items():
# Create the subset for the current label.
image_attrs_label = image_attrs[:, indexes]
images_per_label = image_attrs_label.shape[1]
# Shuffle the current label subset.
columns_indices = np.arange(images_per_label)
rs.shuffle(columns_indices)
image_attrs_label = image_attrs_label[:, columns_indices]
# Split the current label subset into TRAIN and VALIDATION splits.
cutoff_idx = max(1, int(validation_split_size * images_per_label))
validation_split = image_attrs_label[:, 0 : cutoff_idx]
train_split = image_attrs_label[:, cutoff_idx : ]
# Merge the splits of the current subset with the splits of other labels.
splits[_VALIDATION_SPLIT] = (
np.concatenate((splits[_VALIDATION_SPLIT], validation_split), axis=1)
if _VALIDATION_SPLIT in splits else validation_split)
splits[_TRAIN_SPLIT] = (
np.concatenate((splits[_TRAIN_SPLIT], train_split), axis=1)
if _TRAIN_SPLIT in splits else train_split)
# Unstack the image attribute arrays in the TRAIN and VALIDATION splits and
# convert them back to lists. Convert labels back to 'int' from 'str'
# following the explicit type change from 'str' to 'int' for stacking.
validation_split = splits[_VALIDATION_SPLIT]
train_split = splits[_TRAIN_SPLIT]
return (
{
_IMAGE_PATHS_KEY: validation_split[0, :].tolist(),
_FILE_IDS_KEY: validation_split[1, :].tolist(),
_LABELS_KEY: [int(label) for label in validation_split[2, :].tolist()]
}, {
_IMAGE_PATHS_KEY: train_split[0, :].tolist(),
_FILE_IDS_KEY: train_split[1, :].tolist(),
_LABELS_KEY: [int(label) for label in train_split[2, :].tolist()]
})
def _build_train_tfrecord_dataset(csv_path,
clean_csv_path,
image_dir,
generate_train_validation_splits,
validation_split_size,
seed):
"""Build a TFRecord dataset for the train split.
Args:
csv_path: path to the train Google-landmark Dataset csv Data Sources files.
clean_csv_path: path to the Google-landmark Dataset v2 CSV Data Sources
files of the clean train dataset.
image_dir: directory that stores downloaded images. image_dir: directory that stores downloaded images.
generate_train_validation_splits: whether to split the test dataset into
TRAIN and VALIDATION splits.
validation_split_size: size of the VALIDATION split as a ratio of the train
dataset. Only used if 'generate_train_validation_splits' is True.
seed: seed to use for shuffling the dataset for reproducibility purposes.
Only used if 'generate_train_validation_splits' is True.
Returns: Returns:
Nothing. After the function call, sharded TFRecord files are materialized. Nothing. After the function call, sharded TFRecord files are materialized.
Raises:
ValueError: if the size of the VALIDATION split is outside (0,1) when TRAIN
and VALIDATION splits need to be generated.
""" """
# Make sure the size of the VALIDATION split is inside (0, 1) if we need to
# generate the TRAIN and VALIDATION splits.
if generate_train_validation_splits:
if validation_split_size <= 0 or validation_split_size >= 1:
raise ValueError('Invalid VALIDATION split size. Expected inside (0,1)'
'but received %f.' % validation_split_size)
if clean_csv_path:
# Load clean train images and labels and write the relabeling rules.
(image_paths, file_ids, labels,
relabeling_rules) = _get_clean_train_image_files_and_labels(clean_csv_path,
image_dir)
_write_relabeling_rules(relabeling_rules)
else:
# Load all train images.
image_paths, file_ids, labels = _get_all_image_files_and_labels(
_TRAIN_SPLIT, csv_path, image_dir)
if generate_train_validation_splits:
# Generate the TRAIN and VALIDATION splits and write them to TFRecord.
validation_split, train_split = _build_train_and_validation_splits(
image_paths, file_ids, labels, validation_split_size, seed)
_write_tfrecord(_VALIDATION_SPLIT,
validation_split[_IMAGE_PATHS_KEY],
validation_split[_FILE_IDS_KEY],
validation_split[_LABELS_KEY])
_write_tfrecord(_TRAIN_SPLIT,
train_split[_IMAGE_PATHS_KEY],
train_split[_FILE_IDS_KEY],
train_split[_LABELS_KEY])
else:
# Write to TFRecord a single split, TRAIN.
_write_tfrecord(_TRAIN_SPLIT, image_paths, file_ids, labels)
def _build_test_tfrecord_dataset(csv_path, image_dir):
"""Build a TFRecord dataset for the 'test' split.
image_paths, file_ids, labels = _get_image_files_and_labels( Args:
name, csv_path, image_dir) csv_path: path to the 'test' Google-landmark Dataset csv Data Sources files.
_write_tfrecord(name, image_paths, file_ids, labels) image_dir: directory that stores downloaded images.
Returns:
Nothing. After the function call, sharded TFRecord files are materialized.
"""
image_paths, file_ids, labels = _get_all_image_files_and_labels(
_TEST_SPLIT, csv_path, image_dir)
_write_tfrecord(_TEST_SPLIT, image_paths, file_ids, labels)
def main(unused_argv): def main(unused_argv):
_build_tfrecord_dataset('train', FLAGS.train_csv_path, FLAGS.train_directory) _build_train_tfrecord_dataset(FLAGS.train_csv_path,
_build_tfrecord_dataset('test', FLAGS.test_csv_path, FLAGS.test_directory) FLAGS.train_clean_csv_path,
FLAGS.train_directory,
FLAGS.generate_train_validation_splits,
FLAGS.validation_split_size,
FLAGS.seed)
if FLAGS.test_csv_path is not None:
_build_test_tfrecord_dataset(FLAGS.test_csv_path, FLAGS.test_directory)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment