Unverified Commit e82a2d9f authored by Dan Anghel's avatar Dan Anghel Committed by GitHub
Browse files

Push to Github of changes to the TFRecord generation of GLDv2 dataset (#8650)

* First version of working script to download the GLDv2 dataset

* First version of the DEFL package installation script

* First working version of the DELF package installation script

* Fixed feedback from PR review

* Push to Github of changes to the TFRecord data generation script for DELF.

* Merged commit includes the following changes:
315363544  

    Added the generation of TRAIN and VALIDATE splits from the train dataset.

--
314676530  

    Updated script to download GLDv2 images for DELF training.

--
314101235  

    Added newly created module 'utils' to the copybara script.

--
313677085  

    Code migration from TF1 to TF2 for:
    - logging (replaced usage of tf.compat.v1.logging.info)
    - testing directories (replaced usage of tf.compat.v1.test.get_temp_dir())
    - feature/object extraction scripts (replaced usage of tf.compat.v1.train.string_input_producer and tf.compat.v1.train.start_queue_runners with PIL)

--
312770828  by 

    Internal change.

--

PiperOrigin-RevId: 315363544
parent a9684184
......@@ -17,9 +17,6 @@
The image data set is expected to reside in JPEG files ends up with '.jpg'.
This script assumes you have downloaded using the provided script:
https://www.kaggle.com/tobwey/landmark-recognition-challenge-image-downloader
This script converts the training and testing data into
a sharded data set consisting of TFRecord files
train_directory/train-00000-of-00128
......@@ -50,6 +47,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import csv
import os
from absl import app
......@@ -62,23 +60,48 @@ import tensorflow as tf
FLAGS = flags.FLAGS
flags.DEFINE_string('train_directory', '/tmp/', 'Training data directory.')
flags.DEFINE_string('test_directory', '/tmp/', 'Testing data directory.')
flags.DEFINE_string('test_directory', None,
'(Optional) Testing data directory. Required only if '
'test_csv_path is not None.')
flags.DEFINE_string('output_directory', '/tmp/', 'Output data directory.')
flags.DEFINE_string('train_csv_path', '/tmp/train.csv',
'Training data csv file path.')
flags.DEFINE_string('test_csv_path', '/tmp/test.csv',
'Testing data csv file path.')
flags.DEFINE_string('train_clean_csv_path', None,
('(Optional) Clean training data csv file path. '
'If provided, filters images keeping the ones listed in '
'this file. In this case, also outputs a CSV file '
'relabeling.csv mapping new labels to old ones.'))
flags.DEFINE_string('test_csv_path', None,
'(Optional) Testing data csv file path. If None or absent,'
'TFRecords for the images in the test dataset are not'
'generated')
flags.DEFINE_integer('num_shards', 128, 'Number of shards in output data.')
def _get_image_files_and_labels(name, csv_path, image_dir):
flags.DEFINE_boolean('generate_train_validation_splits', False,
'(Optional) Whether to split the train dataset into'
'TRAIN and VALIDATION splits.')
flags.DEFINE_float('validation_split_size', 0.2,
'(Optional) The size of the VALIDATION split as a fraction'
'of the train dataset.')
flags.DEFINE_integer('seed', 0,
'(Optional) The seed to be used while shuffling the train'
'dataset when generating the TRAIN and VALIDATION splits.'
'Recommended for splits reproducibility purposes.')
_FILE_IDS_KEY = 'file_ids'
_IMAGE_PATHS_KEY = 'image_paths'
_LABELS_KEY = 'labels'
_TEST_SPLIT = 'test'
_TRAIN_SPLIT = 'train'
_VALIDATION_SPLIT = 'validation'
def _get_all_image_files_and_labels(name, csv_path, image_dir):
"""Process input and get the image file paths, image ids and the labels.
Args:
name: 'train' or 'test'.
csv_path: path to the Google-landmark Dataset csv Data Sources files.
image_dir: directory that stores downloaded images.
Returns:
image_paths: the paths to all images in the image_dir.
file_ids: the unique ids of images.
......@@ -87,21 +110,71 @@ def _get_image_files_and_labels(name, csv_path, image_dir):
Raises:
ValueError: if input name is not supported.
"""
image_paths = tf.io.gfile.glob(image_dir + '/*.jpg')
image_paths = tf.io.gfile.glob(os.path.join(image_dir, '*.jpg'))
file_ids = [os.path.basename(os.path.normpath(f))[:-4] for f in image_paths]
if name == 'train':
if name == _TRAIN_SPLIT:
with tf.io.gfile.GFile(csv_path, 'rb') as csv_file:
df = pd.read_csv(csv_file)
df = df.set_index('id')
labels = [int(df.loc[fid]['landmark_id']) for fid in file_ids]
elif name == 'test':
elif name == _TEST_SPLIT:
labels = []
else:
raise ValueError('Unsupported dataset split name: %s' % name)
return image_paths, file_ids, labels
def _get_clean_train_image_files_and_labels(csv_path, image_dir):
"""Get image file paths, image ids and labels for the clean training split.
Args:
csv_path: path to the Google-landmark Dataset v2 CSV Data Sources files
of the clean train dataset. Assumes CSV header landmark_id;images.
image_dir: directory that stores downloaded images.
Returns:
image_paths: the paths to all images in the image_dir.
file_ids: the unique ids of images.
labels: the landmark id of all images.
relabeling: relabeling rules created to replace actual labels with
a continuous set of labels.
"""
# Load the content of the CSV file (landmark_id/label -> images).
with tf.io.gfile.GFile(csv_path, 'rb') as csv_file:
df = pd.read_csv(csv_file)
# Create the dictionary (key = image_id, value = {label, file_id}).
images = {}
for _, row in df.iterrows():
label = row['landmark_id']
for file_id in row['images'].split(' '):
images[file_id] = {}
images[file_id]['label'] = label
images[file_id]['file_id'] = file_id
# Add the full image path to the dictionary of images.
image_paths = tf.io.gfile.glob(os.path.join(image_dir, '*.jpg'))
for image_path in image_paths:
file_id = os.path.basename(os.path.normpath(image_path))[:-4]
if file_id in images:
images[file_id]['image_path'] = image_path
# Explode the dictionary into lists (1 per image attribute).
image_paths = []
file_ids = []
labels = []
for _, value in images.items():
image_paths.append(value['image_path'])
file_ids.append(value['file_id'])
labels.append(value['label'])
# Relabel image labels to contiguous values.
unique_labels = sorted(set(labels))
relabeling = {label: index for index, label in enumerate(unique_labels)}
new_labels = [relabeling[label] for label in labels]
return image_paths, file_ids, new_labels, relabeling
def _process_image(filename):
"""Process a single image file.
......@@ -190,7 +263,7 @@ def _write_tfrecord(output_prefix, image_paths, file_ids, labels):
Raises:
ValueError: if the length of input images, ids and labels don't match
"""
if output_prefix == 'test':
if output_prefix == _TEST_SPLIT:
labels = [None] * len(image_paths)
if not len(image_paths) == len(file_ids) == len(labels):
raise ValueError('length of image_paths, file_ids, labels shoud be the' +
......@@ -213,26 +286,189 @@ def _write_tfrecord(output_prefix, image_paths, file_ids, labels):
writer.close()
def _build_tfrecord_dataset(name, csv_path, image_dir):
"""Build a TFRecord dataset.
def _write_relabeling_rules(relabeling_rules):
"""Write to a file the relabeling rules when the clean train dataset is used.
Args:
name: 'train' or 'test' to indicate which set of data to be processed.
csv_path: path to the Google-landmark Dataset csv Data Sources files.
relabeling_rules: dictionary of relabeling rules applied when the clean
train dataset is used (key = old_label, value = new_label).
"""
relabeling_file_name = os.path.join(FLAGS.output_directory,
'relabeling.csv')
with tf.io.gfile.GFile(relabeling_file_name, 'w') as relabeling_file:
csv_writer = csv.writer(relabeling_file, delimiter=',')
csv_writer.writerow(['new_label', 'old_label'])
for old_label, new_label in relabeling_rules.items():
csv_writer.writerow([new_label, old_label])
def _build_train_and_validation_splits(image_paths, file_ids, labels,
validation_split_size, seed):
"""Create TRAIN and VALIDATION splits containg all labels in equal proportion.
Args:
image_paths: list of paths to the image files in the train dataset.
file_ids: list of image file ids in the train dataset.
labels: list of image labels in the train dataset.
validation_split_size: size of the VALIDATION split as a ratio of the train
dataset.
seed: seed to use for shuffling the dataset for reproducibility purposes.
Returns:
splits : tuple containing the TRAIN and VALIDATION splits.
Raises:
ValueError: if the image attributes arrays don't all have the same length,
which makes the shuffling impossible.
"""
# Ensure all image attribute arrays have the same length.
total_images = len(file_ids)
if not (len(image_paths) == total_images and len(labels) == total_images):
raise ValueError('Inconsistencies between number of file_ids (%d), number '
'of image_paths (%d) and number of labels (%d). Cannot'
'shuffle the train dataset.'% (total_images,
len(image_paths),
len(labels)))
# Stack all image attributes arrays in a single 2D array of dimensions
# (3, number of images) and group by label the indices of datapoins in the
# image attributes arrays. Explicitly convert label types from 'int' to 'str'
# to avoid implicit conversion during stacking with image_paths and file_ids
# which are 'str'.
labels_str = [str(label) for label in labels]
image_attrs = np.stack((image_paths, file_ids, labels_str))
image_attrs_idx_by_label = {}
for index, label in enumerate(labels):
if label not in image_attrs_idx_by_label:
image_attrs_idx_by_label[label] = []
image_attrs_idx_by_label[label].append(index)
# Create subsets of image attributes by label, shuffle them separately and
# split each subset into TRAIN and VALIDATION splits based on the size of the
# validation split.
splits = {}
rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))
for label, indexes in image_attrs_idx_by_label.items():
# Create the subset for the current label.
image_attrs_label = image_attrs[:, indexes]
images_per_label = image_attrs_label.shape[1]
# Shuffle the current label subset.
columns_indices = np.arange(images_per_label)
rs.shuffle(columns_indices)
image_attrs_label = image_attrs_label[:, columns_indices]
# Split the current label subset into TRAIN and VALIDATION splits.
cutoff_idx = max(1, int(validation_split_size * images_per_label))
validation_split = image_attrs_label[:, 0 : cutoff_idx]
train_split = image_attrs_label[:, cutoff_idx : ]
# Merge the splits of the current subset with the splits of other labels.
splits[_VALIDATION_SPLIT] = (
np.concatenate((splits[_VALIDATION_SPLIT], validation_split), axis=1)
if _VALIDATION_SPLIT in splits else validation_split)
splits[_TRAIN_SPLIT] = (
np.concatenate((splits[_TRAIN_SPLIT], train_split), axis=1)
if _TRAIN_SPLIT in splits else train_split)
# Unstack the image attribute arrays in the TRAIN and VALIDATION splits and
# convert them back to lists. Convert labels back to 'int' from 'str'
# following the explicit type change from 'str' to 'int' for stacking.
validation_split = splits[_VALIDATION_SPLIT]
train_split = splits[_TRAIN_SPLIT]
return (
{
_IMAGE_PATHS_KEY: validation_split[0, :].tolist(),
_FILE_IDS_KEY: validation_split[1, :].tolist(),
_LABELS_KEY: [int(label) for label in validation_split[2, :].tolist()]
}, {
_IMAGE_PATHS_KEY: train_split[0, :].tolist(),
_FILE_IDS_KEY: train_split[1, :].tolist(),
_LABELS_KEY: [int(label) for label in train_split[2, :].tolist()]
})
def _build_train_tfrecord_dataset(csv_path,
clean_csv_path,
image_dir,
generate_train_validation_splits,
validation_split_size,
seed):
"""Build a TFRecord dataset for the train split.
Args:
csv_path: path to the train Google-landmark Dataset csv Data Sources files.
clean_csv_path: path to the Google-landmark Dataset v2 CSV Data Sources
files of the clean train dataset.
image_dir: directory that stores downloaded images.
generate_train_validation_splits: whether to split the test dataset into
TRAIN and VALIDATION splits.
validation_split_size: size of the VALIDATION split as a ratio of the train
dataset. Only used if 'generate_train_validation_splits' is True.
seed: seed to use for shuffling the dataset for reproducibility purposes.
Only used if 'generate_train_validation_splits' is True.
Returns:
Nothing. After the function call, sharded TFRecord files are materialized.
Raises:
ValueError: if the size of the VALIDATION split is outside (0,1) when TRAIN
and VALIDATION splits need to be generated.
"""
# Make sure the size of the VALIDATION split is inside (0, 1) if we need to
# generate the TRAIN and VALIDATION splits.
if generate_train_validation_splits:
if validation_split_size <= 0 or validation_split_size >= 1:
raise ValueError('Invalid VALIDATION split size. Expected inside (0,1)'
'but received %f.' % validation_split_size)
if clean_csv_path:
# Load clean train images and labels and write the relabeling rules.
(image_paths, file_ids, labels,
relabeling_rules) = _get_clean_train_image_files_and_labels(clean_csv_path,
image_dir)
_write_relabeling_rules(relabeling_rules)
else:
# Load all train images.
image_paths, file_ids, labels = _get_all_image_files_and_labels(
_TRAIN_SPLIT, csv_path, image_dir)
if generate_train_validation_splits:
# Generate the TRAIN and VALIDATION splits and write them to TFRecord.
validation_split, train_split = _build_train_and_validation_splits(
image_paths, file_ids, labels, validation_split_size, seed)
_write_tfrecord(_VALIDATION_SPLIT,
validation_split[_IMAGE_PATHS_KEY],
validation_split[_FILE_IDS_KEY],
validation_split[_LABELS_KEY])
_write_tfrecord(_TRAIN_SPLIT,
train_split[_IMAGE_PATHS_KEY],
train_split[_FILE_IDS_KEY],
train_split[_LABELS_KEY])
else:
# Write to TFRecord a single split, TRAIN.
_write_tfrecord(_TRAIN_SPLIT, image_paths, file_ids, labels)
def _build_test_tfrecord_dataset(csv_path, image_dir):
"""Build a TFRecord dataset for the 'test' split.
image_paths, file_ids, labels = _get_image_files_and_labels(
name, csv_path, image_dir)
_write_tfrecord(name, image_paths, file_ids, labels)
Args:
csv_path: path to the 'test' Google-landmark Dataset csv Data Sources files.
image_dir: directory that stores downloaded images.
Returns:
Nothing. After the function call, sharded TFRecord files are materialized.
"""
image_paths, file_ids, labels = _get_all_image_files_and_labels(
_TEST_SPLIT, csv_path, image_dir)
_write_tfrecord(_TEST_SPLIT, image_paths, file_ids, labels)
def main(unused_argv):
_build_tfrecord_dataset('train', FLAGS.train_csv_path, FLAGS.train_directory)
_build_tfrecord_dataset('test', FLAGS.test_csv_path, FLAGS.test_directory)
_build_train_tfrecord_dataset(FLAGS.train_csv_path,
FLAGS.train_clean_csv_path,
FLAGS.train_directory,
FLAGS.generate_train_validation_splits,
FLAGS.validation_split_size,
FLAGS.seed)
if FLAGS.test_csv_path is not None:
_build_test_tfrecord_dataset(FLAGS.test_csv_path, FLAGS.test_directory)
if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment