Push to Github of changes to the TFRecord generation of GLDv2 dataset (#8650)

* First version of working script to download the GLDv2 dataset * First version of the DEFL package installation script * First working version of the DELF package installation script * Fixed feedback from PR review * Push to Github of changes to the TFRecord data generation script for DELF. * Merged commit includes the following changes: 315363544 Added the generation of TRAIN and VALIDATE splits from the train dataset. -- 314676530 Updated script to download GLDv2 images for DELF training. -- 314101235 Added newly created module 'utils' to the copybara script. -- 313677085 Code migration from TF1 to TF2 for: - logging (replaced usage of tf.compat.v1.logging.info) - testing directories (replaced usage of tf.compat.v1.test.get_temp_dir()) - feature/object extraction scripts (replaced usage of tf.compat.v1.train.string_input_producer and tf.compat.v1.train.start_queue_runners with PIL) -- 312770828 by Internal change. -- PiperOrigin-RevId: 315363544

Push to Github of changes to the TFRecord generation of GLDv2 dataset (#8650)
* First version of working script to download the GLDv2 dataset * First version of the DEFL package installation script * First working version of the DELF package installation script * Fixed feedback from PR review * Push to Github of changes to the TFRecord data generation script for DELF. * Merged commit includes the following changes: 315363544 Added the generation of TRAIN and VALIDATE splits from the train dataset. -- 314676530 Updated script to download GLDv2 images for DELF training. -- 314101235 Added newly created module 'utils' to the copybara script. -- 313677085 Code migration from TF1 to TF2 for: - logging (replaced usage of tf.compat.v1.logging.info) - testing directories (replaced usage of tf.compat.v1.test.get_temp_dir()) - feature/object extraction scripts (replaced usage of tf.compat.v1.train.string_input_producer and tf.compat.v1.train.start_queue_runners with PIL) -- 312770828 by Internal change. -- PiperOrigin-RevId: 315363544
e82a2d9f · Dan Anghel · GitHub · a9684184 · e82a2d9f
Unverified Commit e82a2d9f authored Jun 08, 2020 by Dan Anghel Committed by GitHub Jun 08, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 260 additions and 24 deletions

research/delf/delf/python/training/build_image_dataset.py research/delf/delf/python/training/build_image_dataset.py +260 -24

No files found.
--- a/research/delf/delf/python/training/build_image_dataset.py
+++ b/research/delf/delf/python/training/build_image_dataset.py
@@ -17,9 +17,6 @@

 The image data set is expected to reside in JPEG files ends up with '.jpg'.

-This script assumes you have downloaded using the provided script:
-https://www.kaggle.com/tobwey/landmark-recognition-challenge-image-downloader
-
 This script converts the training and testing data into
 a sharded data set consisting of TFRecord files
  train_directory/train-00000-of-00128
@@ -50,6 +47,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import csv
 import os

 from absl import app
@@ -62,23 +60,48 @@ import tensorflow as tf
 FLAGS = flags.FLAGS

 flags.DEFINE_string('train_directory', '/tmp/', 'Training data directory.')
-flags.DEFINE_string('test_directory', '/tmp/', 'Testing data directory.')
+flags.DEFINE_string('test_directory', None,
+                    '(Optional) Testing data directory. Required only if '
+                    'test_csv_path is not None.')
 flags.DEFINE_string('output_directory', '/tmp/', 'Output data directory.')
 flags.DEFINE_string('train_csv_path', '/tmp/train.csv',
                    'Training data csv file path.')
-flags.DEFINE_string('test_csv_path', '/tmp/test.csv',
-                    'Testing data csv file path.')
+flags.DEFINE_string('train_clean_csv_path', None,
+                    ('(Optional) Clean training data csv file path. '
+                     'If provided, filters images keeping the ones listed in '
+                     'this file. In this case, also outputs a CSV file '
+                     'relabeling.csv mapping new labels to old ones.'))
+flags.DEFINE_string('test_csv_path', None,
+                    '(Optional) Testing data csv file path. If None or absent,'
+                    'TFRecords for the images in the test dataset are not'
+                    'generated')
 flags.DEFINE_integer('num_shards', 128, 'Number of shards in output data.')
-
-
-def _get_image_files_and_labels(name, csv_path, image_dir):
+flags.DEFINE_boolean('generate_train_validation_splits', False,
+                     '(Optional) Whether to split the train dataset into'
+                     'TRAIN and VALIDATION splits.')
+flags.DEFINE_float('validation_split_size', 0.2,
+                   '(Optional) The size of the VALIDATION split as a fraction'
+                   'of the train dataset.')
+flags.DEFINE_integer('seed', 0,
+                     '(Optional) The seed to be used while shuffling the train'
+                     'dataset when generating the TRAIN and VALIDATION splits.'
+                     'Recommended for splits reproducibility purposes.')
+
+_FILE_IDS_KEY = 'file_ids'
+_IMAGE_PATHS_KEY = 'image_paths'
+_LABELS_KEY = 'labels'
+_TEST_SPLIT = 'test'
+_TRAIN_SPLIT = 'train'
+_VALIDATION_SPLIT = 'validation'
+
+
+def _get_all_image_files_and_labels(name, csv_path, image_dir):
  """Process input and get the image file paths, image ids and the labels.

  Args:
    name: 'train' or 'test'.
    csv_path: path to the Google-landmark Dataset csv Data Sources files.
    image_dir: directory that stores downloaded images.
-
  Returns:
    image_paths: the paths to all images in the image_dir.
    file_ids: the unique ids of images.
@@ -87,21 +110,71 @@ def _get_image_files_and_labels(name, csv_path, image_dir):
  Raises:
    ValueError: if input name is not supported.
  """
-
-  image_paths = tf.io.gfile.glob(image_dir + '/*.jpg')
+  image_paths = tf.io.gfile.glob(os.path.join(image_dir, '*.jpg'))
  file_ids = [os.path.basename(os.path.normpath(f))[:-4] for f in image_paths]
-  if name == 'train':
+  if name == _TRAIN_SPLIT:
    with tf.io.gfile.GFile(csv_path, 'rb') as csv_file:
      df = pd.read_csv(csv_file)
    df = df.set_index('id')
    labels = [int(df.loc[fid]['landmark_id']) for fid in file_ids]
-  elif name == 'test':
+  elif name == _TEST_SPLIT:
    labels = []
  else:
    raise ValueError('Unsupported dataset split name: %s' % name)
  return image_paths, file_ids, labels


+def _get_clean_train_image_files_and_labels(csv_path, image_dir):
+  """Get image file paths, image ids and  labels for the clean training split.
+
+  Args:
+    csv_path: path to the Google-landmark Dataset v2 CSV Data Sources files
+              of the clean train dataset. Assumes CSV header landmark_id;images.
+    image_dir: directory that stores downloaded images.
+
+  Returns:
+    image_paths: the paths to all images in the image_dir.
+    file_ids: the unique ids of images.
+    labels: the landmark id of all images.
+    relabeling: relabeling rules created to replace actual labels with
+                a continuous set of labels.
+  """
+  # Load the content of the CSV file (landmark_id/label -> images).
+  with tf.io.gfile.GFile(csv_path, 'rb') as csv_file:
+    df = pd.read_csv(csv_file)
+
+  # Create the dictionary (key = image_id, value = {label, file_id}).
+  images = {}
+  for _, row in df.iterrows():
+    label = row['landmark_id']
+    for file_id in row['images'].split(' '):
+      images[file_id] = {}
+      images[file_id]['label'] = label
+      images[file_id]['file_id'] = file_id
+
+  # Add the full image path to the dictionary of images.
+  image_paths = tf.io.gfile.glob(os.path.join(image_dir, '*.jpg'))
+  for image_path in image_paths:
+    file_id = os.path.basename(os.path.normpath(image_path))[:-4]
+    if file_id in images:
+      images[file_id]['image_path'] = image_path
+
+  # Explode the dictionary into lists (1 per image attribute).
+  image_paths = []
+  file_ids = []
+  labels = []
+  for _, value in images.items():
+    image_paths.append(value['image_path'])
+    file_ids.append(value['file_id'])
+    labels.append(value['label'])
+
+  # Relabel image labels to contiguous values.
+  unique_labels = sorted(set(labels))
+  relabeling = {label: index for index, label in enumerate(unique_labels)}
+  new_labels = [relabeling[label] for label in labels]
+  return image_paths, file_ids, new_labels, relabeling
+
+
 def _process_image(filename):
  """Process a single image file.

@@ -190,7 +263,7 @@ def _write_tfrecord(output_prefix, image_paths, file_ids, labels):
  Raises:
    ValueError: if the length of input images, ids and labels don't match
  """
-  if output_prefix == 'test':
+  if output_prefix == _TEST_SPLIT:
    labels = [None] * len(image_paths)
  if not len(image_paths) == len(file_ids) == len(labels):
    raise ValueError('length of image_paths, file_ids, labels shoud be the' +
@@ -213,26 +286,189 @@ def _write_tfrecord(output_prefix, image_paths, file_ids, labels):
    writer.close()


-def _build_tfrecord_dataset(name, csv_path, image_dir):
-  """Build a TFRecord dataset.
+def _write_relabeling_rules(relabeling_rules):
+  """Write to a file the relabeling rules when the clean train dataset is used.

  Args:
-    name: 'train' or 'test' to indicate which set of data to be processed.
-    csv_path: path to the Google-landmark Dataset csv Data Sources files.
+    relabeling_rules: dictionary of relabeling rules applied when the clean
+      train dataset is used (key = old_label, value = new_label).
+  """
+  relabeling_file_name = os.path.join(FLAGS.output_directory,
+                                      'relabeling.csv')
+  with tf.io.gfile.GFile(relabeling_file_name, 'w') as relabeling_file:
+    csv_writer = csv.writer(relabeling_file, delimiter=',')
+    csv_writer.writerow(['new_label', 'old_label'])
+    for old_label, new_label in relabeling_rules.items():
+      csv_writer.writerow([new_label, old_label])
+
+
+def _build_train_and_validation_splits(image_paths, file_ids, labels,
+                                       validation_split_size, seed):
+  """Create TRAIN and VALIDATION splits containg all labels in equal proportion.
+
+  Args:
+    image_paths: list of paths to the image files in the train dataset.
+    file_ids: list of image file ids in the train dataset.
+    labels: list of image labels in the train dataset.
+    validation_split_size: size of the VALIDATION split as a ratio of the train
+      dataset.
+    seed: seed to use for shuffling the dataset for reproducibility purposes.
+
+  Returns:
+    splits : tuple containing the TRAIN and VALIDATION splits.
+  Raises:
+    ValueError: if the image attributes arrays don't all have the same length,
+                which makes the shuffling impossible.
+  """
+  # Ensure all image attribute arrays have the same length.
+  total_images = len(file_ids)
+  if not (len(image_paths) == total_images and len(labels) == total_images):
+    raise ValueError('Inconsistencies between number of file_ids (%d), number '
+                     'of image_paths (%d) and number of labels (%d). Cannot'
+                     'shuffle the train dataset.'% (total_images,
+                                                    len(image_paths),
+                                                    len(labels)))
+
+  # Stack all image attributes arrays in a single 2D array of dimensions
+  # (3, number of images) and group by label the indices of datapoins in the
+  # image attributes arrays. Explicitly convert label types from 'int' to 'str'
+  # to avoid implicit conversion during stacking with image_paths and file_ids
+  # which are 'str'.
+  labels_str = [str(label) for label in labels]
+  image_attrs = np.stack((image_paths, file_ids, labels_str))
+  image_attrs_idx_by_label = {}
+  for index, label in enumerate(labels):
+    if label not in image_attrs_idx_by_label:
+      image_attrs_idx_by_label[label] = []
+    image_attrs_idx_by_label[label].append(index)
+
+  # Create subsets of image attributes by label, shuffle them separately and
+  # split each subset into TRAIN and VALIDATION splits based on the size of the
+  # validation split.
+  splits = {}
+  rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))
+  for label, indexes in image_attrs_idx_by_label.items():
+    # Create the subset for the current label.
+    image_attrs_label = image_attrs[:, indexes]
+    images_per_label = image_attrs_label.shape[1]
+    # Shuffle the current label subset.
+    columns_indices = np.arange(images_per_label)
+    rs.shuffle(columns_indices)
+    image_attrs_label = image_attrs_label[:, columns_indices]
+    # Split the current label subset into TRAIN and VALIDATION splits.
+    cutoff_idx = max(1, int(validation_split_size * images_per_label))
+    validation_split = image_attrs_label[:, 0 : cutoff_idx]
+    train_split = image_attrs_label[:, cutoff_idx : ]
+    # Merge the splits of the current subset with the splits of other labels.
+    splits[_VALIDATION_SPLIT] = (
+        np.concatenate((splits[_VALIDATION_SPLIT], validation_split), axis=1)
+        if _VALIDATION_SPLIT in splits else validation_split)
+    splits[_TRAIN_SPLIT] = (
+        np.concatenate((splits[_TRAIN_SPLIT], train_split), axis=1)
+        if _TRAIN_SPLIT in splits else train_split)
+
+  # Unstack the image attribute arrays in the TRAIN and VALIDATION splits and
+  # convert them back to lists. Convert labels back to 'int' from 'str'
+  # following the explicit type change from 'str' to 'int' for stacking.
+  validation_split = splits[_VALIDATION_SPLIT]
+  train_split = splits[_TRAIN_SPLIT]
+  return (
+      {
+          _IMAGE_PATHS_KEY: validation_split[0, :].tolist(),
+          _FILE_IDS_KEY: validation_split[1, :].tolist(),
+          _LABELS_KEY: [int(label) for label in validation_split[2, :].tolist()]
+      }, {
+          _IMAGE_PATHS_KEY: train_split[0, :].tolist(),
+          _FILE_IDS_KEY: train_split[1, :].tolist(),
+          _LABELS_KEY: [int(label) for label in train_split[2, :].tolist()]
+      })
+
+
+def _build_train_tfrecord_dataset(csv_path,
+                                  clean_csv_path,
+                                  image_dir,
+                                  generate_train_validation_splits,
+                                  validation_split_size,
+                                  seed):
+  """Build a TFRecord dataset for the train split.
+
+  Args:
+    csv_path: path to the train Google-landmark Dataset csv Data Sources files.
+    clean_csv_path: path to the Google-landmark Dataset v2 CSV Data Sources
+                    files of the clean train dataset.
    image_dir: directory that stores downloaded images.
+    generate_train_validation_splits: whether to split the test dataset into
+      TRAIN and VALIDATION splits.
+    validation_split_size: size of the VALIDATION split as a ratio of the train
+      dataset. Only used if 'generate_train_validation_splits' is True.
+    seed: seed to use for shuffling the dataset for reproducibility purposes.
+      Only used if 'generate_train_validation_splits' is True.

  Returns:
    Nothing. After the function call, sharded TFRecord files are materialized.
+  Raises:
+    ValueError: if the size of the VALIDATION split is outside (0,1) when TRAIN
+                and VALIDATION splits need to be generated.
  """
+  # Make sure the size of the VALIDATION split is inside (0, 1) if we need to
+  # generate the TRAIN and VALIDATION splits.
+  if generate_train_validation_splits:
+    if validation_split_size <= 0 or validation_split_size >= 1:
+      raise ValueError('Invalid VALIDATION split size. Expected inside (0,1)'
+                       'but received %f.' % validation_split_size)
+
+  if clean_csv_path:
+    # Load clean train images and labels and write the relabeling rules.
+    (image_paths, file_ids, labels,
+     relabeling_rules) = _get_clean_train_image_files_and_labels(clean_csv_path,
+                                                                 image_dir)
+    _write_relabeling_rules(relabeling_rules)
+  else:
+    # Load all train images.
+    image_paths, file_ids, labels = _get_all_image_files_and_labels(
+        _TRAIN_SPLIT, csv_path, image_dir)
+
+  if generate_train_validation_splits:
+    # Generate the TRAIN and VALIDATION splits and write them to TFRecord.
+    validation_split, train_split = _build_train_and_validation_splits(
+        image_paths, file_ids, labels, validation_split_size, seed)
+    _write_tfrecord(_VALIDATION_SPLIT,
+                    validation_split[_IMAGE_PATHS_KEY],
+                    validation_split[_FILE_IDS_KEY],
+                    validation_split[_LABELS_KEY])
+    _write_tfrecord(_TRAIN_SPLIT,
+                    train_split[_IMAGE_PATHS_KEY],
+                    train_split[_FILE_IDS_KEY],
+                    train_split[_LABELS_KEY])
+  else:
+    # Write to TFRecord a single split, TRAIN.
+    _write_tfrecord(_TRAIN_SPLIT, image_paths, file_ids, labels)
+
+
+def _build_test_tfrecord_dataset(csv_path, image_dir):
+  """Build a TFRecord dataset for the 'test' split.

-  image_paths, file_ids, labels = _get_image_files_and_labels(
-      name, csv_path, image_dir)
-  _write_tfrecord(name, image_paths, file_ids, labels)
+  Args:
+    csv_path: path to the 'test' Google-landmark Dataset csv Data Sources files.
+    image_dir: directory that stores downloaded images.
+
+  Returns:
+    Nothing. After the function call, sharded TFRecord files are materialized.
+  """
+  image_paths, file_ids, labels = _get_all_image_files_and_labels(
+      _TEST_SPLIT, csv_path, image_dir)
+  _write_tfrecord(_TEST_SPLIT, image_paths, file_ids, labels)


 def main(unused_argv):
-  _build_tfrecord_dataset('train', FLAGS.train_csv_path, FLAGS.train_directory)
-  _build_tfrecord_dataset('test', FLAGS.test_csv_path, FLAGS.test_directory)
+  _build_train_tfrecord_dataset(FLAGS.train_csv_path,
+                                FLAGS.train_clean_csv_path,
+                                FLAGS.train_directory,
+                                FLAGS.generate_train_validation_splits,
+                                FLAGS.validation_split_size,
+                                FLAGS.seed)
+  if FLAGS.test_csv_path is not None:
+    _build_test_tfrecord_dataset(FLAGS.test_csv_path, FLAGS.test_directory)


 if __name__ == '__main__':