Ensure that shuffle occurs before map

c6479e77 · Neal Wu · 6e52c271 · c6479e77 · c6479e77 · c6479e77
Commit c6479e77 authored Nov 06, 2017 by Neal Wu
Showing with 10 additions and 8 deletions

official/resnet/cifar10_main.py official/resnet/cifar10_main.py +3 -4

official/resnet/imagenet_main.py official/resnet/imagenet_main.py +4 -3

official/wide_deep/wide_deep.py official/wide_deep/wide_deep.py +3 -1

No files found.
--- a/official/resnet/cifar10_main.py
+++ b/official/resnet/cifar10_main.py
@@ -71,8 +71,6 @@ _NUM_IMAGES = {
    'validation': 10000,
 }
-_SHUFFLE_BUFFER = 20000
 def record_dataset(filenames):
  """Returns an input pipeline Dataset from `filenames`."""
@@ -158,8 +156,9 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1):
  if is_training:
    # When choosing shuffle buffer sizes, larger sizes result in better
-    # randomness, while smaller sizes have better performance.
+    # randomness, while smaller sizes have better performance. Because CIFAR-10
-    dataset = dataset.shuffle(buffer_size=_SHUFFLE_BUFFER)
+    # is a relatively small dataset, we choose to shuffle the full epoch.
+    dataset = dataset.shuffle(buffer_size=_NUM_IMAGES['train'])
  dataset = dataset.map(parse_record)
  dataset = dataset.map(

--- a/official/resnet/imagenet_main.py
+++ b/official/resnet/imagenet_main.py
@@ -142,14 +142,15 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1):
  dataset = dataset.flat_map(tf.data.TFRecordDataset)
-  dataset = dataset.map(lambda value: dataset_parser(value, is_training),
-                        num_parallel_calls=5).prefetch(batch_size)
  if is_training:
    # When choosing shuffle buffer sizes, larger sizes result in better
    # randomness, while smaller sizes have better performance.
    dataset = dataset.shuffle(buffer_size=_SHUFFLE_BUFFER)
+  dataset = dataset.map(lambda value: dataset_parser(value, is_training),
+                        num_parallel_calls=5)
+  dataset = dataset.prefetch(batch_size)
  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = dataset.repeat(num_epochs)

--- a/official/wide_deep/wide_deep.py
+++ b/official/wide_deep/wide_deep.py
@@ -179,11 +179,12 @@ def input_fn(data_file, num_epochs, shuffle, batch_size):
  # Extract lines from input files using the Dataset API.
  dataset = tf.data.TextLineDataset(data_file)
-  dataset = dataset.map(parse_csv, num_parallel_calls=5)
  if shuffle:
    dataset = dataset.shuffle(buffer_size=_SHUFFLE_BUFFER)
+  dataset = dataset.map(parse_csv, num_parallel_calls=5)
  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = dataset.repeat(num_epochs)
@@ -193,6 +194,7 @@ def input_fn(data_file, num_epochs, shuffle, batch_size):
  features, labels = iterator.get_next()
  return features, labels
 def main(unused_argv):
  # Clean up the model directory if present
  shutil.rmtree(FLAGS.model_dir, ignore_errors=True)