Fix NCF input pipeline to avoid reading the same file multiple times in one epoch.

PiperOrigin-RevId: 322415899

Fix NCF input pipeline to avoid reading the same file multiple times in one epoch.
PiperOrigin-RevId: 322415899
6a2de9bb · Bruce Fontaine · A. Unique TensorFlower · f97e0231 · 6a2de9bb
Commit 6a2de9bb authored Jul 21, 2020 by Bruce Fontaine Committed by A. Unique TensorFlower Jul 21, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 25 deletions

official/recommendation/ncf_input_pipeline.py official/recommendation/ncf_input_pipeline.py +14 -25

No files found.
--- a/official/recommendation/ncf_input_pipeline.py
+++ b/official/recommendation/ncf_input_pipeline.py
@@ -25,10 +25,8 @@ import tensorflow.compat.v2 as tf
 # pylint: enable=g-bad-import-order
 from official.recommendation import constants as rconst
-from official.recommendation import movielens
 from official.recommendation import data_pipeline
+from official.recommendation import movielens
-NUM_SHARDS = 16
 def create_dataset_from_tf_record_files(input_file_pattern,
@@ -36,32 +34,23 @@ def create_dataset_from_tf_record_files(input_file_pattern,
                                        batch_size,
                                        is_training=True):
  """Creates dataset from (tf)records files for training/evaluation."""
+  if pre_batch_size != batch_size:
+    raise ValueError("Pre-batch ({}) size is not equal to batch "
+                     "size ({})".format(pre_batch_size, batch_size))
  files = tf.data.Dataset.list_files(input_file_pattern, shuffle=is_training)
-  def make_dataset(files_dataset, shard_index):
+  dataset = files.interleave(
-    """Returns dataset for sharded tf record files."""
+      tf.data.TFRecordDataset,
-    if pre_batch_size != batch_size:
+      cycle_length=16,
-      raise ValueError("Pre-batch ({}) size is not equal to batch "
-                       "size ({})".format(pre_batch_size, batch_size))
-    files_dataset = files_dataset.shard(NUM_SHARDS, shard_index)
-    dataset = files_dataset.interleave(
-        tf.data.TFRecordDataset,
-        num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    decode_fn = functools.partial(
-        data_pipeline.DatasetManager.deserialize,
-        batch_size=pre_batch_size,
-        is_training=is_training)
-    dataset = dataset.map(
-        decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    return dataset
-  dataset = tf.data.Dataset.range(NUM_SHARDS)
-  map_fn = functools.partial(make_dataset, files)
-  dataset = dataset.interleave(
-      map_fn,
-      cycle_length=NUM_SHARDS,
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  decode_fn = functools.partial(
+      data_pipeline.DatasetManager.deserialize,
+      batch_size=pre_batch_size,
+      is_training=is_training)
+  dataset = dataset.map(
+      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset