Reorder NCF data pipeline (#5536)

* intermediate commit finish replacing spillover with resampled padding intermediate commit * resolve merge conflict * intermediate commit * further consolidate the data pipeline * complete first pass at data pipeline refactor * remove some leftover code * fix test * remove resampling, and move train padding logic into neumf.py * small tweaks * fix weight bug * address PR comments * fix dict zip. (Reed led me astray) * delint * make data test deterministic and delint * Reed didn't lead me astray. I just can't read. * more delinting * even more delinting * use resampling for last batch padding * pad last batch with unique data * Revert "pad last batch with unique data" This reverts commit cbdf46efcd5c7907038a24105b88d38e7f1d6da2. * move padded batch to the beginning * delint * fix step check for synthetic data

Reorder NCF data pipeline (#5536)
* intermediate commit finish replacing spillover with resampled padding intermediate commit * resolve merge conflict * intermediate commit * further consolidate the data pipeline * complete first pass at data pipeline refactor * remove some leftover code * fix test * remove resampling, and move train padding logic into neumf.py * small tweaks * fix weight bug * address PR comments * fix dict zip. (Reed led me astray) * delint * make data test deterministic and delint * Reed didn't lead me astray. I just can't read. * more delinting * even more delinting * use resampling for last batch padding * pad last batch with unique data * Revert "pad last batch with unique data" This reverts commit cbdf46efcd5c7907038a24105b88d38e7f1d6da2. * move padded batch to the beginning * delint * fix step check for synthetic data
19d4eaaf · Taylor Robie · GitHub · 413f15ba · 19d4eaaf · 19d4eaaf
Unverified Commit 19d4eaaf authored Oct 18, 2018 by Taylor Robie Committed by GitHub Oct 18, 2018
6 changed files
--- a/official/recommendation/constants.py
+++ b/official/recommendation/constants.py
@@ -35,16 +35,16 @@ class Paths(object):
                                             "positive_shard_{}.pickle")
    self.train_epoch_dir = os.path.join(self.cache_root, "training_epochs")
    self.eval_data_subdir = os.path.join(self.cache_root, "eval_data")
-    self.eval_raw_file = os.path.join(self.eval_data_subdir, "raw.pickle")
-    self.eval_record_template_temp = os.path.join(self.eval_data_subdir,
-                                                  "eval_records.temp")
-    self.eval_record_template = os.path.join(
-        self.eval_data_subdir, "padded_eval_batch_size_{}.tfrecords")
    self.subproc_alive = os.path.join(self.cache_root, "subproc.alive")
 APPROX_PTS_PER_TRAIN_SHARD = 128000
+# Keys for data shards
+TRAIN_KEY = "train"
+EVAL_KEY = "eval"
 # In both datasets, each user has at least 20 ratings.
 MIN_NUM_RATINGS = 20
@@ -68,7 +68,9 @@ FLAGFILE_TEMP = "flagfile.temp"
 FLAGFILE = "flagfile"
 READY_FILE_TEMP = "ready.json.temp"
 READY_FILE = "ready.json"
 TRAIN_RECORD_TEMPLATE = "train_{}.tfrecords"
+EVAL_RECORD_TEMPLATE = "eval_{}.tfrecords"
 TIMEOUT_SECONDS = 3600 * 2  # If the train loop goes more than two hours without
                            # consuming an epoch of data, this is a good

--- a/official/recommendation/data_async_generation.py
+++ b/official/recommendation/data_async_generation.py
--- a/official/recommendation/data_preprocessing.py
+++ b/official/recommendation/data_preprocessing.py
@@ -57,7 +57,7 @@ DATASET_TO_NUM_USERS_AND_ITEMS = {
 # Number of batches to run per epoch when using synthetic data. At high batch
 # sizes, we run for more batches than with real data, which is good since
 # running more batches reduces noise when measuring the average batches/second.
-_SYNTHETIC_BATCHES_PER_EPOCH = 2000
+SYNTHETIC_BATCHES_PER_EPOCH = 2000
 class NCFDataset(object):
@@ -65,7 +65,7 @@ class NCFDataset(object):
  def __init__(self, user_map, item_map, num_data_readers, cache_paths,
               num_train_positives, deterministic=False):
-    # type: (dict, dict, int, rconst.Paths) -> None
+    # type: (dict, dict, int, rconst.Paths, int, bool) -> None
    """Assign key values for recommendation dataset.
    Args:
@@ -175,7 +175,6 @@ def _filter_index_sort(raw_rating_path, match_mlperf):
 def _train_eval_map_fn(args):
-  # type: (...) -> typing.Dict(np.ndarray)
  """Split training and testing data and generate testing negatives.
  This function is called as part of a multiprocessing map. The principle
@@ -186,9 +185,8 @@ def _train_eval_map_fn(args):
  For each user, all but the last item is written into a pickle file which the
  training data producer can consume on as needed. The last item for a user
-  is a validation point; for each validation point a number of negatives are
+  is a validation point; it is written under a separate key and will be used
-  generated (typically 999). The validation data is returned by this function,
+  later to generate the evaluation data.
-  as it is held in memory for the remainder of the run.
  Args:
    shard: A dict containing the user and item arrays.
@@ -198,16 +196,10 @@ def _train_eval_map_fn(args):
      which validation negatives should be drawn.
    cache_paths: rconst.Paths object containing locations for various cache
      files.
-    seed: Random seed to be used when generating testing negatives.
-    match_mlperf: If True, sample eval negative with replacements, which the
-      MLPerf reference implementation does.
-  Returns:
-    A dict containing the evaluation data for a given shard.
  """
-  shard, shard_id, num_items, cache_paths, seed, match_mlperf = args
+  shard, shard_id, num_items, cache_paths = args
-  np.random.seed(seed)
  users = shard[movielens.USER_COLUMN]
  items = shard[movielens.ITEM_COLUMN]
@@ -218,7 +210,6 @@ def _train_eval_map_fn(args):
                [users.shape[0]])
  train_blocks = []
-  test_blocks = []
  test_positives = []
  for i in range(len(boundaries) - 1):
    # This is simply a vector of repeated values such that the shard could be
@@ -233,38 +224,30 @@ def _train_eval_map_fn(args):
    block_items = items[boundaries[i]:boundaries[i+1]]
    train_blocks.append((block_user[:-1], block_items[:-1]))
-    test_negatives = stat_utils.sample_with_exclusion(
-        num_items=num_items, positive_set=set(block_items),
-        n=rconst.NUM_EVAL_NEGATIVES, replacement=match_mlperf)
-    test_blocks.append((
-        block_user[0] * np.ones((rconst.NUM_EVAL_NEGATIVES + 1,),
-                                dtype=np.int32),
-        np.array([block_items[-1]] + test_negatives, dtype=np.uint16)
-    ))
    test_positives.append((block_user[0], block_items[-1]))
  train_users = np.concatenate([i[0] for i in train_blocks])
  train_items = np.concatenate([i[1] for i in train_blocks])
+  test_pos_users = np.array([i[0] for i in test_positives],
+                            dtype=train_users.dtype)
+  test_pos_items = np.array([i[1] for i in test_positives],
+                            dtype=train_items.dtype)
  train_shard_fpath = cache_paths.train_shard_template.format(
      str(shard_id).zfill(5))
  with tf.gfile.Open(train_shard_fpath, "wb") as f:
    pickle.dump({
+        rconst.TRAIN_KEY: {
            movielens.USER_COLUMN: train_users,
            movielens.ITEM_COLUMN: train_items,
-    }, f)
+        },
+        rconst.EVAL_KEY: {
-  test_users = np.concatenate([i[0] for i in test_blocks])
+            movielens.USER_COLUMN: test_pos_users,
-  test_items = np.concatenate([i[1] for i in test_blocks])
+            movielens.ITEM_COLUMN: test_pos_items,
-  assert test_users.shape == test_items.shape
-  assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0
-  return {
-      movielens.USER_COLUMN: test_users,
-      movielens.ITEM_COLUMN: test_items,
        }
+    }, f)
 def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
@@ -327,38 +310,16 @@ def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
                  "negatives per user...".format(rconst.NUM_EVAL_NEGATIVES))
  tf.gfile.MakeDirs(cache_paths.train_shard_subdir)
-  # We choose a different random seed for each process, so that the processes
+  map_args = [(shards[i], i, num_items, cache_paths)
-  # will not all choose the same random numbers.
-  process_seeds = [np.random.randint(2**32) for _ in range(approx_num_shards)]
-  map_args = [(shards[i], i, num_items, cache_paths, process_seeds[i],
-               match_mlperf)
              for i in range(approx_num_shards)]
-  with popen_helper.get_pool(multiprocessing.cpu_count()) as pool:
-    test_shards = pool.map(_train_eval_map_fn, map_args)  # pylint: disable=no-member
-  tf.logging.info("Merging test shards...")
-  test_users = np.concatenate([i[movielens.USER_COLUMN] for i in test_shards])
-  test_items = np.concatenate([i[movielens.ITEM_COLUMN] for i in test_shards])
-  assert test_users.shape == test_items.shape
-  assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0
-  test_labels = np.zeros(shape=test_users.shape)
+  with popen_helper.get_pool(multiprocessing.cpu_count()) as pool:
-  test_labels[0::(rconst.NUM_EVAL_NEGATIVES + 1)] = 1
+    pool.map(_train_eval_map_fn, map_args)  # pylint: disable=no-member
-  eval_data = ({
-      movielens.USER_COLUMN: test_users,
-      movielens.ITEM_COLUMN: test_items,
-  }, test_labels)
-  tf.logging.info("Writing test data to file.")
-  tf.gfile.MakeDirs(cache_paths.eval_data_subdir)
-  with tf.gfile.Open(cache_paths.eval_raw_file, "wb") as f:
-    pickle.dump(eval_data, f, protocol=pickle.HIGHEST_PROTOCOL)
 def construct_cache(dataset, data_dir, num_data_readers, match_mlperf,
                    deterministic, cache_id=None):
-  # type: (str, str, int, bool, typing.Optional[int]) -> NCFDataset
+  # type: (str, str, int, bool, bool, typing.Optional[int]) -> NCFDataset
  """Load and digest data CSV into a usable form.
  Args:
@@ -419,18 +380,21 @@ def _shutdown(proc):
  """Convenience function to cleanly shut down async generation process."""
  tf.logging.info("Shutting down train data creation subprocess.")
+  try:
    try:
      proc.send_signal(signal.SIGINT)
-    time.sleep(1)
+      time.sleep(5)
      if proc.returncode is not None:
-      return  # SIGINT was handled successfully within 1 sec
+        return  # SIGINT was handled successfully within 5 seconds
    except socket.error:
      pass
-  # Otherwise another second of grace period and then forcibly kill the process.
+    # Otherwise another second of grace period and then force kill the process.
    time.sleep(1)
    proc.terminate()
+  except:  # pylint: disable=broad-except
+    tf.logging.error("Data generation subprocess could not be killed.")
 def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
@@ -456,18 +420,17 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
      "num_neg": num_neg,
      "num_train_positives": ncf_dataset.num_train_positives,
      "num_items": ncf_dataset.num_items,
+      "num_users": ncf_dataset.num_users,
      "num_readers": ncf_dataset.num_data_readers,
      "epochs_per_cycle": epochs_per_cycle,
      "train_batch_size": batch_size,
      "eval_batch_size": eval_batch_size,
      "num_workers": num_workers,
-      # This allows the training input function to guarantee batch size and
-      # significantly improves performance. (~5% increase in examples/sec on
-      # GPU, and needed for TPU XLA.)
-      "spillover": True,
      "redirect_logs": use_subprocess,
      "use_tf_logging": not use_subprocess,
+      "ml_perf": match_mlperf,
  }
  if ncf_dataset.deterministic:
    flags_["seed"] = stat_utils.random_int32()
  tf.gfile.MakeDirs(data_dir)
@@ -608,12 +571,12 @@ def hash_pipeline(dataset, deterministic):
  tf.logging.info("  [pipeline_hash] All batches hash: {}".format(overall_hash))
-def make_train_input_fn(ncf_dataset):
+def make_input_fn(ncf_dataset, is_training):
-  # type: (typing.Optional[NCFDataset]) -> (typing.Callable, str, int)
+  # type: (typing.Optional[NCFDataset], bool) -> (typing.Callable, str, int)
  """Construct training input_fn for the current epoch."""
  if ncf_dataset is None:
-    return make_train_synthetic_input_fn()
+    return make_synthetic_input_fn(is_training)
  if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
    # The generation subprocess must have been alive at some point, because we
@@ -621,6 +584,7 @@ def make_train_input_fn(ncf_dataset):
    raise ValueError("Generation subprocess unexpectedly died. Data will not "
                     "be available; exiting to avoid waiting forever.")
+  if is_training:
    train_epoch_dir = ncf_dataset.cache_paths.train_epoch_dir
    while not tf.gfile.Exists(train_epoch_dir):
      tf.logging.info("Waiting for {} to exist.".format(train_epoch_dir))
@@ -634,6 +598,10 @@ def make_train_input_fn(ncf_dataset):
    train_data_dirs.sort()  # names are zfilled so that
                            # lexicographic sort == numeric sort
    record_dir = os.path.join(train_epoch_dir, train_data_dirs[0])
+    template = rconst.TRAIN_RECORD_TEMPLATE
+  else:
+    record_dir = ncf_dataset.cache_paths.eval_data_subdir
+    template = rconst.EVAL_RECORD_TEMPLATE
  ready_file = os.path.join(record_dir, rconst.READY_FILE)
  while not tf.gfile.Exists(ready_file):
@@ -643,16 +611,18 @@ def make_train_input_fn(ncf_dataset):
  with tf.gfile.Open(ready_file, "r") as f:
    epoch_metadata = json.load(f)
-  # The data pipeline uses spillover to guarantee static batch sizes. This
+  # This value is used to check that the batch count from the subprocess matches
-  # means that an extra batch will need to be run every few epochs. TPUs
+  # the batch count expected by the main thread.
-  # require that the number of batches to be run is known at the time that
-  # estimator.train() is called, so having the generation pipeline report
-  # number of batches guarantees that this count is correct.
  batch_count = epoch_metadata["batch_count"]
  def input_fn(params):
    """Generated input_fn for the given epoch."""
+    if is_training:
      batch_size = params["batch_size"]
+    else:
+      # Estimator has "eval_batch_size" included in the params, but TPUEstimator
+      # populates "batch_size" to the appropriate value.
+      batch_size = params.get("eval_batch_size") or params["batch_size"]
    if epoch_metadata["batch_size"] != batch_size:
      raise ValueError(
@@ -662,8 +632,7 @@ def make_train_input_fn(ncf_dataset):
          .format(epoch_metadata["batch_size"], batch_size))
    record_files = tf.data.Dataset.list_files(
-        os.path.join(record_dir, rconst.TRAIN_RECORD_TEMPLATE.format("*")),
+        os.path.join(record_dir, template.format("*")), shuffle=False)
-        shuffle=False)
    interleave = tf.contrib.data.parallel_interleave(
        tf.data.TFRecordDataset,
@@ -673,7 +642,7 @@ def make_train_input_fn(ncf_dataset):
        prefetch_input_elements=4,
    )
-    deserialize = make_deserialize(params, batch_size, True)
+    deserialize = make_deserialize(params, batch_size, is_training)
    dataset = record_files.apply(interleave)
    dataset = dataset.map(deserialize, num_parallel_calls=4)
    dataset = dataset.prefetch(32)
@@ -686,11 +655,12 @@ def make_train_input_fn(ncf_dataset):
  return input_fn, record_dir, batch_count
-def make_train_synthetic_input_fn():
+def make_synthetic_input_fn(is_training):
  """Construct training input_fn that uses synthetic data."""
  def input_fn(params):
    """Generated input_fn for the given epoch."""
-    batch_size = params["batch_size"]
+    batch_size = (params["batch_size"] if is_training else
+                  params["eval_batch_size"] or params["batch_size"])
    num_users = params["num_users"]
    num_items = params["num_items"]
@@ -698,78 +668,26 @@ def make_train_synthetic_input_fn():
                              maxval=num_users)
    items = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
                              maxval=num_items)
+    if is_training:
      labels = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
                                 maxval=2)
      data = {
          movielens.USER_COLUMN: users,
          movielens.ITEM_COLUMN: items,
      }, labels
-    dataset = tf.data.Dataset.from_tensors(data).repeat(
+    else:
-        _SYNTHETIC_BATCHES_PER_EPOCH)
-    dataset = dataset.prefetch(32)
-    return dataset
-  return input_fn, None, _SYNTHETIC_BATCHES_PER_EPOCH
-def make_pred_input_fn(ncf_dataset):
-  # type: (typing.Optional[NCFDataset]) -> typing.Callable
-  """Construct input_fn for metric evaluation."""
-  if ncf_dataset is None:
-    return make_synthetic_pred_input_fn()
-  def input_fn(params):
-    """Input function based on eval batch size."""
-    # Estimator has "eval_batch_size" included in the params, but TPUEstimator
-    # populates "batch_size" to the appropriate value.
-    batch_size = params.get("eval_batch_size") or params["batch_size"]
-    record_file = ncf_dataset.cache_paths.eval_record_template.format(
-        batch_size)
-    while not tf.gfile.Exists(record_file):
-      tf.logging.info(
-          "Waiting for eval data to be written to {}".format(record_file))
-      time.sleep(1)
-    dataset = tf.data.TFRecordDataset(record_file)
-    deserialize = make_deserialize(params, batch_size, False)
-    dataset = dataset.map(deserialize, num_parallel_calls=4)
-    dataset = dataset.prefetch(16)
-    if params.get("hash_pipeline"):
-      hash_pipeline(dataset, ncf_dataset.deterministic)
-    return dataset
-  return input_fn
-def make_synthetic_pred_input_fn():
-  """Construct input_fn for metric evaluation that uses synthetic data."""
-  def input_fn(params):
-    """Generated input_fn for the given epoch."""
-    batch_size = params["eval_batch_size"]
-    num_users = params["num_users"]
-    num_items = params["num_items"]
-    users = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
-                              maxval=num_users)
-    items = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
-                              maxval=num_items)
      dupe_mask = tf.cast(tf.random_uniform([batch_size], dtype=tf.int32,
                                            minval=0, maxval=2), tf.bool)
      data = {
          movielens.USER_COLUMN: users,
          movielens.ITEM_COLUMN: items,
          rconst.DUPLICATE_MASK: dupe_mask,
      }
    dataset = tf.data.Dataset.from_tensors(data).repeat(
-        _SYNTHETIC_BATCHES_PER_EPOCH)
+        SYNTHETIC_BATCHES_PER_EPOCH)
-    dataset = dataset.prefetch(16)
+    dataset = dataset.prefetch(32)
    return dataset
-  return input_fn
+  return input_fn, None, SYNTHETIC_BATCHES_PER_EPOCH
--- a/official/recommendation/data_test.py
+++ b/official/recommendation/data_test.py
@@ -28,7 +28,9 @@ import tensorflow as tf
 from official.datasets import movielens
 from official.recommendation import constants as rconst
+from official.recommendation import data_async_generation
 from official.recommendation import data_preprocessing
+from official.recommendation import stat_utils
 DATASET = "ml-test"
@@ -121,7 +123,7 @@ class BaseTest(tf.test.TestCase):
    g = tf.Graph()
    with g.as_default():
      input_fn, record_dir, batch_count = \
-        data_preprocessing.make_train_input_fn(ncf_dataset)
+        data_preprocessing.make_input_fn(ncf_dataset, True)
      dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False})
    first_epoch = self.drain_dataset(dataset=dataset, g=g)
    user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
@@ -134,6 +136,7 @@ class BaseTest(tf.test.TestCase):
    for features, labels in first_epoch:
      for u, i, l in zip(features[movielens.USER_COLUMN],
                         features[movielens.ITEM_COLUMN], labels):
        u_raw = user_inv_map[u]
        i_raw = item_inv_map[i]
        if ((u_raw, i_raw) in self.seen_pairs) != l:
@@ -145,9 +148,7 @@ class BaseTest(tf.test.TestCase):
        train_examples[l].add((u_raw, i_raw))
    num_positives_seen = len(train_examples[True])
-    # The numbers don't match exactly because the last batch spills over into
+    assert ncf_dataset.num_train_positives == num_positives_seen
-    # the next epoch
-    assert ncf_dataset.num_train_positives - num_positives_seen < BATCH_SIZE
    # This check is more heuristic because negatives are sampled with
    # replacement. It only checks that negative generation is reasonably random.
@@ -162,20 +163,42 @@ class BaseTest(tf.test.TestCase):
                       movielens.TIMESTAMP_COLUMN: times})
    cache_paths = rconst.Paths(data_dir=self.temp_data_dir)
    np.random.seed(1)
-    data_preprocessing.generate_train_eval_data(df, approx_num_shards=2,
-                                                num_items=10,
+    num_shards = 2
-                                                cache_paths=cache_paths,
+    num_items = 10
-                                                match_mlperf=True)
+    data_preprocessing.generate_train_eval_data(
-    with tf.gfile.Open(cache_paths.eval_raw_file, "rb") as f:
+        df, approx_num_shards=num_shards, num_items=num_items,
-      eval_data = pickle.load(f)
+        cache_paths=cache_paths, match_mlperf=True)
+    raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir)
+    assert len(raw_shards) == num_shards
+    sharded_eval_data = []
+    for i in range(2):
+      sharded_eval_data.append(data_async_generation._process_shard(
+          (os.path.join(cache_paths.train_shard_subdir, raw_shards[i]),
+           num_items, rconst.NUM_EVAL_NEGATIVES, stat_utils.random_int32(),
+           False, True)))
+    if sharded_eval_data[0][0][0] == 1:
+      # Order is not assured for this part of the pipeline.
+      sharded_eval_data.reverse()
+    eval_data = [np.concatenate([shard[i] for shard in sharded_eval_data])
+                 for i in range(3)]
+    eval_data = {
+        movielens.USER_COLUMN: eval_data[0],
+        movielens.ITEM_COLUMN: eval_data[1],
+    }
    eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1
-    self.assertAllClose(eval_data[0][movielens.USER_COLUMN],
+    self.assertAllClose(eval_data[movielens.USER_COLUMN],
                        [0] * eval_items_per_user + [1] * eval_items_per_user)
    # Each shard process should generate different random items.
    self.assertNotAllClose(
-        eval_data[0][movielens.ITEM_COLUMN][:eval_items_per_user],
+        eval_data[movielens.ITEM_COLUMN][:eval_items_per_user],
-        eval_data[0][movielens.ITEM_COLUMN][eval_items_per_user:])
+        eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
 if __name__ == "__main__":

--- a/official/recommendation/ncf_main.py
+++ b/official/recommendation/ncf_main.py
@@ -142,7 +142,8 @@ def run_ncf(_):
    cleanup_fn = lambda: None
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
-    approx_train_steps = None
+    num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
+    num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
@@ -156,8 +157,11 @@ def run_ncf(_):
        cache_id=FLAGS.cache_id)
    num_users = ncf_dataset.num_users
    num_items = ncf_dataset.num_items
-    approx_train_steps = int(ncf_dataset.num_train_positives
+    num_train_steps = int(np.ceil(
-                             * (1 + FLAGS.num_neg) // FLAGS.batch_size)
+        FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
+        (1 + FLAGS.num_neg) / FLAGS.batch_size))
+    num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
+                                 ncf_dataset.num_users / eval_batch_size))
  model_helpers.apply_clean(flags.FLAGS)
@@ -206,8 +210,8 @@ def run_ncf(_):
      run_params=run_params,
      test_id=FLAGS.benchmark_test_id)
-  pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset)
+  pred_input_fn = None
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
  for cycle_index in range(total_training_cycle):
    tf.logging.info("Starting a training cycle: {}/{}".format(
@@ -215,20 +219,31 @@ def run_ncf(_):
    # Train the model
    train_input_fn, train_record_dir, batch_count = \
-      data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)
+      data_preprocessing.make_input_fn(
+          ncf_dataset=ncf_dataset, is_training=True)
-    if approx_train_steps and np.abs(approx_train_steps - batch_count) > 1:
+    if batch_count != num_train_steps:
-      tf.logging.warning(
+      raise ValueError(
-          "Estimated ({}) and reported ({}) number of batches differ by more "
+          "Step counts do not match. ({} vs. {}) The async process is "
-          "than one".format(approx_train_steps, batch_count))
+          "producing incorrect shards.".format(batch_count, num_train_steps))
    train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
-                          steps=batch_count)
+                          steps=num_train_steps)
    if train_record_dir:
      tf.gfile.DeleteRecursively(train_record_dir)
    tf.logging.info("Beginning evaluation.")
-    eval_results = eval_estimator.evaluate(pred_input_fn)
+    if pred_input_fn is None:
+      pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
+          ncf_dataset=ncf_dataset, is_training=False)
+      if eval_batch_count != num_eval_steps:
+        raise ValueError(
+            "Step counts do not match. ({} vs. {}) The async process is "
+            "producing incorrect shards.".format(
+                eval_batch_count, num_eval_steps))
+    eval_results = eval_estimator.evaluate(pred_input_fn, steps=num_eval_steps)
    tf.logging.info("Evaluation complete.")
    # Benchmark the evaluation results

--- a/official/recommendation/run.sh
+++ b/official/recommendation/run.sh
@@ -48,7 +48,7 @@ do
  # And to confirm that the pipeline is deterministic pass the flag:
  #   --hash_pipeline
  #
-  # (`--hash_pipeline` will slow down training)
+  # (`--hash_pipeline` will slow down training, though not as much as one might imagine.)
  python ncf_main.py --model_dir ${MODEL_DIR} \
                     --data_dir ${DATA_DIR} \
                     --dataset ${DATASET} --hooks "" \