Unverified Commit 19d4eaaf authored by Taylor Robie's avatar Taylor Robie Committed by GitHub
Browse files

Reorder NCF data pipeline (#5536)

* intermediate commit

finish replacing spillover with resampled padding

intermediate commit

* resolve merge conflict

* intermediate commit

* further consolidate the data pipeline

* complete first pass at data pipeline refactor

* remove some leftover code

* fix test

* remove resampling, and move train padding logic into neumf.py

* small tweaks

* fix weight bug

* address PR comments

* fix dict zip. (Reed led me astray)

* delint

* make data test deterministic and delint

* Reed didn't lead me astray. I just can't read.

* more delinting

* even more delinting

* use resampling for last batch padding

* pad last batch with unique data

* Revert "pad last batch with unique data"

This reverts commit cbdf46efcd5c7907038a24105b88d38e7f1d6da2.

* move padded batch to the beginning

* delint

* fix step check for synthetic data
parent 413f15ba
...@@ -35,16 +35,16 @@ class Paths(object): ...@@ -35,16 +35,16 @@ class Paths(object):
"positive_shard_{}.pickle") "positive_shard_{}.pickle")
self.train_epoch_dir = os.path.join(self.cache_root, "training_epochs") self.train_epoch_dir = os.path.join(self.cache_root, "training_epochs")
self.eval_data_subdir = os.path.join(self.cache_root, "eval_data") self.eval_data_subdir = os.path.join(self.cache_root, "eval_data")
self.eval_raw_file = os.path.join(self.eval_data_subdir, "raw.pickle")
self.eval_record_template_temp = os.path.join(self.eval_data_subdir,
"eval_records.temp")
self.eval_record_template = os.path.join(
self.eval_data_subdir, "padded_eval_batch_size_{}.tfrecords")
self.subproc_alive = os.path.join(self.cache_root, "subproc.alive") self.subproc_alive = os.path.join(self.cache_root, "subproc.alive")
APPROX_PTS_PER_TRAIN_SHARD = 128000 APPROX_PTS_PER_TRAIN_SHARD = 128000
# Keys for data shards
TRAIN_KEY = "train"
EVAL_KEY = "eval"
# In both datasets, each user has at least 20 ratings. # In both datasets, each user has at least 20 ratings.
MIN_NUM_RATINGS = 20 MIN_NUM_RATINGS = 20
...@@ -68,7 +68,9 @@ FLAGFILE_TEMP = "flagfile.temp" ...@@ -68,7 +68,9 @@ FLAGFILE_TEMP = "flagfile.temp"
FLAGFILE = "flagfile" FLAGFILE = "flagfile"
READY_FILE_TEMP = "ready.json.temp" READY_FILE_TEMP = "ready.json.temp"
READY_FILE = "ready.json" READY_FILE = "ready.json"
TRAIN_RECORD_TEMPLATE = "train_{}.tfrecords" TRAIN_RECORD_TEMPLATE = "train_{}.tfrecords"
EVAL_RECORD_TEMPLATE = "eval_{}.tfrecords"
TIMEOUT_SECONDS = 3600 * 2 # If the train loop goes more than two hours without TIMEOUT_SECONDS = 3600 * 2 # If the train loop goes more than two hours without
# consuming an epoch of data, this is a good # consuming an epoch of data, this is a good
......
...@@ -57,7 +57,7 @@ DATASET_TO_NUM_USERS_AND_ITEMS = { ...@@ -57,7 +57,7 @@ DATASET_TO_NUM_USERS_AND_ITEMS = {
# Number of batches to run per epoch when using synthetic data. At high batch # Number of batches to run per epoch when using synthetic data. At high batch
# sizes, we run for more batches than with real data, which is good since # sizes, we run for more batches than with real data, which is good since
# running more batches reduces noise when measuring the average batches/second. # running more batches reduces noise when measuring the average batches/second.
_SYNTHETIC_BATCHES_PER_EPOCH = 2000 SYNTHETIC_BATCHES_PER_EPOCH = 2000
class NCFDataset(object): class NCFDataset(object):
...@@ -65,7 +65,7 @@ class NCFDataset(object): ...@@ -65,7 +65,7 @@ class NCFDataset(object):
def __init__(self, user_map, item_map, num_data_readers, cache_paths, def __init__(self, user_map, item_map, num_data_readers, cache_paths,
num_train_positives, deterministic=False): num_train_positives, deterministic=False):
# type: (dict, dict, int, rconst.Paths) -> None # type: (dict, dict, int, rconst.Paths, int, bool) -> None
"""Assign key values for recommendation dataset. """Assign key values for recommendation dataset.
Args: Args:
...@@ -175,7 +175,6 @@ def _filter_index_sort(raw_rating_path, match_mlperf): ...@@ -175,7 +175,6 @@ def _filter_index_sort(raw_rating_path, match_mlperf):
def _train_eval_map_fn(args): def _train_eval_map_fn(args):
# type: (...) -> typing.Dict(np.ndarray)
"""Split training and testing data and generate testing negatives. """Split training and testing data and generate testing negatives.
This function is called as part of a multiprocessing map. The principle This function is called as part of a multiprocessing map. The principle
...@@ -186,9 +185,8 @@ def _train_eval_map_fn(args): ...@@ -186,9 +185,8 @@ def _train_eval_map_fn(args):
For each user, all but the last item is written into a pickle file which the For each user, all but the last item is written into a pickle file which the
training data producer can consume on as needed. The last item for a user training data producer can consume on as needed. The last item for a user
is a validation point; for each validation point a number of negatives are is a validation point; it is written under a separate key and will be used
generated (typically 999). The validation data is returned by this function, later to generate the evaluation data.
as it is held in memory for the remainder of the run.
Args: Args:
shard: A dict containing the user and item arrays. shard: A dict containing the user and item arrays.
...@@ -198,16 +196,10 @@ def _train_eval_map_fn(args): ...@@ -198,16 +196,10 @@ def _train_eval_map_fn(args):
which validation negatives should be drawn. which validation negatives should be drawn.
cache_paths: rconst.Paths object containing locations for various cache cache_paths: rconst.Paths object containing locations for various cache
files. files.
seed: Random seed to be used when generating testing negatives.
match_mlperf: If True, sample eval negative with replacements, which the
MLPerf reference implementation does.
Returns:
A dict containing the evaluation data for a given shard.
""" """
shard, shard_id, num_items, cache_paths, seed, match_mlperf = args shard, shard_id, num_items, cache_paths = args
np.random.seed(seed)
users = shard[movielens.USER_COLUMN] users = shard[movielens.USER_COLUMN]
items = shard[movielens.ITEM_COLUMN] items = shard[movielens.ITEM_COLUMN]
...@@ -218,7 +210,6 @@ def _train_eval_map_fn(args): ...@@ -218,7 +210,6 @@ def _train_eval_map_fn(args):
[users.shape[0]]) [users.shape[0]])
train_blocks = [] train_blocks = []
test_blocks = []
test_positives = [] test_positives = []
for i in range(len(boundaries) - 1): for i in range(len(boundaries) - 1):
# This is simply a vector of repeated values such that the shard could be # This is simply a vector of repeated values such that the shard could be
...@@ -233,38 +224,30 @@ def _train_eval_map_fn(args): ...@@ -233,38 +224,30 @@ def _train_eval_map_fn(args):
block_items = items[boundaries[i]:boundaries[i+1]] block_items = items[boundaries[i]:boundaries[i+1]]
train_blocks.append((block_user[:-1], block_items[:-1])) train_blocks.append((block_user[:-1], block_items[:-1]))
test_negatives = stat_utils.sample_with_exclusion(
num_items=num_items, positive_set=set(block_items),
n=rconst.NUM_EVAL_NEGATIVES, replacement=match_mlperf)
test_blocks.append((
block_user[0] * np.ones((rconst.NUM_EVAL_NEGATIVES + 1,),
dtype=np.int32),
np.array([block_items[-1]] + test_negatives, dtype=np.uint16)
))
test_positives.append((block_user[0], block_items[-1])) test_positives.append((block_user[0], block_items[-1]))
train_users = np.concatenate([i[0] for i in train_blocks]) train_users = np.concatenate([i[0] for i in train_blocks])
train_items = np.concatenate([i[1] for i in train_blocks]) train_items = np.concatenate([i[1] for i in train_blocks])
test_pos_users = np.array([i[0] for i in test_positives],
dtype=train_users.dtype)
test_pos_items = np.array([i[1] for i in test_positives],
dtype=train_items.dtype)
train_shard_fpath = cache_paths.train_shard_template.format( train_shard_fpath = cache_paths.train_shard_template.format(
str(shard_id).zfill(5)) str(shard_id).zfill(5))
with tf.gfile.Open(train_shard_fpath, "wb") as f: with tf.gfile.Open(train_shard_fpath, "wb") as f:
pickle.dump({ pickle.dump({
rconst.TRAIN_KEY: {
movielens.USER_COLUMN: train_users, movielens.USER_COLUMN: train_users,
movielens.ITEM_COLUMN: train_items, movielens.ITEM_COLUMN: train_items,
}, f) },
rconst.EVAL_KEY: {
test_users = np.concatenate([i[0] for i in test_blocks]) movielens.USER_COLUMN: test_pos_users,
test_items = np.concatenate([i[1] for i in test_blocks]) movielens.ITEM_COLUMN: test_pos_items,
assert test_users.shape == test_items.shape
assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0
return {
movielens.USER_COLUMN: test_users,
movielens.ITEM_COLUMN: test_items,
} }
}, f)
def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths, def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
...@@ -327,38 +310,16 @@ def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths, ...@@ -327,38 +310,16 @@ def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
"negatives per user...".format(rconst.NUM_EVAL_NEGATIVES)) "negatives per user...".format(rconst.NUM_EVAL_NEGATIVES))
tf.gfile.MakeDirs(cache_paths.train_shard_subdir) tf.gfile.MakeDirs(cache_paths.train_shard_subdir)
# We choose a different random seed for each process, so that the processes map_args = [(shards[i], i, num_items, cache_paths)
# will not all choose the same random numbers.
process_seeds = [np.random.randint(2**32) for _ in range(approx_num_shards)]
map_args = [(shards[i], i, num_items, cache_paths, process_seeds[i],
match_mlperf)
for i in range(approx_num_shards)] for i in range(approx_num_shards)]
with popen_helper.get_pool(multiprocessing.cpu_count()) as pool:
test_shards = pool.map(_train_eval_map_fn, map_args) # pylint: disable=no-member
tf.logging.info("Merging test shards...")
test_users = np.concatenate([i[movielens.USER_COLUMN] for i in test_shards])
test_items = np.concatenate([i[movielens.ITEM_COLUMN] for i in test_shards])
assert test_users.shape == test_items.shape
assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0
test_labels = np.zeros(shape=test_users.shape) with popen_helper.get_pool(multiprocessing.cpu_count()) as pool:
test_labels[0::(rconst.NUM_EVAL_NEGATIVES + 1)] = 1 pool.map(_train_eval_map_fn, map_args) # pylint: disable=no-member
eval_data = ({
movielens.USER_COLUMN: test_users,
movielens.ITEM_COLUMN: test_items,
}, test_labels)
tf.logging.info("Writing test data to file.")
tf.gfile.MakeDirs(cache_paths.eval_data_subdir)
with tf.gfile.Open(cache_paths.eval_raw_file, "wb") as f:
pickle.dump(eval_data, f, protocol=pickle.HIGHEST_PROTOCOL)
def construct_cache(dataset, data_dir, num_data_readers, match_mlperf, def construct_cache(dataset, data_dir, num_data_readers, match_mlperf,
deterministic, cache_id=None): deterministic, cache_id=None):
# type: (str, str, int, bool, typing.Optional[int]) -> NCFDataset # type: (str, str, int, bool, bool, typing.Optional[int]) -> NCFDataset
"""Load and digest data CSV into a usable form. """Load and digest data CSV into a usable form.
Args: Args:
...@@ -419,18 +380,21 @@ def _shutdown(proc): ...@@ -419,18 +380,21 @@ def _shutdown(proc):
"""Convenience function to cleanly shut down async generation process.""" """Convenience function to cleanly shut down async generation process."""
tf.logging.info("Shutting down train data creation subprocess.") tf.logging.info("Shutting down train data creation subprocess.")
try:
try: try:
proc.send_signal(signal.SIGINT) proc.send_signal(signal.SIGINT)
time.sleep(1) time.sleep(5)
if proc.returncode is not None: if proc.returncode is not None:
return # SIGINT was handled successfully within 1 sec return # SIGINT was handled successfully within 5 seconds
except socket.error: except socket.error:
pass pass
# Otherwise another second of grace period and then forcibly kill the process. # Otherwise another second of grace period and then force kill the process.
time.sleep(1) time.sleep(1)
proc.terminate() proc.terminate()
except: # pylint: disable=broad-except
tf.logging.error("Data generation subprocess could not be killed.")
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
...@@ -456,18 +420,17 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size, ...@@ -456,18 +420,17 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
"num_neg": num_neg, "num_neg": num_neg,
"num_train_positives": ncf_dataset.num_train_positives, "num_train_positives": ncf_dataset.num_train_positives,
"num_items": ncf_dataset.num_items, "num_items": ncf_dataset.num_items,
"num_users": ncf_dataset.num_users,
"num_readers": ncf_dataset.num_data_readers, "num_readers": ncf_dataset.num_data_readers,
"epochs_per_cycle": epochs_per_cycle, "epochs_per_cycle": epochs_per_cycle,
"train_batch_size": batch_size, "train_batch_size": batch_size,
"eval_batch_size": eval_batch_size, "eval_batch_size": eval_batch_size,
"num_workers": num_workers, "num_workers": num_workers,
# This allows the training input function to guarantee batch size and
# significantly improves performance. (~5% increase in examples/sec on
# GPU, and needed for TPU XLA.)
"spillover": True,
"redirect_logs": use_subprocess, "redirect_logs": use_subprocess,
"use_tf_logging": not use_subprocess, "use_tf_logging": not use_subprocess,
"ml_perf": match_mlperf,
} }
if ncf_dataset.deterministic: if ncf_dataset.deterministic:
flags_["seed"] = stat_utils.random_int32() flags_["seed"] = stat_utils.random_int32()
tf.gfile.MakeDirs(data_dir) tf.gfile.MakeDirs(data_dir)
...@@ -608,12 +571,12 @@ def hash_pipeline(dataset, deterministic): ...@@ -608,12 +571,12 @@ def hash_pipeline(dataset, deterministic):
tf.logging.info(" [pipeline_hash] All batches hash: {}".format(overall_hash)) tf.logging.info(" [pipeline_hash] All batches hash: {}".format(overall_hash))
def make_train_input_fn(ncf_dataset): def make_input_fn(ncf_dataset, is_training):
# type: (typing.Optional[NCFDataset]) -> (typing.Callable, str, int) # type: (typing.Optional[NCFDataset], bool) -> (typing.Callable, str, int)
"""Construct training input_fn for the current epoch.""" """Construct training input_fn for the current epoch."""
if ncf_dataset is None: if ncf_dataset is None:
return make_train_synthetic_input_fn() return make_synthetic_input_fn(is_training)
if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive): if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
# The generation subprocess must have been alive at some point, because we # The generation subprocess must have been alive at some point, because we
...@@ -621,6 +584,7 @@ def make_train_input_fn(ncf_dataset): ...@@ -621,6 +584,7 @@ def make_train_input_fn(ncf_dataset):
raise ValueError("Generation subprocess unexpectedly died. Data will not " raise ValueError("Generation subprocess unexpectedly died. Data will not "
"be available; exiting to avoid waiting forever.") "be available; exiting to avoid waiting forever.")
if is_training:
train_epoch_dir = ncf_dataset.cache_paths.train_epoch_dir train_epoch_dir = ncf_dataset.cache_paths.train_epoch_dir
while not tf.gfile.Exists(train_epoch_dir): while not tf.gfile.Exists(train_epoch_dir):
tf.logging.info("Waiting for {} to exist.".format(train_epoch_dir)) tf.logging.info("Waiting for {} to exist.".format(train_epoch_dir))
...@@ -634,6 +598,10 @@ def make_train_input_fn(ncf_dataset): ...@@ -634,6 +598,10 @@ def make_train_input_fn(ncf_dataset):
train_data_dirs.sort() # names are zfilled so that train_data_dirs.sort() # names are zfilled so that
# lexicographic sort == numeric sort # lexicographic sort == numeric sort
record_dir = os.path.join(train_epoch_dir, train_data_dirs[0]) record_dir = os.path.join(train_epoch_dir, train_data_dirs[0])
template = rconst.TRAIN_RECORD_TEMPLATE
else:
record_dir = ncf_dataset.cache_paths.eval_data_subdir
template = rconst.EVAL_RECORD_TEMPLATE
ready_file = os.path.join(record_dir, rconst.READY_FILE) ready_file = os.path.join(record_dir, rconst.READY_FILE)
while not tf.gfile.Exists(ready_file): while not tf.gfile.Exists(ready_file):
...@@ -643,16 +611,18 @@ def make_train_input_fn(ncf_dataset): ...@@ -643,16 +611,18 @@ def make_train_input_fn(ncf_dataset):
with tf.gfile.Open(ready_file, "r") as f: with tf.gfile.Open(ready_file, "r") as f:
epoch_metadata = json.load(f) epoch_metadata = json.load(f)
# The data pipeline uses spillover to guarantee static batch sizes. This # This value is used to check that the batch count from the subprocess matches
# means that an extra batch will need to be run every few epochs. TPUs # the batch count expected by the main thread.
# require that the number of batches to be run is known at the time that
# estimator.train() is called, so having the generation pipeline report
# number of batches guarantees that this count is correct.
batch_count = epoch_metadata["batch_count"] batch_count = epoch_metadata["batch_count"]
def input_fn(params): def input_fn(params):
"""Generated input_fn for the given epoch.""" """Generated input_fn for the given epoch."""
if is_training:
batch_size = params["batch_size"] batch_size = params["batch_size"]
else:
# Estimator has "eval_batch_size" included in the params, but TPUEstimator
# populates "batch_size" to the appropriate value.
batch_size = params.get("eval_batch_size") or params["batch_size"]
if epoch_metadata["batch_size"] != batch_size: if epoch_metadata["batch_size"] != batch_size:
raise ValueError( raise ValueError(
...@@ -662,8 +632,7 @@ def make_train_input_fn(ncf_dataset): ...@@ -662,8 +632,7 @@ def make_train_input_fn(ncf_dataset):
.format(epoch_metadata["batch_size"], batch_size)) .format(epoch_metadata["batch_size"], batch_size))
record_files = tf.data.Dataset.list_files( record_files = tf.data.Dataset.list_files(
os.path.join(record_dir, rconst.TRAIN_RECORD_TEMPLATE.format("*")), os.path.join(record_dir, template.format("*")), shuffle=False)
shuffle=False)
interleave = tf.contrib.data.parallel_interleave( interleave = tf.contrib.data.parallel_interleave(
tf.data.TFRecordDataset, tf.data.TFRecordDataset,
...@@ -673,7 +642,7 @@ def make_train_input_fn(ncf_dataset): ...@@ -673,7 +642,7 @@ def make_train_input_fn(ncf_dataset):
prefetch_input_elements=4, prefetch_input_elements=4,
) )
deserialize = make_deserialize(params, batch_size, True) deserialize = make_deserialize(params, batch_size, is_training)
dataset = record_files.apply(interleave) dataset = record_files.apply(interleave)
dataset = dataset.map(deserialize, num_parallel_calls=4) dataset = dataset.map(deserialize, num_parallel_calls=4)
dataset = dataset.prefetch(32) dataset = dataset.prefetch(32)
...@@ -686,11 +655,12 @@ def make_train_input_fn(ncf_dataset): ...@@ -686,11 +655,12 @@ def make_train_input_fn(ncf_dataset):
return input_fn, record_dir, batch_count return input_fn, record_dir, batch_count
def make_train_synthetic_input_fn(): def make_synthetic_input_fn(is_training):
"""Construct training input_fn that uses synthetic data.""" """Construct training input_fn that uses synthetic data."""
def input_fn(params): def input_fn(params):
"""Generated input_fn for the given epoch.""" """Generated input_fn for the given epoch."""
batch_size = params["batch_size"] batch_size = (params["batch_size"] if is_training else
params["eval_batch_size"] or params["batch_size"])
num_users = params["num_users"] num_users = params["num_users"]
num_items = params["num_items"] num_items = params["num_items"]
...@@ -698,78 +668,26 @@ def make_train_synthetic_input_fn(): ...@@ -698,78 +668,26 @@ def make_train_synthetic_input_fn():
maxval=num_users) maxval=num_users)
items = tf.random_uniform([batch_size], dtype=tf.int32, minval=0, items = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
maxval=num_items) maxval=num_items)
if is_training:
labels = tf.random_uniform([batch_size], dtype=tf.int32, minval=0, labels = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
maxval=2) maxval=2)
data = { data = {
movielens.USER_COLUMN: users, movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items, movielens.ITEM_COLUMN: items,
}, labels }, labels
dataset = tf.data.Dataset.from_tensors(data).repeat( else:
_SYNTHETIC_BATCHES_PER_EPOCH)
dataset = dataset.prefetch(32)
return dataset
return input_fn, None, _SYNTHETIC_BATCHES_PER_EPOCH
def make_pred_input_fn(ncf_dataset):
# type: (typing.Optional[NCFDataset]) -> typing.Callable
"""Construct input_fn for metric evaluation."""
if ncf_dataset is None:
return make_synthetic_pred_input_fn()
def input_fn(params):
"""Input function based on eval batch size."""
# Estimator has "eval_batch_size" included in the params, but TPUEstimator
# populates "batch_size" to the appropriate value.
batch_size = params.get("eval_batch_size") or params["batch_size"]
record_file = ncf_dataset.cache_paths.eval_record_template.format(
batch_size)
while not tf.gfile.Exists(record_file):
tf.logging.info(
"Waiting for eval data to be written to {}".format(record_file))
time.sleep(1)
dataset = tf.data.TFRecordDataset(record_file)
deserialize = make_deserialize(params, batch_size, False)
dataset = dataset.map(deserialize, num_parallel_calls=4)
dataset = dataset.prefetch(16)
if params.get("hash_pipeline"):
hash_pipeline(dataset, ncf_dataset.deterministic)
return dataset
return input_fn
def make_synthetic_pred_input_fn():
"""Construct input_fn for metric evaluation that uses synthetic data."""
def input_fn(params):
"""Generated input_fn for the given epoch."""
batch_size = params["eval_batch_size"]
num_users = params["num_users"]
num_items = params["num_items"]
users = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
maxval=num_users)
items = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
maxval=num_items)
dupe_mask = tf.cast(tf.random_uniform([batch_size], dtype=tf.int32, dupe_mask = tf.cast(tf.random_uniform([batch_size], dtype=tf.int32,
minval=0, maxval=2), tf.bool) minval=0, maxval=2), tf.bool)
data = { data = {
movielens.USER_COLUMN: users, movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items, movielens.ITEM_COLUMN: items,
rconst.DUPLICATE_MASK: dupe_mask, rconst.DUPLICATE_MASK: dupe_mask,
} }
dataset = tf.data.Dataset.from_tensors(data).repeat( dataset = tf.data.Dataset.from_tensors(data).repeat(
_SYNTHETIC_BATCHES_PER_EPOCH) SYNTHETIC_BATCHES_PER_EPOCH)
dataset = dataset.prefetch(16) dataset = dataset.prefetch(32)
return dataset return dataset
return input_fn return input_fn, None, SYNTHETIC_BATCHES_PER_EPOCH
...@@ -28,7 +28,9 @@ import tensorflow as tf ...@@ -28,7 +28,9 @@ import tensorflow as tf
from official.datasets import movielens from official.datasets import movielens
from official.recommendation import constants as rconst from official.recommendation import constants as rconst
from official.recommendation import data_async_generation
from official.recommendation import data_preprocessing from official.recommendation import data_preprocessing
from official.recommendation import stat_utils
DATASET = "ml-test" DATASET = "ml-test"
...@@ -121,7 +123,7 @@ class BaseTest(tf.test.TestCase): ...@@ -121,7 +123,7 @@ class BaseTest(tf.test.TestCase):
g = tf.Graph() g = tf.Graph()
with g.as_default(): with g.as_default():
input_fn, record_dir, batch_count = \ input_fn, record_dir, batch_count = \
data_preprocessing.make_train_input_fn(ncf_dataset) data_preprocessing.make_input_fn(ncf_dataset, True)
dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False}) dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False})
first_epoch = self.drain_dataset(dataset=dataset, g=g) first_epoch = self.drain_dataset(dataset=dataset, g=g)
user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()} user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
...@@ -134,6 +136,7 @@ class BaseTest(tf.test.TestCase): ...@@ -134,6 +136,7 @@ class BaseTest(tf.test.TestCase):
for features, labels in first_epoch: for features, labels in first_epoch:
for u, i, l in zip(features[movielens.USER_COLUMN], for u, i, l in zip(features[movielens.USER_COLUMN],
features[movielens.ITEM_COLUMN], labels): features[movielens.ITEM_COLUMN], labels):
u_raw = user_inv_map[u] u_raw = user_inv_map[u]
i_raw = item_inv_map[i] i_raw = item_inv_map[i]
if ((u_raw, i_raw) in self.seen_pairs) != l: if ((u_raw, i_raw) in self.seen_pairs) != l:
...@@ -145,9 +148,7 @@ class BaseTest(tf.test.TestCase): ...@@ -145,9 +148,7 @@ class BaseTest(tf.test.TestCase):
train_examples[l].add((u_raw, i_raw)) train_examples[l].add((u_raw, i_raw))
num_positives_seen = len(train_examples[True]) num_positives_seen = len(train_examples[True])
# The numbers don't match exactly because the last batch spills over into assert ncf_dataset.num_train_positives == num_positives_seen
# the next epoch
assert ncf_dataset.num_train_positives - num_positives_seen < BATCH_SIZE
# This check is more heuristic because negatives are sampled with # This check is more heuristic because negatives are sampled with
# replacement. It only checks that negative generation is reasonably random. # replacement. It only checks that negative generation is reasonably random.
...@@ -162,20 +163,42 @@ class BaseTest(tf.test.TestCase): ...@@ -162,20 +163,42 @@ class BaseTest(tf.test.TestCase):
movielens.TIMESTAMP_COLUMN: times}) movielens.TIMESTAMP_COLUMN: times})
cache_paths = rconst.Paths(data_dir=self.temp_data_dir) cache_paths = rconst.Paths(data_dir=self.temp_data_dir)
np.random.seed(1) np.random.seed(1)
data_preprocessing.generate_train_eval_data(df, approx_num_shards=2,
num_items=10, num_shards = 2
cache_paths=cache_paths, num_items = 10
match_mlperf=True) data_preprocessing.generate_train_eval_data(
with tf.gfile.Open(cache_paths.eval_raw_file, "rb") as f: df, approx_num_shards=num_shards, num_items=num_items,
eval_data = pickle.load(f) cache_paths=cache_paths, match_mlperf=True)
raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir)
assert len(raw_shards) == num_shards
sharded_eval_data = []
for i in range(2):
sharded_eval_data.append(data_async_generation._process_shard(
(os.path.join(cache_paths.train_shard_subdir, raw_shards[i]),
num_items, rconst.NUM_EVAL_NEGATIVES, stat_utils.random_int32(),
False, True)))
if sharded_eval_data[0][0][0] == 1:
# Order is not assured for this part of the pipeline.
sharded_eval_data.reverse()
eval_data = [np.concatenate([shard[i] for shard in sharded_eval_data])
for i in range(3)]
eval_data = {
movielens.USER_COLUMN: eval_data[0],
movielens.ITEM_COLUMN: eval_data[1],
}
eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1 eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1
self.assertAllClose(eval_data[0][movielens.USER_COLUMN], self.assertAllClose(eval_data[movielens.USER_COLUMN],
[0] * eval_items_per_user + [1] * eval_items_per_user) [0] * eval_items_per_user + [1] * eval_items_per_user)
# Each shard process should generate different random items. # Each shard process should generate different random items.
self.assertNotAllClose( self.assertNotAllClose(
eval_data[0][movielens.ITEM_COLUMN][:eval_items_per_user], eval_data[movielens.ITEM_COLUMN][:eval_items_per_user],
eval_data[0][movielens.ITEM_COLUMN][eval_items_per_user:]) eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -142,7 +142,8 @@ def run_ncf(_): ...@@ -142,7 +142,8 @@ def run_ncf(_):
cleanup_fn = lambda: None cleanup_fn = lambda: None
num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
FLAGS.dataset] FLAGS.dataset]
approx_train_steps = None num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
else: else:
ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
...@@ -156,8 +157,11 @@ def run_ncf(_): ...@@ -156,8 +157,11 @@ def run_ncf(_):
cache_id=FLAGS.cache_id) cache_id=FLAGS.cache_id)
num_users = ncf_dataset.num_users num_users = ncf_dataset.num_users
num_items = ncf_dataset.num_items num_items = ncf_dataset.num_items
approx_train_steps = int(ncf_dataset.num_train_positives num_train_steps = int(np.ceil(
* (1 + FLAGS.num_neg) // FLAGS.batch_size) FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
(1 + FLAGS.num_neg) / FLAGS.batch_size))
num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
ncf_dataset.num_users / eval_batch_size))
model_helpers.apply_clean(flags.FLAGS) model_helpers.apply_clean(flags.FLAGS)
...@@ -206,8 +210,8 @@ def run_ncf(_): ...@@ -206,8 +210,8 @@ def run_ncf(_):
run_params=run_params, run_params=run_params,
test_id=FLAGS.benchmark_test_id) test_id=FLAGS.benchmark_test_id)
pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset)
pred_input_fn = None
total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
for cycle_index in range(total_training_cycle): for cycle_index in range(total_training_cycle):
tf.logging.info("Starting a training cycle: {}/{}".format( tf.logging.info("Starting a training cycle: {}/{}".format(
...@@ -215,20 +219,31 @@ def run_ncf(_): ...@@ -215,20 +219,31 @@ def run_ncf(_):
# Train the model # Train the model
train_input_fn, train_record_dir, batch_count = \ train_input_fn, train_record_dir, batch_count = \
data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset) data_preprocessing.make_input_fn(
ncf_dataset=ncf_dataset, is_training=True)
if approx_train_steps and np.abs(approx_train_steps - batch_count) > 1: if batch_count != num_train_steps:
tf.logging.warning( raise ValueError(
"Estimated ({}) and reported ({}) number of batches differ by more " "Step counts do not match. ({} vs. {}) The async process is "
"than one".format(approx_train_steps, batch_count)) "producing incorrect shards.".format(batch_count, num_train_steps))
train_estimator.train(input_fn=train_input_fn, hooks=train_hooks, train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
steps=batch_count) steps=num_train_steps)
if train_record_dir: if train_record_dir:
tf.gfile.DeleteRecursively(train_record_dir) tf.gfile.DeleteRecursively(train_record_dir)
tf.logging.info("Beginning evaluation.") tf.logging.info("Beginning evaluation.")
eval_results = eval_estimator.evaluate(pred_input_fn) if pred_input_fn is None:
pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
ncf_dataset=ncf_dataset, is_training=False)
if eval_batch_count != num_eval_steps:
raise ValueError(
"Step counts do not match. ({} vs. {}) The async process is "
"producing incorrect shards.".format(
eval_batch_count, num_eval_steps))
eval_results = eval_estimator.evaluate(pred_input_fn, steps=num_eval_steps)
tf.logging.info("Evaluation complete.") tf.logging.info("Evaluation complete.")
# Benchmark the evaluation results # Benchmark the evaluation results
......
...@@ -48,7 +48,7 @@ do ...@@ -48,7 +48,7 @@ do
# And to confirm that the pipeline is deterministic pass the flag: # And to confirm that the pipeline is deterministic pass the flag:
# --hash_pipeline # --hash_pipeline
# #
# (`--hash_pipeline` will slow down training) # (`--hash_pipeline` will slow down training, though not as much as one might imagine.)
python ncf_main.py --model_dir ${MODEL_DIR} \ python ncf_main.py --model_dir ${MODEL_DIR} \
--data_dir ${DATA_DIR} \ --data_dir ${DATA_DIR} \
--dataset ${DATASET} --hooks "" \ --dataset ${DATASET} --hooks "" \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment