"integration-tests/vscode:/vscode.git/clone" did not exist on "4e821c003a7cb055a358cf142dbf01a2f4c1916f"
Unverified Commit 19d4eaaf authored by Taylor Robie's avatar Taylor Robie Committed by GitHub
Browse files

Reorder NCF data pipeline (#5536)

* intermediate commit

finish replacing spillover with resampled padding

intermediate commit

* resolve merge conflict

* intermediate commit

* further consolidate the data pipeline

* complete first pass at data pipeline refactor

* remove some leftover code

* fix test

* remove resampling, and move train padding logic into neumf.py

* small tweaks

* fix weight bug

* address PR comments

* fix dict zip. (Reed led me astray)

* delint

* make data test deterministic and delint

* Reed didn't lead me astray. I just can't read.

* more delinting

* even more delinting

* use resampling for last batch padding

* pad last batch with unique data

* Revert "pad last batch with unique data"

This reverts commit cbdf46efcd5c7907038a24105b88d38e7f1d6da2.

* move padded batch to the beginning

* delint

* fix step check for synthetic data
parent 413f15ba
......@@ -35,16 +35,16 @@ class Paths(object):
"positive_shard_{}.pickle")
self.train_epoch_dir = os.path.join(self.cache_root, "training_epochs")
self.eval_data_subdir = os.path.join(self.cache_root, "eval_data")
self.eval_raw_file = os.path.join(self.eval_data_subdir, "raw.pickle")
self.eval_record_template_temp = os.path.join(self.eval_data_subdir,
"eval_records.temp")
self.eval_record_template = os.path.join(
self.eval_data_subdir, "padded_eval_batch_size_{}.tfrecords")
self.subproc_alive = os.path.join(self.cache_root, "subproc.alive")
APPROX_PTS_PER_TRAIN_SHARD = 128000
# Keys for data shards
TRAIN_KEY = "train"
EVAL_KEY = "eval"
# In both datasets, each user has at least 20 ratings.
MIN_NUM_RATINGS = 20
......@@ -68,7 +68,9 @@ FLAGFILE_TEMP = "flagfile.temp"
FLAGFILE = "flagfile"
READY_FILE_TEMP = "ready.json.temp"
READY_FILE = "ready.json"
TRAIN_RECORD_TEMPLATE = "train_{}.tfrecords"
EVAL_RECORD_TEMPLATE = "eval_{}.tfrecords"
TIMEOUT_SECONDS = 3600 * 2 # If the train loop goes more than two hours without
# consuming an epoch of data, this is a good
......
......@@ -57,7 +57,7 @@ DATASET_TO_NUM_USERS_AND_ITEMS = {
# Number of batches to run per epoch when using synthetic data. At high batch
# sizes, we run for more batches than with real data, which is good since
# running more batches reduces noise when measuring the average batches/second.
_SYNTHETIC_BATCHES_PER_EPOCH = 2000
SYNTHETIC_BATCHES_PER_EPOCH = 2000
class NCFDataset(object):
......@@ -65,7 +65,7 @@ class NCFDataset(object):
def __init__(self, user_map, item_map, num_data_readers, cache_paths,
num_train_positives, deterministic=False):
# type: (dict, dict, int, rconst.Paths) -> None
# type: (dict, dict, int, rconst.Paths, int, bool) -> None
"""Assign key values for recommendation dataset.
Args:
......@@ -175,7 +175,6 @@ def _filter_index_sort(raw_rating_path, match_mlperf):
def _train_eval_map_fn(args):
# type: (...) -> typing.Dict(np.ndarray)
"""Split training and testing data and generate testing negatives.
This function is called as part of a multiprocessing map. The principle
......@@ -186,9 +185,8 @@ def _train_eval_map_fn(args):
For each user, all but the last item is written into a pickle file which the
training data producer can consume on as needed. The last item for a user
is a validation point; for each validation point a number of negatives are
generated (typically 999). The validation data is returned by this function,
as it is held in memory for the remainder of the run.
is a validation point; it is written under a separate key and will be used
later to generate the evaluation data.
Args:
shard: A dict containing the user and item arrays.
......@@ -198,16 +196,10 @@ def _train_eval_map_fn(args):
which validation negatives should be drawn.
cache_paths: rconst.Paths object containing locations for various cache
files.
seed: Random seed to be used when generating testing negatives.
match_mlperf: If True, sample eval negative with replacements, which the
MLPerf reference implementation does.
Returns:
A dict containing the evaluation data for a given shard.
"""
shard, shard_id, num_items, cache_paths, seed, match_mlperf = args
np.random.seed(seed)
shard, shard_id, num_items, cache_paths = args
users = shard[movielens.USER_COLUMN]
items = shard[movielens.ITEM_COLUMN]
......@@ -218,7 +210,6 @@ def _train_eval_map_fn(args):
[users.shape[0]])
train_blocks = []
test_blocks = []
test_positives = []
for i in range(len(boundaries) - 1):
# This is simply a vector of repeated values such that the shard could be
......@@ -233,39 +224,31 @@ def _train_eval_map_fn(args):
block_items = items[boundaries[i]:boundaries[i+1]]
train_blocks.append((block_user[:-1], block_items[:-1]))
test_negatives = stat_utils.sample_with_exclusion(
num_items=num_items, positive_set=set(block_items),
n=rconst.NUM_EVAL_NEGATIVES, replacement=match_mlperf)
test_blocks.append((
block_user[0] * np.ones((rconst.NUM_EVAL_NEGATIVES + 1,),
dtype=np.int32),
np.array([block_items[-1]] + test_negatives, dtype=np.uint16)
))
test_positives.append((block_user[0], block_items[-1]))
train_users = np.concatenate([i[0] for i in train_blocks])
train_items = np.concatenate([i[1] for i in train_blocks])
test_pos_users = np.array([i[0] for i in test_positives],
dtype=train_users.dtype)
test_pos_items = np.array([i[1] for i in test_positives],
dtype=train_items.dtype)
train_shard_fpath = cache_paths.train_shard_template.format(
str(shard_id).zfill(5))
with tf.gfile.Open(train_shard_fpath, "wb") as f:
pickle.dump({
movielens.USER_COLUMN: train_users,
movielens.ITEM_COLUMN: train_items,
rconst.TRAIN_KEY: {
movielens.USER_COLUMN: train_users,
movielens.ITEM_COLUMN: train_items,
},
rconst.EVAL_KEY: {
movielens.USER_COLUMN: test_pos_users,
movielens.ITEM_COLUMN: test_pos_items,
}
}, f)
test_users = np.concatenate([i[0] for i in test_blocks])
test_items = np.concatenate([i[1] for i in test_blocks])
assert test_users.shape == test_items.shape
assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0
return {
movielens.USER_COLUMN: test_users,
movielens.ITEM_COLUMN: test_items,
}
def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
match_mlperf):
......@@ -327,38 +310,16 @@ def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
"negatives per user...".format(rconst.NUM_EVAL_NEGATIVES))
tf.gfile.MakeDirs(cache_paths.train_shard_subdir)
# We choose a different random seed for each process, so that the processes
# will not all choose the same random numbers.
process_seeds = [np.random.randint(2**32) for _ in range(approx_num_shards)]
map_args = [(shards[i], i, num_items, cache_paths, process_seeds[i],
match_mlperf)
map_args = [(shards[i], i, num_items, cache_paths)
for i in range(approx_num_shards)]
with popen_helper.get_pool(multiprocessing.cpu_count()) as pool:
test_shards = pool.map(_train_eval_map_fn, map_args) # pylint: disable=no-member
tf.logging.info("Merging test shards...")
test_users = np.concatenate([i[movielens.USER_COLUMN] for i in test_shards])
test_items = np.concatenate([i[movielens.ITEM_COLUMN] for i in test_shards])
assert test_users.shape == test_items.shape
assert test_items.shape[0] % (rconst.NUM_EVAL_NEGATIVES + 1) == 0
test_labels = np.zeros(shape=test_users.shape)
test_labels[0::(rconst.NUM_EVAL_NEGATIVES + 1)] = 1
eval_data = ({
movielens.USER_COLUMN: test_users,
movielens.ITEM_COLUMN: test_items,
}, test_labels)
tf.logging.info("Writing test data to file.")
tf.gfile.MakeDirs(cache_paths.eval_data_subdir)
with tf.gfile.Open(cache_paths.eval_raw_file, "wb") as f:
pickle.dump(eval_data, f, protocol=pickle.HIGHEST_PROTOCOL)
with popen_helper.get_pool(multiprocessing.cpu_count()) as pool:
pool.map(_train_eval_map_fn, map_args) # pylint: disable=no-member
def construct_cache(dataset, data_dir, num_data_readers, match_mlperf,
deterministic, cache_id=None):
# type: (str, str, int, bool, typing.Optional[int]) -> NCFDataset
# type: (str, str, int, bool, bool, typing.Optional[int]) -> NCFDataset
"""Load and digest data CSV into a usable form.
Args:
......@@ -420,17 +381,20 @@ def _shutdown(proc):
tf.logging.info("Shutting down train data creation subprocess.")
try:
proc.send_signal(signal.SIGINT)
time.sleep(1)
if proc.returncode is not None:
return # SIGINT was handled successfully within 1 sec
try:
proc.send_signal(signal.SIGINT)
time.sleep(5)
if proc.returncode is not None:
return # SIGINT was handled successfully within 5 seconds
except socket.error:
pass
except socket.error:
pass
# Otherwise another second of grace period and then forcibly kill the process.
time.sleep(1)
proc.terminate()
# Otherwise another second of grace period and then force kill the process.
time.sleep(1)
proc.terminate()
except: # pylint: disable=broad-except
tf.logging.error("Data generation subprocess could not be killed.")
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
......@@ -456,18 +420,17 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
"num_neg": num_neg,
"num_train_positives": ncf_dataset.num_train_positives,
"num_items": ncf_dataset.num_items,
"num_users": ncf_dataset.num_users,
"num_readers": ncf_dataset.num_data_readers,
"epochs_per_cycle": epochs_per_cycle,
"train_batch_size": batch_size,
"eval_batch_size": eval_batch_size,
"num_workers": num_workers,
# This allows the training input function to guarantee batch size and
# significantly improves performance. (~5% increase in examples/sec on
# GPU, and needed for TPU XLA.)
"spillover": True,
"redirect_logs": use_subprocess,
"use_tf_logging": not use_subprocess,
"ml_perf": match_mlperf,
}
if ncf_dataset.deterministic:
flags_["seed"] = stat_utils.random_int32()
tf.gfile.MakeDirs(data_dir)
......@@ -608,12 +571,12 @@ def hash_pipeline(dataset, deterministic):
tf.logging.info(" [pipeline_hash] All batches hash: {}".format(overall_hash))
def make_train_input_fn(ncf_dataset):
# type: (typing.Optional[NCFDataset]) -> (typing.Callable, str, int)
def make_input_fn(ncf_dataset, is_training):
# type: (typing.Optional[NCFDataset], bool) -> (typing.Callable, str, int)
"""Construct training input_fn for the current epoch."""
if ncf_dataset is None:
return make_train_synthetic_input_fn()
return make_synthetic_input_fn(is_training)
if not tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
# The generation subprocess must have been alive at some point, because we
......@@ -621,19 +584,24 @@ def make_train_input_fn(ncf_dataset):
raise ValueError("Generation subprocess unexpectedly died. Data will not "
"be available; exiting to avoid waiting forever.")
train_epoch_dir = ncf_dataset.cache_paths.train_epoch_dir
while not tf.gfile.Exists(train_epoch_dir):
tf.logging.info("Waiting for {} to exist.".format(train_epoch_dir))
time.sleep(1)
if is_training:
train_epoch_dir = ncf_dataset.cache_paths.train_epoch_dir
while not tf.gfile.Exists(train_epoch_dir):
tf.logging.info("Waiting for {} to exist.".format(train_epoch_dir))
time.sleep(1)
train_data_dirs = tf.gfile.ListDirectory(train_epoch_dir)
while not train_data_dirs:
tf.logging.info("Waiting for data folder to be created.")
time.sleep(1)
train_data_dirs = tf.gfile.ListDirectory(train_epoch_dir)
train_data_dirs.sort() # names are zfilled so that
# lexicographic sort == numeric sort
record_dir = os.path.join(train_epoch_dir, train_data_dirs[0])
while not train_data_dirs:
tf.logging.info("Waiting for data folder to be created.")
time.sleep(1)
train_data_dirs = tf.gfile.ListDirectory(train_epoch_dir)
train_data_dirs.sort() # names are zfilled so that
# lexicographic sort == numeric sort
record_dir = os.path.join(train_epoch_dir, train_data_dirs[0])
template = rconst.TRAIN_RECORD_TEMPLATE
else:
record_dir = ncf_dataset.cache_paths.eval_data_subdir
template = rconst.EVAL_RECORD_TEMPLATE
ready_file = os.path.join(record_dir, rconst.READY_FILE)
while not tf.gfile.Exists(ready_file):
......@@ -643,16 +611,18 @@ def make_train_input_fn(ncf_dataset):
with tf.gfile.Open(ready_file, "r") as f:
epoch_metadata = json.load(f)
# The data pipeline uses spillover to guarantee static batch sizes. This
# means that an extra batch will need to be run every few epochs. TPUs
# require that the number of batches to be run is known at the time that
# estimator.train() is called, so having the generation pipeline report
# number of batches guarantees that this count is correct.
# This value is used to check that the batch count from the subprocess matches
# the batch count expected by the main thread.
batch_count = epoch_metadata["batch_count"]
def input_fn(params):
"""Generated input_fn for the given epoch."""
batch_size = params["batch_size"]
if is_training:
batch_size = params["batch_size"]
else:
# Estimator has "eval_batch_size" included in the params, but TPUEstimator
# populates "batch_size" to the appropriate value.
batch_size = params.get("eval_batch_size") or params["batch_size"]
if epoch_metadata["batch_size"] != batch_size:
raise ValueError(
......@@ -662,8 +632,7 @@ def make_train_input_fn(ncf_dataset):
.format(epoch_metadata["batch_size"], batch_size))
record_files = tf.data.Dataset.list_files(
os.path.join(record_dir, rconst.TRAIN_RECORD_TEMPLATE.format("*")),
shuffle=False)
os.path.join(record_dir, template.format("*")), shuffle=False)
interleave = tf.contrib.data.parallel_interleave(
tf.data.TFRecordDataset,
......@@ -673,7 +642,7 @@ def make_train_input_fn(ncf_dataset):
prefetch_input_elements=4,
)
deserialize = make_deserialize(params, batch_size, True)
deserialize = make_deserialize(params, batch_size, is_training)
dataset = record_files.apply(interleave)
dataset = dataset.map(deserialize, num_parallel_calls=4)
dataset = dataset.prefetch(32)
......@@ -686,11 +655,12 @@ def make_train_input_fn(ncf_dataset):
return input_fn, record_dir, batch_count
def make_train_synthetic_input_fn():
def make_synthetic_input_fn(is_training):
"""Construct training input_fn that uses synthetic data."""
def input_fn(params):
"""Generated input_fn for the given epoch."""
batch_size = params["batch_size"]
batch_size = (params["batch_size"] if is_training else
params["eval_batch_size"] or params["batch_size"])
num_users = params["num_users"]
num_items = params["num_items"]
......@@ -698,78 +668,26 @@ def make_train_synthetic_input_fn():
maxval=num_users)
items = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
maxval=num_items)
labels = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
maxval=2)
data = {
movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items,
}, labels
dataset = tf.data.Dataset.from_tensors(data).repeat(
_SYNTHETIC_BATCHES_PER_EPOCH)
dataset = dataset.prefetch(32)
return dataset
return input_fn, None, _SYNTHETIC_BATCHES_PER_EPOCH
def make_pred_input_fn(ncf_dataset):
# type: (typing.Optional[NCFDataset]) -> typing.Callable
"""Construct input_fn for metric evaluation."""
if ncf_dataset is None:
return make_synthetic_pred_input_fn()
def input_fn(params):
"""Input function based on eval batch size."""
# Estimator has "eval_batch_size" included in the params, but TPUEstimator
# populates "batch_size" to the appropriate value.
batch_size = params.get("eval_batch_size") or params["batch_size"]
record_file = ncf_dataset.cache_paths.eval_record_template.format(
batch_size)
while not tf.gfile.Exists(record_file):
tf.logging.info(
"Waiting for eval data to be written to {}".format(record_file))
time.sleep(1)
dataset = tf.data.TFRecordDataset(record_file)
deserialize = make_deserialize(params, batch_size, False)
dataset = dataset.map(deserialize, num_parallel_calls=4)
dataset = dataset.prefetch(16)
if params.get("hash_pipeline"):
hash_pipeline(dataset, ncf_dataset.deterministic)
return dataset
return input_fn
def make_synthetic_pred_input_fn():
"""Construct input_fn for metric evaluation that uses synthetic data."""
def input_fn(params):
"""Generated input_fn for the given epoch."""
batch_size = params["eval_batch_size"]
num_users = params["num_users"]
num_items = params["num_items"]
users = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
maxval=num_users)
items = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
maxval=num_items)
dupe_mask = tf.cast(tf.random_uniform([batch_size], dtype=tf.int32,
minval=0, maxval=2), tf.bool)
if is_training:
labels = tf.random_uniform([batch_size], dtype=tf.int32, minval=0,
maxval=2)
data = {
movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items,
}, labels
else:
dupe_mask = tf.cast(tf.random_uniform([batch_size], dtype=tf.int32,
minval=0, maxval=2), tf.bool)
data = {
movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items,
rconst.DUPLICATE_MASK: dupe_mask,
}
data = {
movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items,
rconst.DUPLICATE_MASK: dupe_mask,
}
dataset = tf.data.Dataset.from_tensors(data).repeat(
_SYNTHETIC_BATCHES_PER_EPOCH)
dataset = dataset.prefetch(16)
SYNTHETIC_BATCHES_PER_EPOCH)
dataset = dataset.prefetch(32)
return dataset
return input_fn
return input_fn, None, SYNTHETIC_BATCHES_PER_EPOCH
......@@ -28,7 +28,9 @@ import tensorflow as tf
from official.datasets import movielens
from official.recommendation import constants as rconst
from official.recommendation import data_async_generation
from official.recommendation import data_preprocessing
from official.recommendation import stat_utils
DATASET = "ml-test"
......@@ -121,7 +123,7 @@ class BaseTest(tf.test.TestCase):
g = tf.Graph()
with g.as_default():
input_fn, record_dir, batch_count = \
data_preprocessing.make_train_input_fn(ncf_dataset)
data_preprocessing.make_input_fn(ncf_dataset, True)
dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False})
first_epoch = self.drain_dataset(dataset=dataset, g=g)
user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
......@@ -134,6 +136,7 @@ class BaseTest(tf.test.TestCase):
for features, labels in first_epoch:
for u, i, l in zip(features[movielens.USER_COLUMN],
features[movielens.ITEM_COLUMN], labels):
u_raw = user_inv_map[u]
i_raw = item_inv_map[i]
if ((u_raw, i_raw) in self.seen_pairs) != l:
......@@ -145,9 +148,7 @@ class BaseTest(tf.test.TestCase):
train_examples[l].add((u_raw, i_raw))
num_positives_seen = len(train_examples[True])
# The numbers don't match exactly because the last batch spills over into
# the next epoch
assert ncf_dataset.num_train_positives - num_positives_seen < BATCH_SIZE
assert ncf_dataset.num_train_positives == num_positives_seen
# This check is more heuristic because negatives are sampled with
# replacement. It only checks that negative generation is reasonably random.
......@@ -162,20 +163,42 @@ class BaseTest(tf.test.TestCase):
movielens.TIMESTAMP_COLUMN: times})
cache_paths = rconst.Paths(data_dir=self.temp_data_dir)
np.random.seed(1)
data_preprocessing.generate_train_eval_data(df, approx_num_shards=2,
num_items=10,
cache_paths=cache_paths,
match_mlperf=True)
with tf.gfile.Open(cache_paths.eval_raw_file, "rb") as f:
eval_data = pickle.load(f)
num_shards = 2
num_items = 10
data_preprocessing.generate_train_eval_data(
df, approx_num_shards=num_shards, num_items=num_items,
cache_paths=cache_paths, match_mlperf=True)
raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir)
assert len(raw_shards) == num_shards
sharded_eval_data = []
for i in range(2):
sharded_eval_data.append(data_async_generation._process_shard(
(os.path.join(cache_paths.train_shard_subdir, raw_shards[i]),
num_items, rconst.NUM_EVAL_NEGATIVES, stat_utils.random_int32(),
False, True)))
if sharded_eval_data[0][0][0] == 1:
# Order is not assured for this part of the pipeline.
sharded_eval_data.reverse()
eval_data = [np.concatenate([shard[i] for shard in sharded_eval_data])
for i in range(3)]
eval_data = {
movielens.USER_COLUMN: eval_data[0],
movielens.ITEM_COLUMN: eval_data[1],
}
eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1
self.assertAllClose(eval_data[0][movielens.USER_COLUMN],
self.assertAllClose(eval_data[movielens.USER_COLUMN],
[0] * eval_items_per_user + [1] * eval_items_per_user)
# Each shard process should generate different random items.
self.assertNotAllClose(
eval_data[0][movielens.ITEM_COLUMN][:eval_items_per_user],
eval_data[0][movielens.ITEM_COLUMN][eval_items_per_user:])
eval_data[movielens.ITEM_COLUMN][:eval_items_per_user],
eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
if __name__ == "__main__":
......
......@@ -142,7 +142,8 @@ def run_ncf(_):
cleanup_fn = lambda: None
num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
FLAGS.dataset]
approx_train_steps = None
num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
else:
ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
......@@ -156,8 +157,11 @@ def run_ncf(_):
cache_id=FLAGS.cache_id)
num_users = ncf_dataset.num_users
num_items = ncf_dataset.num_items
approx_train_steps = int(ncf_dataset.num_train_positives
* (1 + FLAGS.num_neg) // FLAGS.batch_size)
num_train_steps = int(np.ceil(
FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
(1 + FLAGS.num_neg) / FLAGS.batch_size))
num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
ncf_dataset.num_users / eval_batch_size))
model_helpers.apply_clean(flags.FLAGS)
......@@ -206,8 +210,8 @@ def run_ncf(_):
run_params=run_params,
test_id=FLAGS.benchmark_test_id)
pred_input_fn = data_preprocessing.make_pred_input_fn(ncf_dataset=ncf_dataset)
pred_input_fn = None
total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals
for cycle_index in range(total_training_cycle):
tf.logging.info("Starting a training cycle: {}/{}".format(
......@@ -215,20 +219,31 @@ def run_ncf(_):
# Train the model
train_input_fn, train_record_dir, batch_count = \
data_preprocessing.make_train_input_fn(ncf_dataset=ncf_dataset)
data_preprocessing.make_input_fn(
ncf_dataset=ncf_dataset, is_training=True)
if approx_train_steps and np.abs(approx_train_steps - batch_count) > 1:
tf.logging.warning(
"Estimated ({}) and reported ({}) number of batches differ by more "
"than one".format(approx_train_steps, batch_count))
if batch_count != num_train_steps:
raise ValueError(
"Step counts do not match. ({} vs. {}) The async process is "
"producing incorrect shards.".format(batch_count, num_train_steps))
train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
steps=batch_count)
steps=num_train_steps)
if train_record_dir:
tf.gfile.DeleteRecursively(train_record_dir)
tf.logging.info("Beginning evaluation.")
eval_results = eval_estimator.evaluate(pred_input_fn)
if pred_input_fn is None:
pred_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
ncf_dataset=ncf_dataset, is_training=False)
if eval_batch_count != num_eval_steps:
raise ValueError(
"Step counts do not match. ({} vs. {}) The async process is "
"producing incorrect shards.".format(
eval_batch_count, num_eval_steps))
eval_results = eval_estimator.evaluate(pred_input_fn, steps=num_eval_steps)
tf.logging.info("Evaluation complete.")
# Benchmark the evaluation results
......
......@@ -48,7 +48,7 @@ do
# And to confirm that the pipeline is deterministic pass the flag:
# --hash_pipeline
#
# (`--hash_pipeline` will slow down training)
# (`--hash_pipeline` will slow down training, though not as much as one might imagine.)
python ncf_main.py --model_dir ${MODEL_DIR} \
--data_dir ${DATA_DIR} \
--dataset ${DATASET} --hooks "" \
......@@ -61,8 +61,8 @@ do
--layers 256,256,128,64 --num_factors 64 \
--hr_threshold 0.635 \
--ml_perf \
|& tee ${RUN_LOG} \
| grep --line-buffered -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+)|(pipeline_hash)"
|& tee ${RUN_LOG} \
| grep --line-buffered -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+)|(pipeline_hash)"
END_TIME=$(date +%s)
echo "Run ${i} complete: $(( $END_TIME - $START_TIME )) seconds."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment