Unverified Commit 56cbd1f2 authored by Taylor Robie's avatar Taylor Robie Committed by GitHub
Browse files

Replace pipeline in NCF (#5786)

* rough pass at carving out existing NCF pipeline

2nd half of rough replacement pass

fix dataset map functions

reduce bias in sample selection

cache pandas work on a daily basis

cleanup and fix batch check for multi gpu

multi device fix

fix treatment of eval data padding

print data producer

replace epoch overlap with padding and masking

move type and shape info into the producer class and update run.sh with larger batch size hyperparams

remove xla for multi GPU

more cleanup

remove model runner altogether

bug fixes

address subtle pipeline hang and improve producer __repr__

fix crash

fix assert

use popen_helper to create pools

add StreamingFilesDataset and abstract data storage to a separate class

bug fix

fix wait bug and add manual stack trace print

more bug fixes and refactor valid point mask to work with TPU sharding

misc bug fixes and adjust dtypes

address crash from decoding bools

fix remaining dtypes and change record writer pattern since it does not append

fix synthetic data

use TPUStrategy instead of TPUEstimator

minor tweaks around moving to TPUStrategy

cleanup some old code

delint and simplify permutation generation

remove low level tf layer definition, use single table with slice for keras, and misc fixes

missed minor point on removing tf layer definition

fix several bugs from recombinging layer definitions

delint and add docstrings

Update ncf_test.py. Section for identical inputs and different outputs was removed.

update data test to run against the new producer class

* remove 'deterministic'

* delint

* address PR comments

* change eval_batch_size flag from a string to an int

* Add bisection based producer for increased scalability, enable fully deterministic data production, and use the materialized and bisection producer to check each other (via expected output md5's)

* remove references to hash pipeline

* skip bisection when it is not needed

* add unbuffer to run.sh as tee is causing issues

* address PR comments

* address more PR comments

* fix lint errors

* trim lines in resnet keras

* remove mock to debug kokoro failures

* Revert "remove mock to debug kokoro failures"

This reverts commit 63f5827d.

* remove match_mlperf from expected cache keys

* fix test now that cache construction no longer uses match_mlperf

* disable tests to debug test failure

* disable more tests

* completely disable data_test

* restore data test

* add versions to requirements.txt

* update call to TPUStrategy
parents 2c4dc0c0 7021ac1c
...@@ -14,36 +14,30 @@ ...@@ -14,36 +14,30 @@
# ============================================================================== # ==============================================================================
"""Central location for NCF specific values.""" """Central location for NCF specific values."""
import os import sys
import time
import numpy as np
from official.datasets import movielens
# ============================================================================== # ==============================================================================
# == Main Thread Data Processing =============================================== # == Main Thread Data Processing ===============================================
# ============================================================================== # ==============================================================================
class Paths(object):
"""Container for various path information used while training NCF."""
def __init__(self, data_dir, cache_id=None):
self.cache_id = cache_id or int(time.time())
self.data_dir = data_dir
self.cache_root = os.path.join(
self.data_dir, "{}_ncf_recommendation_cache".format(self.cache_id))
self.train_shard_subdir = os.path.join(self.cache_root,
"raw_training_shards")
self.train_shard_template = os.path.join(self.train_shard_subdir,
"positive_shard_{}.pickle")
self.train_epoch_dir = os.path.join(self.cache_root, "training_epochs")
self.eval_data_subdir = os.path.join(self.cache_root, "eval_data")
self.subproc_alive = os.path.join(self.cache_root, "subproc.alive") # Keys for data shards
TRAIN_USER_KEY = "train_{}".format(movielens.USER_COLUMN)
TRAIN_ITEM_KEY = "train_{}".format(movielens.ITEM_COLUMN)
TRAIN_LABEL_KEY = "train_labels"
MASK_START_INDEX = "mask_start_index"
VALID_POINT_MASK = "valid_point_mask"
EVAL_USER_KEY = "eval_{}".format(movielens.USER_COLUMN)
EVAL_ITEM_KEY = "eval_{}".format(movielens.ITEM_COLUMN)
USER_MAP = "user_map"
ITEM_MAP = "item_map"
APPROX_PTS_PER_TRAIN_SHARD = 128000 USER_DTYPE = np.int32
ITEM_DTYPE = np.int32
# Keys for data shards
TRAIN_KEY = "train"
EVAL_KEY = "eval"
# In both datasets, each user has at least 20 ratings. # In both datasets, each user has at least 20 ratings.
MIN_NUM_RATINGS = 20 MIN_NUM_RATINGS = 20
...@@ -62,21 +56,24 @@ DUPLICATE_MASK = "duplicate_mask" ...@@ -62,21 +56,24 @@ DUPLICATE_MASK = "duplicate_mask"
HR_METRIC_NAME = "HR_METRIC" HR_METRIC_NAME = "HR_METRIC"
NDCG_METRIC_NAME = "NDCG_METRIC" NDCG_METRIC_NAME = "NDCG_METRIC"
# Trying to load a cache created in py2 when running in py3 will cause an
# error due to differences in unicode handling.
RAW_CACHE_FILE = "raw_data_cache_py{}.pickle".format(sys.version_info[0])
CACHE_INVALIDATION_SEC = 3600 * 24
# ============================================================================== # ==============================================================================
# == Subprocess Data Generation ================================================ # == Data Generation ===========================================================
# ============================================================================== # ==============================================================================
CYCLES_TO_BUFFER = 3 # The number of train cycles worth of data to "run ahead" CYCLES_TO_BUFFER = 3 # The number of train cycles worth of data to "run ahead"
# of the main training loop. # of the main training loop.
FLAGFILE_TEMP = "flagfile.temp" # Number of batches to run per epoch when using synthetic data. At high batch
FLAGFILE = "flagfile" # sizes, we run for more batches than with real data, which is good since
READY_FILE_TEMP = "ready.json.temp" # running more batches reduces noise when measuring the average batches/second.
READY_FILE = "ready.json" SYNTHETIC_BATCHES_PER_EPOCH = 2000
TRAIN_RECORD_TEMPLATE = "train_{}.tfrecords"
EVAL_RECORD_TEMPLATE = "eval_{}.tfrecords"
TIMEOUT_SECONDS = 3600 * 2 # If the train loop goes more than two hours without # Only used when StreamingFilesDataset is used.
# consuming an epoch of data, this is a good NUM_FILE_SHARDS = 16
# indicator that the main thread is dead and the TRAIN_FOLDER_TEMPLATE = "training_cycle_{}"
# subprocess is orphaned. EVAL_FOLDER = "eval_data"
SHARD_TEMPLATE = "shard_{}.tfrecords"
This diff is collapsed.
This diff is collapsed.
...@@ -18,19 +18,19 @@ from __future__ import absolute_import ...@@ -18,19 +18,19 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from collections import defaultdict
import hashlib
import os import os
import pickle
import time
import mock
import numpy as np import numpy as np
import pandas as pd import scipy.stats
import tensorflow as tf import tensorflow as tf
from official.datasets import movielens from official.datasets import movielens
from official.recommendation import constants as rconst from official.recommendation import constants as rconst
from official.recommendation import data_async_generation
from official.recommendation import data_preprocessing from official.recommendation import data_preprocessing
from official.recommendation import stat_utils from official.recommendation import popen_helper
DATASET = "ml-test" DATASET = "ml-test"
...@@ -42,10 +42,18 @@ EVAL_BATCH_SIZE = 4000 ...@@ -42,10 +42,18 @@ EVAL_BATCH_SIZE = 4000
NUM_NEG = 4 NUM_NEG = 4
END_TO_END_TRAIN_MD5 = "b218738e915e825d03939c5e305a2698"
END_TO_END_EVAL_MD5 = "d753d0f3186831466d6e218163a9501e"
FRESH_RANDOMNESS_MD5 = "63d0dff73c0e5f1048fbdc8c65021e22"
def mock_download(*args, **kwargs): def mock_download(*args, **kwargs):
return return
# The forkpool used by data producers interacts badly with the threading
# used by TestCase. Without this patch tests will hang, and no amount
# of diligent closing and joining within the producer will prevent it.
@mock.patch.object(popen_helper, "get_forkpool", popen_helper.get_fauxpool)
class BaseTest(tf.test.TestCase): class BaseTest(tf.test.TestCase):
def setUp(self): def setUp(self):
self.temp_data_dir = self.get_temp_dir() self.temp_data_dir = self.get_temp_dir()
...@@ -65,10 +73,10 @@ class BaseTest(tf.test.TestCase): ...@@ -65,10 +73,10 @@ class BaseTest(tf.test.TestCase):
scores = np.random.randint(low=0, high=5, size=NUM_PTS) scores = np.random.randint(low=0, high=5, size=NUM_PTS)
times = np.random.randint(low=1000000000, high=1200000000, size=NUM_PTS) times = np.random.randint(low=1000000000, high=1200000000, size=NUM_PTS)
rating_file = os.path.join(ratings_folder, movielens.RATINGS_FILE) self.rating_file = os.path.join(ratings_folder, movielens.RATINGS_FILE)
self.seen_pairs = set() self.seen_pairs = set()
self.holdout = {} self.holdout = {}
with tf.gfile.Open(rating_file, "w") as f: with tf.gfile.Open(self.rating_file, "w") as f:
f.write("user_id,item_id,rating,timestamp\n") f.write("user_id,item_id,rating,timestamp\n")
for usr, itm, scr, ts in zip(users, items, scores, times): for usr, itm, scr, ts in zip(users, items, scores, times):
pair = (usr, itm) pair = (usr, itm)
...@@ -85,21 +93,29 @@ class BaseTest(tf.test.TestCase): ...@@ -85,21 +93,29 @@ class BaseTest(tf.test.TestCase):
data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[DATASET] = (NUM_USERS, data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[DATASET] = (NUM_USERS,
NUM_ITEMS) NUM_ITEMS)
def make_params(self, train_epochs=1):
return {
"train_epochs": train_epochs,
"batches_per_step": 1,
"use_seed": False,
"batch_size": BATCH_SIZE,
"eval_batch_size": EVAL_BATCH_SIZE,
"num_neg": NUM_NEG,
"match_mlperf": True,
"use_tpu": False,
"use_xla_for_gpu": False,
}
def test_preprocessing(self): def test_preprocessing(self):
# For the most part the necessary checks are performed within # For the most part the necessary checks are performed within
# construct_cache() # _filter_index_sort()
ncf_dataset = data_preprocessing.construct_cache(
dataset=DATASET, data_dir=self.temp_data_dir, num_data_readers=2, cache_path = os.path.join(self.temp_data_dir, "test_cache.pickle")
match_mlperf=False, deterministic=False) data, valid_cache = data_preprocessing._filter_index_sort(
assert ncf_dataset.num_users == NUM_USERS self.rating_file, cache_path=cache_path)
assert ncf_dataset.num_items == NUM_ITEMS
assert len(data[rconst.USER_MAP]) == NUM_USERS
time.sleep(1) # Ensure we create the next cache in a new directory. assert len(data[rconst.ITEM_MAP]) == NUM_ITEMS
ncf_dataset = data_preprocessing.construct_cache(
dataset=DATASET, data_dir=self.temp_data_dir, num_data_readers=2,
match_mlperf=True, deterministic=False)
assert ncf_dataset.num_users == NUM_USERS
assert ncf_dataset.num_items == NUM_ITEMS
def drain_dataset(self, dataset, g): def drain_dataset(self, dataset, g):
# type: (tf.data.Dataset, tf.Graph) -> list # type: (tf.data.Dataset, tf.Graph) -> list
...@@ -114,29 +130,46 @@ class BaseTest(tf.test.TestCase): ...@@ -114,29 +130,46 @@ class BaseTest(tf.test.TestCase):
break break
return output return output
def test_end_to_end(self): def _test_end_to_end(self, constructor_type):
ncf_dataset, _ = data_preprocessing.instantiate_pipeline( params = self.make_params(train_epochs=1)
dataset=DATASET, data_dir=self.temp_data_dir, _, _, producer = data_preprocessing.instantiate_pipeline(
batch_size=BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, dataset=DATASET, data_dir=self.temp_data_dir, params=params,
num_cycles=1, num_data_readers=2, num_neg=NUM_NEG) constructor_type=constructor_type, deterministic=True)
producer.start()
producer.join()
assert producer._fatal_exception is None
user_inv_map = {v: k for k, v in producer.user_map.items()}
item_inv_map = {v: k for k, v in producer.item_map.items()}
# ==========================================================================
# == Training Data =========================================================
# ==========================================================================
g = tf.Graph() g = tf.Graph()
with g.as_default(): with g.as_default():
input_fn, record_dir, batch_count = \ input_fn = producer.make_input_fn(is_training=True)
data_preprocessing.make_input_fn(ncf_dataset, True) dataset = input_fn(params)
dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False,
"use_xla_for_gpu": False})
first_epoch = self.drain_dataset(dataset=dataset, g=g) first_epoch = self.drain_dataset(dataset=dataset, g=g)
user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()}
counts = defaultdict(int)
train_examples = { train_examples = {
True: set(), True: set(),
False: set(), False: set(),
} }
md5 = hashlib.md5()
for features, labels in first_epoch: for features, labels in first_epoch:
for u, i, l in zip(features[movielens.USER_COLUMN], data_list = [
features[movielens.ITEM_COLUMN], labels): features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
features[rconst.VALID_POINT_MASK], labels]
for i in data_list:
md5.update(i.tobytes())
for u, i, v, l in zip(*data_list):
if not v:
continue # ignore padding
u_raw = user_inv_map[u] u_raw = user_inv_map[u]
i_raw = item_inv_map[i] i_raw = item_inv_map[i]
...@@ -145,61 +178,166 @@ class BaseTest(tf.test.TestCase): ...@@ -145,61 +178,166 @@ class BaseTest(tf.test.TestCase):
# generation, so it will occasionally appear as a negative example # generation, so it will occasionally appear as a negative example
# during training. # during training.
assert not l assert not l
assert i_raw == self.holdout[u_raw][1] self.assertEqual(i_raw, self.holdout[u_raw][1])
train_examples[l].add((u_raw, i_raw)) train_examples[l].add((u_raw, i_raw))
num_positives_seen = len(train_examples[True]) counts[(u_raw, i_raw)] += 1
self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5)
assert ncf_dataset.num_train_positives == num_positives_seen num_positives_seen = len(train_examples[True])
self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen)
# This check is more heuristic because negatives are sampled with # This check is more heuristic because negatives are sampled with
# replacement. It only checks that negative generation is reasonably random. # replacement. It only checks that negative generation is reasonably random.
assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9 self.assertGreater(
len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9)
def test_shard_randomness(self):
users = [0, 0, 0, 0, 1, 1, 1, 1] # This checks that the samples produced are independent by checking the
items = [0, 2, 4, 6, 0, 2, 4, 6] # number of duplicate entries. If workers are not properly independent there
times = [1, 2, 3, 4, 1, 2, 3, 4] # will be lots of repeated pairs.
df = pd.DataFrame({movielens.USER_COLUMN: users, self.assertLess(np.mean(list(counts.values())), 1.1)
movielens.ITEM_COLUMN: items,
movielens.TIMESTAMP_COLUMN: times}) # ==========================================================================
cache_paths = rconst.Paths(data_dir=self.temp_data_dir) # == Eval Data =============================================================
np.random.seed(1) # ==========================================================================
with g.as_default():
num_shards = 2 input_fn = producer.make_input_fn(is_training=False)
num_items = 10 dataset = input_fn(params)
data_preprocessing.generate_train_eval_data(
df, approx_num_shards=num_shards, num_items=num_items, eval_data = self.drain_dataset(dataset=dataset, g=g)
cache_paths=cache_paths, match_mlperf=True)
raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir)
assert len(raw_shards) == num_shards
sharded_eval_data = []
for i in range(2):
sharded_eval_data.append(data_async_generation._process_shard(
(os.path.join(cache_paths.train_shard_subdir, raw_shards[i]),
num_items, rconst.NUM_EVAL_NEGATIVES, stat_utils.random_int32(),
False, True)))
if sharded_eval_data[0][0][0] == 1:
# Order is not assured for this part of the pipeline.
sharded_eval_data.reverse()
eval_data = [np.concatenate([shard[i] for shard in sharded_eval_data])
for i in range(3)]
eval_data = {
movielens.USER_COLUMN: eval_data[0],
movielens.ITEM_COLUMN: eval_data[1],
}
eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1 current_user = None
self.assertAllClose(eval_data[movielens.USER_COLUMN], md5 = hashlib.md5()
[0] * eval_items_per_user + [1] * eval_items_per_user) for features in eval_data:
data_list = [
features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
features[rconst.DUPLICATE_MASK]]
for i in data_list:
md5.update(i.tobytes())
# Each shard process should generate different random items. for idx, (u, i, d) in enumerate(zip(*data_list)):
self.assertNotAllClose( u_raw = user_inv_map[u]
eval_data[movielens.ITEM_COLUMN][:eval_items_per_user], i_raw = item_inv_map[i]
eval_data[movielens.ITEM_COLUMN][eval_items_per_user:]) if current_user is None:
current_user = u
# Ensure that users appear in blocks, as the evaluation logic expects
# this structure.
self.assertEqual(u, current_user)
# The structure of evaluation data is 999 negative examples followed
# by the holdout positive.
if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1):
# Check that the last element in each chunk is the holdout item.
self.assertEqual(i_raw, self.holdout[u_raw][1])
current_user = None
elif i_raw == self.holdout[u_raw][1]:
# Because the holdout item is not given to the negative generation
# process, it can appear as a negative. In that case, it should be
# masked out as a duplicate. (Since the true positive is placed at
# the end and would therefore lose the tie.)
assert d
else:
# Otherwise check that the other 999 points for a user are selected
# from the negatives.
assert (u_raw, i_raw) not in self.seen_pairs
self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)
def _test_fresh_randomness(self, constructor_type):
train_epochs = 5
params = self.make_params(train_epochs=train_epochs)
_, _, producer = data_preprocessing.instantiate_pipeline(
dataset=DATASET, data_dir=self.temp_data_dir, params=params,
constructor_type=constructor_type, deterministic=True)
producer.start()
results = []
g = tf.Graph()
with g.as_default():
for _ in range(train_epochs):
input_fn = producer.make_input_fn(is_training=True)
dataset = input_fn(params)
results.extend(self.drain_dataset(dataset=dataset, g=g))
producer.join()
assert producer._fatal_exception is None
positive_counts, negative_counts = defaultdict(int), defaultdict(int)
md5 = hashlib.md5()
for features, labels in results:
data_list = [
features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
features[rconst.VALID_POINT_MASK], labels]
for i in data_list:
md5.update(i.tobytes())
for u, i, v, l in zip(*data_list):
if not v:
continue # ignore padding
if l:
positive_counts[(u, i)] += 1
else:
negative_counts[(u, i)] += 1
self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5)
# The positive examples should appear exactly once each epoch
self.assertAllEqual(list(positive_counts.values()),
[train_epochs for _ in positive_counts])
# The threshold for the negatives is heuristic, but in general repeats are
# expected, but should not appear too frequently.
pair_cardinality = NUM_USERS * NUM_ITEMS
neg_pair_cardinality = pair_cardinality - len(self.seen_pairs)
# Approximation for the expectation number of times that a particular
# negative will appear in a given epoch. Implicit in this calculation is the
# treatment of all negative pairs as equally likely. Normally is not
# necessarily reasonable; however the generation in self.setUp() will
# approximate this behavior sufficiently for heuristic testing.
e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality
# The frequency of occurance of a given negative pair should follow an
# approximately binomial distribution in the limit that the cardinality of
# the negative pair set >> number of samples per epoch.
approx_pdf = scipy.stats.binom.pmf(k=np.arange(train_epochs+1),
n=train_epochs, p=e_sample)
# Tally the actual observed counts.
count_distribution = [0 for _ in range(train_epochs + 1)]
for i in negative_counts.values():
i = min([i, train_epochs]) # round down tail for simplicity.
count_distribution[i] += 1
count_distribution[0] = neg_pair_cardinality - sum(count_distribution[1:])
# Check that the frequency of negative pairs is approximately binomial.
for i in range(train_epochs + 1):
if approx_pdf[i] < 0.05:
continue # Variance will be high at the tails.
observed_fraction = count_distribution[i] / neg_pair_cardinality
deviation = (2 * abs(observed_fraction - approx_pdf[i]) /
(observed_fraction + approx_pdf[i]))
self.assertLess(deviation, 0.2)
def test_end_to_end_materialized(self):
self._test_end_to_end("materialized")
def test_end_to_end_bisection(self):
self._test_end_to_end("bisection")
def test_fresh_randomness_materialized(self):
self._test_fresh_randomness("materialized")
def test_fresh_randomness_bisection(self):
self._test_fresh_randomness("bisection")
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains NcfModelRunner, which can train and evaluate an NCF model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import os
import time
import tensorflow as tf
from tensorflow.contrib.compiler import xla
from official.recommendation import constants as rconst
from official.recommendation import data_preprocessing
from official.recommendation import neumf_model
class NcfModelRunner(object):
"""Creates a graph to train/evaluate an NCF model, and runs it.
This class builds both a training model and evaluation model in the graph.
The two models share variables, so that during evaluation, the trained
variables are used.
"""
# _TrainModelProperties and _EvalModelProperties store useful properties of
# the training and evaluation models, respectively.
# _SHARED_MODEL_PROPERTY_FIELDS is their shared fields.
_SHARED_MODEL_PROPERTY_FIELDS = (
# A scalar tf.string placeholder tensor, that will be fed the path to the
# directory storing the TFRecord files for the input data.
"record_files_placeholder",
# The tf.data.Iterator to iterate over the input data.
"iterator",
# A scalar float tensor representing the model loss.
"loss",
# The batch size, as a Python int.
"batch_size",
# The op to run the model. For the training model, this trains the model
# for one step. For the evaluation model, this computes the metrics and
# updates the metric variables.
"run_model_op")
_TrainModelProperties = namedtuple("_TrainModelProperties", # pylint: disable=invalid-name
_SHARED_MODEL_PROPERTY_FIELDS)
_EvalModelProperties = namedtuple( # pylint: disable=invalid-name
"_EvalModelProperties", _SHARED_MODEL_PROPERTY_FIELDS + (
# A dict from metric name to metric tensor.
"metrics",
# Initializes the metric variables.
"metric_initializer",))
def __init__(self, ncf_dataset, params, num_train_steps, num_eval_steps,
use_while_loop):
self._num_train_steps = num_train_steps
self._num_eval_steps = num_eval_steps
self._use_while_loop = use_while_loop
with tf.Graph().as_default() as self._graph:
if params["use_xla_for_gpu"]:
# The XLA functions we use require resource variables.
tf.enable_resource_variables()
self._ncf_dataset = ncf_dataset
self._global_step = tf.train.create_global_step()
self._train_model_properties = self._build_model(params, num_train_steps,
is_training=True)
self._eval_model_properties = self._build_model(params, num_eval_steps,
is_training=False)
initializer = tf.global_variables_initializer()
self._graph.finalize()
self._session = tf.Session(graph=self._graph)
self._session.run(initializer)
def _compute_metric_mean(self, metric_name):
"""Computes the mean from a call tf tf.metrics.mean().
tf.metrics.mean() already returns the mean, so normally this call is
unnecessary. But, if tf.metrics.mean() is called inside a tf.while_loop, the
mean cannot be accessed outside the while loop. Calling this function
recomputes the mean from the variables created by tf.metrics.mean(),
allowing the mean to be accessed outside the while loop.
Args:
metric_name: The string passed to the 'name' argument of tf.metrics.mean()
Returns:
The mean of the metric.
"""
metric_vars = tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)
total_suffix = metric_name + "/total:0"
total_vars = [v for v in metric_vars if v.name.endswith(total_suffix)]
assert len(total_vars) == 1., (
"Found {} metric variables ending with '{}' but expected to find "
"exactly 1. All metric variables: {}".format(
len(total_vars), total_suffix, metric_vars))
total_var = total_vars[0]
count_suffix = metric_name + "/count:0"
count_vars = [v for v in metric_vars if v.name.endswith(count_suffix)]
assert len(count_vars) == 1., (
"Found {} metric variables ending with '{}' but expected to find "
"exactly 1. All metric variables: {}".format(
len(count_vars), count_suffix, metric_vars))
count_var = count_vars[0]
return total_var / count_var
def _build_model(self, params, num_steps, is_training):
"""Builds the NCF model.
Args:
params: A dict of hyperparameters.
is_training: If True, build the training model. If False, build the
evaluation model.
Returns:
A _TrainModelProperties if is_training is True, or an _EvalModelProperties
otherwise.
"""
record_files_placeholder = tf.placeholder(tf.string, ())
input_fn, _, _ = \
data_preprocessing.make_input_fn(
ncf_dataset=self._ncf_dataset, is_training=is_training,
record_files=record_files_placeholder)
dataset = input_fn(params)
iterator = dataset.make_initializable_iterator()
model_fn = neumf_model.neumf_model_fn
if params["use_xla_for_gpu"]:
model_fn = xla.estimator_model_fn(model_fn)
if is_training:
return self._build_train_specific_graph(
iterator, model_fn, params, record_files_placeholder, num_steps)
else:
return self._build_eval_specific_graph(
iterator, model_fn, params, record_files_placeholder, num_steps)
def _build_train_specific_graph(self, iterator, model_fn, params,
record_files_placeholder, num_train_steps):
"""Builds the part of the model that is specific to training."""
def build():
features, labels = iterator.get_next()
estimator_spec = model_fn(
features, labels, tf.estimator.ModeKeys.TRAIN, params)
with tf.control_dependencies([estimator_spec.train_op]):
run_model_op = self._global_step.assign_add(1)
return run_model_op, estimator_spec.loss
if self._use_while_loop:
def body(i):
run_model_op_single_step, _ = build()
with tf.control_dependencies([run_model_op_single_step]):
return i + 1
run_model_op = tf.while_loop(lambda i: i < num_train_steps, body, [0],
parallel_iterations=1)
loss = None
else:
run_model_op, loss = build()
return self._TrainModelProperties(
record_files_placeholder, iterator, loss, params["batch_size"],
run_model_op)
def _build_eval_specific_graph(self, iterator, model_fn, params,
record_files_placeholder, num_eval_steps):
"""Builds the part of the model that is specific to evaluation."""
def build():
features = iterator.get_next()
estimator_spec = model_fn(
features, None, tf.estimator.ModeKeys.EVAL, params)
run_model_op = tf.group(*(update_op for _, update_op in
estimator_spec.eval_metric_ops.values()))
eval_metric_tensors = {k: tensor for (k, (tensor, _))
in estimator_spec.eval_metric_ops.items()}
return run_model_op, estimator_spec.loss, eval_metric_tensors
if self._use_while_loop:
def body(i):
run_model_op_single_step, _, _ = build()
with tf.control_dependencies([run_model_op_single_step]):
return i + 1
run_model_op = tf.while_loop(lambda i: i < num_eval_steps, body, [0],
parallel_iterations=1)
loss = None
eval_metric_tensors = {
"HR": self._compute_metric_mean(rconst.HR_METRIC_NAME),
"NDCG": self._compute_metric_mean(rconst.NDCG_METRIC_NAME),
}
else:
run_model_op, loss, eval_metric_tensors = build()
metric_initializer = tf.variables_initializer(
tf.get_collection(tf.GraphKeys.METRIC_VARIABLES))
return self._EvalModelProperties(
record_files_placeholder, iterator, loss, params["eval_batch_size"],
run_model_op, eval_metric_tensors, metric_initializer)
def _train_or_eval(self, model_properties, num_steps, is_training):
"""Either trains or evaluates, depending on whether `is_training` is True.
Args:
model_properties: _TrainModelProperties or an _EvalModelProperties
containing the properties of the training or evaluation graph.
num_steps: The number of steps to train or evaluate for.
is_training: If True, run the training model. If False, run the evaluation
model.
Returns:
record_dir: The directory of TFRecords where the training/evaluation input
data was read from.
"""
if self._ncf_dataset is not None:
epoch_metadata, record_dir, template = data_preprocessing.get_epoch_info(
is_training=is_training, ncf_dataset=self._ncf_dataset)
batch_count = epoch_metadata["batch_count"]
if batch_count != num_steps:
raise ValueError(
"Step counts do not match. ({} vs. {}) The async process is "
"producing incorrect shards.".format(batch_count, num_steps))
record_files = os.path.join(record_dir, template.format("*"))
initializer_feed_dict = {
model_properties.record_files_placeholder: record_files}
del batch_count
else:
initializer_feed_dict = None
record_dir = None
self._session.run(model_properties.iterator.initializer,
initializer_feed_dict)
fetches = (model_properties.run_model_op,)
if model_properties.loss is not None:
fetches += (model_properties.loss,)
mode = "Train" if is_training else "Eval"
start = None
times_to_run = 1 if self._use_while_loop else num_steps
for i in range(times_to_run):
fetches_ = self._session.run(fetches)
if i % 100 == 0:
if start is None:
# Only start the timer after 100 steps so there is a warmup.
start = time.time()
start_step = i
if model_properties.loss is not None:
_, loss = fetches_
tf.logging.info("{} Loss = {}".format(mode, loss))
end = time.time()
if start is not None:
print("{} peformance: {} examples/sec".format(
mode, (i - start_step) * model_properties.batch_size / (end - start)))
return record_dir
def train(self):
"""Trains the graph for a single cycle."""
record_dir = self._train_or_eval(self._train_model_properties,
self._num_train_steps, is_training=True)
if record_dir:
# We delete the record_dir because each cycle, new TFRecords is generated
# by the async process.
tf.gfile.DeleteRecursively(record_dir)
def eval(self):
"""Evaluates the graph on the eval data.
Returns:
A dict of evaluation results.
"""
self._session.run(self._eval_model_properties.metric_initializer)
self._train_or_eval(self._eval_model_properties, self._num_eval_steps,
is_training=False)
eval_results = {
'global_step': self._session.run(self._global_step)}
for key, val in self._eval_model_properties.metrics.items():
val_ = self._session.run(val)
tf.logging.info("{} = {}".format(key, self._session.run(val)))
eval_results[key] = val_
return eval_results
This diff is collapsed.
...@@ -24,13 +24,11 @@ import mock ...@@ -24,13 +24,11 @@ import mock
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from absl import flags
from absl.testing import flagsaver from absl.testing import flagsaver
from official.recommendation import constants as rconst from official.recommendation import constants as rconst
from official.recommendation import data_preprocessing from official.recommendation import data_pipeline
from official.recommendation import neumf_model from official.recommendation import neumf_model
from official.recommendation import ncf_main from official.recommendation import ncf_main
from official.recommendation import stat_utils
NUM_TRAIN_NEG = 4 NUM_TRAIN_NEG = 4
...@@ -56,6 +54,13 @@ class NcfTest(tf.test.TestCase): ...@@ -56,6 +54,13 @@ class NcfTest(tf.test.TestCase):
top_k=rconst.TOP_K, match_mlperf=False): top_k=rconst.TOP_K, match_mlperf=False):
rconst.TOP_K = top_k rconst.TOP_K = top_k
rconst.NUM_EVAL_NEGATIVES = predicted_scores_by_user.shape[1] - 1 rconst.NUM_EVAL_NEGATIVES = predicted_scores_by_user.shape[1] - 1
batch_size = items_by_user.shape[0]
users = np.repeat(np.arange(batch_size)[:, np.newaxis],
rconst.NUM_EVAL_NEGATIVES + 1, axis=1)
users, items, duplicate_mask = \
data_pipeline.BaseDataConstructor._assemble_eval_batch(
users, items_by_user[:, -1:], items_by_user[:, :-1], batch_size)
g = tf.Graph() g = tf.Graph()
with g.as_default(): with g.as_default():
...@@ -63,8 +68,7 @@ class NcfTest(tf.test.TestCase): ...@@ -63,8 +68,7 @@ class NcfTest(tf.test.TestCase):
predicted_scores_by_user.reshape((-1, 1)), tf.float32) predicted_scores_by_user.reshape((-1, 1)), tf.float32)
softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype), softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype),
logits], axis=1) logits], axis=1)
duplicate_mask = tf.convert_to_tensor( duplicate_mask = tf.convert_to_tensor(duplicate_mask, tf.float32)
stat_utils.mask_duplicates(items_by_user, axis=1), tf.float32)
metric_ops = neumf_model.compute_eval_loss_and_metrics( metric_ops = neumf_model.compute_eval_loss_and_metrics(
logits=logits, softmax_logits=softmax_logits, logits=logits, softmax_logits=softmax_logits,
...@@ -81,21 +85,19 @@ class NcfTest(tf.test.TestCase): ...@@ -81,21 +85,19 @@ class NcfTest(tf.test.TestCase):
sess.run(init) sess.run(init)
return sess.run([hr[1], ndcg[1]]) return sess.run([hr[1], ndcg[1]])
def test_hit_rate_and_ndcg(self): def test_hit_rate_and_ndcg(self):
# Test with no duplicate items # Test with no duplicate items
predictions = np.array([ predictions = np.array([
[1., 2., 0.], # In top 2 [2., 0., 1.], # In top 2
[2., 1., 0.], # In top 1 [1., 0., 2.], # In top 1
[0., 2., 1.], # In top 3 [2., 1., 0.], # In top 3
[2., 3., 4.] # In top 3 [3., 4., 2.] # In top 3
]) ])
items = np.array([ items = np.array([
[1, 2, 3],
[2, 3, 1], [2, 3, 1],
[3, 2, 1], [3, 1, 2],
[2, 1, 3], [2, 1, 3],
[1, 3, 2],
]) ])
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1) hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1)
...@@ -130,16 +132,16 @@ class NcfTest(tf.test.TestCase): ...@@ -130,16 +132,16 @@ class NcfTest(tf.test.TestCase):
# Test with duplicate items. In the MLPerf case, we treat the duplicates as # Test with duplicate items. In the MLPerf case, we treat the duplicates as
# a single item. Otherwise, we treat the duplicates as separate items. # a single item. Otherwise, we treat the duplicates as separate items.
predictions = np.array([ predictions = np.array([
[1., 2., 2., 3.], # In top 4. MLPerf: In top 3 [2., 2., 3., 1.], # In top 4. MLPerf: In top 3
[3., 1., 0., 2.], # In top 1. MLPerf: In top 1 [1., 0., 2., 3.], # In top 1. MLPerf: In top 1
[0., 2., 3., 2.], # In top 4. MLPerf: In top 3 [2., 3., 2., 0.], # In top 4. MLPerf: In top 3
[3., 2., 4., 2.] # In top 2. MLPerf: In top 2 [2., 4., 2., 3.] # In top 2. MLPerf: In top 2
]) ])
items = np.array([ items = np.array([
[1, 2, 2, 3], [2, 2, 3, 1],
[1, 2, 3, 4], [2, 3, 4, 1],
[1, 2, 3, 2], [2, 3, 2, 1],
[4, 3, 2, 1], [3, 2, 1, 4],
]) ])
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1) hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1)
self.assertAlmostEqual(hr, 1 / 4) self.assertAlmostEqual(hr, 1 / 4)
...@@ -180,59 +182,6 @@ class NcfTest(tf.test.TestCase): ...@@ -180,59 +182,6 @@ class NcfTest(tf.test.TestCase):
self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) + self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) +
2 * math.log(2) / math.log(4)) / 4) 2 * math.log(2) / math.log(4)) / 4)
# Test with duplicate items, where the predictions for the same item can
# differ. In the MLPerf case, we should take the first prediction.
predictions = np.array([
[3., 2., 4., 4.], # In top 3. MLPerf: In top 2
[3., 4., 2., 4.], # In top 3. MLPerf: In top 3
[2., 3., 4., 1.], # In top 3. MLPerf: In top 2
[4., 3., 5., 2.] # In top 2. MLPerf: In top 1
])
items = np.array([
[1, 2, 2, 3],
[4, 3, 3, 2],
[2, 1, 1, 1],
[4, 2, 2, 1],
])
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1)
self.assertAlmostEqual(hr, 0 / 4)
self.assertAlmostEqual(ndcg, 0 / 4)
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 2)
self.assertAlmostEqual(hr, 1 / 4)
self.assertAlmostEqual(ndcg, (math.log(2) / math.log(3)) / 4)
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 3)
self.assertAlmostEqual(hr, 4 / 4)
self.assertAlmostEqual(ndcg, (math.log(2) / math.log(3) +
3 * math.log(2) / math.log(4)) / 4)
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 4)
self.assertAlmostEqual(hr, 4 / 4)
self.assertAlmostEqual(ndcg, (math.log(2) / math.log(3) +
3 * math.log(2) / math.log(4)) / 4)
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1,
match_mlperf=True)
self.assertAlmostEqual(hr, 1 / 4)
self.assertAlmostEqual(ndcg, 1 / 4)
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 2,
match_mlperf=True)
self.assertAlmostEqual(hr, 3 / 4)
self.assertAlmostEqual(ndcg, (1 + 2 * math.log(2) / math.log(3)) / 4)
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 3,
match_mlperf=True)
self.assertAlmostEqual(hr, 4 / 4)
self.assertAlmostEqual(ndcg, (1 + 2 * math.log(2) / math.log(3) +
math.log(2) / math.log(4)) / 4)
hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 4,
match_mlperf=True)
self.assertAlmostEqual(hr, 4 / 4)
self.assertAlmostEqual(ndcg, (1 + 2 * math.log(2) / math.log(3) +
math.log(2) / math.log(4)) / 4)
_BASE_END_TO_END_FLAGS = { _BASE_END_TO_END_FLAGS = {
"batch_size": 1024, "batch_size": 1024,
...@@ -241,33 +190,15 @@ class NcfTest(tf.test.TestCase): ...@@ -241,33 +190,15 @@ class NcfTest(tf.test.TestCase):
} }
@flagsaver.flagsaver(**_BASE_END_TO_END_FLAGS) @flagsaver.flagsaver(**_BASE_END_TO_END_FLAGS)
@mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100) @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end(self): def test_end_to_end(self):
ncf_main.main(None) ncf_main.main(None)
@flagsaver.flagsaver(ml_perf=True, **_BASE_END_TO_END_FLAGS) @flagsaver.flagsaver(ml_perf=True, **_BASE_END_TO_END_FLAGS)
@mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100) @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_mlperf(self): def test_end_to_end_mlperf(self):
ncf_main.main(None) ncf_main.main(None)
@flagsaver.flagsaver(use_estimator=False, **_BASE_END_TO_END_FLAGS)
@mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_no_estimator(self):
ncf_main.main(None)
flags.FLAGS.ml_perf = True
ncf_main.main(None)
@flagsaver.flagsaver(use_estimator=False, **_BASE_END_TO_END_FLAGS)
@mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_while_loop(self):
# We cannot set use_while_loop = True in the flagsaver constructor, because
# if the flagsaver sets it to True before setting use_estimator to False,
# the flag validator will throw an error.
flags.FLAGS.use_while_loop = True
ncf_main.main(None)
flags.FLAGS.ml_perf = True
ncf_main.main(None)
if __name__ == "__main__": if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO) tf.logging.set_verbosity(tf.logging.INFO)
......
...@@ -76,44 +76,24 @@ def neumf_model_fn(features, labels, mode, params): ...@@ -76,44 +76,24 @@ def neumf_model_fn(features, labels, mode, params):
tf.set_random_seed(stat_utils.random_int32()) tf.set_random_seed(stat_utils.random_int32())
users = features[movielens.USER_COLUMN] users = features[movielens.USER_COLUMN]
items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32) items = features[movielens.ITEM_COLUMN]
keras_model = params.get("keras_model") logits = construct_model(users, items, params).output
if keras_model:
logits = keras_model([users, items],
training=mode == tf.estimator.ModeKeys.TRAIN)
else:
keras_model = construct_model(users=users, items=items, params=params)
logits = keras_model.output
if not params["use_estimator"] and "keras_model" not in params:
# When we are not using estimator, we need to reuse the Keras model when
# this model_fn is called again, so that the variables are shared between
# training and eval. So we mutate params to add the Keras model.
params["keras_model"] = keras_model
# Softmax with the first column of zeros is equivalent to sigmoid. # Softmax with the first column of zeros is equivalent to sigmoid.
softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype), softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype),
logits], axis=1) logits], axis=1)
if mode == tf.estimator.ModeKeys.PREDICT: if mode == tf.estimator.ModeKeys.EVAL:
predictions = {
movielens.ITEM_COLUMN: items,
movielens.RATING_COLUMN: logits,
}
if params["use_tpu"]:
return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions)
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
elif mode == tf.estimator.ModeKeys.EVAL:
duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32) duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
return compute_eval_loss_and_metrics( return compute_eval_loss_and_metrics(
logits, softmax_logits, duplicate_mask, params["num_neg"], logits, softmax_logits, duplicate_mask, params["num_neg"],
params["match_mlperf"], params["match_mlperf"],
use_tpu_spec=params["use_tpu"] or params["use_xla_for_gpu"]) use_tpu_spec=params["use_xla_for_gpu"])
elif mode == tf.estimator.ModeKeys.TRAIN: elif mode == tf.estimator.ModeKeys.TRAIN:
labels = tf.cast(labels, tf.int32) labels = tf.cast(labels, tf.int32)
valid_pt_mask = features[rconst.VALID_POINT_MASK]
mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam") mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam")
mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR, mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR,
...@@ -135,7 +115,8 @@ def neumf_model_fn(features, labels, mode, params): ...@@ -135,7 +115,8 @@ def neumf_model_fn(features, labels, mode, params):
value=mlperf_helper.TAGS.BCE) value=mlperf_helper.TAGS.BCE)
loss = tf.losses.sparse_softmax_cross_entropy( loss = tf.losses.sparse_softmax_cross_entropy(
labels=labels, labels=labels,
logits=softmax_logits logits=softmax_logits,
weights=tf.cast(valid_pt_mask, tf.float32)
) )
# This tensor is used by logging hooks. # This tensor is used by logging hooks.
...@@ -151,9 +132,6 @@ def neumf_model_fn(features, labels, mode, params): ...@@ -151,9 +132,6 @@ def neumf_model_fn(features, labels, mode, params):
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
train_op = tf.group(minimize_op, update_ops) train_op = tf.group(minimize_op, update_ops)
if params["use_tpu"]:
return tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, loss=loss, train_op=train_op)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
else: else:
...@@ -161,21 +139,18 @@ def neumf_model_fn(features, labels, mode, params): ...@@ -161,21 +139,18 @@ def neumf_model_fn(features, labels, mode, params):
def construct_model(users, items, params): def construct_model(users, items, params):
# type: (tf.Tensor, tf.Tensor, dict) -> tf.Tensor # type: (tf.Tensor, tf.Tensor, dict) -> tf.keras.Model
"""Initialize NeuMF model. """Initialize NeuMF model.
Args: Args:
users: Tensor of user ids. users: Tensor of user ids.
items: Tensor of item ids. items: Tensor of item ids.
params: Dict of hyperparameters. params: Dict of hyperparameters.
Raises: Raises:
ValueError: if the first model layer is not even. ValueError: if the first model layer is not even.
Returns: Returns:
logits: network logits model: a keras Model for computing the logits
""" """
num_users = params["num_users"] num_users = params["num_users"]
num_items = params["num_items"] num_items = params["num_items"]
...@@ -194,82 +169,39 @@ def construct_model(users, items, params): ...@@ -194,82 +169,39 @@ def construct_model(users, items, params):
raise ValueError("The first layer size should be multiple of 2!") raise ValueError("The first layer size should be multiple of 2!")
# Input variables # Input variables
user_input = tf.keras.layers.Input(tensor=users) user_input = tf.keras.layers.Input(tensor=users, name="user_input")
item_input = tf.keras.layers.Input(tensor=items) item_input = tf.keras.layers.Input(tensor=items, name="item_input")
batch_size = user_input.get_shape()[0]
# Initializer for embedding layers
if params["use_tpu"]: embedding_initializer = "glorot_uniform"
with tf.variable_scope("embed_weights", reuse=tf.AUTO_REUSE):
cmb_embedding_user = tf.get_variable( # It turns out to be significantly more effecient to store the MF and MLP
name="embeddings_mf_user", # embedding portions in the same table, and then slice as needed.
shape=[num_users, mf_dim + model_layers[0] // 2], mf_slice_fn = lambda x: x[:, :mf_dim]
initializer=tf.glorot_uniform_initializer()) mlp_slice_fn = lambda x: x[:, mf_dim:]
embedding_user = tf.keras.layers.Embedding(
cmb_embedding_item = tf.get_variable( num_users, mf_dim + model_layers[0] // 2,
name="embeddings_mf_item", embeddings_initializer=embedding_initializer,
shape=[num_items, mf_dim + model_layers[0] // 2], embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
initializer=tf.glorot_uniform_initializer()) input_length=1, name="embedding_user")(user_input)
cmb_user_latent = tf.keras.layers.Lambda(lambda ids: tf.gather( embedding_item = tf.keras.layers.Embedding(
cmb_embedding_user, ids))(user_input) num_items, mf_dim + model_layers[0] // 2,
embeddings_initializer=embedding_initializer,
cmb_item_latent = tf.keras.layers.Lambda(lambda ids: tf.gather( embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
cmb_embedding_item, ids))(item_input) input_length=1, name="embedding_item")(item_input)
mlp_user_latent = tf.keras.layers.Lambda( # GMF part
lambda x: tf.slice(x, [0, 0], [batch_size, model_layers[0] // 2]) mf_user_latent = tf.keras.layers.Lambda(
)(cmb_user_latent) mf_slice_fn, name="embedding_user_mf")(embedding_user)
mf_item_latent = tf.keras.layers.Lambda(
mlp_item_latent = tf.keras.layers.Lambda( mf_slice_fn, name="embedding_item_mf")(embedding_item)
lambda x: tf.slice(x, [0, 0], [batch_size, model_layers[0] // 2])
)(cmb_item_latent) # MLP part
mlp_user_latent = tf.keras.layers.Lambda(
mf_user_latent = tf.keras.layers.Lambda( mlp_slice_fn, name="embedding_user_mlp")(embedding_user)
lambda x: tf.slice(x, [0, model_layers[0] // 2], [batch_size, mf_dim]) mlp_item_latent = tf.keras.layers.Lambda(
)(cmb_user_latent) mlp_slice_fn, name="embedding_item_mlp")(embedding_item)
mf_item_latent = tf.keras.layers.Lambda(
lambda x: tf.slice(x, [0, model_layers[0] // 2], [batch_size, mf_dim])
)(cmb_item_latent)
else:
# Initializer for embedding layers
embedding_initializer = "glorot_uniform"
# Embedding layers of GMF and MLP
mf_embedding_user = tf.keras.layers.Embedding(
num_users,
mf_dim,
embeddings_initializer=embedding_initializer,
embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
input_length=1)
mf_embedding_item = tf.keras.layers.Embedding(
num_items,
mf_dim,
embeddings_initializer=embedding_initializer,
embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
input_length=1)
mlp_embedding_user = tf.keras.layers.Embedding(
num_users,
model_layers[0]//2,
embeddings_initializer=embedding_initializer,
embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
input_length=1)
mlp_embedding_item = tf.keras.layers.Embedding(
num_items,
model_layers[0]//2,
embeddings_initializer=embedding_initializer,
embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
input_length=1)
# GMF part
mf_user_latent = mf_embedding_user(user_input)
mf_item_latent = mf_embedding_item(item_input)
# MLP part
mlp_user_latent = mlp_embedding_user(user_input)
mlp_item_latent = mlp_embedding_item(item_input)
# Element-wise multiply # Element-wise multiply
mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent]) mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])
...@@ -352,7 +284,7 @@ def compute_eval_loss_and_metrics(logits, # type: tf.Tensor ...@@ -352,7 +284,7 @@ def compute_eval_loss_and_metrics(logits, # type: tf.Tensor
Args: Args:
logits: A tensor containing the predicted logits for each user. The shape logits: A tensor containing the predicted logits for each user. The shape
of logits is (num_users_per_batch * (1 + NUM_EVAL_NEGATIVES),) Logits of logits is (num_users_per_batch * (1 + NUM_EVAL_NEGATIVES),) Logits
for a user are grouped, and the first element of the group is the true for a user are grouped, and the last element of the group is the true
element. element.
softmax_logits: The same tensor, but with zeros left-appended. softmax_logits: The same tensor, but with zeros left-appended.
...@@ -377,9 +309,9 @@ def compute_eval_loss_and_metrics(logits, # type: tf.Tensor ...@@ -377,9 +309,9 @@ def compute_eval_loss_and_metrics(logits, # type: tf.Tensor
# Examples are provided by the eval Dataset in a structured format, so eval # Examples are provided by the eval Dataset in a structured format, so eval
# labels can be reconstructed on the fly. # labels can be reconstructed on the fly.
eval_labels = tf.reshape(tf.one_hot( eval_labels = tf.reshape(shape=(-1,), tensor=tf.one_hot(
tf.zeros(shape=(logits_by_user.shape[0],), dtype=tf.int32), tf.zeros(shape=(logits_by_user.shape[0],), dtype=tf.int32) +
logits_by_user.shape[1], dtype=tf.int32), (-1,)) rconst.NUM_EVAL_NEGATIVES, logits_by_user.shape[1], dtype=tf.int32))
eval_labels_float = tf.cast(eval_labels, tf.float32) eval_labels_float = tf.cast(eval_labels, tf.float32)
...@@ -463,7 +395,8 @@ def compute_top_k_and_ndcg(logits, # type: tf.Tensor ...@@ -463,7 +395,8 @@ def compute_top_k_and_ndcg(logits, # type: tf.Tensor
# perform matrix multiplications very quickly. This is similar to np.argwhere. # perform matrix multiplications very quickly. This is similar to np.argwhere.
# However this is a special case because the target will only appear in # However this is a special case because the target will only appear in
# sort_indices once. # sort_indices once.
one_hot_position = tf.cast(tf.equal(sort_indices, 0), tf.int32) one_hot_position = tf.cast(tf.equal(sort_indices, rconst.NUM_EVAL_NEGATIVES),
tf.int32)
sparse_positions = tf.multiply( sparse_positions = tf.multiply(
one_hot_position, tf.range(logits_by_user.shape[1])[tf.newaxis, :]) one_hot_position, tf.range(logits_by_user.shape[1])[tf.newaxis, :])
position_vector = tf.reduce_sum(sparse_positions, axis=1) position_vector = tf.reduce_sum(sparse_positions, axis=1)
......
...@@ -16,21 +16,45 @@ ...@@ -16,21 +16,45 @@
import contextlib import contextlib
import multiprocessing import multiprocessing
import os import multiprocessing.pool
import sys
_PYTHON = sys.executable def get_forkpool(num_workers, init_worker=None, closing=True):
if not _PYTHON: pool = multiprocessing.Pool(processes=num_workers, initializer=init_worker)
raise RuntimeError("Could not find path to Python interpreter in order to " return contextlib.closing(pool) if closing else pool
"spawn subprocesses.")
_ASYNC_GEN_PATH = os.path.join(os.path.dirname(__file__),
"data_async_generation.py")
INVOCATION = [_PYTHON, _ASYNC_GEN_PATH] def get_threadpool(num_workers, init_worker=None, closing=True):
pool = multiprocessing.pool.ThreadPool(processes=num_workers,
initializer=init_worker)
return contextlib.closing(pool) if closing else pool
def get_pool(num_workers, init_worker=None): class FauxPool(object):
return contextlib.closing(multiprocessing.Pool( """Mimic a pool using for loops.
processes=num_workers, initializer=init_worker))
This class is used in place of proper pools when true determinism is desired
for testing or debugging.
"""
def __init__(self, *args, **kwargs):
pass
def map(self, func, iterable, chunksize=None):
return [func(i) for i in iterable]
def imap(self, func, iterable, chunksize=1):
for i in iterable:
yield func(i)
def close(self):
pass
def terminate(self):
pass
def join(self):
pass
def get_fauxpool(num_workers, init_worker=None, closing=True):
pool = FauxPool(processes=num_workers, initializer=init_worker)
return contextlib.closing(pool) if closing else pool
...@@ -27,7 +27,7 @@ mkdir -p ${LOCAL_TEST_DIR} ...@@ -27,7 +27,7 @@ mkdir -p ${LOCAL_TEST_DIR}
TPU=${TPU:-""} TPU=${TPU:-""}
if [[ -z ${TPU} ]]; then if [[ -z ${TPU} ]]; then
DEVICE_FLAG="--num_gpus -1 --use_xla_for_gpu" DEVICE_FLAG="--num_gpus -1" # --use_xla_for_gpu"
else else
DEVICE_FLAG="--tpu ${TPU} --num_gpus 0" DEVICE_FLAG="--tpu ${TPU} --num_gpus 0"
fi fi
...@@ -54,25 +54,25 @@ do ...@@ -54,25 +54,25 @@ do
# To reduce variation set the seed flag: # To reduce variation set the seed flag:
# --seed ${i} # --seed ${i}
#
# And to confirm that the pipeline is deterministic pass the flag: python -u ncf_main.py \
# --hash_pipeline --model_dir ${MODEL_DIR} \
# --data_dir ${DATA_DIR} \
# (`--hash_pipeline` will slow down training, though not as much as one might imagine.) --dataset ${DATASET} --hooks "" \
python ncf_main.py --model_dir ${MODEL_DIR} \ ${DEVICE_FLAG} \
--data_dir ${DATA_DIR} \ --clean \
--dataset ${DATASET} --hooks "" \ --train_epochs 14 \
${DEVICE_FLAG} \ --batch_size 98304 \
--clean \ --eval_batch_size 160000 \
--train_epochs 20 \ --learning_rate 0.00382059 \
--batch_size 2048 \ --beta1 0.783529 \
--eval_batch_size 100000 \ --beta2 0.909003 \
--learning_rate 0.0005 \ --epsilon 1.45439e-07 \
--layers 256,256,128,64 --num_factors 64 \ --layers 256,256,128,64 --num_factors 64 \
--hr_threshold 0.635 \ --hr_threshold 0.635 \
--ml_perf \ --ml_perf \
|& tee ${RUN_LOG} \ |& tee ${RUN_LOG} \
| grep --line-buffered -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+)|(pipeline_hash)|(MLPerf time:)" | grep --line-buffered -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+, Loss = [0-9\.]+)|(pipeline_hash)|(MLPerf time:)"
END_TIME=$(date +%s) END_TIME=$(date +%s)
echo "Run ${i} complete: $(( $END_TIME - $START_TIME )) seconds." echo "Run ${i} complete: $(( $END_TIME - $START_TIME )) seconds."
......
...@@ -18,71 +18,45 @@ from __future__ import absolute_import ...@@ -18,71 +18,45 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import os
import numpy as np import numpy as np
def random_int32(): def random_int32():
return np.random.randint(low=0, high=np.iinfo(np.int32).max, dtype=np.int32) return np.random.randint(low=0, high=np.iinfo(np.int32).max, dtype=np.int32)
def sample_with_exclusion(num_items, positive_set, n, replacement=True):
# type: (int, typing.Iterable, int, bool) -> list
"""Vectorized negative sampling.
This function samples from the positive set's conjugate, both with and def permutation(args):
without replacement. """Fork safe permutation function.
Performance: This function can be called within a multiprocessing worker and give
This algorithm generates a vector of candidate values based on the expected appropriately random results.
number needed such that at least k are not in the positive set, where k
is the number of false negatives still needed. An additional factor of
safety of 1.2 is used during the generation to minimize the chance of having
to perform another generation cycle.
While this approach generates more values than needed and then discards some
of them, vectorized generation is inexpensive and turns out to be much
faster than generating points one at a time. (And it defers quite a bit
of work to NumPy which has much better multi-core utilization than native
Python.)
Args: Args:
num_items: The cardinality of the entire set of items. args: A size two tuple that will unpacked into the size of the permutation
positive_set: The set of positive items which should not be included as and the random seed. This form is used because starmap is not universally
negatives. available.
n: The number of negatives to generate.
replacement: Whether to sample with (True) or without (False) replacement.
Returns: returns:
A list of generated negatives. A NumPy array containing a random permutation.
""" """
x, seed = args
# If seed is None NumPy will seed randomly.
state = np.random.RandomState(seed=seed) # pylint: disable=no-member
output = np.arange(x, dtype=np.int32)
state.shuffle(output)
return output
def very_slightly_biased_randint(max_val_vector):
sample_dtype = np.uint64
out_dtype = max_val_vector.dtype
samples = np.random.randint(low=0, high=np.iinfo(sample_dtype).max,
size=max_val_vector.shape, dtype=sample_dtype)
return np.mod(samples, max_val_vector.astype(sample_dtype)).astype(out_dtype)
if not isinstance(positive_set, set):
positive_set = set(positive_set)
p = 1 - len(positive_set) / num_items
n_attempt = int(n * (1 / p) * 1.2) # factor of 1.2 for safety
# If sampling is performed with replacement, candidates are appended.
# Otherwise, they should be added with a set union to remove duplicates.
if replacement:
negatives = []
else:
negatives = set()
while len(negatives) < n:
negative_candidates = np.random.randint(
low=0, high=num_items, size=(n_attempt,))
if replacement:
negatives.extend(
[i for i in negative_candidates if i not in positive_set]
)
else:
negatives |= (set(negative_candidates) - positive_set)
if not replacement:
negatives = list(negatives)
np.random.shuffle(negatives) # list(set(...)) is not order guaranteed, but
# in practice tends to be quite ordered.
return negatives[:n]
def mask_duplicates(x, axis=1): # type: (np.ndarray, int) -> np.ndarray def mask_duplicates(x, axis=1): # type: (np.ndarray, int) -> np.ndarray
"""Identify duplicates from sampling with replacement. """Identify duplicates from sampling with replacement.
......
...@@ -2,9 +2,10 @@ google-api-python-client>=1.6.7 ...@@ -2,9 +2,10 @@ google-api-python-client>=1.6.7
google-cloud-bigquery>=0.31.0 google-cloud-bigquery>=0.31.0
kaggle>=1.3.9 kaggle>=1.3.9
mlperf_compliance==0.0.10 mlperf_compliance==0.0.10
numpy numpy>=1.15.4
oauth2client>=4.1.2 oauth2client>=4.1.2
pandas pandas>=0.22.0
psutil>=5.4.3 psutil>=5.4.3
py-cpuinfo>=3.3.0 py-cpuinfo>=3.3.0
scipy>=0.19.1
typing typing
...@@ -228,5 +228,3 @@ class DummyContextManager(object): ...@@ -228,5 +228,3 @@ class DummyContextManager(object):
def __exit__(self, *args): def __exit__(self, *args):
pass pass
...@@ -34,7 +34,7 @@ import typing ...@@ -34,7 +34,7 @@ import typing
import tensorflow as tf import tensorflow as tf
_MIN_VERSION = (0, 0, 6) _MIN_VERSION = (0, 0, 10)
_STACK_OFFSET = 2 _STACK_OFFSET = 2
SUDO = "sudo" if os.geteuid() else "" SUDO = "sudo" if os.geteuid() else ""
...@@ -186,60 +186,6 @@ def clear_system_caches(): ...@@ -186,60 +186,6 @@ def clear_system_caches():
raise ValueError("Failed to clear caches") raise ValueError("Failed to clear caches")
def stitch_ncf():
"""Format NCF logs for MLPerf compliance."""
if not LOGGER.enabled:
return
if LOGGER.log_file is None or not tf.gfile.Exists(LOGGER.log_file):
tf.logging.warning("Could not find log file to stitch.")
return
log_lines = []
num_eval_users = None
start_time = None
stop_time = None
with tf.gfile.Open(LOGGER.log_file, "r") as f:
for line in f:
parsed_line = parse_line(line)
if not parsed_line:
tf.logging.warning("Failed to parse line: {}".format(line))
continue
log_lines.append(parsed_line)
if parsed_line.tag == TAGS.RUN_START:
assert start_time is None
start_time = float(parsed_line.timestamp)
if parsed_line.tag == TAGS.RUN_STOP:
assert stop_time is None
stop_time = float(parsed_line.timestamp)
if (parsed_line.tag == TAGS.EVAL_HP_NUM_USERS and parsed_line.value
is not None and "DEFERRED" not in parsed_line.value):
assert num_eval_users is None or num_eval_users == parsed_line.value
num_eval_users = parsed_line.value
log_lines.pop()
for i, parsed_line in enumerate(log_lines):
if parsed_line.tag == TAGS.EVAL_HP_NUM_USERS:
log_lines[i] = ParsedLine(*parsed_line[:-1], value=num_eval_users)
log_lines = sorted([unparse_line(i) for i in log_lines])
output_path = os.getenv("STITCHED_COMPLIANCE_FILE", None)
if output_path:
with tf.gfile.Open(output_path, "w") as f:
for line in log_lines:
f.write(line + "\n")
else:
for line in log_lines:
print(line)
sys.stdout.flush()
if start_time is not None and stop_time is not None:
tf.logging.info("MLPerf time: {:.1f} sec.".format(stop_time - start_time))
if __name__ == "__main__": if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO) tf.logging.set_verbosity(tf.logging.INFO)
with LOGGER(True): with LOGGER(True):
......
...@@ -146,6 +146,10 @@ no-space-check= ...@@ -146,6 +146,10 @@ no-space-check=
# else. # else.
single-line-if-stmt=yes single-line-if-stmt=yes
# Allow URLs and comment type annotations to exceed the max line length as neither can be easily
# split across lines.
ignore-long-lines=^\s*(?:(# )?<?https?://\S+>?$|# type:)
[VARIABLES] [VARIABLES]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment