Unverified Commit 8ff9eb54 authored by Shining Sun's avatar Shining Sun Committed by GitHub
Browse files

Remove contrib imports, or move them inline (#6591)

* Remove contrib imports, or move them inline

* Use exposed API for FixedLenFeature

* Replace tf.logging with absl logging

* Change GFile to v2 APIs

* replace tf.logging with absl loggin in movielens

* Fixing an import bug

* Change gfile to v2 APIs in code

* Swap to keras optimizer v2

* Bug fix for optimizer

* Change tf.log to tf.keras.backend.log

* Change the loss function to keras loss

* convert another loss to keras loss

* Resolve comments and fix lint

* Add a doc string

* Fix existing tests and add new tests for DS

* Added tests for multi-replica

* Fix lint

* resolve comments

* make estimator run in tf2.0

* use compat v1 loss

* fix lint issue
parent 139dd8e9
......@@ -33,6 +33,7 @@ import six
from six.moves import urllib # pylint: disable=redefined-builtin
from absl import app as absl_app
from absl import flags
from absl import logging
import tensorflow as tf
# pylint: enable=g-bad-import-order
......@@ -100,10 +101,10 @@ def _download_and_clean(dataset, data_dir):
expected_files = ["{}.zip".format(dataset), RATINGS_FILE, MOVIES_FILE]
tf.gfile.MakeDirs(data_subdir)
tf.io.gfile.makedirs(data_subdir)
if set(expected_files).intersection(
tf.gfile.ListDirectory(data_subdir)) == set(expected_files):
tf.logging.info("Dataset {} has already been downloaded".format(dataset))
tf.io.gfile.listdir(data_subdir)) == set(expected_files):
logging.info("Dataset {} has already been downloaded".format(dataset))
return
url = "{}{}.zip".format(_DATA_URL, dataset)
......@@ -114,9 +115,9 @@ def _download_and_clean(dataset, data_dir):
zip_path, _ = urllib.request.urlretrieve(url, zip_path)
statinfo = os.stat(zip_path)
# A new line to clear the carriage return from download progress
# tf.logging.info is not applicable here
# logging.info is not applicable here
print()
tf.logging.info(
logging.info(
"Successfully downloaded {} {} bytes".format(
zip_path, statinfo.st_size))
......@@ -127,16 +128,16 @@ def _download_and_clean(dataset, data_dir):
else:
_regularize_20m_dataset(temp_dir)
for fname in tf.gfile.ListDirectory(temp_dir):
if not tf.gfile.Exists(os.path.join(data_subdir, fname)):
tf.gfile.Copy(os.path.join(temp_dir, fname),
os.path.join(data_subdir, fname))
for fname in tf.io.gfile.listdir(temp_dir):
if not tf.io.gfile.exists(os.path.join(data_subdir, fname)):
tf.io.gfile.copy(os.path.join(temp_dir, fname),
os.path.join(data_subdir, fname))
else:
tf.logging.info("Skipping copy of {}, as it already exists in the "
"destination folder.".format(fname))
logging.info("Skipping copy of {}, as it already exists in the "
"destination folder.".format(fname))
finally:
tf.gfile.DeleteRecursively(temp_dir)
tf.io.gfile.rmtree(temp_dir)
def _transform_csv(input_path, output_path, names, skip_first, separator=","):
......@@ -152,8 +153,8 @@ def _transform_csv(input_path, output_path, names, skip_first, separator=","):
if six.PY2:
names = [n.decode("utf-8") for n in names]
with tf.gfile.Open(output_path, "wb") as f_out, \
tf.gfile.Open(input_path, "rb") as f_in:
with tf.io.gfile.GFile(output_path, "wb") as f_out, \
tf.io.gfile.GFile(input_path, "rb") as f_in:
# Write column names to the csv.
f_out.write(",".join(names).encode("utf-8"))
......@@ -199,7 +200,7 @@ def _regularize_1m_dataset(temp_dir):
output_path=os.path.join(temp_dir, MOVIES_FILE),
names=MOVIE_COLUMNS, skip_first=False, separator="::")
tf.gfile.DeleteRecursively(working_dir)
tf.io.gfile.rmtree(working_dir)
def _regularize_20m_dataset(temp_dir):
......@@ -233,7 +234,7 @@ def _regularize_20m_dataset(temp_dir):
output_path=os.path.join(temp_dir, MOVIES_FILE),
names=MOVIE_COLUMNS, skip_first=True, separator=",")
tf.gfile.DeleteRecursively(working_dir)
tf.io.gfile.rmtree(working_dir)
def download(dataset, data_dir):
......@@ -244,14 +245,14 @@ def download(dataset, data_dir):
def ratings_csv_to_dataframe(data_dir, dataset):
with tf.gfile.Open(os.path.join(data_dir, dataset, RATINGS_FILE)) as f:
with tf.io.gfile.GFile(os.path.join(data_dir, dataset, RATINGS_FILE)) as f:
return pd.read_csv(f, encoding="utf-8")
def csv_to_joint_dataframe(data_dir, dataset):
ratings = ratings_csv_to_dataframe(data_dir, dataset)
with tf.gfile.Open(os.path.join(data_dir, dataset, MOVIES_FILE)) as f:
with tf.io.gfile.GFile(os.path.join(data_dir, dataset, MOVIES_FILE)) as f:
movies = pd.read_csv(f, encoding="utf-8")
df = ratings.merge(movies, on=ITEM_COLUMN)
......@@ -302,7 +303,6 @@ def main(_):
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
define_data_download_flags()
FLAGS = flags.FLAGS
absl_app.run(main)
......@@ -33,7 +33,7 @@ import numpy as np
import six
from six.moves import queue
import tensorflow as tf
from tensorflow.contrib.tpu.python.tpu.datasets import StreamingFilesDataset
from absl import logging
from official.datasets import movielens
from official.recommendation import constants as rconst
......@@ -57,17 +57,17 @@ Eval:
_TRAIN_FEATURE_MAP = {
movielens.USER_COLUMN: tf.FixedLenFeature([], dtype=tf.string),
movielens.ITEM_COLUMN: tf.FixedLenFeature([], dtype=tf.string),
rconst.MASK_START_INDEX: tf.FixedLenFeature([1], dtype=tf.string),
"labels": tf.FixedLenFeature([], dtype=tf.string),
movielens.USER_COLUMN: tf.io.FixedLenFeature([], dtype=tf.string),
movielens.ITEM_COLUMN: tf.io.FixedLenFeature([], dtype=tf.string),
rconst.MASK_START_INDEX: tf.io.FixedLenFeature([1], dtype=tf.string),
"labels": tf.io.FixedLenFeature([], dtype=tf.string),
}
_EVAL_FEATURE_MAP = {
movielens.USER_COLUMN: tf.FixedLenFeature([], dtype=tf.string),
movielens.ITEM_COLUMN: tf.FixedLenFeature([], dtype=tf.string),
rconst.DUPLICATE_MASK: tf.FixedLenFeature([], dtype=tf.string)
movielens.USER_COLUMN: tf.io.FixedLenFeature([], dtype=tf.string),
movielens.ITEM_COLUMN: tf.io.FixedLenFeature([], dtype=tf.string),
rconst.DUPLICATE_MASK: tf.io.FixedLenFeature([], dtype=tf.string)
}
......@@ -200,7 +200,7 @@ class DatasetManager(object):
def start_construction(self):
if self._stream_files:
tf.gfile.MakeDirs(self.current_data_root)
tf.io.gfile.makedirs(self.current_data_root)
template = os.path.join(self.current_data_root, rconst.SHARD_TEMPLATE)
self._writers = [tf.io.TFRecordWriter(template.format(i))
for i in range(rconst.NUM_FILE_SHARDS)]
......@@ -261,6 +261,10 @@ class DatasetManager(object):
file_pattern = os.path.join(
epoch_data_dir, rconst.SHARD_TEMPLATE.format("*"))
# TODO: remove this contrib import
# pylint: disable=line-too-long
from tensorflow.contrib.tpu.python.tpu.datasets import StreamingFilesDataset
# pylint: enable=line-too-long
dataset = StreamingFilesDataset(
files=file_pattern, worker_job=popen_helper.worker_job(),
num_parallel_reads=rconst.NUM_FILE_SHARDS, num_epochs=1,
......@@ -388,7 +392,7 @@ class BaseDataConstructor(threading.Thread):
self._shuffle_with_forkpool = not stream_files
if stream_files:
self._shard_root = epoch_dir or tempfile.mkdtemp(prefix="ncf_")
atexit.register(tf.gfile.DeleteRecursively, dirname=self._shard_root)
atexit.register(tf.io.gfile.rmtree, dirname=self._shard_root)
else:
self._shard_root = None
......@@ -517,7 +521,7 @@ class BaseDataConstructor(threading.Thread):
time.sleep(0.01)
count += 1
if count >= 100 and np.log10(count) == np.round(np.log10(count)):
tf.logging.info(
logging.info(
"Waited {} times for training data to be consumed".format(count))
def _construct_training_epoch(self):
......@@ -537,7 +541,7 @@ class BaseDataConstructor(threading.Thread):
pool.map(self._get_training_batch, map_args)
self._train_dataset.end_construction()
tf.logging.info("Epoch construction complete. Time: {:.1f} seconds".format(
logging.info("Epoch construction complete. Time: {:.1f} seconds".format(
timeit.default_timer() - start_time))
@staticmethod
......@@ -619,7 +623,7 @@ class BaseDataConstructor(threading.Thread):
pool.map(self._get_eval_batch, map_args)
self._eval_dataset.end_construction()
tf.logging.info("Eval construction complete. Time: {:.1f} seconds".format(
logging.info("Eval construction complete. Time: {:.1f} seconds".format(
timeit.default_timer() - start_time))
def make_input_fn(self, is_training):
......@@ -760,7 +764,7 @@ class MaterializedDataConstructor(BaseDataConstructor):
self._per_user_neg_count[i] = self._num_items - positives.shape[0]
self._negative_table[i, :self._per_user_neg_count[i]] = negatives
tf.logging.info("Negative sample table built. Time: {:.1f} seconds".format(
logging.info("Negative sample table built. Time: {:.1f} seconds".format(
timeit.default_timer() - start_time))
def lookup_negative_items(self, negative_users, **kwargs):
......@@ -813,7 +817,7 @@ class BisectionDataConstructor(BaseDataConstructor):
self._total_negatives = np.concatenate([
self._index_segment(i) for i in range(self._num_users)])
tf.logging.info("Negative total vector built. Time: {:.1f} seconds".format(
logging.info("Negative total vector built. Time: {:.1f} seconds".format(
timeit.default_timer() - start_time))
def lookup_negative_items(self, negative_users, **kwargs):
......
......@@ -28,6 +28,7 @@ import typing
import numpy as np
import pandas as pd
import tensorflow as tf
from absl import logging
# pylint: enable=wrong-import-order
from official.datasets import movielens
......@@ -80,9 +81,9 @@ def _filter_index_sort(raw_rating_path, cache_path):
IDs to regularized user IDs, and a dict mapping raw item IDs to regularized
item IDs.
"""
valid_cache = tf.gfile.Exists(cache_path)
valid_cache = tf.io.gfile.exists(cache_path)
if valid_cache:
with tf.gfile.Open(cache_path, "rb") as f:
with tf.io.gfile.GFile(cache_path, "rb") as f:
cached_data = pickle.load(f)
cache_age = time.time() - cached_data.get("create_time", 0)
......@@ -94,13 +95,13 @@ def _filter_index_sort(raw_rating_path, cache_path):
valid_cache = False
if not valid_cache:
tf.logging.info("Removing stale raw data cache file.")
tf.gfile.Remove(cache_path)
logging.info("Removing stale raw data cache file.")
tf.io.gfile.remove(cache_path)
if valid_cache:
data = cached_data
else:
with tf.gfile.Open(raw_rating_path) as f:
with tf.io.gfile.GFile(raw_rating_path) as f:
df = pd.read_csv(f)
# Get the info of users who have more than 20 ratings on items
......@@ -112,7 +113,7 @@ def _filter_index_sort(raw_rating_path, cache_path):
original_items = df[movielens.ITEM_COLUMN].unique()
# Map the ids of user and item to 0 based index for following processing
tf.logging.info("Generating user_map and item_map...")
logging.info("Generating user_map and item_map...")
user_map = {user: index for index, user in enumerate(original_users)}
item_map = {item: index for index, item in enumerate(original_items)}
......@@ -134,7 +135,7 @@ def _filter_index_sort(raw_rating_path, cache_path):
# This sort is used to shard the dataframe by user, and later to select
# the last item for a user to be used in validation.
tf.logging.info("Sorting by user, timestamp...")
logging.info("Sorting by user, timestamp...")
# This sort is equivalent to
# df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
......@@ -167,8 +168,8 @@ def _filter_index_sort(raw_rating_path, cache_path):
"create_time": time.time(),
}
tf.logging.info("Writing raw data cache.")
with tf.gfile.Open(cache_path, "wb") as f:
logging.info("Writing raw data cache.")
with tf.io.gfile.GFile(cache_path, "wb") as f:
pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
# TODO(robieta): MLPerf cache clear.
......@@ -189,7 +190,7 @@ def instantiate_pipeline(dataset, data_dir, params, constructor_type=None,
deterministic: Tell the data constructor to produce deterministically.
epoch_dir: Directory in which to store the training epochs.
"""
tf.logging.info("Beginning data preprocessing.")
logging.info("Beginning data preprocessing.")
st = timeit.default_timer()
raw_rating_path = os.path.join(data_dir, dataset, movielens.RATINGS_FILE)
......@@ -227,8 +228,8 @@ def instantiate_pipeline(dataset, data_dir, params, constructor_type=None,
)
run_time = timeit.default_timer() - st
tf.logging.info("Data preprocessing complete. Time: {:.1f} sec."
.format(run_time))
logging.info("Data preprocessing complete. Time: {:.1f} sec."
.format(run_time))
print(producer)
return num_users, num_items, producer
......@@ -58,7 +58,7 @@ class BaseTest(tf.test.TestCase):
def setUp(self):
self.temp_data_dir = self.get_temp_dir()
ratings_folder = os.path.join(self.temp_data_dir, DATASET)
tf.gfile.MakeDirs(ratings_folder)
tf.io.gfile.makedirs(ratings_folder)
np.random.seed(0)
raw_user_ids = np.arange(NUM_USERS * 3)
np.random.shuffle(raw_user_ids)
......@@ -76,7 +76,7 @@ class BaseTest(tf.test.TestCase):
self.rating_file = os.path.join(ratings_folder, movielens.RATINGS_FILE)
self.seen_pairs = set()
self.holdout = {}
with tf.gfile.Open(self.rating_file, "w") as f:
with tf.io.gfile.GFile(self.rating_file, "w") as f:
f.write("user_id,item_id,rating,timestamp\n")
for usr, itm, scr, ts in zip(users, items, scores, times):
pair = (usr, itm)
......@@ -341,5 +341,4 @@ class BaseTest(tf.test.TestCase):
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
tf.test.main()
......@@ -20,12 +20,12 @@ from __future__ import division
from __future__ import print_function
import json
import logging
import os
# pylint: disable=g-bad-import-order
import numpy as np
from absl import flags
from absl import logging
import tensorflow as tf
# pylint: enable=g-bad-import-order
......@@ -109,18 +109,6 @@ def parse_flags(flags_obj):
}
def get_optimizer(params):
optimizer = tf.train.AdamOptimizer(
learning_rate=params["learning_rate"],
beta1=params["beta1"],
beta2=params["beta2"],
epsilon=params["epsilon"])
if params["use_tpu"]:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
return optimizer
def get_distribution_strategy(params):
"""Returns the distribution strategy to use."""
if params["turn_off_distribution_strategy"]:
......@@ -132,14 +120,14 @@ def get_distribution_strategy(params):
"oauth2client.transport"]:
logging.getLogger(name).setLevel(logging.ERROR)
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
tpu=params["tpu"],
zone=params["tpu_zone"],
project=params["tpu_gcp_project"],
coordinator_name="coordinator"
)
tf.logging.info("Issuing reset command to TPU to ensure a clean state.")
logging.info("Issuing reset command to TPU to ensure a clean state.")
tf.Session.reset(tpu_cluster_resolver.get_master())
# Estimator looks at the master it connects to for MonitoredTrainingSession
......@@ -153,7 +141,7 @@ def get_distribution_strategy(params):
}
os.environ['TF_CONFIG'] = json.dumps(tf_config_env)
distribution = tf.contrib.distribute.TPUStrategy(
distribution = tf.distribute.experimental.TPUStrategy(
tpu_cluster_resolver, steps_per_run=100)
else:
......
......@@ -25,7 +25,6 @@ from __future__ import print_function
import contextlib
import heapq
import json
import logging
import math
import multiprocessing
import os
......@@ -36,10 +35,10 @@ import typing
import numpy as np
from absl import app as absl_app
from absl import flags
from absl import logging
import tensorflow as tf
# pylint: enable=g-bad-import-order
from tensorflow.contrib.compiler import xla
from official.datasets import movielens
from official.recommendation import constants as rconst
from official.recommendation import data_pipeline
......@@ -73,7 +72,9 @@ def construct_estimator(model_dir, params):
model_fn = neumf_model.neumf_model_fn
if params["use_xla_for_gpu"]:
tf.logging.info("Using XLA for GPU for training and evaluation.")
# TODO: remove the contrib imput
from tensorflow.contrib.compiler import xla
logging.info("Using XLA for GPU for training and evaluation.")
model_fn = xla.estimator_model_fn(model_fn)
estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir,
config=run_config, params=params)
......@@ -133,7 +134,7 @@ def run_ncf(_):
mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
for cycle_index in range(total_training_cycle):
assert FLAGS.epochs_between_evals == 1 or not mlperf_helper.LOGGER.enabled
tf.logging.info("Starting a training cycle: {}/{}".format(
logging.info("Starting a training cycle: {}/{}".format(
cycle_index + 1, total_training_cycle))
mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
......@@ -143,13 +144,13 @@ def run_ncf(_):
estimator.train(input_fn=train_input_fn, hooks=train_hooks,
steps=num_train_steps)
tf.logging.info("Beginning evaluation.")
logging.info("Beginning evaluation.")
eval_input_fn = producer.make_input_fn(is_training=False)
mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
value=cycle_index)
eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
tf.logging.info("Evaluation complete.")
logging.info("Evaluation complete.")
hr = float(eval_results[rconst.HR_KEY])
ndcg = float(eval_results[rconst.NDCG_KEY])
......@@ -169,7 +170,7 @@ def run_ncf(_):
# Benchmark the evaluation results
benchmark_logger.log_evaluation_result(eval_results)
# Log the HR and NDCG results.
tf.logging.info(
logging.info(
"Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
cycle_index + 1, hr, ndcg, loss))
......@@ -189,6 +190,6 @@ def run_ncf(_):
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
logging.set_verbosity(logging.INFO)
ncf_common.define_ncf_flags()
absl_app.run(main)
......@@ -27,6 +27,7 @@ import os
# pylint: disable=g-bad-import-order
from absl import app as absl_app
from absl import flags
from absl import logging
import tensorflow as tf
# pylint: enable=g-bad-import-order
......@@ -46,9 +47,10 @@ FLAGS = flags.FLAGS
def _keras_loss(y_true, y_pred):
# Here we are using the exact same loss used by the estimator
loss = tf.losses.sparse_softmax_cross_entropy(
labels=tf.cast(y_true, tf.int32),
logits=y_pred)
loss = tf.keras.losses.sparse_categorical_crossentropy(
y_pred=y_pred,
y_true=tf.cast(y_true, tf.int32),
from_logits=True)
return loss
......@@ -66,7 +68,7 @@ def _get_metric_fn(params):
# repetition correction
dup_mask = tf.zeros([batch_size, 1])
cross_entropy, metric_fn, in_top_k, ndcg, metric_weights = (
_, _, in_top_k, _, _ = (
neumf_model.compute_eval_loss_and_metrics_helper(
logits,
softmax_logits,
......@@ -155,13 +157,13 @@ def _get_keras_model(params):
# is designed to be of batch_size 1 for each replica.
user_input = tf.keras.layers.Input(
shape=(batch_size,),
batch_size=1,
batch_size=params["batches_per_step"],
name=movielens.USER_COLUMN,
dtype=tf.int32)
item_input = tf.keras.layers.Input(
shape=(batch_size,),
batch_size=1,
batch_size=params["batches_per_step"],
name=movielens.ITEM_COLUMN,
dtype=tf.int32)
......@@ -193,7 +195,7 @@ def run_ncf(_):
"""Run NCF training and eval with Keras."""
# TODO(seemuch): Support different train and eval batch sizes
if FLAGS.eval_batch_size != FLAGS.batch_size:
tf.logging.warning(
logging.warning(
"The Keras implementation of NCF currently does not support batch_size "
"!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
"batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size)
......@@ -226,7 +228,11 @@ def run_ncf(_):
strategy = ncf_common.get_distribution_strategy(params)
with distribution_utils.get_strategy_scope(strategy):
keras_model = _get_keras_model(params)
optimizer = ncf_common.get_optimizer(params)
optimizer = tf.keras.optimizers.Adam(
learning_rate=params["learning_rate"],
beta_1=params["beta1"],
beta_2=params["beta2"],
epsilon=params["epsilon"])
time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
keras_model.compile(
......@@ -241,14 +247,14 @@ def run_ncf(_):
time_callback],
verbose=2)
tf.logging.info("Training done. Start evaluating")
logging.info("Training done. Start evaluating")
eval_results = keras_model.evaluate(
eval_input_dataset,
steps=num_eval_steps,
verbose=2)
tf.logging.info("Keras evaluation is done.")
logging.info("Keras evaluation is done.")
stats = build_stats(history, eval_results, time_callback)
return stats
......@@ -298,6 +304,5 @@ def main(_):
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
ncf_common.define_ncf_flags()
absl_app.run(main)
......@@ -32,6 +32,7 @@ from official.recommendation import ncf_common
from official.recommendation import ncf_estimator_main
from official.recommendation import ncf_keras_main
from official.utils.testing import integration
from tensorflow.python.eager import context # pylint: disable=ungrouped-imports
NUM_TRAIN_NEG = 4
......@@ -204,7 +205,7 @@ class NcfTest(tf.test.TestCase):
integration.run_synthetic(
ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
extra_flags=self._BASE_END_TO_END_FLAGS +
['-distribution_strategy', 'off', '-log_steps', '100'])
['-distribution_strategy', 'off'])
@mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_keras_mlperf(self):
......@@ -212,10 +213,29 @@ class NcfTest(tf.test.TestCase):
ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
extra_flags=self._BASE_END_TO_END_FLAGS +
['-ml_perf', 'True',
'-distribution_strategy', 'off',
'-log_steps', '100'])
'-distribution_strategy', 'off'])
@mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_keras_1_gpu(self):
if context.num_gpus() < 1:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available".
format(1, context.num_gpus()))
integration.run_synthetic(
ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1'])
@mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_keras_2_gpu(self):
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available".
format(2, context.num_gpus()))
integration.run_synthetic(
ncf_keras_main.main, tmp_root=self.get_temp_dir(), max_train=None,
extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '2'])
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
tf.test.main()
......@@ -109,12 +109,20 @@ def neumf_model_fn(features, labels, mode, params):
mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
value=params["epsilon"])
optimizer = ncf_common.get_optimizer(params)
optimizer = tf.compat.v1.train.AdamOptimizer(
learning_rate=params["learning_rate"],
beta1=params["beta1"],
beta2=params["beta2"],
epsilon=params["epsilon"])
if params["use_tpu"]:
# TODO: remove this contrib import
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN,
value=mlperf_helper.TAGS.BCE)
loss = tf.losses.sparse_softmax_cross_entropy(
loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(
labels=labels,
logits=softmax_logits,
weights=tf.cast(valid_pt_mask, tf.float32)
......@@ -123,14 +131,14 @@ def neumf_model_fn(features, labels, mode, params):
# This tensor is used by logging hooks.
tf.identity(loss, name="cross_entropy")
global_step = tf.train.get_global_step()
tvars = tf.trainable_variables()
global_step = tf.compat.v1.train.get_global_step()
tvars = tf.compat.v1.trainable_variables()
gradients = optimizer.compute_gradients(
loss, tvars, colocate_gradients_with_ops=True)
gradients = _sparse_to_dense_grads(gradients)
minimize_op = optimizer.apply_gradients(
gradients, global_step=global_step, name="train")
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
train_op = tf.group(minimize_op, update_ops)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
......@@ -381,15 +389,17 @@ def compute_eval_loss_and_metrics_helper(logits, # type: tf.Tensor
# ignore padded examples
example_weights *= tf.cast(expanded_metric_weights, tf.float32)
cross_entropy = tf.losses.sparse_softmax_cross_entropy(
cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
logits=softmax_logits, labels=eval_labels, weights=example_weights)
def metric_fn(top_k_tensor, ndcg_tensor, weight_tensor):
return {
rconst.HR_KEY: tf.metrics.mean(top_k_tensor, weights=weight_tensor,
name=rconst.HR_METRIC_NAME),
rconst.NDCG_KEY: tf.metrics.mean(ndcg_tensor, weights=weight_tensor,
name=rconst.NDCG_METRIC_NAME),
rconst.HR_KEY: tf.compat.v1.metrics.mean(top_k_tensor,
weights=weight_tensor,
name=rconst.HR_METRIC_NAME),
rconst.NDCG_KEY: tf.compat.v1.metrics.mean(ndcg_tensor,
weights=weight_tensor,
name=rconst.NDCG_METRIC_NAME)
}
return cross_entropy, metric_fn, in_top_k, ndcg, metric_weights
......@@ -428,7 +438,7 @@ def compute_top_k_and_ndcg(logits, # type: tf.Tensor
# Determine the location of the first element in each row after the elements
# are sorted.
sort_indices = tf.contrib.framework.argsort(
sort_indices = tf.argsort(
logits_by_user, axis=1, direction="DESCENDING")
# Use matrix multiplication to extract the position of the true item from the
......@@ -443,7 +453,8 @@ def compute_top_k_and_ndcg(logits, # type: tf.Tensor
position_vector = tf.reduce_sum(sparse_positions, axis=1)
in_top_k = tf.cast(tf.less(position_vector, rconst.TOP_K), tf.float32)
ndcg = tf.log(2.) / tf.log(tf.cast(position_vector, tf.float32) + 2)
ndcg = tf.math.log(2.) / tf.math.log(
tf.cast(position_vector, tf.float32) + 2)
ndcg *= in_top_k
# If a row is a padded row, all but the first element will be a duplicate.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment