Commit 901c4cc4 authored by Vinh Nguyen's avatar Vinh Nguyen
Browse files

Merge remote-tracking branch 'upstream/master' into amp_resnet50

parents ef30de93 824ff2d6
......@@ -26,7 +26,7 @@ import pandas as pd
import tensorflow as tf
# pylint: disable=g-bad-import-order
from official.boosted_trees import train_higgs
from official.r1.boosted_trees import train_higgs
from official.utils.misc import keras_utils
from official.utils.testing import integration
......@@ -133,7 +133,7 @@ class BaseTest(tf.test.TestCase):
"--eval_start", "12",
"--eval_count", "8",
],
synth=False, max_train=None)
synth=False)
self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, "checkpoint")))
@unittest.skipIf(keras_utils.is_v2_0(), "TF 1.0 only test.")
......@@ -152,7 +152,7 @@ class BaseTest(tf.test.TestCase):
"--eval_start", "12",
"--eval_count", "8",
],
synth=False, max_train=None)
synth=False)
self.assertTrue(tf.gfile.Exists(os.path.join(model_dir, "checkpoint")))
self.assertTrue(tf.gfile.Exists(os.path.join(export_dir)))
......
......@@ -168,13 +168,15 @@ class BaseTest(tf.test.TestCase):
def test_cifar10_end_to_end_synthetic_v1(self):
integration.run_synthetic(
main=cifar10_main.run_cifar, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '1', '-batch_size', '4']
extra_flags=['-resnet_version', '1', '-batch_size', '4',
'--max_train_steps', '1']
)
def test_cifar10_end_to_end_synthetic_v2(self):
integration.run_synthetic(
main=cifar10_main.run_cifar, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '2', '-batch_size', '4']
extra_flags=['-resnet_version', '2', '-batch_size', '4',
'--max_train_steps', '1']
)
......
......@@ -282,41 +282,43 @@ class BaseTest(tf.test.TestCase):
def test_imagenet_end_to_end_synthetic_v1(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '1', '-batch_size', '4']
extra_flags=['-resnet_version', '1', '-batch_size', '4',
'--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v2(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '2', '-batch_size', '4']
extra_flags=['-resnet_version', '2', '-batch_size', '4',
'--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v1_tiny(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '1', '-batch_size', '4',
'-resnet_size', '18']
'-resnet_size', '18', '--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v2_tiny(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '2', '-batch_size', '4',
'-resnet_size', '18']
'-resnet_size', '18', '--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v1_huge(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '1', '-batch_size', '4',
'-resnet_size', '200']
'-resnet_size', '200', '--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v2_huge(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '2', '-batch_size', '4',
'-resnet_size', '200']
'-resnet_size', '200', '--max_train_steps', '1']
)
......
......@@ -730,9 +730,11 @@ def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
dynamic_loss_scale=dynamic_loss_scale,
fp16_implementation=fp16_implementation,
loss_scale=True,
tf_data_experimental_slack=True)
tf_data_experimental_slack=True,
max_train_steps=True)
flags_core.define_image()
flags_core.define_benchmark()
flags_core.define_distribution()
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_enum(
......@@ -768,16 +770,6 @@ def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
'If True, uses `tf.estimator.train_and_evaluate` for the training '
'and evaluation loop, instead of separate calls to `classifier.train '
'and `classifier.evaluate`, which is the default behavior.'))
flags.DEFINE_string(
name='worker_hosts', default=None,
help=flags_core.help_wrap(
'Comma-separated list of worker ip:port pairs for running '
'multi-worker models with DistributionStrategy. The user would '
'start the program on each host with identical value for this flag.'))
flags.DEFINE_integer(
name='task_index', default=-1,
help=flags_core.help_wrap('If multi-worker training, the task_index of '
'this worker.'))
flags.DEFINE_bool(
name='enable_lars', default=False,
help=flags_core.help_wrap(
......
# Predicting Income with the Census Income Dataset
Note that, the implementation is based on TF 1.x.
It is subjected to move to R1 archive folder.
## Overview
The [Census Income Data Set](https://archive.ics.uci.edu/ml/datasets/Census+Income) contains over 48,000 samples with attributes including age, occupation, education, and income (a binary label, either `>50K` or `<=50K`). The dataset is split into roughly 32,000 training and 16,000 testing samples.
......
......@@ -22,8 +22,8 @@ import tensorflow as tf
from official.utils.flags import core as flags_core
from official.utils.logs import logger
from official.wide_deep import census_dataset
from official.wide_deep import wide_deep_run_loop
from official.r1.wide_deep import census_dataset
from official.r1.wide_deep import wide_deep_run_loop
def define_census_flags():
......
......@@ -24,8 +24,8 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.misc import keras_utils
from official.utils.testing import integration
from official.wide_deep import census_dataset
from official.wide_deep import census_main
from official.r1.wide_deep import census_dataset
from official.r1.wide_deep import census_main
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
......@@ -139,7 +139,7 @@ class BaseTest(tf.test.TestCase):
'--model_type', 'wide',
'--download_if_missing=false'
],
synth=False, max_train=None)
synth=False)
@unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
def test_end_to_end_deep(self):
......@@ -150,7 +150,7 @@ class BaseTest(tf.test.TestCase):
'--model_type', 'deep',
'--download_if_missing=false'
],
synth=False, max_train=None)
synth=False)
@unittest.skipIf(keras_utils.is_v2_0(), 'TF 1.0 only test.')
def test_end_to_end_wide_deep(self):
......@@ -161,7 +161,7 @@ class BaseTest(tf.test.TestCase):
'--model_type', 'wide_deep',
'--download_if_missing=false'
],
synth=False, max_train=None)
synth=False)
if __name__ == '__main__':
......
......@@ -27,8 +27,8 @@ import tensorflow as tf
from official.datasets import movielens
from official.utils.flags import core as flags_core
from official.utils.logs import logger
from official.wide_deep import movielens_dataset
from official.wide_deep import wide_deep_run_loop
from official.r1.wide_deep import movielens_dataset
from official.r1.wide_deep import wide_deep_run_loop
def define_movie_flags():
......
......@@ -26,8 +26,8 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from official.datasets import movielens
from official.utils.misc import keras_utils
from official.utils.testing import integration
from official.wide_deep import movielens_dataset
from official.wide_deep import movielens_main
from official.r1.wide_deep import movielens_dataset
from official.r1.wide_deep import movielens_main
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
......@@ -112,7 +112,7 @@ class BaseTest(tf.test.TestCase):
"--train_epochs", "1",
"--epochs_between_evals", "1"
],
synth=False, max_train=None)
synth=False)
if __name__ == "__main__":
......
......@@ -143,37 +143,32 @@ class DatasetManager(object):
if is_training:
return {
movielens.USER_COLUMN:
tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
movielens.ITEM_COLUMN:
tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
rconst.VALID_POINT_MASK:
tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
"labels":
tf.io.FixedLenFeature([batch_size], dtype=tf.int64)
tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
}
else:
return {
movielens.USER_COLUMN:
tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
movielens.ITEM_COLUMN:
tf.io.FixedLenFeature([batch_size], dtype=tf.int64),
tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
rconst.DUPLICATE_MASK:
tf.io.FixedLenFeature([batch_size], dtype=tf.int64)
tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
}
features = tf.io.parse_single_example(
serialized_data, _get_feature_map(batch_size, is_training=is_training))
users = tf.reshape(
tf.cast(features[movielens.USER_COLUMN], rconst.USER_DTYPE),
(batch_size,))
items = tf.reshape(
tf.cast(features[movielens.ITEM_COLUMN], rconst.ITEM_DTYPE),
(batch_size,))
users = tf.cast(features[movielens.USER_COLUMN], rconst.USER_DTYPE)
items = tf.cast(features[movielens.ITEM_COLUMN], rconst.ITEM_DTYPE)
if is_training:
valid_point_mask = tf.reshape(
tf.cast(features[movielens.ITEM_COLUMN], tf.bool), (batch_size,))
fake_dup_mask = tf.zeros_like(features[movielens.USER_COLUMN])
valid_point_mask = tf.cast(features[rconst.VALID_POINT_MASK], tf.bool)
fake_dup_mask = tf.zeros_like(users)
return {
movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items,
......@@ -184,20 +179,15 @@ class DatasetManager(object):
rconst.DUPLICATE_MASK: fake_dup_mask
}
else:
labels = tf.reshape(
tf.cast(tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool),
(batch_size, 1))
fake_valid_pt_mask = tf.cast(
tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
labels = tf.cast(tf.zeros_like(users), tf.bool)
fake_valid_pt_mask = tf.cast(tf.zeros_like(users), tf.bool)
return {
movielens.USER_COLUMN:
users,
movielens.ITEM_COLUMN:
items,
rconst.DUPLICATE_MASK:
tf.reshape(
tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
(batch_size,)),
tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
rconst.VALID_POINT_MASK:
fake_valid_pt_mask,
rconst.TRAIN_LABEL_KEY:
......@@ -221,8 +211,8 @@ class DatasetManager(object):
if self._is_training:
mask_start_index = data.pop(rconst.MASK_START_INDEX)
batch_size = data[movielens.ITEM_COLUMN].shape[0]
data[rconst.VALID_POINT_MASK] = np.less(
np.arange(batch_size), mask_start_index)
data[rconst.VALID_POINT_MASK] = np.expand_dims(
np.less(np.arange(batch_size), mask_start_index), -1)
if self._stream_files:
example_bytes = self.serialize(data)
......@@ -313,19 +303,21 @@ class DatasetManager(object):
else:
types = {movielens.USER_COLUMN: rconst.USER_DTYPE,
movielens.ITEM_COLUMN: rconst.ITEM_DTYPE}
shapes = {movielens.USER_COLUMN: tf.TensorShape([batch_size]),
movielens.ITEM_COLUMN: tf.TensorShape([batch_size])}
shapes = {
movielens.USER_COLUMN: tf.TensorShape([batch_size, 1]),
movielens.ITEM_COLUMN: tf.TensorShape([batch_size, 1])
}
if self._is_training:
types[rconst.VALID_POINT_MASK] = np.bool
shapes[rconst.VALID_POINT_MASK] = tf.TensorShape([batch_size])
shapes[rconst.VALID_POINT_MASK] = tf.TensorShape([batch_size, 1])
types = (types, np.bool)
shapes = (shapes, tf.TensorShape([batch_size]))
shapes = (shapes, tf.TensorShape([batch_size, 1]))
else:
types[rconst.DUPLICATE_MASK] = np.bool
shapes[rconst.DUPLICATE_MASK] = tf.TensorShape([batch_size])
shapes[rconst.DUPLICATE_MASK] = tf.TensorShape([batch_size, 1])
data_generator = functools.partial(
self.data_generator, epochs_between_evals=epochs_between_evals)
......@@ -554,12 +546,17 @@ class BaseDataConstructor(threading.Thread):
items = np.concatenate([items, item_pad])
labels = np.concatenate([labels, label_pad])
self._train_dataset.put(i, {
movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items,
rconst.MASK_START_INDEX: np.array(mask_start_index, dtype=np.int32),
"labels": labels,
})
self._train_dataset.put(
i, {
movielens.USER_COLUMN:
np.reshape(users, (self.train_batch_size, 1)),
movielens.ITEM_COLUMN:
np.reshape(items, (self.train_batch_size, 1)),
rconst.MASK_START_INDEX:
np.array(mask_start_index, dtype=np.int32),
"labels":
np.reshape(labels, (self.train_batch_size, 1)),
})
def _wait_to_construct_train_epoch(self):
count = 0
......@@ -649,11 +646,15 @@ class BaseDataConstructor(threading.Thread):
users, items, duplicate_mask = self._assemble_eval_batch(
users, positive_items, negative_items, self._eval_users_per_batch)
self._eval_dataset.put(i, {
movielens.USER_COLUMN: users.flatten(),
movielens.ITEM_COLUMN: items.flatten(),
rconst.DUPLICATE_MASK: duplicate_mask.flatten(),
})
self._eval_dataset.put(
i, {
movielens.USER_COLUMN:
np.reshape(users.flatten(), (self.eval_batch_size, 1)),
movielens.ITEM_COLUMN:
np.reshape(items.flatten(), (self.eval_batch_size, 1)),
rconst.DUPLICATE_MASK:
np.reshape(duplicate_mask.flatten(), (self.eval_batch_size, 1)),
})
def _construct_eval_epoch(self):
"""Loop to construct data for evaluation."""
......@@ -720,24 +721,37 @@ class DummyConstructor(threading.Thread):
num_users = params["num_users"]
num_items = params["num_items"]
users = tf.random.uniform([batch_size], dtype=tf.int32, minval=0,
users = tf.random.uniform([batch_size, 1],
dtype=tf.int32,
minval=0,
maxval=num_users)
items = tf.random.uniform([batch_size], dtype=tf.int32, minval=0,
items = tf.random.uniform([batch_size, 1],
dtype=tf.int32,
minval=0,
maxval=num_items)
if is_training:
valid_point_mask = tf.cast(tf.random.uniform(
[batch_size], dtype=tf.int32, minval=0, maxval=2), tf.bool)
labels = tf.cast(tf.random.uniform(
[batch_size], dtype=tf.int32, minval=0, maxval=2), tf.bool)
valid_point_mask = tf.cast(
tf.random.uniform([batch_size, 1],
dtype=tf.int32,
minval=0,
maxval=2), tf.bool)
labels = tf.cast(
tf.random.uniform([batch_size, 1],
dtype=tf.int32,
minval=0,
maxval=2), tf.bool)
data = {
movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items,
rconst.VALID_POINT_MASK: valid_point_mask,
}, labels
else:
dupe_mask = tf.cast(tf.random.uniform([batch_size], dtype=tf.int32,
minval=0, maxval=2), tf.bool)
dupe_mask = tf.cast(
tf.random.uniform([batch_size, 1],
dtype=tf.int32,
minval=0,
maxval=2), tf.bool)
data = {
movielens.USER_COLUMN: users,
movielens.ITEM_COLUMN: items,
......
......@@ -168,8 +168,11 @@ class BaseTest(tf.test.TestCase):
md5 = hashlib.md5()
for features, labels in first_epoch:
data_list = [
features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
features[rconst.VALID_POINT_MASK], labels]
features[movielens.USER_COLUMN].flatten(),
features[movielens.ITEM_COLUMN].flatten(),
features[rconst.VALID_POINT_MASK].flatten(),
labels.flatten()
]
for i in data_list:
md5.update(i.tobytes())
......@@ -216,8 +219,10 @@ class BaseTest(tf.test.TestCase):
md5 = hashlib.md5()
for features in eval_data:
data_list = [
features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
features[rconst.DUPLICATE_MASK]]
features[movielens.USER_COLUMN].flatten(),
features[movielens.ITEM_COLUMN].flatten(),
features[rconst.DUPLICATE_MASK].flatten()
]
for i in data_list:
md5.update(i.tobytes())
......@@ -276,8 +281,11 @@ class BaseTest(tf.test.TestCase):
md5 = hashlib.md5()
for features, labels in results:
data_list = [
features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
features[rconst.VALID_POINT_MASK], labels]
features[movielens.USER_COLUMN].flatten(),
features[movielens.ITEM_COLUMN].flatten(),
features[rconst.VALID_POINT_MASK].flatten(),
labels.flatten()
]
for i in data_list:
md5.update(i.tobytes())
......
......@@ -37,7 +37,6 @@ from official.utils.flags import core as flags_core
from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils
FLAGS = flags.FLAGS
......@@ -60,13 +59,8 @@ def get_inputs(params):
dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
constructor_type=FLAGS.constructor_type,
deterministic=FLAGS.seed is not None)
num_train_steps = (producer.train_batches_per_epoch //
params["batches_per_step"])
num_eval_steps = (producer.eval_batches_per_epoch //
params["batches_per_step"])
assert not producer.train_batches_per_epoch % params["batches_per_step"]
assert not producer.eval_batches_per_epoch % params["batches_per_step"]
num_train_steps = producer.train_batches_per_epoch
num_eval_steps = producer.eval_batches_per_epoch
return num_users, num_items, num_train_steps, num_eval_steps, producer
......@@ -74,18 +68,13 @@ def get_inputs(params):
def parse_flags(flags_obj):
"""Convenience function to turn flags into params."""
num_gpus = flags_core.get_num_gpus(flags_obj)
num_devices = FLAGS.num_tpu_shards if FLAGS.tpu else num_gpus or 1
batch_size = (flags_obj.batch_size + num_devices - 1) // num_devices
eval_divisor = (rconst.NUM_EVAL_NEGATIVES + 1) * num_devices
batch_size = flags_obj.batch_size
eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
eval_batch_size = ((eval_batch_size + eval_divisor - 1) //
eval_divisor * eval_divisor // num_devices)
return {
"train_epochs": flags_obj.train_epochs,
"batches_per_step": num_devices,
"batches_per_step": 1,
"use_seed": flags_obj.seed is not None,
"batch_size": batch_size,
"eval_batch_size": eval_batch_size,
......@@ -95,6 +84,7 @@ def parse_flags(flags_obj):
"mf_regularization": flags_obj.mf_regularization,
"mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
"num_neg": flags_obj.num_neg,
"distribution_strategy": flags_obj.distribution_strategy,
"num_gpus": num_gpus,
"use_tpu": flags_obj.tpu is not None,
"tpu": flags_obj.tpu,
......@@ -115,7 +105,7 @@ def parse_flags(flags_obj):
}
def get_distribution_strategy(params):
def get_v1_distribution_strategy(params):
"""Returns the distribution strategy to use."""
if params["use_tpu"]:
# Some of the networking libraries are quite chatty.
......
......@@ -66,7 +66,7 @@ def construct_estimator(model_dir, params):
Returns:
An Estimator or TPUEstimator.
"""
distribution = ncf_common.get_distribution_strategy(params)
distribution = ncf_common.get_v1_distribution_strategy(params)
run_config = tf.estimator.RunConfig(train_distribute=distribution,
eval_distribute=distribution)
......
......@@ -82,7 +82,6 @@ def create_dataset_from_data_producer(producer, params):
Returns:
Processed training features.
"""
labels = tf.expand_dims(labels, -1)
fake_dup_mask = tf.zeros_like(features[movielens.USER_COLUMN])
features[rconst.DUPLICATE_MASK] = fake_dup_mask
features[rconst.TRAIN_LABEL_KEY] = labels
......@@ -106,7 +105,6 @@ def create_dataset_from_data_producer(producer, params):
Processed evaluation features.
"""
labels = tf.cast(tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
labels = tf.expand_dims(labels, -1)
fake_valid_pt_mask = tf.cast(
tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
features[rconst.VALID_POINT_MASK] = fake_valid_pt_mask
......@@ -119,7 +117,10 @@ def create_dataset_from_data_producer(producer, params):
return train_input_dataset, eval_input_dataset
def create_ncf_input_data(params, producer=None, input_meta_data=None):
def create_ncf_input_data(params,
producer=None,
input_meta_data=None,
strategy=None):
"""Creates NCF training/evaluation dataset.
Args:
......@@ -130,13 +131,31 @@ def create_ncf_input_data(params, producer=None, input_meta_data=None):
input_meta_data: A dictionary of input metadata to be used when reading data
from tf record files. Must be specified when params["train_input_dataset"]
is specified.
strategy: Distribution strategy used for distributed training. If specified,
used to assert that evaluation batch size is correctly a multiple of
total number of devices used.
Returns:
(training dataset, evaluation dataset, train steps per epoch,
eval steps per epoch)
Raises:
ValueError: If data is being generated online for when using TPU's.
"""
# NCF evaluation metric calculation logic assumes that evaluation data
# sample size are in multiples of (1 + number of negative samples in
# evaluation) for each device. As so, evaluation batch size must be a
# multiple of (number of replicas * (1 + number of negative samples)).
num_devices = strategy.num_replicas_in_sync if strategy else 1
if (params["eval_batch_size"] % (num_devices *
(1 + rconst.NUM_EVAL_NEGATIVES))):
raise ValueError("Evaluation batch size must be divisible by {} "
"times {}".format(num_devices,
(1 + rconst.NUM_EVAL_NEGATIVES)))
if params["train_dataset_path"]:
assert params["eval_dataset_path"]
train_dataset = create_dataset_from_tf_record_files(
params["train_dataset_path"],
input_meta_data["train_prebatch_size"],
......@@ -148,34 +167,18 @@ def create_ncf_input_data(params, producer=None, input_meta_data=None):
params["eval_batch_size"],
is_training=False)
# TODO(b/259377621): Remove number of devices (i.e.
# params["batches_per_step"]) in input pipeline logic and only use
# global batch size instead.
num_train_steps = int(
np.ceil(input_meta_data["num_train_steps"] /
params["batches_per_step"]))
num_eval_steps = (
input_meta_data["num_eval_steps"] // params["batches_per_step"])
num_train_steps = int(input_meta_data["num_train_steps"])
num_eval_steps = int(input_meta_data["num_eval_steps"])
else:
assert producer
if params["use_tpu"]:
raise ValueError("TPU training does not support data producer yet. "
"Use pre-processed data.")
assert producer
# Start retrieving data from producer.
train_dataset, eval_dataset = create_dataset_from_data_producer(
producer, params)
num_train_steps = (
producer.train_batches_per_epoch // params["batches_per_step"])
num_eval_steps = (
producer.eval_batches_per_epoch // params["batches_per_step"])
assert not producer.train_batches_per_epoch % params["batches_per_step"]
assert not producer.eval_batches_per_epoch % params["batches_per_step"]
# It is required that for distributed training, the dataset must call
# batch(). The parameter of batch() here is the number of replicas involed,
# such that each replica evenly gets a slice of data.
# drop_remainder = True, as we would like batch call to return a fixed shape
# vs None, this prevents a expensive broadcast during weighted_loss
batches_per_step = params["batches_per_step"]
train_dataset = train_dataset.batch(batches_per_step, drop_remainder=True)
eval_dataset = eval_dataset.batch(batches_per_step, drop_remainder=True)
num_train_steps = producer.train_batches_per_epoch
num_eval_steps = producer.eval_batches_per_epoch
return train_dataset, eval_dataset, num_train_steps, num_eval_steps
......@@ -181,6 +181,13 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.early_stopping = True
self._run_and_report_benchmark()
def benchmark_1_gpu_ctl_run_eagerly_early_stop(self):
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.early_stopping = True
FLAGS.run_eagerly = True
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_ctl_early_stop(self):
self._setup()
FLAGS.keras_use_ctl = True
......@@ -192,6 +199,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
self._setup()
FLAGS.early_stopping = True
FLAGS.num_gpus = 2
FLAGS.eval_batch_size = 160000
self._run_and_report_benchmark()
def benchmark_2_gpus_ctl_early_stop(self):
......@@ -200,10 +208,11 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.keras_use_ctl = True
FLAGS.early_stopping = True
FLAGS.num_gpus = 2
FLAGS.eval_batch_size = 160000
self._run_and_report_benchmark()
#############################################
# Tests below with mlperf in the test name are of two types
# Tests below with mlperf in the test name are of two types:
# 1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
# 2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
#
......@@ -254,6 +263,14 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.train_epochs = 7
self._run_and_report_benchmark_mlperf_like()
def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
"""1 GPU using CTL with eager and distribution strategy."""
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.run_eagerly = True
FLAGS.train_epochs = 7
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_ctl_mlperf_like(self):
"""1 GPU using CTL with XLA."""
self._setup()
......@@ -268,6 +285,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.num_gpus = 8
FLAGS.train_epochs = 17
FLAGS.batch_size = 1048576
FLAGS.eval_batch_size = 160000
FLAGS.learning_rate = 0.0045
FLAGS.beta1 = 0.25
FLAGS.beta2 = 0.5
......@@ -280,6 +298,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.num_gpus = 8
FLAGS.train_epochs = 17
FLAGS.batch_size = 1048576
FLAGS.eval_batch_size = 160000
FLAGS.learning_rate = 0.0045
FLAGS.beta1 = 0.25
FLAGS.beta2 = 0.5
......@@ -287,19 +306,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark_mlperf_like()
def benchmark_xla_8_gpu_mlperf_like(self):
"""8 GPU using keras fit/compile with XLA."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_xla = True
FLAGS.train_epochs = 17
FLAGS.batch_size = 1048576
FLAGS.learning_rate = 0.0045
FLAGS.beta1 = 0.25
FLAGS.beta2 = 0.5
FLAGS.epsilon = 1e-8
self._run_and_report_benchmark_mlperf_like()
def benchmark_8_gpu_ctl_mlperf_like(self):
"""8 GPU using CTL."""
self._setup()
......@@ -307,20 +313,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.num_gpus = 8
FLAGS.train_epochs = 17
FLAGS.batch_size = 1048576
FLAGS.learning_rate = 0.0045
FLAGS.beta1 = 0.25
FLAGS.beta2 = 0.5
FLAGS.epsilon = 1e-8
self._run_and_report_benchmark_mlperf_like()
def benchmark_xla_8_gpu_ctl_mlperf_like(self):
"""8 GPU using CTL with XLA."""
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.enable_xla = True
FLAGS.num_gpus = 8
FLAGS.train_epochs = 17
FLAGS.batch_size = 1048576
FLAGS.eval_batch_size = 160000
FLAGS.learning_rate = 0.0045
FLAGS.beta1 = 0.25
FLAGS.beta2 = 0.5
......@@ -341,6 +334,7 @@ class NCFKerasSynth(NCFKerasBenchmarkBase):
default_flags['num_gpus'] = 1
default_flags['train_epochs'] = 8
default_flags['batch_size'] = 99000
default_flags['eval_batch_size'] = 160000
default_flags['learning_rate'] = 0.00382059
default_flags['beta1'] = 0.783529
default_flags['beta2'] = 0.909003
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment