Unverified Commit 965cc3ee authored by Ayushman Kumar's avatar Ayushman Kumar Committed by GitHub
Browse files

Merge pull request #7 from tensorflow/master

updated
parents 1f3247f4 1f685c54
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# ResNet in TensorFlow # ResNet in TensorFlow
Deep residual networks, or ResNets for short, provided the breakthrough idea of Deep residual networks, or ResNets for short, provided the breakthrough idea of
......
...@@ -329,6 +329,37 @@ def learning_rate_with_decay( ...@@ -329,6 +329,37 @@ def learning_rate_with_decay(
return learning_rate_fn return learning_rate_fn
def per_replica_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size
remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)
def resnet_model_fn(features, labels, mode, model_class, def resnet_model_fn(features, labels, mode, model_class,
resnet_size, weight_decay, learning_rate_fn, momentum, resnet_size, weight_decay, learning_rate_fn, momentum,
data_format, resnet_version, loss_scale, data_format, resnet_version, loss_scale,
...@@ -620,7 +651,7 @@ def resnet_main( ...@@ -620,7 +651,7 @@ def resnet_main(
return input_function( return input_function(
is_training=True, is_training=True,
data_dir=flags_obj.data_dir, data_dir=flags_obj.data_dir,
batch_size=distribution_utils.per_replica_batch_size( batch_size=per_replica_batch_size(
flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
num_epochs=num_epochs, num_epochs=num_epochs,
dtype=flags_core.get_tf_dtype(flags_obj), dtype=flags_core.get_tf_dtype(flags_obj),
...@@ -631,7 +662,7 @@ def resnet_main( ...@@ -631,7 +662,7 @@ def resnet_main(
return input_function( return input_function(
is_training=False, is_training=False,
data_dir=flags_obj.data_dir, data_dir=flags_obj.data_dir,
batch_size=distribution_utils.per_replica_batch_size( batch_size=per_replica_batch_size(
flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
num_epochs=1, num_epochs=1,
dtype=flags_core.get_tf_dtype(flags_obj)) dtype=flags_core.get_tf_dtype(flags_obj))
......
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Transformer Translation Model # Transformer Translation Model
This is an implementation of the Transformer translation model as described in the [Attention is All You Need](https://arxiv.org/abs/1706.03762) paper. Based on the code provided by the authors: [Transformer code](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py) from [Tensor2Tensor](https://github.com/tensorflow/tensor2tensor). Also, check out the [tutorial](https://www.tensorflow.org/beta/tutorials/text/transformer) on Transformer in TF 2.0. This is an implementation of the Transformer translation model as described in the [Attention is All You Need](https://arxiv.org/abs/1706.03762) paper. Based on the code provided by the authors: [Transformer code](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py) from [Tensor2Tensor](https://github.com/tensorflow/tensor2tensor). Also, check out the [tutorial](https://www.tensorflow.org/beta/tutorials/text/transformer) on Transformer in TF 2.0.
......
...@@ -562,6 +562,36 @@ def construct_estimator(flags_obj, params, schedule_manager): ...@@ -562,6 +562,36 @@ def construct_estimator(flags_obj, params, schedule_manager):
}, },
config=run_config) config=run_config)
def per_replica_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size
remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)
def run_transformer(flags_obj): def run_transformer(flags_obj):
"""Create tf.Estimator to train and evaluate transformer model. """Create tf.Estimator to train and evaluate transformer model.
...@@ -605,8 +635,8 @@ def run_transformer(flags_obj): ...@@ -605,8 +635,8 @@ def run_transformer(flags_obj):
total_batch_size = params["batch_size"] total_batch_size = params["batch_size"]
if not params["use_tpu"]: if not params["use_tpu"]:
params["batch_size"] = distribution_utils.per_replica_batch_size( params["batch_size"] = per_replica_batch_size(params["batch_size"],
params["batch_size"], num_gpus) num_gpus)
schedule_manager = schedule.Manager( schedule_manager = schedule.Manager(
train_steps=flags_obj.train_steps, train_steps=flags_obj.train_steps,
......
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Predicting Income with the Census Income Dataset # Predicting Income with the Census Income Dataset
Note that, the implementation is based on TF 1.x. The implementation is based on TensorFlow 1.x.
It is subjected to move to R1 archive folder.
## Overview ## Overview
The [Census Income Data Set](https://archive.ics.uci.edu/ml/datasets/Census+Income) contains over 48,000 samples with attributes including age, occupation, education, and income (a binary label, either `>50K` or `<=50K`). The dataset is split into roughly 32,000 training and 16,000 testing samples. The [Census Income Data Set](https://archive.ics.uci.edu/ml/datasets/Census+Income) contains over 48,000 samples with attributes including age, occupation, education, and income (a binary label, either `>50K` or `<=50K`). The dataset is split into roughly 32,000 training and 16,000 testing samples.
......
...@@ -331,7 +331,7 @@ class DatasetManager(object): ...@@ -331,7 +331,7 @@ class DatasetManager(object):
"""Returns batches for training.""" """Returns batches for training."""
# Estimator passes batch_size during training and eval_batch_size during # Estimator passes batch_size during training and eval_batch_size during
# eval. TPUEstimator only passes batch_size. # eval.
param_batch_size = (params["batch_size"] if self._is_training else param_batch_size = (params["batch_size"] if self._is_training else
params.get("eval_batch_size") or params["batch_size"]) params.get("eval_batch_size") or params["batch_size"])
if batch_size != param_batch_size: if batch_size != param_batch_size:
...@@ -713,7 +713,7 @@ class DummyConstructor(threading.Thread): ...@@ -713,7 +713,7 @@ class DummyConstructor(threading.Thread):
"""Returns dummy input batches for training.""" """Returns dummy input batches for training."""
# Estimator passes batch_size during training and eval_batch_size during # Estimator passes batch_size during training and eval_batch_size during
# eval. TPUEstimator only passes batch_size. # eval.
batch_size = (params["batch_size"] if is_training else batch_size = (params["batch_size"] if is_training else
params.get("eval_batch_size") or params["batch_size"]) params.get("eval_batch_size") or params["batch_size"])
num_users = params["num_users"] num_users = params["num_users"]
......
...@@ -149,7 +149,7 @@ def define_ncf_flags(): ...@@ -149,7 +149,7 @@ def define_ncf_flags():
flags_core.define_base(model_dir=True, clean=True, train_epochs=True, flags_core.define_base(model_dir=True, clean=True, train_epochs=True,
epochs_between_evals=True, export_dir=False, epochs_between_evals=True, export_dir=False,
run_eagerly=True, stop_threshold=True, num_gpu=True, run_eagerly=True, stop_threshold=True, num_gpu=True,
hooks=True, distribution_strategy=True) distribution_strategy=True)
flags_core.define_performance( flags_core.define_performance(
synthetic_data=True, synthetic_data=True,
dtype=True, dtype=True,
...@@ -167,8 +167,7 @@ def define_ncf_flags(): ...@@ -167,8 +167,7 @@ def define_ncf_flags():
model_dir="/tmp/ncf/", model_dir="/tmp/ncf/",
data_dir="/tmp/movielens-data/", data_dir="/tmp/movielens-data/",
train_epochs=2, train_epochs=2,
batch_size=256, batch_size=99000,
hooks="ProfilerHook",
tpu=None tpu=None
) )
......
...@@ -29,7 +29,7 @@ import os ...@@ -29,7 +29,7 @@ import os
from absl import app from absl import app
from absl import flags from absl import flags
from absl import logging from absl import logging
import tensorflow as tf import tensorflow.compat.v2 as tf
# pylint: enable=g-bad-import-order # pylint: enable=g-bad-import-order
from official.recommendation import constants as rconst from official.recommendation import constants as rconst
......
...@@ -23,18 +23,15 @@ import unittest ...@@ -23,18 +23,15 @@ import unittest
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from tensorflow.python.eager import context # pylint: disable=ungrouped-imports
from official.recommendation import constants as rconst from official.recommendation import constants as rconst
from official.recommendation import data_pipeline from official.recommendation import data_pipeline
from official.recommendation import neumf_model
from official.recommendation import ncf_common from official.recommendation import ncf_common
from official.recommendation import ncf_estimator_main
from official.recommendation import ncf_keras_main from official.recommendation import ncf_keras_main
from official.recommendation import neumf_model
from official.utils.misc import keras_utils from official.utils.misc import keras_utils
from official.utils.testing import integration from official.utils.testing import integration
from tensorflow.python.eager import context # pylint: disable=ungrouped-imports
NUM_TRAIN_NEG = 4 NUM_TRAIN_NEG = 4
...@@ -190,20 +187,6 @@ class NcfTest(tf.test.TestCase): ...@@ -190,20 +187,6 @@ class NcfTest(tf.test.TestCase):
_BASE_END_TO_END_FLAGS = ['-batch_size', '1044', '-train_epochs', '1'] _BASE_END_TO_END_FLAGS = ['-batch_size', '1044', '-train_epochs', '1']
@unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
@unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_estimator(self):
integration.run_synthetic(
ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
extra_flags=self._BASE_END_TO_END_FLAGS)
@unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
@unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_estimator_mlperf(self):
integration.run_synthetic(
ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
extra_flags=self._BASE_END_TO_END_FLAGS + ['-ml_perf', 'True'])
@unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100) @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_keras_no_dist_strat(self): def test_end_to_end_keras_no_dist_strat(self):
integration.run_synthetic( integration.run_synthetic(
......
...@@ -126,7 +126,6 @@ def neumf_model_fn(features, labels, mode, params): ...@@ -126,7 +126,6 @@ def neumf_model_fn(features, labels, mode, params):
weights=tf.cast(valid_pt_mask, tf.float32) weights=tf.cast(valid_pt_mask, tf.float32)
) )
# This tensor is used by logging hooks.
tf.identity(loss, name="cross_entropy") tf.identity(loss, name="cross_entropy")
global_step = tf.compat.v1.train.get_global_step() global_step = tf.compat.v1.train.get_global_step()
......
#!/bin/bash
set -e
# Example settings:
# export TPU="taylorrobie-tpu-0"
# export BUCKET="gs://taylorrobie-tpu-test-bucket-2"
# Remove IDE "not assigned" warning highlights.
TPU=${TPU:-""}
BUCKET=${BUCKET:-""}
if [[ -z ${TPU} ]]; then
echo "Please set 'TPU' to the name of the TPU to be used."
exit 1
fi
if [[ -z ${BUCKET} ]]; then
echo "Please set 'BUCKET' to the GCS bucket to be used."
exit 1
fi
./run.sh
...@@ -139,13 +139,10 @@ class StandardEvaluable(runnable.AbstractEvaluable): ...@@ -139,13 +139,10 @@ class StandardEvaluable(runnable.AbstractEvaluable):
eval_fn = tf.function(eval_fn) eval_fn = tf.function(eval_fn)
self.eval_loop_fn = utils.create_loop_fn(eval_fn) self.eval_loop_fn = utils.create_loop_fn(eval_fn)
# TODO(b/147718615): When async RPC is enabled in eager runtime, we make eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
# eval iterator as a class member so it doesn't get destroyed when out of
# the function scope.
self.eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
self.eval_begin() self.eval_begin()
self.eval_loop_fn(self.eval_iter, num_steps) self.eval_loop_fn(eval_iter, num_steps)
return self.eval_end() return self.eval_end()
def eval_begin(self): def eval_begin(self):
......
...@@ -94,6 +94,7 @@ def initialize_common_flags(): ...@@ -94,6 +94,7 @@ def initialize_common_flags():
def strategy_flags_dict(): def strategy_flags_dict():
"""Returns TPU and/or GPU related flags in a dictionary.""" """Returns TPU and/or GPU related flags in a dictionary."""
return { return {
'distribution_strategy': FLAGS.strategy_type,
# TPUStrategy related flags. # TPUStrategy related flags.
'tpu': FLAGS.tpu, 'tpu': FLAGS.tpu,
# MultiWorkerMirroredStrategy related flags. # MultiWorkerMirroredStrategy related flags.
......
...@@ -40,7 +40,7 @@ def _collective_communication(all_reduce_alg): ...@@ -40,7 +40,7 @@ def _collective_communication(all_reduce_alg):
tf.distribute.experimental.CollectiveCommunication object tf.distribute.experimental.CollectiveCommunication object
Raises: Raises:
ValueError: if `all_reduce_alg` not in [None, 'ring', 'nccl'] ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
""" """
collective_communication_options = { collective_communication_options = {
None: tf.distribute.experimental.CollectiveCommunication.AUTO, None: tf.distribute.experimental.CollectiveCommunication.AUTO,
...@@ -50,7 +50,7 @@ def _collective_communication(all_reduce_alg): ...@@ -50,7 +50,7 @@ def _collective_communication(all_reduce_alg):
if all_reduce_alg not in collective_communication_options: if all_reduce_alg not in collective_communication_options:
raise ValueError( raise ValueError(
"When used with `multi_worker_mirrored`, valid values for " "When used with `multi_worker_mirrored`, valid values for "
"all_reduce_alg are ['ring', 'nccl']. Supplied value: {}".format( "all_reduce_alg are [`ring`, `nccl`]. Supplied value: {}".format(
all_reduce_alg)) all_reduce_alg))
return collective_communication_options[all_reduce_alg] return collective_communication_options[all_reduce_alg]
...@@ -66,7 +66,7 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs): ...@@ -66,7 +66,7 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
tf.distribute.CrossDeviceOps object or None. tf.distribute.CrossDeviceOps object or None.
Raises: Raises:
ValueError: if `all_reduce_alg` not in [None, 'nccl', 'hierarchical_copy']. ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
""" """
if all_reduce_alg is None: if all_reduce_alg is None:
return None return None
...@@ -77,7 +77,7 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs): ...@@ -77,7 +77,7 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
if all_reduce_alg not in mirrored_all_reduce_options: if all_reduce_alg not in mirrored_all_reduce_options:
raise ValueError( raise ValueError(
"When used with `mirrored`, valid values for all_reduce_alg are " "When used with `mirrored`, valid values for all_reduce_alg are "
"['nccl', 'hierarchical_copy']. Supplied value: {}".format( "[`nccl`, `hierarchical_copy`]. Supplied value: {}".format(
all_reduce_alg)) all_reduce_alg))
cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg] cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
return cross_device_ops_class(num_packs=num_packs) return cross_device_ops_class(num_packs=num_packs)
...@@ -92,9 +92,9 @@ def get_distribution_strategy(distribution_strategy="mirrored", ...@@ -92,9 +92,9 @@ def get_distribution_strategy(distribution_strategy="mirrored",
Args: Args:
distribution_strategy: a string specifying which distribution strategy to distribution_strategy: a string specifying which distribution strategy to
use. Accepted values are 'off', 'one_device', 'mirrored', use. Accepted values are "off", "one_device", "mirrored",
'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive. "parameter_server", "multi_worker_mirrored", and "tpu" -- case insensitive.
'off' means not to use Distribution Strategy; 'tpu' means to use "off" means not to use Distribution Strategy; "tpu" means to use
TPUStrategy using `tpu_address`. TPUStrategy using `tpu_address`.
num_gpus: Number of GPUs to run this model. num_gpus: Number of GPUs to run this model.
all_reduce_alg: Optional. Specifies which algorithm to use when performing all_reduce_alg: Optional. Specifies which algorithm to use when performing
...@@ -109,7 +109,7 @@ def get_distribution_strategy(distribution_strategy="mirrored", ...@@ -109,7 +109,7 @@ def get_distribution_strategy(distribution_strategy="mirrored",
Returns: Returns:
tf.distribute.DistibutionStrategy object. tf.distribute.DistibutionStrategy object.
Raises: Raises:
ValueError: if `distribution_strategy` is 'off' or 'one_device' and ValueError: if `distribution_strategy` is "off" or "one_device" and
`num_gpus` is larger than 1; or `num_gpus` is negative or if `num_gpus` is larger than 1; or `num_gpus` is negative or if
`distribution_strategy` is `tpu` but `tpu_address` is not specified. `distribution_strategy` is `tpu` but `tpu_address` is not specified.
""" """
...@@ -121,7 +121,7 @@ def get_distribution_strategy(distribution_strategy="mirrored", ...@@ -121,7 +121,7 @@ def get_distribution_strategy(distribution_strategy="mirrored",
if num_gpus > 1: if num_gpus > 1:
raise ValueError( raise ValueError(
"When {} GPUs are specified, distribution_strategy " "When {} GPUs are specified, distribution_strategy "
"flag cannot be set to 'off'.".format(num_gpus)) "flag cannot be set to `off`.".format(num_gpus))
return None return None
if distribution_strategy == "tpu": if distribution_strategy == "tpu":
...@@ -157,141 +157,6 @@ def get_distribution_strategy(distribution_strategy="mirrored", ...@@ -157,141 +157,6 @@ def get_distribution_strategy(distribution_strategy="mirrored",
"Unrecognized Distribution Strategy: %r" % distribution_strategy) "Unrecognized Distribution Strategy: %r" % distribution_strategy)
def per_replica_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size
remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)
# The `SyntheticDataset` is a temporary solution for generating synthetic data
# directly on devices. It is only useful for Keras with Distribution
# Strategies. We will have better support in `tf.data` or Distribution Strategy
# later.
class SyntheticDataset(object):
"""A dataset that generates synthetic data on each device."""
def __init__(self, dataset, split_by=1):
# dataset.take(1) doesn't have GPU kernel.
with tf.device('device:CPU:0'):
tensor = tf.data.experimental.get_single_element(dataset.take(1))
flat_tensor = tf.nest.flatten(tensor)
variable_data = []
initializers = []
for t in flat_tensor:
rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
v = tf.compat.v1.get_local_variable(self._random_name(),
initializer=rebatched_t)
variable_data.append(v)
initializers.append(v.initializer)
input_data = tf.nest.pack_sequence_as(tensor, variable_data)
self._iterator = SyntheticIterator(input_data, initializers)
def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
def __iter__(self):
return self._iterator
def make_one_shot_iterator(self):
return self._iterator
def make_initializable_iterator(self):
return self._iterator
class SyntheticIterator(object):
"""A dataset that generates synthetic data on each device."""
def __init__(self, input_data, initializers):
self._input_data = input_data
self._initializers = initializers
def get_next(self):
return self._input_data
def next(self):
return self.__next__()
def __next__(self):
try:
return self.get_next()
except tf.errors.OutOfRangeError:
raise StopIteration
def initialize(self):
if tf.executing_eagerly():
return tf.no_op()
else:
return self._initializers
def _monkey_patch_dataset_method(strategy):
"""Monkey-patch `strategy`'s `make_dataset_iterator` method."""
def make_dataset(self, dataset):
logging.info('Using pure synthetic data.')
with self.scope():
if self.extended._global_batch_size: # pylint: disable=protected-access
return SyntheticDataset(dataset, self.num_replicas_in_sync)
else:
return SyntheticDataset(dataset)
def make_iterator(self, dataset):
dist_dataset = make_dataset(self, dataset)
return iter(dist_dataset)
strategy.orig_make_dataset_iterator = strategy.make_dataset_iterator
strategy.make_dataset_iterator = make_iterator
strategy.orig_distribute_dataset = strategy.experimental_distribute_dataset
strategy.experimental_distribute_dataset = make_dataset
def _undo_monkey_patch_dataset_method(strategy):
if hasattr(strategy, 'orig_make_dataset_iterator'):
strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
if hasattr(strategy, 'orig_distribute_dataset'):
strategy.make_dataset_iterator = strategy.orig_distribute_dataset
def set_up_synthetic_data():
_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
_monkey_patch_dataset_method(
tf.distribute.experimental.MultiWorkerMirroredStrategy)
def undo_set_up_synthetic_data():
_undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
_undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
_undo_monkey_patch_dataset_method(
tf.distribute.experimental.MultiWorkerMirroredStrategy)
def configure_cluster(worker_hosts=None, task_index=-1): def configure_cluster(worker_hosts=None, task_index=-1):
"""Set multi-worker cluster spec in TF_CONFIG environment variable. """Set multi-worker cluster spec in TF_CONFIG environment variable.
...@@ -301,21 +166,21 @@ def configure_cluster(worker_hosts=None, task_index=-1): ...@@ -301,21 +166,21 @@ def configure_cluster(worker_hosts=None, task_index=-1):
Returns: Returns:
Number of workers in the cluster. Number of workers in the cluster.
""" """
tf_config = json.loads(os.environ.get('TF_CONFIG', '{}')) tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
if tf_config: if tf_config:
num_workers = (len(tf_config['cluster'].get('chief', [])) + num_workers = (len(tf_config["cluster"].get("chief", [])) +
len(tf_config['cluster'].get('worker', []))) len(tf_config["cluster"].get("worker", [])))
elif worker_hosts: elif worker_hosts:
workers = worker_hosts.split(',') workers = worker_hosts.split(",")
num_workers = len(workers) num_workers = len(workers)
if num_workers > 1 and task_index < 0: if num_workers > 1 and task_index < 0:
raise ValueError('Must specify task_index when number of workers > 1') raise ValueError("Must specify task_index when number of workers > 1")
task_index = 0 if num_workers == 1 else task_index task_index = 0 if num_workers == 1 else task_index
os.environ['TF_CONFIG'] = json.dumps({ os.environ["TF_CONFIG"] = json.dumps({
'cluster': { "cluster": {
'worker': workers "worker": workers
}, },
'task': {'type': 'worker', 'index': task_index} "task": {"type": "worker", "index": task_index}
}) })
else: else:
num_workers = 1 num_workers = 1
......
...@@ -45,21 +45,5 @@ class GetDistributionStrategyTest(tf.test.TestCase): ...@@ -45,21 +45,5 @@ class GetDistributionStrategyTest(tf.test.TestCase):
self.assertIn('GPU', device) self.assertIn('GPU', device)
class PerReplicaBatchSizeTest(tf.test.TestCase):
"""Tests for per_replica_batch_size."""
def test_batch_size(self):
self.assertEquals(
distribution_utils.per_replica_batch_size(147, num_gpus=0), 147)
self.assertEquals(
distribution_utils.per_replica_batch_size(147, num_gpus=1), 147)
self.assertEquals(
distribution_utils.per_replica_batch_size(147, num_gpus=7), 21)
def test_batch_size_with_remainder(self):
with self.assertRaises(ValueError):
distribution_utils.per_replica_batch_size(147, num_gpus=5)
if __name__ == "__main__": if __name__ == "__main__":
tf.test.main() tf.test.main()
...@@ -164,6 +164,18 @@ def get_profiler_callback(model_dir, profile_steps, enable_tensorboard, ...@@ -164,6 +164,18 @@ def get_profiler_callback(model_dir, profile_steps, enable_tensorboard,
return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch) return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch)
class SimpleCheckpoint(tf.keras.callbacks.Callback):
"""Keras callback to save tf.train.Checkpoints."""
def __init__(self, checkpoint_manager):
super(SimpleCheckpoint, self).__init__()
self.checkpoint_manager = checkpoint_manager
def on_epoch_end(self, epoch, logs=None):
step_counter = self.checkpoint_manager._step_counter.numpy() # pylint: disable=protected-access
self.checkpoint_manager.save(checkpoint_number=step_counter)
class ProfilerCallback(tf.keras.callbacks.Callback): class ProfilerCallback(tf.keras.callbacks.Callback):
"""Save profiles in specified step range to log directory.""" """Save profiles in specified step range to log directory."""
......
# Object Detection Models on TensorFlow 2.0 # Object Detection Models on TensorFlow 2
**Note**: The repo is still under construction. More features and instructions **Note**: This repository is still under construction.
will be added soon. More features and instructions will be added soon.
## Prerequsite ## Prerequsite
To get started, make sure to use Tensorflow 2.1+ on Google Cloud. Also here are To get started, download the code from TensorFlow models GitHub repository or
a few package you need to install to get started: use the pre-installed Google Cloud VM.
```bash ```bash
sudo apt-get install -y python-tk && \ git clone https://github.com/tensorflow/models.git
pip install Cython matplotlib opencv-python-headless pyyaml Pillow && \
pip install 'git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI'
``` ```
Next, download the code from TensorFlow models github repository or use the Next, make sure to use TensorFlow 2.1+ on Google Cloud. Also here are
pre-installed Google Cloud VM. a few package you need to install to get started:
```bash ```bash
git clone https://github.com/tensorflow/models.git sudo apt-get install -y python-tk && \
pip3 install -r ~/models/official/requirements.txt
``` ```
## Train RetinaNet on TPU ## Train RetinaNet on TPU
### Train a vanilla ResNet-50 based RetinaNet. ### Train a vanilla ResNet-50 based RetinaNet.
```bash ```bash
...@@ -30,7 +30,7 @@ RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>" ...@@ -30,7 +30,7 @@ RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>" TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>" EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>" VAL_JSON_FILE="<path to the validation annotation JSON file>"
python ~/models/official/vision/detection/main.py \ python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \ --strategy_type=tpu \
--tpu="${TPU_NAME?}" \ --tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \ --model_dir="${MODEL_DIR?}" \
...@@ -60,7 +60,7 @@ following command. ...@@ -60,7 +60,7 @@ following command.
```bash ```bash
TPU_NAME="<your GCP TPU name>" TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>" MODEL_DIR="<path to the directory to store model files>"
python ~/models/official/vision/detection/main.py \ python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \ --strategy_type=tpu \
--tpu="${TPU_NAME?}" \ --tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \ --model_dir="${MODEL_DIR?}" \
...@@ -86,7 +86,6 @@ python3 ~/models/official/vision/detection/main.py \ ...@@ -86,7 +86,6 @@ python3 ~/models/official/vision/detection/main.py \
--config_file="my_retinanet.yaml" --config_file="my_retinanet.yaml"
``` ```
```bash ```bash
MODEL_DIR="<path to the directory to store model files>" MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \ python3 ~/models/official/vision/detection/main.py \
...@@ -123,6 +122,118 @@ use_tpu: False ...@@ -123,6 +122,118 @@ use_tpu: False
" "
``` ```
---
## Train Mask R-CNN on TPU
### Train a vanilla ResNet-50 based Mask R-CNN.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } }"
```
### Train a custom Mask R-CNN using the config file.
First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
This file specifies the parameters to be overridden,
which should at least include the following fields.
```YAML
# my_maskrcnn.yaml
train:
train_file_pattern: <path to the TFRecord training data>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
## Train Mask R-CNN on GPU
Training on GPU is similar to that on TPU. The major change is the strategy type
(use
"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/vision/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--model=mask_rcnn \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
predict:
predict_batch_size: 8
architecture:
use_bfloat16: False
maskrcnn_parser:
use_bfloat16: Flase
train:
total_steps: 1000
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files. downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.
......
...@@ -14,8 +14,16 @@ ...@@ -14,8 +14,16 @@
# ============================================================================== # ==============================================================================
"""Base config template.""" """Base config template."""
# pylint: disable=line-too-long
BACKBONES = [
'resnet',
]
MULTILEVEL_FEATURES = [
'fpn',
]
# pylint: disable=line-too-long
# For ResNet, this freezes the variables of the first conv1 and conv2_x # For ResNet, this freezes the variables of the first conv1 and conv2_x
# layers [1], which leads to higher training speed and slightly better testing # layers [1], which leads to higher training speed and slightly better testing
# accuracy. The intuition is that the low-level architecture (e.g., ResNet-50) # accuracy. The intuition is that the low-level architecture (e.g., ResNet-50)
...@@ -24,7 +32,6 @@ ...@@ -24,7 +32,6 @@
# Note that we need to trailing `/` to avoid the incorrect match. # Note that we need to trailing `/` to avoid the incorrect match.
# [1]: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L198 # [1]: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L198
RESNET_FROZEN_VAR_PREFIX = r'(resnet\d+)\/(conv2d(|_([1-9]|10))|batch_normalization(|_([1-9]|10)))\/' RESNET_FROZEN_VAR_PREFIX = r'(resnet\d+)\/(conv2d(|_([1-9]|10))|batch_normalization(|_([1-9]|10)))\/'
REGULARIZATION_VAR_REGEX = r'.*(kernel|weight):0$' REGULARIZATION_VAR_REGEX = r'.*(kernel|weight):0$'
BASE_CFG = { BASE_CFG = {
...@@ -41,6 +48,7 @@ BASE_CFG = { ...@@ -41,6 +48,7 @@ BASE_CFG = {
'optimizer': { 'optimizer': {
'type': 'momentum', 'type': 'momentum',
'momentum': 0.9, 'momentum': 0.9,
'nesterov': True, # `False` is better for TPU v3-128.
}, },
'learning_rate': { 'learning_rate': {
'type': 'step', 'type': 'step',
...@@ -49,21 +57,25 @@ BASE_CFG = { ...@@ -49,21 +57,25 @@ BASE_CFG = {
'init_learning_rate': 0.08, 'init_learning_rate': 0.08,
'learning_rate_levels': [0.008, 0.0008], 'learning_rate_levels': [0.008, 0.0008],
'learning_rate_steps': [15000, 20000], 'learning_rate_steps': [15000, 20000],
'total_steps': 22500,
}, },
'checkpoint': { 'checkpoint': {
'path': '', 'path': '',
'prefix': '', 'prefix': '',
}, },
'frozen_variable_prefix': RESNET_FROZEN_VAR_PREFIX, # One can use 'RESNET_FROZEN_VAR_PREFIX' to speed up ResNet training
# when loading from the checkpoint.
'frozen_variable_prefix': '',
'train_file_pattern': '', 'train_file_pattern': '',
'train_dataset_type': 'tfrecord', 'train_dataset_type': 'tfrecord',
# TODO(b/142174042): Support transpose_input option.
'transpose_input': False, 'transpose_input': False,
'regularization_variable_regex': REGULARIZATION_VAR_REGEX, 'regularization_variable_regex': REGULARIZATION_VAR_REGEX,
'l2_weight_decay': 0.0001, 'l2_weight_decay': 0.0001,
'gradient_clip_norm': 0.0, 'gradient_clip_norm': 0.0,
'input_sharding': False,
}, },
'eval': { 'eval': {
'input_sharding': True,
'batch_size': 8, 'batch_size': 8,
'eval_samples': 5000, 'eval_samples': 5000,
'min_eval_interval': 180, 'min_eval_interval': 180,
...@@ -74,38 +86,42 @@ BASE_CFG = { ...@@ -74,38 +86,42 @@ BASE_CFG = {
'val_json_file': '', 'val_json_file': '',
'eval_file_pattern': '', 'eval_file_pattern': '',
'eval_dataset_type': 'tfrecord', 'eval_dataset_type': 'tfrecord',
# When visualizing images, set evaluation batch size to 40 to avoid
# potential OOM.
'num_images_to_visualize': 0,
}, },
'predict': { 'predict': {
'batch_size': 8, 'batch_size': 8,
}, },
'anchor': { 'architecture': {
'backbone': 'resnet',
'min_level': 3, 'min_level': 3,
'max_level': 7, 'max_level': 7,
'multilevel_features': 'fpn',
'use_bfloat16': True,
# Note that `num_classes` is the total number of classes including
# one background classes whose index is 0.
'num_classes': 91,
},
'anchor': {
'num_scales': 3, 'num_scales': 3,
'aspect_ratios': [1.0, 2.0, 0.5], 'aspect_ratios': [1.0, 2.0, 0.5],
'anchor_size': 4.0, 'anchor_size': 4.0,
}, },
'norm_activation': {
'activation': 'relu',
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
'resnet': { 'resnet': {
'resnet_depth': 50, 'resnet_depth': 50,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
}, },
'fpn': { 'fpn': {
'min_level': 3,
'max_level': 7,
'fpn_feat_dims': 256, 'fpn_feat_dims': 256,
'use_separable_conv': False, 'use_separable_conv': False,
'use_batch_norm': True, 'use_batch_norm': True,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
}, },
'postprocess': { 'postprocess': {
'use_batched_nms': False, 'use_batched_nms': False,
...@@ -116,5 +132,4 @@ BASE_CFG = { ...@@ -116,5 +132,4 @@ BASE_CFG = {
}, },
'enable_summary': False, 'enable_summary': False,
} }
# pylint: enable=line-too-long # pylint: enable=line-too-long
...@@ -28,13 +28,12 @@ MASKRCNN_CFG.override({ ...@@ -28,13 +28,12 @@ MASKRCNN_CFG.override({
}, },
'architecture': { 'architecture': {
'parser': 'maskrcnn_parser', 'parser': 'maskrcnn_parser',
'backbone': 'resnet', 'min_level': 2,
'multilevel_features': 'fpn', 'max_level': 6,
'use_bfloat16': True,
'include_mask': True, 'include_mask': True,
'mask_target_size': 28,
}, },
'maskrcnn_parser': { 'maskrcnn_parser': {
'use_bfloat16': True,
'output_size': [1024, 1024], 'output_size': [1024, 1024],
'num_channels': 3, 'num_channels': 3,
'rpn_match_threshold': 0.7, 'rpn_match_threshold': 0.7,
...@@ -46,74 +45,32 @@ MASKRCNN_CFG.override({ ...@@ -46,74 +45,32 @@ MASKRCNN_CFG.override({
'aug_scale_max': 1.0, 'aug_scale_max': 1.0,
'skip_crowd_during_training': True, 'skip_crowd_during_training': True,
'max_num_instances': 100, 'max_num_instances': 100,
'include_mask': True,
'mask_crop_size': 112, 'mask_crop_size': 112,
}, },
'anchor': { 'anchor': {
'min_level': 2,
'max_level': 6,
'num_scales': 1, 'num_scales': 1,
'anchor_size': 8, 'anchor_size': 8,
}, },
'fpn': {
'min_level': 2,
'max_level': 6,
},
'nasfpn': {
'min_level': 2,
'max_level': 6,
},
# tunable_nasfpn:strip_begin
'tunable_nasfpn_v1': {
'min_level': 2,
'max_level': 6,
},
# tunable_nasfpn:strip_end
'rpn_head': { 'rpn_head': {
'min_level': 2,
'max_level': 6,
'anchors_per_location': 3, 'anchors_per_location': 3,
'num_convs': 2, 'num_convs': 2,
'num_filters': 256, 'num_filters': 256,
'use_separable_conv': False, 'use_separable_conv': False,
'use_batch_norm': False, 'use_batch_norm': False,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
}, },
'frcnn_head': { 'frcnn_head': {
# Note that `num_classes` is the total number of classes including
# one background classes whose index is 0.
'num_classes': 91,
'num_convs': 0, 'num_convs': 0,
'num_filters': 256, 'num_filters': 256,
'use_separable_conv': False, 'use_separable_conv': False,
'num_fcs': 2, 'num_fcs': 2,
'fc_dims': 1024, 'fc_dims': 1024,
'use_batch_norm': False, 'use_batch_norm': False,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
}, },
'mrcnn_head': { 'mrcnn_head': {
'num_classes': 91,
'mask_target_size': 28,
'num_convs': 4, 'num_convs': 4,
'num_filters': 256, 'num_filters': 256,
'use_separable_conv': False, 'use_separable_conv': False,
'use_batch_norm': False, 'use_batch_norm': False,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
}, },
'rpn_score_loss': { 'rpn_score_loss': {
'rpn_batch_size_per_im': 256, 'rpn_batch_size_per_im': 256,
...@@ -147,23 +104,10 @@ MASKRCNN_CFG.override({ ...@@ -147,23 +104,10 @@ MASKRCNN_CFG.override({
}, },
'mask_sampling': { 'mask_sampling': {
'num_mask_samples_per_image': 128, # Typically = `num_samples_per_image` * `fg_fraction`. 'num_mask_samples_per_image': 128, # Typically = `num_samples_per_image` * `fg_fraction`.
'mask_target_size': 28,
},
'postprocess': {
'use_batched_nms': False,
'max_total_size': 100,
'nms_iou_threshold': 0.5,
'score_threshold': 0.05,
'pre_nms_num_boxes': 1000,
}, },
}, is_strict=False) }, is_strict=False)
MASKRCNN_RESTRICTIONS = [ MASKRCNN_RESTRICTIONS = [
'architecture.use_bfloat16 == maskrcnn_parser.use_bfloat16',
'architecture.include_mask == maskrcnn_parser.include_mask',
'anchor.min_level == rpn_head.min_level',
'anchor.max_level == rpn_head.max_level',
'mrcnn_head.mask_target_size == mask_sampling.mask_target_size',
] ]
# pylint: enable=line-too-long # pylint: enable=line-too-long
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment