Unverified Commit 965cc3ee authored by Ayushman Kumar's avatar Ayushman Kumar Committed by GitHub
Browse files

Merge pull request #7 from tensorflow/master

updated
parents 1f3247f4 1f685c54
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# ResNet in TensorFlow
Deep residual networks, or ResNets for short, provided the breakthrough idea of
......
......@@ -329,6 +329,37 @@ def learning_rate_with_decay(
return learning_rate_fn
def per_replica_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size
remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)
def resnet_model_fn(features, labels, mode, model_class,
resnet_size, weight_decay, learning_rate_fn, momentum,
data_format, resnet_version, loss_scale,
......@@ -620,7 +651,7 @@ def resnet_main(
return input_function(
is_training=True,
data_dir=flags_obj.data_dir,
batch_size=distribution_utils.per_replica_batch_size(
batch_size=per_replica_batch_size(
flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
num_epochs=num_epochs,
dtype=flags_core.get_tf_dtype(flags_obj),
......@@ -631,7 +662,7 @@ def resnet_main(
return input_function(
is_training=False,
data_dir=flags_obj.data_dir,
batch_size=distribution_utils.per_replica_batch_size(
batch_size=per_replica_batch_size(
flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
num_epochs=1,
dtype=flags_core.get_tf_dtype(flags_obj))
......
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Transformer Translation Model
This is an implementation of the Transformer translation model as described in the [Attention is All You Need](https://arxiv.org/abs/1706.03762) paper. Based on the code provided by the authors: [Transformer code](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py) from [Tensor2Tensor](https://github.com/tensorflow/tensor2tensor). Also, check out the [tutorial](https://www.tensorflow.org/beta/tutorials/text/transformer) on Transformer in TF 2.0.
......
......@@ -562,6 +562,36 @@ def construct_estimator(flags_obj, params, schedule_manager):
},
config=run_config)
def per_replica_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size
remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)
def run_transformer(flags_obj):
"""Create tf.Estimator to train and evaluate transformer model.
......@@ -605,8 +635,8 @@ def run_transformer(flags_obj):
total_batch_size = params["batch_size"]
if not params["use_tpu"]:
params["batch_size"] = distribution_utils.per_replica_batch_size(
params["batch_size"], num_gpus)
params["batch_size"] = per_replica_batch_size(params["batch_size"],
num_gpus)
schedule_manager = schedule.Manager(
train_steps=flags_obj.train_steps,
......
![No Maintenance Intended](https://img.shields.io/badge/No%20Maintenance%20Intended-%E2%9C%95-red.svg)
![TensorFlow Requirement: 1.x](https://img.shields.io/badge/TensorFlow%20Requirement-1.x-brightgreen)
![TensorFlow 2 Not Supported](https://img.shields.io/badge/TensorFlow%202%20Not%20Supported-%E2%9C%95-red.svg)
# Predicting Income with the Census Income Dataset
Note that, the implementation is based on TF 1.x.
It is subjected to move to R1 archive folder.
The implementation is based on TensorFlow 1.x.
## Overview
The [Census Income Data Set](https://archive.ics.uci.edu/ml/datasets/Census+Income) contains over 48,000 samples with attributes including age, occupation, education, and income (a binary label, either `>50K` or `<=50K`). The dataset is split into roughly 32,000 training and 16,000 testing samples.
......
......@@ -331,7 +331,7 @@ class DatasetManager(object):
"""Returns batches for training."""
# Estimator passes batch_size during training and eval_batch_size during
# eval. TPUEstimator only passes batch_size.
# eval.
param_batch_size = (params["batch_size"] if self._is_training else
params.get("eval_batch_size") or params["batch_size"])
if batch_size != param_batch_size:
......@@ -713,7 +713,7 @@ class DummyConstructor(threading.Thread):
"""Returns dummy input batches for training."""
# Estimator passes batch_size during training and eval_batch_size during
# eval. TPUEstimator only passes batch_size.
# eval.
batch_size = (params["batch_size"] if is_training else
params.get("eval_batch_size") or params["batch_size"])
num_users = params["num_users"]
......
......@@ -149,7 +149,7 @@ def define_ncf_flags():
flags_core.define_base(model_dir=True, clean=True, train_epochs=True,
epochs_between_evals=True, export_dir=False,
run_eagerly=True, stop_threshold=True, num_gpu=True,
hooks=True, distribution_strategy=True)
distribution_strategy=True)
flags_core.define_performance(
synthetic_data=True,
dtype=True,
......@@ -167,8 +167,7 @@ def define_ncf_flags():
model_dir="/tmp/ncf/",
data_dir="/tmp/movielens-data/",
train_epochs=2,
batch_size=256,
hooks="ProfilerHook",
batch_size=99000,
tpu=None
)
......
......@@ -29,7 +29,7 @@ import os
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import tensorflow.compat.v2 as tf
# pylint: enable=g-bad-import-order
from official.recommendation import constants as rconst
......
......@@ -23,18 +23,15 @@ import unittest
import numpy as np
import tensorflow as tf
from tensorflow.python.eager import context # pylint: disable=ungrouped-imports
from official.recommendation import constants as rconst
from official.recommendation import data_pipeline
from official.recommendation import neumf_model
from official.recommendation import ncf_common
from official.recommendation import ncf_estimator_main
from official.recommendation import ncf_keras_main
from official.recommendation import neumf_model
from official.utils.misc import keras_utils
from official.utils.testing import integration
from tensorflow.python.eager import context # pylint: disable=ungrouped-imports
NUM_TRAIN_NEG = 4
......@@ -190,20 +187,6 @@ class NcfTest(tf.test.TestCase):
_BASE_END_TO_END_FLAGS = ['-batch_size', '1044', '-train_epochs', '1']
@unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
@unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_estimator(self):
integration.run_synthetic(
ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
extra_flags=self._BASE_END_TO_END_FLAGS)
@unittest.skipIf(keras_utils.is_v2_0(), "TODO(b/136018594)")
@unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_estimator_mlperf(self):
integration.run_synthetic(
ncf_estimator_main.main, tmp_root=self.get_temp_dir(),
extra_flags=self._BASE_END_TO_END_FLAGS + ['-ml_perf', 'True'])
@unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
def test_end_to_end_keras_no_dist_strat(self):
integration.run_synthetic(
......
......@@ -126,7 +126,6 @@ def neumf_model_fn(features, labels, mode, params):
weights=tf.cast(valid_pt_mask, tf.float32)
)
# This tensor is used by logging hooks.
tf.identity(loss, name="cross_entropy")
global_step = tf.compat.v1.train.get_global_step()
......
#!/bin/bash
set -e
# Example settings:
# export TPU="taylorrobie-tpu-0"
# export BUCKET="gs://taylorrobie-tpu-test-bucket-2"
# Remove IDE "not assigned" warning highlights.
TPU=${TPU:-""}
BUCKET=${BUCKET:-""}
if [[ -z ${TPU} ]]; then
echo "Please set 'TPU' to the name of the TPU to be used."
exit 1
fi
if [[ -z ${BUCKET} ]]; then
echo "Please set 'BUCKET' to the GCS bucket to be used."
exit 1
fi
./run.sh
......@@ -139,13 +139,10 @@ class StandardEvaluable(runnable.AbstractEvaluable):
eval_fn = tf.function(eval_fn)
self.eval_loop_fn = utils.create_loop_fn(eval_fn)
# TODO(b/147718615): When async RPC is enabled in eager runtime, we make
# eval iterator as a class member so it doesn't get destroyed when out of
# the function scope.
self.eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
self.eval_begin()
self.eval_loop_fn(self.eval_iter, num_steps)
self.eval_loop_fn(eval_iter, num_steps)
return self.eval_end()
def eval_begin(self):
......
......@@ -94,6 +94,7 @@ def initialize_common_flags():
def strategy_flags_dict():
"""Returns TPU and/or GPU related flags in a dictionary."""
return {
'distribution_strategy': FLAGS.strategy_type,
# TPUStrategy related flags.
'tpu': FLAGS.tpu,
# MultiWorkerMirroredStrategy related flags.
......
......@@ -40,7 +40,7 @@ def _collective_communication(all_reduce_alg):
tf.distribute.experimental.CollectiveCommunication object
Raises:
ValueError: if `all_reduce_alg` not in [None, 'ring', 'nccl']
ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
"""
collective_communication_options = {
None: tf.distribute.experimental.CollectiveCommunication.AUTO,
......@@ -50,7 +50,7 @@ def _collective_communication(all_reduce_alg):
if all_reduce_alg not in collective_communication_options:
raise ValueError(
"When used with `multi_worker_mirrored`, valid values for "
"all_reduce_alg are ['ring', 'nccl']. Supplied value: {}".format(
"all_reduce_alg are [`ring`, `nccl`]. Supplied value: {}".format(
all_reduce_alg))
return collective_communication_options[all_reduce_alg]
......@@ -66,7 +66,7 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
tf.distribute.CrossDeviceOps object or None.
Raises:
ValueError: if `all_reduce_alg` not in [None, 'nccl', 'hierarchical_copy'].
ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
"""
if all_reduce_alg is None:
return None
......@@ -77,7 +77,7 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
if all_reduce_alg not in mirrored_all_reduce_options:
raise ValueError(
"When used with `mirrored`, valid values for all_reduce_alg are "
"['nccl', 'hierarchical_copy']. Supplied value: {}".format(
"[`nccl`, `hierarchical_copy`]. Supplied value: {}".format(
all_reduce_alg))
cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
return cross_device_ops_class(num_packs=num_packs)
......@@ -92,9 +92,9 @@ def get_distribution_strategy(distribution_strategy="mirrored",
Args:
distribution_strategy: a string specifying which distribution strategy to
use. Accepted values are 'off', 'one_device', 'mirrored',
'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive.
'off' means not to use Distribution Strategy; 'tpu' means to use
use. Accepted values are "off", "one_device", "mirrored",
"parameter_server", "multi_worker_mirrored", and "tpu" -- case insensitive.
"off" means not to use Distribution Strategy; "tpu" means to use
TPUStrategy using `tpu_address`.
num_gpus: Number of GPUs to run this model.
all_reduce_alg: Optional. Specifies which algorithm to use when performing
......@@ -109,7 +109,7 @@ def get_distribution_strategy(distribution_strategy="mirrored",
Returns:
tf.distribute.DistibutionStrategy object.
Raises:
ValueError: if `distribution_strategy` is 'off' or 'one_device' and
ValueError: if `distribution_strategy` is "off" or "one_device" and
`num_gpus` is larger than 1; or `num_gpus` is negative or if
`distribution_strategy` is `tpu` but `tpu_address` is not specified.
"""
......@@ -121,7 +121,7 @@ def get_distribution_strategy(distribution_strategy="mirrored",
if num_gpus > 1:
raise ValueError(
"When {} GPUs are specified, distribution_strategy "
"flag cannot be set to 'off'.".format(num_gpus))
"flag cannot be set to `off`.".format(num_gpus))
return None
if distribution_strategy == "tpu":
......@@ -157,141 +157,6 @@ def get_distribution_strategy(distribution_strategy="mirrored",
"Unrecognized Distribution Strategy: %r" % distribution_strategy)
def per_replica_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size
remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)
# The `SyntheticDataset` is a temporary solution for generating synthetic data
# directly on devices. It is only useful for Keras with Distribution
# Strategies. We will have better support in `tf.data` or Distribution Strategy
# later.
class SyntheticDataset(object):
"""A dataset that generates synthetic data on each device."""
def __init__(self, dataset, split_by=1):
# dataset.take(1) doesn't have GPU kernel.
with tf.device('device:CPU:0'):
tensor = tf.data.experimental.get_single_element(dataset.take(1))
flat_tensor = tf.nest.flatten(tensor)
variable_data = []
initializers = []
for t in flat_tensor:
rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
v = tf.compat.v1.get_local_variable(self._random_name(),
initializer=rebatched_t)
variable_data.append(v)
initializers.append(v.initializer)
input_data = tf.nest.pack_sequence_as(tensor, variable_data)
self._iterator = SyntheticIterator(input_data, initializers)
def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
def __iter__(self):
return self._iterator
def make_one_shot_iterator(self):
return self._iterator
def make_initializable_iterator(self):
return self._iterator
class SyntheticIterator(object):
"""A dataset that generates synthetic data on each device."""
def __init__(self, input_data, initializers):
self._input_data = input_data
self._initializers = initializers
def get_next(self):
return self._input_data
def next(self):
return self.__next__()
def __next__(self):
try:
return self.get_next()
except tf.errors.OutOfRangeError:
raise StopIteration
def initialize(self):
if tf.executing_eagerly():
return tf.no_op()
else:
return self._initializers
def _monkey_patch_dataset_method(strategy):
"""Monkey-patch `strategy`'s `make_dataset_iterator` method."""
def make_dataset(self, dataset):
logging.info('Using pure synthetic data.')
with self.scope():
if self.extended._global_batch_size: # pylint: disable=protected-access
return SyntheticDataset(dataset, self.num_replicas_in_sync)
else:
return SyntheticDataset(dataset)
def make_iterator(self, dataset):
dist_dataset = make_dataset(self, dataset)
return iter(dist_dataset)
strategy.orig_make_dataset_iterator = strategy.make_dataset_iterator
strategy.make_dataset_iterator = make_iterator
strategy.orig_distribute_dataset = strategy.experimental_distribute_dataset
strategy.experimental_distribute_dataset = make_dataset
def _undo_monkey_patch_dataset_method(strategy):
if hasattr(strategy, 'orig_make_dataset_iterator'):
strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
if hasattr(strategy, 'orig_distribute_dataset'):
strategy.make_dataset_iterator = strategy.orig_distribute_dataset
def set_up_synthetic_data():
_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
_monkey_patch_dataset_method(
tf.distribute.experimental.MultiWorkerMirroredStrategy)
def undo_set_up_synthetic_data():
_undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
_undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
_undo_monkey_patch_dataset_method(
tf.distribute.experimental.MultiWorkerMirroredStrategy)
def configure_cluster(worker_hosts=None, task_index=-1):
"""Set multi-worker cluster spec in TF_CONFIG environment variable.
......@@ -301,21 +166,21 @@ def configure_cluster(worker_hosts=None, task_index=-1):
Returns:
Number of workers in the cluster.
"""
tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
if tf_config:
num_workers = (len(tf_config['cluster'].get('chief', [])) +
len(tf_config['cluster'].get('worker', [])))
num_workers = (len(tf_config["cluster"].get("chief", [])) +
len(tf_config["cluster"].get("worker", [])))
elif worker_hosts:
workers = worker_hosts.split(',')
workers = worker_hosts.split(",")
num_workers = len(workers)
if num_workers > 1 and task_index < 0:
raise ValueError('Must specify task_index when number of workers > 1')
raise ValueError("Must specify task_index when number of workers > 1")
task_index = 0 if num_workers == 1 else task_index
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': workers
os.environ["TF_CONFIG"] = json.dumps({
"cluster": {
"worker": workers
},
'task': {'type': 'worker', 'index': task_index}
"task": {"type": "worker", "index": task_index}
})
else:
num_workers = 1
......
......@@ -45,21 +45,5 @@ class GetDistributionStrategyTest(tf.test.TestCase):
self.assertIn('GPU', device)
class PerReplicaBatchSizeTest(tf.test.TestCase):
"""Tests for per_replica_batch_size."""
def test_batch_size(self):
self.assertEquals(
distribution_utils.per_replica_batch_size(147, num_gpus=0), 147)
self.assertEquals(
distribution_utils.per_replica_batch_size(147, num_gpus=1), 147)
self.assertEquals(
distribution_utils.per_replica_batch_size(147, num_gpus=7), 21)
def test_batch_size_with_remainder(self):
with self.assertRaises(ValueError):
distribution_utils.per_replica_batch_size(147, num_gpus=5)
if __name__ == "__main__":
tf.test.main()
......@@ -164,6 +164,18 @@ def get_profiler_callback(model_dir, profile_steps, enable_tensorboard,
return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch)
class SimpleCheckpoint(tf.keras.callbacks.Callback):
"""Keras callback to save tf.train.Checkpoints."""
def __init__(self, checkpoint_manager):
super(SimpleCheckpoint, self).__init__()
self.checkpoint_manager = checkpoint_manager
def on_epoch_end(self, epoch, logs=None):
step_counter = self.checkpoint_manager._step_counter.numpy() # pylint: disable=protected-access
self.checkpoint_manager.save(checkpoint_number=step_counter)
class ProfilerCallback(tf.keras.callbacks.Callback):
"""Save profiles in specified step range to log directory."""
......
# Object Detection Models on TensorFlow 2.0
# Object Detection Models on TensorFlow 2
**Note**: The repo is still under construction. More features and instructions
will be added soon.
**Note**: This repository is still under construction.
More features and instructions will be added soon.
## Prerequsite
To get started, make sure to use Tensorflow 2.1+ on Google Cloud. Also here are
a few package you need to install to get started:
To get started, download the code from TensorFlow models GitHub repository or
use the pre-installed Google Cloud VM.
```bash
sudo apt-get install -y python-tk && \
pip install Cython matplotlib opencv-python-headless pyyaml Pillow && \
pip install 'git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI'
git clone https://github.com/tensorflow/models.git
```
Next, download the code from TensorFlow models github repository or use the
pre-installed Google Cloud VM.
Next, make sure to use TensorFlow 2.1+ on Google Cloud. Also here are
a few package you need to install to get started:
```bash
git clone https://github.com/tensorflow/models.git
sudo apt-get install -y python-tk && \
pip3 install -r ~/models/official/requirements.txt
```
## Train RetinaNet on TPU
### Train a vanilla ResNet-50 based RetinaNet.
```bash
......@@ -30,7 +30,7 @@ RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python ~/models/official/vision/detection/main.py \
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
......@@ -60,7 +60,7 @@ following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python ~/models/official/vision/detection/main.py \
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
......@@ -86,7 +86,6 @@ python3 ~/models/official/vision/detection/main.py \
--config_file="my_retinanet.yaml"
```
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
......@@ -123,6 +122,118 @@ use_tpu: False
"
```
---
## Train Mask R-CNN on TPU
### Train a vanilla ResNet-50 based Mask R-CNN.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } }"
```
### Train a custom Mask R-CNN using the config file.
First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
This file specifies the parameters to be overridden,
which should at least include the following fields.
```YAML
# my_maskrcnn.yaml
train:
train_file_pattern: <path to the TFRecord training data>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
## Train Mask R-CNN on GPU
Training on GPU is similar to that on TPU. The major change is the strategy type
(use
"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/vision/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--model=mask_rcnn \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
predict:
predict_batch_size: 8
architecture:
use_bfloat16: False
maskrcnn_parser:
use_bfloat16: Flase
train:
total_steps: 1000
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.
......
......@@ -14,8 +14,16 @@
# ==============================================================================
"""Base config template."""
# pylint: disable=line-too-long
BACKBONES = [
'resnet',
]
MULTILEVEL_FEATURES = [
'fpn',
]
# pylint: disable=line-too-long
# For ResNet, this freezes the variables of the first conv1 and conv2_x
# layers [1], which leads to higher training speed and slightly better testing
# accuracy. The intuition is that the low-level architecture (e.g., ResNet-50)
......@@ -24,7 +32,6 @@
# Note that we need to trailing `/` to avoid the incorrect match.
# [1]: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L198
RESNET_FROZEN_VAR_PREFIX = r'(resnet\d+)\/(conv2d(|_([1-9]|10))|batch_normalization(|_([1-9]|10)))\/'
REGULARIZATION_VAR_REGEX = r'.*(kernel|weight):0$'
BASE_CFG = {
......@@ -41,6 +48,7 @@ BASE_CFG = {
'optimizer': {
'type': 'momentum',
'momentum': 0.9,
'nesterov': True, # `False` is better for TPU v3-128.
},
'learning_rate': {
'type': 'step',
......@@ -49,21 +57,25 @@ BASE_CFG = {
'init_learning_rate': 0.08,
'learning_rate_levels': [0.008, 0.0008],
'learning_rate_steps': [15000, 20000],
'total_steps': 22500,
},
'checkpoint': {
'path': '',
'prefix': '',
},
'frozen_variable_prefix': RESNET_FROZEN_VAR_PREFIX,
# One can use 'RESNET_FROZEN_VAR_PREFIX' to speed up ResNet training
# when loading from the checkpoint.
'frozen_variable_prefix': '',
'train_file_pattern': '',
'train_dataset_type': 'tfrecord',
# TODO(b/142174042): Support transpose_input option.
'transpose_input': False,
'regularization_variable_regex': REGULARIZATION_VAR_REGEX,
'l2_weight_decay': 0.0001,
'gradient_clip_norm': 0.0,
'input_sharding': False,
},
'eval': {
'input_sharding': True,
'batch_size': 8,
'eval_samples': 5000,
'min_eval_interval': 180,
......@@ -74,38 +86,42 @@ BASE_CFG = {
'val_json_file': '',
'eval_file_pattern': '',
'eval_dataset_type': 'tfrecord',
# When visualizing images, set evaluation batch size to 40 to avoid
# potential OOM.
'num_images_to_visualize': 0,
},
'predict': {
'batch_size': 8,
},
'anchor': {
'architecture': {
'backbone': 'resnet',
'min_level': 3,
'max_level': 7,
'multilevel_features': 'fpn',
'use_bfloat16': True,
# Note that `num_classes` is the total number of classes including
# one background classes whose index is 0.
'num_classes': 91,
},
'anchor': {
'num_scales': 3,
'aspect_ratios': [1.0, 2.0, 0.5],
'anchor_size': 4.0,
},
'norm_activation': {
'activation': 'relu',
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
'resnet': {
'resnet_depth': 50,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
},
'fpn': {
'min_level': 3,
'max_level': 7,
'fpn_feat_dims': 256,
'use_separable_conv': False,
'use_batch_norm': True,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
},
'postprocess': {
'use_batched_nms': False,
......@@ -116,5 +132,4 @@ BASE_CFG = {
},
'enable_summary': False,
}
# pylint: enable=line-too-long
......@@ -28,13 +28,12 @@ MASKRCNN_CFG.override({
},
'architecture': {
'parser': 'maskrcnn_parser',
'backbone': 'resnet',
'multilevel_features': 'fpn',
'use_bfloat16': True,
'min_level': 2,
'max_level': 6,
'include_mask': True,
'mask_target_size': 28,
},
'maskrcnn_parser': {
'use_bfloat16': True,
'output_size': [1024, 1024],
'num_channels': 3,
'rpn_match_threshold': 0.7,
......@@ -46,74 +45,32 @@ MASKRCNN_CFG.override({
'aug_scale_max': 1.0,
'skip_crowd_during_training': True,
'max_num_instances': 100,
'include_mask': True,
'mask_crop_size': 112,
},
'anchor': {
'min_level': 2,
'max_level': 6,
'num_scales': 1,
'anchor_size': 8,
},
'fpn': {
'min_level': 2,
'max_level': 6,
},
'nasfpn': {
'min_level': 2,
'max_level': 6,
},
# tunable_nasfpn:strip_begin
'tunable_nasfpn_v1': {
'min_level': 2,
'max_level': 6,
},
# tunable_nasfpn:strip_end
'rpn_head': {
'min_level': 2,
'max_level': 6,
'anchors_per_location': 3,
'num_convs': 2,
'num_filters': 256,
'use_separable_conv': False,
'use_batch_norm': False,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
},
'frcnn_head': {
# Note that `num_classes` is the total number of classes including
# one background classes whose index is 0.
'num_classes': 91,
'num_convs': 0,
'num_filters': 256,
'use_separable_conv': False,
'num_fcs': 2,
'fc_dims': 1024,
'use_batch_norm': False,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
},
'mrcnn_head': {
'num_classes': 91,
'mask_target_size': 28,
'num_convs': 4,
'num_filters': 256,
'use_separable_conv': False,
'use_batch_norm': False,
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
},
'rpn_score_loss': {
'rpn_batch_size_per_im': 256,
......@@ -147,23 +104,10 @@ MASKRCNN_CFG.override({
},
'mask_sampling': {
'num_mask_samples_per_image': 128, # Typically = `num_samples_per_image` * `fg_fraction`.
'mask_target_size': 28,
},
'postprocess': {
'use_batched_nms': False,
'max_total_size': 100,
'nms_iou_threshold': 0.5,
'score_threshold': 0.05,
'pre_nms_num_boxes': 1000,
},
}, is_strict=False)
MASKRCNN_RESTRICTIONS = [
'architecture.use_bfloat16 == maskrcnn_parser.use_bfloat16',
'architecture.include_mask == maskrcnn_parser.include_mask',
'anchor.min_level == rpn_head.min_level',
'anchor.max_level == rpn_head.max_level',
'mrcnn_head.mask_target_size == mask_sampling.mask_target_size',
]
# pylint: enable=line-too-long
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment