Unverified Commit 09d9656f authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

parents ac671306 49a5706c
......@@ -58,12 +58,12 @@ In the near future, we will add:
| Model | Reference (Paper) |
|-------|-------------------|
| [ALBERT (A Lite BERT)](nlp/albert) | [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) |
| [BERT (Bidirectional Encoder Representations from Transformers)](nlp/bert) | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) |
| [ALBERT (A Lite BERT)](nlp/MODEL_GARDEN.md#available-model-configs) | [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) |
| [BERT (Bidirectional Encoder Representations from Transformers)](nlp/MODEL_GARDEN.md#available-model-configs) | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) |
| [NHNet (News Headline generation model)](projects/nhnet) | [Generating Representative Headlines for News Stories](https://arxiv.org/abs/2001.09386) |
| [Transformer](nlp/transformer) | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) |
| [Transformer](nlp/MODEL_GARDEN.md#available-model-configs) | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) |
| [XLNet](nlp/xlnet) | [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) |
| [MobileBERT](nlp/projects/mobilebert) | [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) |
| [MobileBERT](projects/mobilebert) | [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) |
### Recommendation
......
......@@ -14,10 +14,13 @@
"""Tests for distribution util functions."""
import sys
import tensorflow as tf
from official.common import distribute_utils
TPU_TEST = 'test_tpu' in sys.argv[0]
class DistributeUtilsTest(tf.test.TestCase):
"""Tests for distribute util functions."""
......@@ -51,6 +54,9 @@ class DistributeUtilsTest(tf.test.TestCase):
self.assertIn('GPU', ds.extended.worker_devices[0])
def test_mirrored_strategy(self):
# CPU only.
_ = distribute_utils.get_distribution_strategy(num_gpus=0)
# 5 GPUs.
ds = distribute_utils.get_distribution_strategy(num_gpus=5)
self.assertEquals(ds.num_replicas_in_sync, 5)
self.assertEquals(len(ds.extended.worker_devices), 5)
......@@ -78,10 +84,26 @@ class DistributeUtilsTest(tf.test.TestCase):
self.assertIsInstance(
ds, tf.distribute.experimental.MultiWorkerMirroredStrategy)
with self.assertRaisesRegex(
ValueError,
'When used with `multi_worker_mirrored`, valid values.*'):
_ = distribute_utils.get_distribution_strategy(
'multi_worker_mirrored', all_reduce_alg='dummy')
def test_no_strategy(self):
ds = distribute_utils.get_distribution_strategy('off')
self.assertIs(ds, tf.distribute.get_strategy())
def test_tpu_strategy(self):
if not TPU_TEST:
self.skipTest('Only Cloud TPU VM instances can have local TPUs.')
with self.assertRaises(ValueError):
_ = distribute_utils.get_distribution_strategy('tpu')
ds = distribute_utils.get_distribution_strategy('tpu', tpu_address='local')
self.assertIsInstance(
ds, tf.distribute.TPUStrategy)
def test_invalid_strategy(self):
with self.assertRaisesRegexp(
ValueError,
......
......@@ -28,7 +28,7 @@ from official.core import config_definitions
from official.modeling import optimization
class PruningActions:
class PruningAction:
"""Train action to updates pruning related information.
This action updates pruning steps at the end of trainig loop, and log
......@@ -66,7 +66,7 @@ class PruningActions:
"""Update pruning step and log pruning summaries.
Args:
output: The train output to test.
output: The train output.
"""
self.update_pruning_step.on_epoch_end(batch=None)
self.pruning_summaries.on_epoch_begin(epoch=None)
......@@ -81,8 +81,11 @@ class EMACheckpointing:
than training.
"""
def __init__(self, export_dir: str, optimizer: tf.keras.optimizers.Optimizer,
checkpoint: tf.train.Checkpoint, max_to_keep: int = 1):
def __init__(self,
export_dir: str,
optimizer: tf.keras.optimizers.Optimizer,
checkpoint: tf.train.Checkpoint,
max_to_keep: int = 1):
"""Initializes the instance.
Args:
......@@ -99,8 +102,7 @@ class EMACheckpointing:
'EMACheckpointing action')
export_dir = os.path.join(export_dir, 'ema_checkpoints')
tf.io.gfile.makedirs(
os.path.dirname(export_dir))
tf.io.gfile.makedirs(os.path.dirname(export_dir))
self._optimizer = optimizer
self._checkpoint = checkpoint
self._checkpoint_manager = tf.train.CheckpointManager(
......@@ -113,7 +115,7 @@ class EMACheckpointing:
"""Swaps model weights, and saves the checkpoint.
Args:
output: The train or eval output to test.
output: The train or eval output.
"""
self._optimizer.swap_weights()
self._checkpoint_manager.save(checkpoint_number=self._optimizer.iterations)
......@@ -173,10 +175,9 @@ class RecoveryCondition:
@gin.configurable
def get_eval_actions(
params: config_definitions.ExperimentConfig,
trainer: base_trainer.Trainer,
model_dir: str) -> List[orbit.Action]:
def get_eval_actions(params: config_definitions.ExperimentConfig,
trainer: base_trainer.Trainer,
model_dir: str) -> List[orbit.Action]:
"""Gets eval actions for TFM trainer."""
eval_actions = []
# Adds ema checkpointing action to save the average weights under
......@@ -202,7 +203,7 @@ def get_train_actions(
# Adds pruning callback actions.
if hasattr(params.task, 'pruning'):
train_actions.append(
PruningActions(
PruningAction(
export_dir=model_dir,
model=trainer.model,
optimizer=trainer.optimizer))
......
......@@ -27,14 +27,16 @@ from official.core import actions
from official.modeling import optimization
class TestModel(tf.Module):
class TestModel(tf.keras.Model):
def __init__(self):
self.value = tf.Variable(0)
super().__init__()
self.value = tf.Variable(0.0)
self.dense = tf.keras.layers.Dense(2)
_ = self.dense(tf.zeros((2, 2), tf.float32))
@tf.function(input_signature=[])
def __call__(self):
return self.value
def call(self, x, training=None):
return self.value + x
class ActionsTest(tf.test.TestCase, parameterized.TestCase):
......@@ -43,7 +45,7 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
combinations.combine(
distribution=[
strategy_combinations.cloud_tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
strategy_combinations.one_device_strategy,
],))
def test_ema_checkpointing(self, distribution):
with distribution.scope():
......@@ -62,18 +64,25 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
model.value.assign(3)
# Checks model.value is 3
self.assertEqual(model(), 3)
self.assertEqual(model(0.), 3)
ema_action = actions.EMACheckpointing(directory, optimizer, checkpoint)
ema_action({})
self.assertNotEmpty(
tf.io.gfile.glob(os.path.join(directory, 'ema_checkpoints')))
checkpoint.read(tf.train.latest_checkpoint(
os.path.join(directory, 'ema_checkpoints')))
checkpoint.read(
tf.train.latest_checkpoint(
os.path.join(directory, 'ema_checkpoints')))
# Checks model.value is 0 after swapping.
self.assertEqual(model(), 0)
self.assertEqual(model(0.), 0)
# Raises an error for a normal optimizer.
with self.assertRaisesRegex(ValueError,
'Optimizer has to be instance of.*'):
_ = actions.EMACheckpointing(directory, tf.keras.optimizers.SGD(),
checkpoint)
@combinations.generate(
combinations.combine(
......@@ -102,6 +111,21 @@ class ActionsTest(tf.test.TestCase, parameterized.TestCase):
with self.assertRaises(RuntimeError):
recover_condition(outputs)
@combinations.generate(
combinations.combine(
distribution=[
strategy_combinations.one_device_strategy_gpu,
strategy_combinations.one_device_strategy,
],))
def test_pruning(self, distribution):
with distribution.scope():
directory = self.get_temp_dir()
model = TestModel()
optimizer = tf.keras.optimizers.SGD()
pruning = actions.PruningAction(directory, model, optimizer)
pruning({})
if __name__ == '__main__':
tf.test.main()
......@@ -247,14 +247,12 @@ class Trainer(_AsyncTrainer):
self._validation_loss = tf.keras.metrics.Mean(
"validation_loss", dtype=tf.float32)
model_metrics = model.metrics if hasattr(model, "metrics") else []
self._train_metrics = self.task.build_metrics(
training=True) + model_metrics
self._validation_metrics = self.task.build_metrics(
training=False) + model_metrics
self.init_async()
if train:
self._train_metrics = self.task.build_metrics(
training=True) + model_metrics
train_dataset = train_dataset or self.distribute_dataset(
self.task.build_inputs, self.config.task.train_data)
orbit.StandardTrainer.__init__(
......@@ -266,6 +264,8 @@ class Trainer(_AsyncTrainer):
use_tpu_summary_optimization=config.trainer.allow_tpu_summary))
if evaluate:
self._validation_metrics = self.task.build_metrics(
training=False) + model_metrics
validation_dataset = validation_dataset or self.distribute_dataset(
self.task.build_inputs, self.config.task.validation_data)
orbit.StandardEvaluator.__init__(
......@@ -370,16 +370,6 @@ class Trainer(_AsyncTrainer):
"""Accesses the training checkpoint."""
return self._checkpoint
# TODO(yejiayu): Remove this once all deps are fixed.
def add_recovery(self, params: TrainerConfig,
checkpoint_manager: tf.train.CheckpointManager):
if params.recovery_max_trials >= 0:
self._recovery = Recovery(
loss_upper_bound=params.loss_upper_bound,
recovery_begin_steps=params.recovery_begin_steps,
recovery_max_trials=params.recovery_max_trials,
checkpoint_manager=checkpoint_manager)
def train_loop_end(self):
"""See base class."""
self.join()
......
......@@ -16,10 +16,13 @@
import abc
import functools
import time
from typing import Any, Callable, Dict, Mapping, List, Optional, Text, Union
from absl import logging
import tensorflow as tf
from tensorflow.python.saved_model.model_utils import export_utils
MAX_DIRECTORY_CREATION_ATTEMPTS = 10
class ExportModule(tf.Module, metaclass=abc.ABCMeta):
......@@ -89,7 +92,8 @@ def export(export_module: ExportModule,
export_savedmodel_dir: Text,
checkpoint_path: Optional[Text] = None,
timestamped: bool = True,
save_options: Optional[tf.saved_model.SaveOptions] = None) -> Text:
save_options: Optional[tf.saved_model.SaveOptions] = None,
checkpoint: Optional[tf.train.Checkpoint] = None) -> Text:
"""Exports to SavedModel format.
Args:
......@@ -101,6 +105,8 @@ def export(export_module: ExportModule,
checkpoint_path: Object-based checkpoint path or directory.
timestamped: Whether to export the savedmodel to a timestamped directory.
save_options: `SaveOptions` for `tf.saved_model.save`.
checkpoint: An optional tf.train.Checkpoint. If provided, the export module
will use it to read the weights.
Returns:
The savedmodel directory path.
......@@ -109,7 +115,8 @@ def export(export_module: ExportModule,
if ckpt_dir_or_file is not None and tf.io.gfile.isdir(ckpt_dir_or_file):
ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
if ckpt_dir_or_file:
checkpoint = tf.train.Checkpoint(model=export_module.model)
if checkpoint is None:
checkpoint = tf.train.Checkpoint(model=export_module.model)
checkpoint.read(
ckpt_dir_or_file).assert_existing_objects_matched().expect_partial()
if isinstance(function_keys, list):
......@@ -119,15 +126,48 @@ def export(export_module: ExportModule,
}
else:
raise ValueError(
"If the function_keys is a list, it must contain a single element. %s"
'If the function_keys is a list, it must contain a single element. %s'
% function_keys)
signatures = export_module.get_inference_signatures(function_keys)
if timestamped:
export_dir = export_utils.get_timestamped_export_dir(
export_savedmodel_dir).decode("utf-8")
export_dir = get_timestamped_export_dir(export_savedmodel_dir).decode(
'utf-8')
else:
export_dir = export_savedmodel_dir
tf.saved_model.save(
export_module, export_dir, signatures=signatures, options=save_options)
return export_dir
def get_timestamped_export_dir(export_dir_base):
"""Builds a path to a new subdirectory within the base directory.
Args:
export_dir_base: A string containing a directory to write the exported graph
and checkpoints.
Returns:
The full path of the new subdirectory (which is not actually created yet).
Raises:
RuntimeError: if repeated attempts fail to obtain a unique timestamped
directory name.
"""
attempts = 0
while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
timestamp = int(time.time())
result_dir = tf.io.gfile.join(
tf.compat.as_bytes(export_dir_base), tf.compat.as_bytes(str(timestamp)))
if not tf.io.gfile.exists(result_dir):
# Collisions are still possible (though extremely unlikely): this
# directory is not actually created yet, but it will be almost
# instantly on return from this function.
return result_dir
time.sleep(1)
attempts += 1
logging.warning('Directory %s already exists; retrying (attempt %s/%s)',
str(result_dir), attempts, MAX_DIRECTORY_CREATION_ATTEMPTS)
raise RuntimeError('Failed to obtain a unique export directory name after '
f'{MAX_DIRECTORY_CREATION_ATTEMPTS} attempts.')
......@@ -121,6 +121,13 @@ class ExportBaseTest(tf.test.TestCase):
output = module.serve(inputs)
self.assertAllClose(output['outputs'].numpy(), 1.11)
def test_get_timestamped_export_dir(self):
export_dir = self.get_temp_dir()
timed_dir = export_base.get_timestamped_export_dir(
export_dir_base=export_dir)
self.assertFalse(tf.io.gfile.exists(timed_dir))
self.assertIn(export_dir, str(timed_dir))
if __name__ == '__main__':
tf.test.main()
......@@ -14,13 +14,13 @@
"""Training utils."""
import copy
import dataclasses
import json
import os
import pprint
from typing import Any, Callable, Dict, List, Optional, Union
from absl import logging
import dataclasses
import gin
import orbit
import tensorflow as tf
......@@ -244,49 +244,87 @@ class ParseConfigOptions:
return name in dataclasses.asdict(self)
def parse_configuration(flags_obj, lock_return=True, print_return=True):
"""Parses ExperimentConfig from flags."""
class ExperimentParser:
"""Constructs the Experiment config from Flags or equivalent object.
Most of the cases, users only need to call the `parse()` function:
```
builder = ExperimentParser(FLAGS)
params = builder.parse()
```
if flags_obj.experiment is None:
raise ValueError('The flag --experiment must be specified.')
# 1. Get the default config from the registered experiment.
params = exp_factory.get_exp_config(flags_obj.experiment)
# 2. Get the first level of override from `--config_file`.
# `--config_file` is typically used as a template that specifies the common
# override for a particular experiment.
for config_file in flags_obj.config_file or []:
params = hyperparams.override_params_dict(
params, config_file, is_strict=True)
# 3. Override the TPU address and tf.data service address.
params.override({
'runtime': {
'tpu': flags_obj.tpu,
},
})
if ('tf_data_service' in flags_obj and flags_obj.tf_data_service and
isinstance(params.task, config_definitions.TaskConfig)):
The advanced users can modify the flow by calling the parse_*() functions
separately.
"""
def __init__(self, flags_obj):
self._flags_obj = flags_obj
def parse(self):
"""Overrall process of constructing Experiment config."""
params = self.base_experiment()
params = self.parse_config_file(params)
params = self.parse_runtime(params)
params = self.parse_data_service(params)
params = self.parse_params_override(params)
return params
def base_experiment(self):
"""Get the base experiment config from --experiment field."""
if self._flags_obj.experiment is None:
raise ValueError('The flag --experiment must be specified.')
return exp_factory.get_exp_config(self._flags_obj.experiment)
def parse_config_file(self, params):
"""Override the configs of params from the config_file."""
for config_file in self._flags_obj.config_file or []:
params = hyperparams.override_params_dict(
params, config_file, is_strict=True)
return params
def parse_runtime(self, params):
"""Override the runtime configs of params from flags."""
# Override the TPU address and tf.data service address.
params.override({
'task': {
'train_data': {
'tf_data_service_address': flags_obj.tf_data_service,
},
'validation_data': {
'tf_data_service_address': flags_obj.tf_data_service,
}
}
'runtime': {
'tpu': self._flags_obj.tpu,
},
})
return params
def parse_data_service(self, params):
"""Override the data service configs of params from flags."""
if ('tf_data_service' in self._flags_obj and
self._flags_obj.tf_data_service and
isinstance(params.task, config_definitions.TaskConfig)):
params.override({
'task': {
'train_data': {
'tf_data_service_address': self._flags_obj.tf_data_service,
},
'validation_data': {
'tf_data_service_address': self._flags_obj.tf_data_service,
}
}
})
return params
def parse_params_override(self, params):
# Get the second level of override from `--params_override`.
# `--params_override` is typically used as a further override over the
# template. For example, one may define a particular template for training
# ResNet50 on ImageNet in a config file and pass it via `--config_file`,
# then define different learning rates and pass it via `--params_override`.
if self._flags_obj.params_override:
params = hyperparams.override_params_dict(
params, self._flags_obj.params_override, is_strict=True)
return params
def parse_configuration(flags_obj, lock_return=True, print_return=True):
"""Parses ExperimentConfig from flags."""
# 4. Get the second level of override from `--params_override`.
# `--params_override` is typically used as a further override over the
# template. For example, one may define a particular template for training
# ResNet50 on ImageNet in a config file and pass it via `--config_file`,
# then define different learning rates and pass it via `--params_override`.
if flags_obj.params_override:
params = hyperparams.override_params_dict(
params, flags_obj.params_override, is_strict=True)
params = ExperimentParser(flags_obj).parse()
params.validate()
if lock_return:
......
......@@ -13,14 +13,38 @@
# limitations under the License.
"""Tests for official.core.train_utils."""
import json
import os
import pprint
import numpy as np
import tensorflow as tf
from official.core import exp_factory
from official.core import test_utils
from official.core import train_utils
from official.modeling import hyperparams
@exp_factory.register_config_factory('foo')
def foo():
"""Multitask experiment for test."""
experiment_config = hyperparams.Config(
default_params={
'runtime': {
'tpu': 'fake',
},
'task': {
'model': {
'model_id': 'bar',
},
},
'trainer': {
'train_steps': -1,
'validation_steps': -1,
},
})
return experiment_config
class TrainUtilsTest(tf.test.TestCase):
......@@ -93,6 +117,82 @@ class TrainUtilsTest(tf.test.TestCase):
]
self.assertEqual(actual, expected)
def test_construct_experiment_from_flags(self):
options = train_utils.ParseConfigOptions(
experiment='foo',
config_file=[],
tpu='bar',
tf_data_service='',
params_override='task.model.model_id=new,'
'trainer.train_steps=10,'
'trainer.validation_steps=11')
builder = train_utils.ExperimentParser(options)
params_from_obj = builder.parse()
params_from_func = train_utils.parse_configuration(options)
pp = pprint.PrettyPrinter()
self.assertEqual(
pp.pformat(params_from_obj.as_dict()),
pp.pformat(params_from_func.as_dict()))
self.assertEqual(params_from_obj.runtime.tpu, 'bar')
self.assertEqual(params_from_obj.task.model.model_id, 'new')
self.assertEqual(params_from_obj.trainer.train_steps, 10)
self.assertEqual(params_from_obj.trainer.validation_steps, 11)
class BestCheckpointExporterTest(tf.test.TestCase):
def test_maybe_export(self):
model_dir = self.create_tempdir().full_path
best_ckpt_path = os.path.join(model_dir, 'best_ckpt-1')
metric_name = 'test_metric|metric_1'
exporter = train_utils.BestCheckpointExporter(
model_dir, metric_name, 'higher')
v = tf.Variable(1.0)
checkpoint = tf.train.Checkpoint(v=v)
ret = exporter.maybe_export_checkpoint(
checkpoint, {'test_metric': {'metric_1': 5.0}}, 100)
with self.subTest(name='Successful first save.'):
self.assertEqual(ret, True)
v_2 = tf.Variable(2.0)
checkpoint_2 = tf.train.Checkpoint(v=v_2)
checkpoint_2.restore(best_ckpt_path)
self.assertEqual(v_2.numpy(), 1.0)
v = tf.Variable(3.0)
checkpoint = tf.train.Checkpoint(v=v)
ret = exporter.maybe_export_checkpoint(
checkpoint, {'test_metric': {'metric_1': 6.0}}, 200)
with self.subTest(name='Successful better metic save.'):
self.assertEqual(ret, True)
v_2 = tf.Variable(2.0)
checkpoint_2 = tf.train.Checkpoint(v=v_2)
checkpoint_2.restore(best_ckpt_path)
self.assertEqual(v_2.numpy(), 3.0)
v = tf.Variable(5.0)
checkpoint = tf.train.Checkpoint(v=v)
ret = exporter.maybe_export_checkpoint(
checkpoint, {'test_metric': {'metric_1': 1.0}}, 300)
with self.subTest(name='Worse metic no save.'):
self.assertEqual(ret, False)
v_2 = tf.Variable(2.0)
checkpoint_2 = tf.train.Checkpoint(v=v_2)
checkpoint_2.restore(best_ckpt_path)
self.assertEqual(v_2.numpy(), 3.0)
def test_export_best_eval_metric(self):
model_dir = self.create_tempdir().full_path
metric_name = 'test_metric|metric_1'
exporter = train_utils.BestCheckpointExporter(model_dir, metric_name,
'higher')
exporter.export_best_eval_metric({'test_metric': {'metric_1': 5.0}}, 100)
with tf.io.gfile.GFile(os.path.join(model_dir, 'info.json'),
'rb') as reader:
metric = json.loads(reader.read())
self.assertAllEqual(
metric,
{'test_metric': {'metric_1': 5.0}, 'best_ckpt_global_step': 100.0})
if __name__ == '__main__':
tf.test.main()
# ALBERT (ALBERT: A Lite BERT for Self-supervised Learning of Language Representations)
**WARNING**: This directory is deprecated.
See `nlp/docs/MODEL_GARDEN.md` for the new ALBERT implementation.
# Object Detection Models on TensorFlow 2
**WARNING**: This repository will be deprecated and replaced by the solid
implementations inside vision/beta/.
## Prerequsite
To get started, download the code from TensorFlow models GitHub repository or
use the pre-installed Google Cloud VM.
```bash
git clone https://github.com/tensorflow/models.git
```
Next, make sure to use TensorFlow 2.1+ on Google Cloud. Also here are
a few package you need to install to get started:
```bash
sudo apt-get install -y python-tk && \
pip3 install -r ~/models/official/requirements.txt
```
## Train RetinaNet on TPU
### Train a vanilla ResNet-50 based RetinaNet.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--params_override="{ type: retinanet, train: { checkpoint: { path: ${RESNET_CHECKPOINT?}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
```
The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
Note: The ResNet implementation under
[detection/](https://github.com/tensorflow/models/tree/master/official/legacy/detection)
is currently different from the one under
[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
so the checkpoints are not compatible.
We will unify the implementation soon.
### Train a SpineNet-49 based RetinaNet.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--params_override="{ type: retinanet, architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
```
### Train a custom RetinaNet using the config file.
First, create a YAML config file, e.g. *my_retinanet.yaml*. This file specifies
the parameters to be overridden, which should at least include the following
fields.
```YAML
# my_retinanet.yaml
type: 'retinanet'
train:
train_file_pattern: <path to the TFRecord training data>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--config_file="my_retinanet.yaml"
```
## Train RetinaNet on GPU
Training on GPU is similar to that on TPU. The major change is the strategy
type (use "[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)" for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)" for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--config_file="my_retinanet.yaml"
```
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--config_file="my_retinanet.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/legacy/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
predict:
predict_batch_size: 8
architecture:
use_bfloat16: False
train:
total_steps: 1
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
---
## Train Mask R-CNN on TPU
### Train a vanilla ResNet-50 based Mask R-CNN.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } }"
```
The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
Note: The ResNet implementation under
[detection/](https://github.com/tensorflow/models/tree/master/official/legacy/detection)
is currently different from the one under
[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
so the checkpoints are not compatible.
We will unify the implementation soon.
### Train a SpineNet-49 based Mask R-CNN.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--model=mask_rcnn \
--params_override="{architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
```
### Train a custom Mask R-CNN using the config file.
First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
This file specifies the parameters to be overridden,
which should at least include the following fields.
```YAML
# my_maskrcnn.yaml
train:
train_file_pattern: <path to the TFRecord training data>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
## Train Mask R-CNN on GPU
Training on GPU is similar to that on TPU. The major change is the strategy type
(use
"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/legacy/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--model=mask_rcnn \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
predict:
predict_batch_size: 8
architecture:
use_bfloat16: False
train:
total_steps: 1000
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
## Train ShapeMask on TPU
### Train a ResNet-50 based ShapeMask.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
SHAPE_PRIOR_PATH="<path to shape priors>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } shapemask_head: {use_category_for_mask: true, shape_prior_path: ${SHAPE_PRIOR_PATH}} }"
```
The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
The shape priors can be downloaded [here]
(https://storage.googleapis.com/cloud-tpu-checkpoints/shapemask/kmeans_class_priors_91x20x32x32.npy)
### Train a custom ShapeMask using the config file.
First, create a YAML config file, e.g. *my_shapemask.yaml*.
This file specifies the parameters to be overridden:
```YAML
# my_shapemask.yaml
train:
train_file_pattern: <path to the TFRecord training data>
total_steps: <total steps to train>
batch_size: <training batch size>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
batch_size: <evaluation batch size>
shapemask_head:
shape_prior_path: <path to shape priors>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--config_file="my_shapemask.yaml"
```
## Train ShapeMask on GPU
Training on GPU is similar to that on TPU. The major change is the strategy type
(use
"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--config_file="my_shapemask.yaml"
```
A single GPU example
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/legacy/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--config_file="my_shapemask.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/legacy/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--model=shapemask \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
train:
total_steps: 1000
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
### Run the evaluation (after training)
```
python3 /usr/share/models/official/legacy/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=eval \
--model=shapemask \
--params_override="{eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN}, eval_samples: 5000 } }"
```
`MODEL_DIR` needs to point to the trained path of ShapeMask model.
Change `strategy_type=mirrored` and `num_gpus=1` to run on a GPU.
Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.
## References
1. [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002).
Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, and Piotr Dollár. IEEE
International Conference on Computer Vision (ICCV), 2017.
......@@ -14,11 +14,11 @@
"""Factory to provide model configs."""
from official.legacy.detection.configs import maskrcnn_config
from official.legacy.detection.configs import olnmask_config
from official.legacy.detection.configs import retinanet_config
from official.legacy.detection.configs import shapemask_config
from official.modeling.hyperparams import params_dict
from official.vision.detection.configs import maskrcnn_config
from official.vision.detection.configs import olnmask_config
from official.vision.detection.configs import retinanet_config
from official.vision.detection.configs import shapemask_config
def config_generator(model):
......
......@@ -14,8 +14,8 @@
"""Config template to train Mask R-CNN."""
from official.legacy.detection.configs import base_config
from official.modeling.hyperparams import params_dict
from official.vision.detection.configs import base_config
# pylint: disable=line-too-long
......
......@@ -14,8 +14,8 @@
"""Config template to train Object Localization Network (OLN)."""
from official.legacy.detection.configs import base_config
from official.modeling.hyperparams import params_dict
from official.vision.detection.configs import base_config
# pylint: disable=line-too-long
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment