Commit c320b6ef authored by zhenyi's avatar zhenyi
Browse files

tf2 detection

parent 0fc002df
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Interface to run mask rcnn model in different distributed strategies."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import abc
import os
import six
import math
import multiprocessing
import tensorflow as tf
from mask_rcnn.utils.logging_formatter import logging
from mask_rcnn.utils.distributed_utils import MPI_is_distributed
from mask_rcnn.utils.distributed_utils import MPI_local_rank
from mask_rcnn.utils.distributed_utils import MPI_rank
from mask_rcnn.hooks.logging_hook import AutoLoggingHook
from mask_rcnn.utils.lazy_imports import LazyImport
hvd = LazyImport("horovod.tensorflow")
from tensorflow.core.protobuf import rewriter_config_pb2
from mask_rcnn import evaluation
from mask_rcnn.hyperparameters import params_io
from mask_rcnn.hooks import CheckpointSaverHook
from mask_rcnn.hooks import PretrainedWeightsLoadingHook
def get_training_hooks(mode, model_dir, checkpoint_path=None, skip_checkpoint_variables=None):
assert mode in ('train', 'eval')
training_hooks = [
AutoLoggingHook(
# log_every_n_steps=RUNNING_CONFIG.display_step,
log_every_n_steps=5 if "NGC_JOB_ID" not in os.environ else 100,
# warmup_steps=RUNNING_CONFIG.warmup_steps,
warmup_steps=100,
is_training=True
)
]
if not MPI_is_distributed() or MPI_rank() == 0:
training_hooks.append(PretrainedWeightsLoadingHook(
prefix="resnet50/",
checkpoint_path=checkpoint_path,
skip_variables_regex=skip_checkpoint_variables
))
if MPI_is_distributed() and mode == "train":
training_hooks.append(hvd.BroadcastGlobalVariablesHook(root_rank=0))
if not MPI_is_distributed() or MPI_rank() == 0:
training_hooks.append(CheckpointSaverHook(
checkpoint_dir=model_dir,
checkpoint_basename="model.ckpt"
))
return training_hooks
@six.add_metaclass(abc.ABCMeta)
class BaseExecuter(object):
"""Interface to run Mask RCNN model in TPUs/GPUs.
Arguments:
flags: FLAGS object passed from the user.
model_config: Model configuration needed to run distribution strategy.
model_fn: Model function to be passed to Estimator.
"""
def __init__(self, runtime_config, model_fn):
self._runtime_config = runtime_config
self._model_fn = model_fn
os.environ['CUDA_CACHE_DISABLE'] = '0'
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
os.environ['TF_ADJUST_HUE_FUSED'] = '1'
os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
@staticmethod
def _get_session_config(mode, use_xla, use_amp, use_tf_distributed=False, allow_xla_at_inference=False):
assert mode in ('train', 'eval')
rewrite_options = rewriter_config_pb2.RewriterConfig(
# arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
# arithmetic_optimization=rewriter_config_pb2.RewriterConfig.ON,
# constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
# constant_folding=rewriter_config_pb2.RewriterConfig.ON, # TO TEST
# debug_stripper=rewriter_config_pb2.RewriterConfig.OFF,
# debug_stripper=rewriter_config_pb2.RewriterConfig.ON, # TO TEST
# dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
# dependency_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST
# disable_model_pruning=False, # INCOMPATIBLE with AMP
# function_optimization=True,
# implementation_selector=True,
# loop_optimization=rewriter_config_pb2.RewriterConfig.OFF,
# loop_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST
# The default setting (SCHEDULING and SWAPPING HEURISTICS only)
# memory_optimization=rewriter_config_pb2.RewriterConfig.DEFAULT_MEM_OPT,
# Disabled in the meta-optimizer.
# memory_optimization=rewriter_config_pb2.RewriterConfig.NO_MEM_OPT,
# Driven by manual op-level annotations.
# memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
# Swapping heuristic will move a tensor from the GPU to the CPU and move it
# back when needed to reduce peak memory usage..
# memory_optimization=rewriter_config_pb2.RewriterConfig.SWAPPING_HEURISTICS,
# Recomputation heuristics will recompute ops (such as Relu activation)
# during backprop instead of storing them, reducing peak memory usage.
# memory_optimization=rewriter_config_pb2.RewriterConfig.RECOMPUTATION_HEURISTICS,
# Scheduling will split big ops such as AddN and try to enforce a schedule of
# the new computations that decreases peak memory usage.
# memory_optimization=rewriter_config_pb2.RewriterConfig.SCHEDULING_HEURISTICS,
# Use any combination of swapping and recomputation heuristics.
# memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS,
meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.TWO,
# meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
# meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.DEFAULT_NUM_ITERS,
# pin_to_host_optimization=rewriter_config_pb2.RewriterConfig.OFF,
# pin_to_host_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST
#
# remapping=rewriter_config_pb2.RewriterConfig.OFF,
# remapping=rewriter_config_pb2.RewriterConfig.ON, # TO TEST
# scoped_allocator_optimization=rewriter_config_pb2.RewriterConfig.OFF,
# scoped_allocator_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST
# shape_optimization=rewriter_config_pb2.RewriterConfig.OFF,
# shape_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST
)
if use_amp:
logging.info("[%s] AMP is activated - Experiment Feature" % mode)
rewrite_options.auto_mixed_precision = True
config = tf.compat.v1.ConfigProto(
allow_soft_placement=True,
log_device_placement=False,
graph_options=tf.compat.v1.GraphOptions(
rewrite_options=rewrite_options,
# infer_shapes=True # Heavily drops throughput by 30%
)
)
if use_tf_distributed:
config.gpu_options.force_gpu_compatible = False
else:
config.gpu_options.force_gpu_compatible = True # Force pinned memory
if MPI_is_distributed():
config.gpu_options.visible_device_list = str(MPI_local_rank())
if use_xla and (mode == "train" or allow_xla_at_inference):
logging.info("[%s] XLA is activated - Experiment Feature" % mode)
config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
# config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2
if mode == 'train':
config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads
if MPI_is_distributed():
config.inter_op_parallelism_threads = max(2, multiprocessing.cpu_count() // hvd.local_size())
elif not use_tf_distributed:
config.inter_op_parallelism_threads = 4
return config
@abc.abstractmethod
def build_strategy_configuration(self, mode):
"""Builds run configuration for distributed train/eval.
Returns:
RunConfig with distribution strategy configurations
to pass to the constructor of TPUEstimator/Estimator.
"""
NotImplementedError('Must be implemented in subclass')
def build_model_parameters(self, mode):
"""Builds model parameter."""
assert mode in ('train', 'eval')
batch_size = self._runtime_config.train_batch_size if mode == 'train' else self._runtime_config.eval_batch_size
params = dict(
self._runtime_config.values(),
mode=mode,
batch_size=batch_size,
model_dir=self._runtime_config.model_dir,
)
if mode == 'eval':
params = dict(
params,
augment_input_data=False,
)
return params
def build_mask_rcnn_estimator(self, params, run_config, mode):
"""Creates TPUEstimator/Estimator instance.
Arguments:
params: A dictionary to pass to Estimator `model_fn`.
run_config: RunConfig instance specifying distribution strategy
configurations.
mode: Mode -- one of 'train` or `eval`.
Returns:
TFEstimator or TPUEstimator instance.
"""
assert mode in ('train', 'eval')
return tf.estimator.Estimator(
model_fn=self._model_fn,
model_dir=self._runtime_config.model_dir,
config=run_config,
params=params
)
def _save_config(self):
"""Save parameters to config files if model_dir is defined."""
model_dir = self._runtime_config.model_dir
if model_dir is not None:
if not tf.io.gfile.exists(model_dir):
tf.io.gfile.makedirs(model_dir)
params_io.save_hparams_to_yaml(self._runtime_config, model_dir + '/params.yaml')
def _write_summary(self, summary_dir, eval_results, predictions, current_step):
if not self._runtime_config.visualize_images_summary:
predictions = None
evaluation.write_summary(eval_results, summary_dir, current_step, predictions=predictions)
def train(self, train_input_fn, run_eval_after_train=False, eval_input_fn=None):
"""Run distributed training on Mask RCNN model."""
self._save_config()
train_run_config = self.build_strategy_configuration('train')
train_params = self.build_model_parameters('train')
train_estimator = self.build_mask_rcnn_estimator(train_params, train_run_config, 'train')
print('train_estimator111111111111111111111111111111111111111111111111111111111111111111')
train_estimator.train(
input_fn=train_input_fn,
max_steps=self._runtime_config.total_steps,
hooks=get_training_hooks(
mode="train",
model_dir=self._runtime_config.model_dir,
checkpoint_path=self._runtime_config.checkpoint,
skip_checkpoint_variables=self._runtime_config.skip_checkpoint_variables
)
)
if not run_eval_after_train:
return None
if eval_input_fn is None:
raise ValueError('Eval input_fn must be passed to conduct evaluation after training.')
eval_run_config = self.build_strategy_configuration('eval')
eval_params = self.build_model_parameters('eval')
eval_estimator = self.build_mask_rcnn_estimator(eval_params, eval_run_config, 'eval')
last_ckpt = tf.train.latest_checkpoint(self._runtime_config.model_dir, latest_filename=None)
logging.info("Restoring parameters from %s\n" % last_ckpt)
eval_results, predictions = evaluation.evaluate(
eval_estimator,
eval_input_fn,
self._runtime_config.eval_samples,
self._runtime_config.eval_batch_size,
self._runtime_config.include_mask,
self._runtime_config.val_json_file,
report_frequency=self._runtime_config.report_frequency
)
output_dir = os.path.join(self._runtime_config.model_dir, 'eval')
tf.io.gfile.makedirs(output_dir)
# Summary writer writes out eval metrics.
self._write_summary(output_dir, eval_results, predictions, self._runtime_config.total_steps)
return eval_results
def train_and_eval(self, train_input_fn, eval_input_fn):
"""Run distributed train and eval on Mask RCNN model."""
self._save_config()
output_dir = os.path.join(self._runtime_config.model_dir, 'eval')
tf.io.gfile.makedirs(output_dir)
train_run_config = self.build_strategy_configuration('train')
train_params = self.build_model_parameters('train')
train_estimator = self.build_mask_rcnn_estimator(train_params, train_run_config, 'train')
eval_estimator = None
eval_results = None
num_cycles = math.ceil(self._runtime_config.total_steps / self._runtime_config.num_steps_per_eval)
training_hooks = get_training_hooks(
mode="train",
model_dir=self._runtime_config.model_dir,
checkpoint_path=self._runtime_config.checkpoint,
skip_checkpoint_variables=self._runtime_config.skip_checkpoint_variables
)
for cycle in range(1, num_cycles + 1):
if not MPI_is_distributed() or MPI_rank() == 0:
print() # Visual Spacing
logging.info("=================================")
logging.info(' Start training cycle %02d' % cycle)
logging.info("=================================\n")
max_cycle_step = min(int(cycle * self._runtime_config.num_steps_per_eval), self._runtime_config.total_steps)
PROFILER_ENABLED = False
if (not MPI_is_distributed() or MPI_rank() == 0) and PROFILER_ENABLED:
profiler_context_manager = tf.contrib.tfprof.ProfileContext
else:
from contextlib import suppress
profiler_context_manager = lambda *args, **kwargs: suppress() # No-Op context manager
with profiler_context_manager(
'/workspace/profiling/',
trace_steps=range(100, 200, 3),
dump_steps=[200]
) as pctx:
if (not MPI_is_distributed() or MPI_rank() == 0) and PROFILER_ENABLED:
opts = tf.compat.v1.profiler.ProfileOptionBuilder.time_and_memory()
pctx.add_auto_profiling('op', opts, [150, 200])
train_estimator.train(
input_fn=train_input_fn,
max_steps=max_cycle_step,
hooks=training_hooks,
)
if not MPI_is_distributed() or MPI_rank() == 0:
print() # Visual Spacing
logging.info("=================================")
logging.info(' Start evaluation cycle %02d' % cycle)
logging.info("=================================\n")
if eval_estimator is None:
eval_run_config = self.build_strategy_configuration('eval')
eval_params = self.build_model_parameters('eval')
eval_estimator = self.build_mask_rcnn_estimator(eval_params, eval_run_config, 'eval')
last_ckpt = tf.train.latest_checkpoint(self._runtime_config.model_dir, latest_filename=None)
logging.info("Restoring parameters from %s\n" % last_ckpt)
eval_results, predictions = evaluation.evaluate(
eval_estimator,
eval_input_fn,
self._runtime_config.eval_samples,
self._runtime_config.eval_batch_size,
self._runtime_config.include_mask,
self._runtime_config.val_json_file,
report_frequency=self._runtime_config.report_frequency
)
self._write_summary(output_dir, eval_results, predictions, max_cycle_step)
if MPI_is_distributed():
from mpi4py import MPI
MPI.COMM_WORLD.Barrier() # Waiting for all MPI processes to sync
return eval_results
def eval(self, eval_input_fn):
"""Run distributed eval on Mask RCNN model."""
output_dir = os.path.join(self._runtime_config.model_dir, 'eval')
tf.io.gfile.makedirs(output_dir)
# Summary writer writes out eval metrics.
run_config = self.build_strategy_configuration('eval')
eval_params = self.build_model_parameters('eval')
eval_estimator = self.build_mask_rcnn_estimator(eval_params, run_config, 'eval')
logging.info('Starting to evaluate.')
last_ckpt = tf.train.latest_checkpoint(self._runtime_config.model_dir, latest_filename=None)
if last_ckpt is not None:
logging.info("Restoring parameters from %s\n" % last_ckpt)
current_step = int(os.path.basename(last_ckpt).split('-')[1])
else:
logging.warning(
"Could not find trained model in model_dir: `%s`, running initialization to predict\n" %
self._runtime_config.model_dir
)
current_step = 0
eval_results, predictions = evaluation.evaluate(
eval_estimator,
eval_input_fn,
self._runtime_config.eval_samples,
self._runtime_config.eval_batch_size,
self._runtime_config.include_mask,
self._runtime_config.val_json_file
)
self._write_summary(output_dir, eval_results, predictions, current_step)
if current_step >= self._runtime_config.total_steps:
logging.info('Evaluation finished after training step %d' % current_step)
return eval_results
class EstimatorExecuter(BaseExecuter):
"""Interface that runs Mask RCNN model using TPUEstimator."""
def __init__(self, runtime_config, model_fn):
super(EstimatorExecuter, self).__init__(runtime_config, model_fn)
if MPI_is_distributed():
os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
os.environ['HOROVOD_NUM_NCCL_STREAMS'] = '1'
# os.environ['HOROVOD_AUTOTUNE'] = '2'
hvd.init()
logging.info("Horovod successfully initialized ...")
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '1' if not MPI_is_distributed() else str(hvd.size())
os.environ['TF_SYNC_ON_FINISH'] = '0'
def build_strategy_configu ration(self, mode):
"""Retrieves model configuration for running TF Estimator."""
run_config = tf.estimator.RunConfig(
tf_random_seed=(
self._runtime_config.seed
if not MPI_is_distributed() or self._runtime_config.seed is None else
self._runtime_config.seed + MPI_rank()
),
model_dir=self._runtime_config.model_dir,
save_summary_steps=None, # disabled
save_checkpoints_steps=None, # disabled
save_checkpoints_secs=None, # disabled
keep_checkpoint_max=20, # disabled
keep_checkpoint_every_n_hours=None, # disabled
log_step_count_steps=None, # disabled
session_config=self._get_session_config(
mode=mode,
use_xla=self._runtime_config.xla,
use_amp=self._runtime_config.amp,
use_tf_distributed=False,
allow_xla_at_inference=self._runtime_config.allow_xla_at_inference # TODO: Remove when XLA at inference fixed
),
protocol=None,
device_fn=None,
train_distribute=None,
eval_distribute=None,
experimental_distribute=None
)
return run_config
class TFDistributedExecuter(BaseExecuter):
"""Interface that runs Mask RCNN model using MultiWorkerMirroredStrategy."""
@staticmethod
def is_eval_task():
return tf.distribute.cluster_resolver.TFConfigClusterResolver().task_type == 'evaluator'
def build_strategy_configuration(self, mode):
"""Retrieves model configuration for MultiWorkerMirroredStrategy."""
distributed_strategy = tf.distribute.MirroredStrategy()
# distributed_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
# tf.distribute.experimental.CollectiveCommunication.NCCL
# )
run_config = tf.estimator.RunConfig(
tf_random_seed=self._runtime_config.seed,
model_dir=self._runtime_config.model_dir,
save_summary_steps=None, # disabled
save_checkpoints_steps=None, # disabled
save_checkpoints_secs=None, # disabled
keep_checkpoint_max=20, # disabled
keep_checkpoint_every_n_hours=None, # disabled
log_step_count_steps=None, # disabled
session_config=self._(
mode=mode,
use_xla=self._runtime_config.xla,
use_amp=self._runtime_config.amp,
use_tf_distributed=True,
# TODO: Remove when XLA at inference fixed
allow_xla_at_inference=self._runtime_config.allow_xla_at_inference
),
protocol=None,
device_fn=None,
train_distribute=distributed_strategy if mode == "train" else None,
eval_distribute=None,
experimental_distribute=None
)
return run_config
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to perform COCO evaluation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import operator
import pprint
import six
import time
import io
from PIL import Image
import numpy as np
import tensorflow as tf
from mask_rcnn.utils.logging_formatter import logging
from mask_rcnn import coco_metric
from mask_rcnn.utils import coco_utils
from mask_rcnn.object_detection import visualization_utils
import dllogger
from dllogger import Verbosity
def process_prediction_for_eval(prediction):
"""Process the model prediction for COCO eval."""
image_info = prediction['image_info']
box_coordinates = prediction['detection_boxes']
processed_box_coordinates = np.zeros_like(box_coordinates)
for image_id in range(box_coordinates.shape[0]):
scale = image_info[image_id][2]
for box_id in range(box_coordinates.shape[1]):
# Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
# Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
# by image scale.
y1, x1, y2, x2 = box_coordinates[image_id, box_id, :]
new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
processed_box_coordinates[image_id, box_id, :] = new_box
prediction['detection_boxes'] = processed_box_coordinates
return prediction
def compute_coco_eval_metric(predictor,
num_batches=-1,
include_mask=True,
annotation_json_file="",
eval_batch_size=-1,
report_frequency=None):
"""Compute COCO eval metric given a prediction generator.
Args:
predictor: a generator that iteratively pops a dictionary of predictions
with the format compatible with COCO eval tool.
num_batches: the number of batches to be aggregated in eval. This is how
many times that the predictor gets pulled.
include_mask: a boolean that indicates whether we include the mask eval.
annotation_json_file: the annotation json file of the eval dataset.
Returns:
eval_results: the aggregated COCO metric eval results.
"""
if annotation_json_file == "":
annotation_json_file = None
use_groundtruth_from_json = (annotation_json_file is not None)
predictions = dict()
batch_idx = 0
if use_groundtruth_from_json:
eval_metric = coco_metric.EvaluationMetric(annotation_json_file, include_mask=include_mask)
else:
eval_metric = coco_metric.EvaluationMetric(filename=None, include_mask=include_mask)
def evaluation_preds(preds):
# Essential to avoid modifying the source dict
_preds = copy.deepcopy(preds)
for k, v in six.iteritems(_preds):
_preds[k] = np.concatenate(_preds[k], axis=0)
if 'orig_images' in _preds and _preds['orig_images'].shape[0] > 10:
# Only samples a few images for visualization.
_preds['orig_images'] = _preds['orig_images'][:10]
if use_groundtruth_from_json:
eval_results = eval_metric.predict_metric_fn(_preds)
else:
images, annotations = coco_utils.extract_coco_groundtruth(_preds, include_mask)
coco_dataset = coco_utils.create_coco_format_dataset(images, annotations)
eval_results = eval_metric.predict_metric_fn(_preds, groundtruth_data=coco_dataset)
return eval_results
# Take into account cuDNN & Tensorflow warmup
# Drop N first steps for avg throughput calculation
BURNIN_STEPS = 100
model_throughput_list = list()
inference_time_list = list()
while num_batches < 0 or batch_idx < num_batches:
try:
step_t0 = time.time()
step_predictions = six.next(predictor)
batch_time = time.time() - step_t0
throughput = eval_batch_size / batch_time
model_throughput_list.append(throughput)
inference_time_list.append(batch_time)
logging.info('Running inference on batch %03d/%03d... - Step Time: %.4fs - Throughput: %.1f imgs/s' % (
batch_idx + 1,
num_batches,
batch_time,
throughput
))
except StopIteration:
logging.info('Get StopIteration at %d batch.' % (batch_idx + 1))
break
step_predictions = process_prediction_for_eval(step_predictions)
for k, v in step_predictions.items():
if k not in predictions:
predictions[k] = [v]
else:
predictions[k].append(v)
batch_idx = batch_idx + 1
# If you want the report to happen each report_frequency to happen each report_frequency batches.
# Thus, each report is of eval_batch_size * report_frequency
if report_frequency and batch_idx % report_frequency == 0:
eval_results = evaluation_preds(preds=predictions)
logging.info('Eval results: %s' % pprint.pformat(eval_results, indent=4))
inference_time_list.sort()
eval_results = evaluation_preds(preds=predictions)
average_time = np.mean(inference_time_list)
latency_50 = max(inference_time_list[:int(len(inference_time_list) * 0.5)])
latency_90 = max(inference_time_list[:int(len(inference_time_list) * 0.90)])
latency_95 = max(inference_time_list[:int(len(inference_time_list) * 0.95)])
latency_99 = max(inference_time_list[:int(len(inference_time_list) * 0.99)])
latency_100 = max(inference_time_list[:int(len(inference_time_list) * 1)])
print() # Visual Spacing
logging.info("# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #")
logging.info(" Evaluation Performance Summary ")
logging.info("# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #")
total_processing_hours, rem = divmod(np.sum(model_throughput_list), 3600)
total_processing_minutes, total_processing_seconds = divmod(rem, 60)
if len(model_throughput_list) > BURNIN_STEPS:
# Take into account cuDNN & Tensorflow warmup
# Drop N first steps for avg throughput calculation
# Also drop last step which may have a different batch size
avg_throughput = np.mean(model_throughput_list[BURNIN_STEPS:-1])
else:
avg_throughput = -1.
print() # Visual Spacing
logging.info("Average throughput: {throughput:.1f} samples/sec".format(throughput=avg_throughput))
logging.info("Inference Latency Average (s) = {avg:.4f}".format(avg=average_time))
logging.info("Inference Latency 50% (s) = {cf_50:.4f}".format(cf_50=latency_50))
logging.info("Inference Latency 90% (s) = {cf_90:.4f}".format(cf_90=latency_90))
logging.info("Inference Latency 95% (s) = {cf_95:.4f}".format(cf_95=latency_95))
logging.info("Inference Latency 99% (s) = {cf_99:.4f}".format(cf_99=latency_99))
logging.info("Inference Latency 100% (s) = {cf_100:.4f}".format(cf_100=latency_100))
logging.info("Total processed steps: {total_steps}".format(total_steps=len(model_throughput_list)))
logging.info(
"Total processing time: {hours}h {minutes:02d}m {seconds:02d}s".format(
hours=total_processing_hours,
minutes=int(total_processing_minutes),
seconds=int(total_processing_seconds)
)
)
dllogger.log(step=(), data={"avg_inference_throughput": avg_throughput}, verbosity=Verbosity.DEFAULT)
avg_inference_time = float(total_processing_hours * 3600 + int(total_processing_minutes) * 60 +
int(total_processing_seconds))
dllogger.log(step=(), data={"avg_inference_time": avg_inference_time}, verbosity=Verbosity.DEFAULT)
logging.info("==================== Metrics ====================")
# logging.info('Eval Epoch results: %s' % pprint.pformat(eval_results, indent=4))
for key, value in sorted(eval_results.items(), key=operator.itemgetter(0)):
logging.info("%s: %.9f" % (key, value))
print() # Visual Spacing
return eval_results, predictions
def evaluate(eval_estimator,
input_fn,
num_eval_samples,
eval_batch_size,
include_mask=True,
validation_json_file="",
report_frequency=None):
"""Runs COCO evaluation once."""
predictor = eval_estimator.predict(
input_fn=input_fn,
yield_single_examples=False
)
# Every predictor.next() gets a batch of prediction (a dictionary).
num_eval_times = num_eval_samples // eval_batch_size
assert num_eval_times > 0, 'num_eval_samples must be >= eval_batch_size!'
eval_results, predictions = compute_coco_eval_metric(
predictor,
num_eval_times,
include_mask,
validation_json_file,
eval_batch_size=eval_batch_size,
report_frequency=report_frequency
)
return eval_results, predictions
def write_summary(eval_results, summary_dir, current_step, predictions=None):
"""Write out eval results for the checkpoint."""
with tf.Graph().as_default():
summaries = []
# Summary writer writes out eval metrics.
try:
# Tensorflow 1.x
summary_writer = tf.compat.v1.summary.FileWriter(summary_dir)
except AttributeError:
# Tensorflow 2.x
summary_writer = tf.summary.create_file_writer(summary_dir)
summary_writer.as_default()
eval_results_dict = {}
for metric in eval_results:
try:
summaries.append(tf.compat.v1.Summary.Value(tag=metric, simple_value=eval_results[metric]))
eval_results_dict[metric] = float(eval_results[metric])
except AttributeError:
tf.summary.scalar(name=metric, data=eval_results[metric], step=current_step)
eval_results_dict[metric] = float(eval_results[metric])
dllogger.log(step=(), data=eval_results_dict, verbosity=Verbosity.DEFAULT)
if isinstance(predictions, dict) and predictions:
images_summary = get_image_summary(predictions, current_step)
try:
summaries += images_summary
except TypeError:
summaries.append(images_summary)
try:
# tf_summaries = tf.compat.v1.Summary(value=list(summaries))
tf_summaries = tf.compat.v1.Summary(value=summaries)
summary_writer.add_summary(tf_summaries, current_step)
summary_writer.flush()
except AttributeError:
tf.summary.flush(summary_writer)
def generate_image_preview(image, boxes, scores, classes, gt_boxes=None, segmentations=None):
"""Creates an image summary given predictions."""
max_boxes_to_draw = 100
min_score_thresh = 0.1
# Visualizes the predicitons.
image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes=classes,
scores=scores,
category_index={},
instance_masks=segmentations,
use_normalized_coordinates=False,
max_boxes_to_draw=max_boxes_to_draw,
min_score_thresh=min_score_thresh,
agnostic_mode=False
)
if gt_boxes is not None:
# Visualizes the groundtruth boxes. They are in black by default.
image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
image_with_detections,
gt_boxes,
classes=None,
scores=None,
category_index={},
use_normalized_coordinates=False,
max_boxes_to_draw=max_boxes_to_draw,
agnostic_mode=True
)
return image_with_detections
def generate_image_buffer(input_image):
buf = io.BytesIO()
w, h = input_image.shape[:2]
ratio = 1024 / w
new_size = [int(w * ratio), int(h * ratio)]
image = Image.fromarray(input_image.astype(np.uint8))
image.thumbnail(new_size)
image.save(buf, format='png')
return buf.getvalue()
def get_image_summary(predictions, current_step, max_images=10):
"""Write out image and prediction for summary."""
if 'orig_images' not in predictions:
logging.info('Missing orig_images in predictions: %s', predictions.keys())
return
max_images = min(
len(predictions['orig_images']) * predictions['orig_images'][0].shape[0],
max_images
)
_detection_boxes = np.concatenate(predictions['detection_boxes'], axis=0)
_detection_scores = np.concatenate(predictions['detection_scores'], axis=0)
_detection_classes = np.concatenate(predictions['detection_classes'], axis=0)
_image_info = np.concatenate(predictions['image_info'], axis=0)
_num_detections = np.concatenate(predictions['num_detections'], axis=0)
_orig_images = np.concatenate(predictions['orig_images'], axis=0)
if 'detection_masks' in predictions:
_detection_masks = np.concatenate(predictions['detection_masks'], axis=0)
else:
_detection_masks = None
if 'groundtruth_boxes' in predictions:
_groundtruth_boxes = np.concatenate(predictions['groundtruth_boxes'], axis=0)
else:
_groundtruth_boxes = None
_orig_images = _orig_images * 255
_orig_images = _orig_images.astype(np.uint8)
image_previews = []
for i in range(max_images):
num_detections = min(len(_detection_boxes[i]), int(_num_detections[i]))
detection_boxes = _detection_boxes[i][:num_detections]
detection_scores = _detection_scores[i][:num_detections]
detection_classes = _detection_classes[i][:num_detections]
image = _orig_images[i]
image_height = image.shape[0]
image_width = image.shape[1]
# Rescale the box to fit the visualization image.
h, w = _image_info[i][3:5]
detection_boxes = detection_boxes / np.array([w, h, w, h])
detection_boxes = detection_boxes * np.array([image_width, image_height, image_width, image_height])
if _groundtruth_boxes is not None:
gt_boxes = _groundtruth_boxes[i]
gt_boxes = gt_boxes * np.array([image_height, image_width, image_height, image_width])
else:
gt_boxes = None
if _detection_masks is not None:
instance_masks = _detection_masks[i][0:num_detections]
segmentations = coco_metric.generate_segmentation_from_masks(
instance_masks,
detection_boxes,
image_height,
image_width
)
else:
segmentations = None
# From [x, y, w, h] to [x1, y1, x2, y2] and
# process_prediction_for_eval() set the box to be [x, y] format, need to
# reverted them to [y, x] format.
xmin, ymin, w, h = np.split(detection_boxes, 4, axis=-1)
xmax = xmin + w
ymax = ymin + h
boxes_to_visualize = np.concatenate([ymin, xmin, ymax, xmax], axis=-1)
image_preview = generate_image_preview(
image,
boxes=boxes_to_visualize,
scores=detection_scores,
classes=detection_classes.astype(np.int32),
gt_boxes=gt_boxes,
segmentations=segmentations
)
image_previews.append(image_preview)
try:
summaries = []
for i, image_preview in enumerate(image_previews):
image_buffer = generate_image_buffer(image_preview)
image_summary = tf.compat.v1.Summary.Image(encoded_image_string=image_buffer)
image_value = tf.compat.v1.Summary.Value(tag='%d_input' % i, image=image_summary)
summaries.append(image_value)
except AttributeError:
image_previews = np.array(image_previews)
summaries = tf.summary.image(
name='image_summary',
data=image_previews,
step=current_step,
max_outputs=max_images
)
return summaries
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from mask_rcnn.hooks.ckpt_hook import CheckpointSaverHook
from mask_rcnn.hooks.pretrained_restore_hook import PretrainedWeightsLoadingHook
__all__ = [
"CheckpointSaverHook",
"PretrainedWeightsLoadingHook",
]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tensorflow as tf
from mask_rcnn.utils.logging_formatter import logging
__all__ = ["CheckpointSaverHook"]
class CheckpointSaverHook(tf.estimator.SessionRunHook):
"""Saves checkpoints every N steps or seconds."""
def __init__(self, checkpoint_dir, checkpoint_basename="model.ckpt"):
"""Initializes a `CheckpointSaverHook`.
Args:
checkpoint_dir: `str`, base directory for the checkpoint files.
checkpoint_basename: `str`, base name for the checkpoint files.
Raises:
ValueError: One of `save_steps` or `save_secs` should be set.
ValueError: At most one of `saver` or `scaffold` should be set.
"""
logging.info("Create CheckpointSaverHook.")
self._saver = None
self._checkpoint_dir = checkpoint_dir
self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
self._steps_per_run = 1
self._is_initialized = False
self._global_step_tensor = None
self._summary_writer = None
def _set_steps_per_run(self, steps_per_run):
self._steps_per_run = steps_per_run
def begin(self):
self._global_step_tensor = tf.compat.v1.train.get_or_create_global_step()
self._saver = tf.compat.v1.train.Saver()
from tensorflow.python.training import summary_io
self._summary_writer = summary_io.SummaryWriterCache.get(self._checkpoint_dir)
if self._global_step_tensor is None:
raise RuntimeError(
"Global step should be created to use CheckpointSaverHook."
)
def after_create_session(self, session, coord):
if not self._is_initialized:
global_step = session.run(self._global_step_tensor)
from tensorflow.python.keras.backend import get_graph
default_graph = get_graph()
# We do write graph and saver_def at the first call of before_run.
# We cannot do this in begin, since we let other hooks to change graph and
# add variables in begin. Graph is finalized after all begin calls.
tf.io.write_graph(
default_graph.as_graph_def(add_shapes=True),
self._checkpoint_dir,
"graph.pbtxt"
)
saver_def = self._saver.saver_def
from tensorflow.python.framework import meta_graph
meta_graph_def = meta_graph.create_meta_graph_def(
graph_def=default_graph.as_graph_def(add_shapes=True),
saver_def=saver_def
)
self._summary_writer.add_graph(default_graph)
self._summary_writer.add_meta_graph(meta_graph_def)
# The checkpoint saved here is the state at step "global_step".
self._save(session, global_step)
self._is_initialized = True
def end(self, session):
last_step = session.run(self._global_step_tensor)
self._save(session, last_step)
def _save(self, session, step):
"""Saves the latest checkpoint, returns should_stop."""
logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
self._saver.save(session, self._save_path, global_step=step)
self._summary_writer.add_session_log(
tf.compat.v1.SessionLog(status=tf.compat.v1.SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
step
)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import operator
import time
import numpy as np
import tensorflow as tf
from distutils.version import LooseVersion
from mask_rcnn.utils.logging_formatter import logging
from mask_rcnn.utils import meters
from mask_rcnn.utils.decorators import atexit_hook
from mask_rcnn.utils.distributed_utils import MPI_is_distributed
from mask_rcnn.utils.distributed_utils import MPI_rank_and_size
from mask_rcnn.utils.distributed_utils import MPI_size
from mask_rcnn.utils.logging_backend import LoggingBackend
from mask_rcnn.utils.logging_backend import RuntimeMode
from mask_rcnn.utils.metric_tracking import clear_registered_metrics
from mask_rcnn.utils.metric_tracking import TF_METRICS
from mask_rcnn.utils.metric_tracking import KERAS_MODELS
from mask_rcnn.utils.lazy_imports import LazyImport
hvd = LazyImport("horovod.tensorflow")
__all__ = ["AutoLoggingHook"]
@atexit_hook
class _AutoLoggingHook(tf.estimator.SessionRunHook):
def __init__(self, log_every_n_steps=200, warmup_steps=500, is_training=True):
"""
AutoLogging Hook for Tensorflow
:param log_every_n_steps: log will be output on the console every N steps
:param warmup_steps: integers, numbers of steps considered as warmup
:param is_training: boolean
"""
self._logging_proxy = LoggingBackend()
self._initialized = False
self._metrics = copy.copy(TF_METRICS)
self._batch_size_tensor = None
self._AMP_steps_since_last_loss_scale = None
self._AMP_loss_scale_tensor = None
self._current_step = None
self._amp_steps_non_skipped = None
self._warmup_steps = warmup_steps
self._log_every_n_steps = log_every_n_steps
self._step_t0 = None
self._session_t0 = None
self._session_run_times = list()
self._global_step_tensor = None
self._is_training = is_training
self._runtime_mode = RuntimeMode.TRAIN if is_training else RuntimeMode.VALIDATION
self._model_throughput = meters.MovingAverageMeter(window_size=1000)
self._model_stats = None
self._n_gpus = None
def __atexit__(self):
if self._initialized:
total_processing_time = int(np.sum(self._session_run_times))
try:
avg_throughput = self._model_throughput.read()
except ValueError:
avg_throughput = -1
self._logging_proxy.log_summary(
is_train=self._is_training,
total_steps=self._current_step,
total_processing_time=total_processing_time,
avg_throughput=avg_throughput
)
metric_data = dict()
for key, value in self._metrics.items():
try:
metric_data[key] = value["aggregator"].read()
except ValueError:
pass
self._logging_proxy.log_final_metrics(metric_data=metric_data, runtime_mode=self._runtime_mode)
def begin(self):
"""Called once before using the session.
When called, the default graph is the one that will be launched in the
session. The hook can modify the graph by adding new operations to it.
After the `begin()` call the graph will be finalized and the other callbacks
can not modify the graph anymore. Second call of `begin()` on the same
graph, should not change the graph.
"""
from tensorflow.python.keras.backend import get_graph
_graph = get_graph()
try:
self._batch_size_tensor = None
for tensor in _graph.as_graph_def().node:
if "IteratorGetNext" in tensor.name:
_input_tensor = _graph.get_tensor_by_name(tensor.name + ":0")
try:
self._batch_size_tensor = tf.shape(input=_input_tensor)[0]
except TypeError: # Ragged Tensor
self._batch_size_tensor = _input_tensor.bounding_shape()[0]
break
else:
raise RuntimeError(
"Tensor `{}` could not be found. "
"Make sure you are using tf.data API".format("IteratorGetNext")
)
except RuntimeError:
raise
except Exception as e:
raise RuntimeError(
"Impossible to fetch the tensor: `IteratorGetNext`. Make sure you are using tf.data API."
) from e
self._global_step_tensor = tf.compat.v1.train.get_or_create_global_step()
try:
self._AMP_loss_scale_tensor = _graph.get_tensor_by_name("current_loss_scale/Read/ReadVariableOp:0")
self._AMP_steps_since_last_loss_scale = _graph.get_tensor_by_name("current_loss_scale/Read/ReadVariableOp:0")
except RuntimeError:
raise
# TF-AMP is not activated
except Exception:
pass
# if self._is_training:
# self.runtime_data["params_count"] = tf.reduce_sum(
# [tf.reduce_prod(v.shape) for v in tf.trainable_variables()]
# )
def end(self, session): # pylint: disable=unused-argument
"""Called at the end of session.
The `session` argument can be used in case the hook wants to run final ops,
such as saving a last checkpoint.
If `session.run()` raises exception other than OutOfRangeError or
StopIteration then `end()` is not called.
Note the difference between `end()` and `after_run()` behavior when
`session.run()` raises OutOfRangeError or StopIteration. In that case
`end()` is called but `after_run()` is not called.
Args:
session: A TensorFlow Session that will be soon closed.
"""
self._session_run_times.append(time.time() - self._session_t0)
def after_create_session(self, session, coord): # pylint: disable=unused-argument3
"""Called when new TensorFlow session is created.
This is called to signal the hooks that a new session has been created. This
has two essential differences with the situation in which `begin` is called:
* When this is called, the graph is finalized and ops can no longer be added
to the graph.
* This method will also be called as a result of recovering a wrapped
session, not only at the beginning of the overall session.
Args:
session: A TensorFlow Session that has been created.
coord: A Coordinator object which keeps track of all threads.
"""
# ========= Collect the number of GPUs ======== #
if self._is_training:
if MPI_is_distributed():
self._n_gpus = MPI_size()
elif tf.distribute.has_strategy():
self._n_gpus = tf.distribute.get_strategy().num_replicas_in_sync
else:
self._n_gpus = 1
else:
self._n_gpus = 1
# =========== TensorFlow Hook Setup =========== #
_global_step, _metrics = setup_tensorflow_hook(
sess=session,
logging_proxy=self._logging_proxy,
is_training=self._is_training,
is_initialized=self._initialized
)
if _global_step >= 0:
self._current_step = self._amp_steps_non_skipped = _global_step
self._metrics.update(_metrics)
if not self._is_training:
for metric_name in self._metrics.keys():
self._metrics[metric_name]["aggregator"].reset()
self._initialized = True
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
self._session_t0 = time.time()
def before_run(self, run_context): # pylint: disable=unused-argument
"""Called before each call to run().
You can return from this call a `SessionRunArgs` object indicating ops or
tensors to add to the upcoming `run()` call. These ops/tensors will be run
together with the ops/tensors originally passed to the original run() call.
The run args you return can also contain feeds to be added to the run()
call.
The `run_context` argument is a `SessionRunContext` that provides
information about the upcoming `run()` call: the originally requested
op/tensors, the TensorFlow Session.
At this point graph is finalized and you can not add ops.
Args:
run_context: A `SessionRunContext` object.
Returns:
None or a `SessionRunArgs` object.
"""
self._current_step += 1
request_fetches = {
"global_step": self._global_step_tensor, "metrics": dict(), "batch_size": self._batch_size_tensor
}
if self._is_training and self._AMP_steps_since_last_loss_scale is not None:
request_fetches["AMP"] = {
"steps_since_last_loss_scale": self._AMP_steps_since_last_loss_scale,
"current_loss_scale": self._AMP_loss_scale_tensor,
}
if self._current_step % self._log_every_n_steps == 0:
for key, value in self._metrics.items():
request_fetches["metrics"][key] = value["tensor"]
self._step_t0 = time.time()
return tf.estimator.SessionRunArgs(request_fetches)
def after_run(self, run_context, run_values): # pylint: disable=unused-argument
"""Called after each call to run().
The `run_values` argument contains results of requested ops/tensors by
`before_run()`.
The `run_context` argument is the same one send to `before_run` call.
`run_context.request_stop()` can be called to stop the iteration.
If `session.run()` raises any exceptions then `after_run()` is not called.
Args:
run_context: A `SessionRunContext` object.
run_values: A SessionRunValues object.
"""
batch_time = time.time() - self._step_t0
_global_step = run_values.results["global_step"]
if self._is_training and self._AMP_steps_since_last_loss_scale is not None:
try:
AMP_steps_since_last_loss_scale = run_values.results["AMP"]["steps_since_last_loss_scale"]
AMP_loss_scale = run_values.results["AMP"]["current_loss_scale"]
except KeyError:
AMP_steps_since_last_loss_scale = None
AMP_loss_scale = None
if AMP_steps_since_last_loss_scale is not None:
# Step has been skipped
if _global_step != (self._amp_steps_non_skipped + 1):
logging.warning(
"AMP - Training iteration `#{step}` has been skipped and loss rescaled. "
"New Loss Scale: {loss_scale}\n".format(step=self._current_step, loss_scale=AMP_loss_scale)
)
else:
self._amp_steps_non_skipped += 1
if AMP_steps_since_last_loss_scale == 0:
logging.warning(
"AMP - Training iteration `#{step}` - Loss scale has been automatically increased. "
"New Loss Scale: {loss_scale}\n".format(step=self._current_step, loss_scale=AMP_loss_scale)
)
else:
AMP_steps_since_last_loss_scale = None
AMP_loss_scale = None
def get_model_throughput():
gpu_batch_size = run_values.results["batch_size"]
return gpu_batch_size / batch_time * self._n_gpus
# def get_model_stats():
# return get_tf_model_statistics(batch_size=run_values.results["batch_size"], scope_name=None)
#
# if self._model_stats is None:
# self._model_stats = get_model_stats()
is_log_step = self._current_step % self._log_every_n_steps == 0
if is_log_step:
if self._current_step > self._warmup_steps:
try:
model_throughput = self._model_throughput.read()
except ValueError:
model_throughput = get_model_throughput()
else:
model_throughput = get_model_throughput()
self._logging_proxy.log_step(iteration=self._current_step, throughput=model_throughput, gpu_stats=[])
self._logging_proxy.log_amp_runtime(
current_loss_scale=AMP_loss_scale,
steps_non_skipped=_global_step,
steps_since_last_scale=AMP_steps_since_last_loss_scale,
)
metric_data = dict()
for name, value in sorted(run_values.results["metrics"].items(), key=operator.itemgetter(0)):
self._metrics[name]["aggregator"].record(value)
metric_data[name] = self._metrics[name]["aggregator"].read()
self._logging_proxy.log_metrics(
metric_data=metric_data, iteration=self._current_step, runtime_mode=self._runtime_mode
)
print() # Visual Spacing
elif self._current_step > self._warmup_steps:
# Do not store speed for log step due to additional fetches
self._model_throughput.record(get_model_throughput())
class _SlaveGPUsHook(tf.estimator.SessionRunHook):
def after_create_session(self, session, coord):
with logging.temp_verbosity(logging.INFO): # Do not warn user about metric cleaning
clear_registered_metrics()
def real_autologging_hook(*args, **kwargs):
replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group
# Do not set a logging hook for GPUs != 0
if MPI_rank_and_size()[0] != 0 or (isinstance(replica_id, tf.Tensor) and tf.get_static_value(replica_id) != 0):
return _SlaveGPUsHook()
else:
_ = LoggingBackend() # Making sure the backend is defined before any hook due to __atexit__ hook
return _AutoLoggingHook(*args, **kwargs)
def collect_registered_metrics():
if TF_METRICS: # if not empty
metrics = copy.copy(TF_METRICS)
# Do not warn user about metric cleaning
with logging.temp_verbosity(logging.INFO):
clear_registered_metrics()
return metrics
else:
return dict()
def get_model_variables():
"""return model variables: global variables without optimizer's variables"""
return [
# yapf: disable
var for var in tf.compat.v1.global_variables() if (
var.name[-11:] not in "/Momentum:0" and
var.name[-11:] not in "/Adadelta:0" and
var.name[-13:] not in "/Adadelta_1:0" and
var.name[-7:] not in "/Adam:0" and
var.name[-9:] not in "/Adam_1:0" and
var.name[-10:] not in "/Adagrad:0" and
var.name[10:] not in "/RMSProp:0" and
var.name[-12:] not in "/RMSProp_1:0" and
var.name[-16:] not in "/LARSOptimizer:0"
)
# yapf: enable
]
def get_trainable_variables():
"""Get a list of trainable TensorFlow variables.
Parameters
----------
train_only : boolean
If True, only get the trainable variables.
Returns
-------
list of Tensor
A list of trainable TensorFlow variables
Examples
--------
"""
if KERAS_MODELS or LooseVersion(tf.__version__) >= LooseVersion("2.0.0"):
logging.warning(
"In TF2.x, only trainable variables created with Keras Models are captured for logging.\n"
"In TF1.x, if any keras model is defined. Only variables created inside Keras Models will be logged."
)
var_list = list()
for model in KERAS_MODELS:
var_list.extend(model.trainable_variables)
# Keep only a list of unique variables (remove potential duplicates)
var_list = list(set(var_list))
# clearing the list of Keras Model to avoid memory leaks
KERAS_MODELS.clear()
return [var for var in sorted(var_list, key=lambda v: v.name)]
else:
# return tf.trainable_variables() # deprecated in TF2.x
from tensorflow.python.keras.backend import get_graph
return get_graph().get_collection('trainable_variables')
def setup_tensorflow_hook(sess, logging_proxy, is_training, is_initialized):
global_step = -1
if is_training:
if not is_initialized:
_global_step_tensor = tf.compat.v1.train.get_or_create_global_step()
global_step = sess.run(_global_step_tensor)
trainable_variables = get_trainable_variables()
def count_weights_in_varlist(var_list):
return np.sum([np.prod(s.get_shape()) for s in var_list])
logging_proxy.log_git_status()
logging_proxy.log_model_statistics(
model_statistics={
"# Trainable Weights": "{:,}".format(int(count_weights_in_varlist(trainable_variables))),
"# Model Weights": "{:,}".format(int(count_weights_in_varlist(get_model_variables()))),
}
)
logging_proxy.log_trainable_variables([(var.name, var.get_shape()) for var in trainable_variables])
else:
if not is_initialized:
global_step = 0
metrics = collect_registered_metrics()
logging_proxy.log_runtime(is_train=is_training)
return global_step, metrics
AutoLoggingHook = lambda *args, **kwargs: real_autologging_hook(*args, **kwargs)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import re
import tensorflow as tf
from mask_rcnn.utils.logging_formatter import logging
from mask_rcnn.utils.distributed_utils import MPI_rank
__all__ = ["PretrainedWeightsLoadingHook"]
# pylint: disable=protected-access
# Currently variable_scope doesn't provide very good APIs to access
# all variables under scope and retrieve and check existing scopes.
def get_variable_full_name(var):
"""Returns the full name of a variable.
For normal Variables, this is the same as the var.op.name. For
sliced or PartitionedVariables, this name is the same for all the
slices/partitions. In both cases, this is normally the name used in
a checkpoint file.
Args:
var: A `Variable` object.
Returns:
A string that is the full name.
"""
if var._save_slice_info:
return var._save_slice_info.full_name
else:
return var.op.name
def assign_from_checkpoint(model_path, var_list, ignore_missing_vars=False):
"""Creates an operation to assign specific variables from a checkpoint.
Args:
model_path: The full path to the model checkpoint. To get latest checkpoint
use `model_path = tf.train.latest_checkpoint(checkpoint_dir)`
var_list: A list of (possibly partitioned) `Variable` objects or a
dictionary mapping names in the checkpoint to the corresponding variables
or list of variables to initialize from that checkpoint value. For
partitioned Variables, the name in the checkpoint must be the full
variable, not the name of the partitioned variable, eg. "my_var" rather
than "my_var/part_4". If empty, returns no_op(), {}.
ignore_missing_vars: Boolean, if True ignore variables missing in the
checkpoint with a warning instead of failing.
Returns:
the restore_op and the feed_dict that need to be run to restore var_list.
Raises:
ValueError: If `ignore_missing_vars` is False and the checkpoint specified
at `model_path` is missing one of the variables in `var_list`.
"""
# Normalize var_list into a dictionary mapping names in the
# checkpoint to the list of variables to initialize from that
# checkpoint variable. Sliced (including partitioned) variables will
# end up under the same key.
grouped_vars = {}
if isinstance(var_list, (tuple, list)):
for var in var_list:
ckpt_name = get_variable_full_name(var)
if ckpt_name not in grouped_vars:
grouped_vars[ckpt_name] = []
grouped_vars[ckpt_name].append(var)
else:
for ckpt_name, value in var_list.items():
if isinstance(value, (tuple, list)):
grouped_vars[ckpt_name] = value
else:
grouped_vars[ckpt_name] = [value]
# Read each checkpoint entry. Create a placeholder variable and
# add the (possibly sliced) data from the checkpoint to the feed_dict.
reader = tf.compat.v1.train.NewCheckpointReader(model_path)
feed_dict = {}
assign_ops = []
for ckpt_name in grouped_vars:
if not reader.has_tensor(ckpt_name):
log_str = 'Checkpoint is missing variable [%s]' % ckpt_name
if ignore_missing_vars:
logging.warning(log_str)
continue
else:
raise ValueError(log_str)
ckpt_value = reader.get_tensor(ckpt_name)
for var in grouped_vars[ckpt_name]:
placeholder_tensor = tf.compat.v1.placeholder(
dtype=var.dtype.base_dtype,
shape=var.get_shape(),
name='placeholder/' + var.op.name
)
assign_ops.append(var.assign(placeholder_tensor))
if not var._save_slice_info:
if var.get_shape() != ckpt_value.shape:
raise ValueError(
'Total size of new array must be unchanged for %s '
'lh_shape: [%s], rh_shape: [%s]' %
(ckpt_name, str(ckpt_value.shape), str(var.get_shape())))
feed_dict[placeholder_tensor] = ckpt_value.reshape(ckpt_value.shape)
else:
slice_dims = zip(var._save_slice_info.var_offset,
var._save_slice_info.var_shape)
slice_dims = [(start, start + size) for (start, size) in slice_dims]
slice_dims = [slice(*x) for x in slice_dims]
slice_value = ckpt_value[slice_dims]
slice_value = slice_value.reshape(var._save_slice_info.var_shape)
feed_dict[placeholder_tensor] = slice_value
print_op = tf.print(
"[GPU %02d] Restoring pretrained weights (%d Tensors) from: %s" % (
MPI_rank(),
len(assign_ops),
model_path
),
output_stream=sys.stdout
)
with tf.control_dependencies([print_op]):
assign_op = tf.group(*assign_ops)
return assign_op, feed_dict
def build_assigment_map(prefix=None, skip_variables_regex=None):
"""Generate assigment map for loading checkpoints."""
all_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope=prefix)
if not prefix:
prefix = ''
assignment_map = {}
for var in all_vars:
var_name = var.name
if (
var_name[-11:] in "/Momentum:0" or
var_name[-11:] in "/Adadelta:0" or
var_name[-13:] in "/Adadelta_1:0" or
var_name[-7:] in "/Adam:0" or
var_name[-9:] in "/Adam_1:0" or
var_name[-10:] in "/Adagrad:0" or
var_name[-10:] in "/RMSProp:0" or
var_name[-12:] in "/RMSProp_1:0" or
var_name[-16:] in "/LARSOptimizer:0"
):
continue
# Trim the index of the variable.
if ':' in var_name:
var_name = var_name[:var_name.rindex(':')]
if skip_variables_regex and re.match(skip_variables_regex, var_name[len(prefix):]):
continue
assignment_map[var_name[len(prefix):]] = var
# assignment_map[var_name] = var
return assignment_map
class PretrainedWeightsLoadingHook(tf.estimator.SessionRunHook):
def __init__(self, prefix, checkpoint_path, skip_variables_regex=None):
self._prefix = prefix
self._checkpoint_path = checkpoint_path
self._skip_variables_regex = skip_variables_regex
self._is_initialized = False
self._init_op = None
self._init_feed_dict = None
def begin(self):
vars_to_load = build_assigment_map(
prefix=self._prefix,
skip_variables_regex=self._skip_variables_regex
)
self._init_op, self._init_feed_dict = assign_from_checkpoint(
model_path=self._checkpoint_path,
var_list=vars_to_load,
ignore_missing_vars=False
)
def after_create_session(self, session, coord=None):
if not self._is_initialized:
session.run(self._init_op, feed_dict=self._init_feed_dict)
logging.info("Pretrained weights loaded with success...\n")
self._is_initialized = True
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Defining common model params used across all the models."""
from absl import flags
def define_hparams_flags():
flags.DEFINE_string(
'log_path',
default="./mrcnn.json",
help=(
'The path where dllogger json file will be saved. Please include the'
' name of the json file as well.'
)
)
flags.DEFINE_string(
'data_dir',
default=None,
help=(
'The directory where the input data is stored. Please see the model'
' specific README.md for the expected data format.'
)
)
flags.DEFINE_string('checkpoint', default='', help='Checkpoint filepath')
flags.DEFINE_integer(
'eval_batch_size',
default=8,
help='Batch size for evaluation.'
)
flags.DEFINE_bool(
'eval_after_training',
default=True,
help='Run one eval after the training finishes.'
)
flags.DEFINE_integer('eval_samples', default=5000, help='Number of training steps')
flags.DEFINE_bool(
'include_groundtruth_in_features',
default=False,
help=(
'If `val_json_file` is not provided, one can also read groundtruth'
' from input by setting `include_groundtruth_in_features`=True'
)
)
# Gradient clipping is a fairly coarse heuristic to stabilize training.
# This model clips the gradient by its L2 norm globally (i.e., across
# all variables), using a threshold obtained from multiplying this
# parameter with sqrt(number_of_weights), to have a meaningful value
# across both training phases and different sizes of imported modules.
# Refer value: 0.02, for 25M weights, yields clip norm 10.
# Zero or negative number means no clipping.
flags.DEFINE_float("global_gradient_clip_ratio", default=-1.0, help="Global Gradient Clipping Ratio")
flags.DEFINE_float("init_learning_rate", default=2.5e-3, help="Initial Learning Rate")
flags.DEFINE_float("warmup_learning_rate", default=0., help="Warmup Learning Rate Decay Factor")
flags.DEFINE_bool('finetune_bn', False, 'is batchnorm training mode')
flags.DEFINE_float("l2_weight_decay", default=1e-4, help="l2 regularization weight")
flags.DEFINE_string('mode', default='train_and_eval', help='Mode to run: train or eval')
flags.DEFINE_string(
'model_dir',
default=None,
help='The directory where the model and training/evaluation summaries are stored.'
)
flags.DEFINE_float("momentum", default=0.9, help="Optimizer Momentum")
flags.DEFINE_integer('num_steps_per_eval', default=2500, help='Number of steps per evaluation epoch.')
flags.DEFINE_integer('save_checkpoints_steps', default=2500, help='Save a checkpoint every N steps.')
flags.DEFINE_integer('seed', default=None, help='Set a debug seed for reproducibility.')
flags.DEFINE_integer('train_batch_size', default=2, help='Batch size for training.')
flags.DEFINE_integer(
'total_steps',
default=938240,
help=(
'The number of steps to use for training. This flag'
' should be adjusted according to the --train_batch_size flag.'
)
)
flags.DEFINE_list(
'learning_rate_decay_levels',
default=['0.1', '0.01'],
help=(
'The learning rate decay levels which modify the learning rate using the formula:'
' `lr = decay * init_lr`. Decay factor applied at learning_rate_steps.'
)
)
flags.DEFINE_list(
'learning_rate_steps',
default=['480000', '640000'],
help=(
'The steps at which learning rate changes. This flag'
' should be adjusted according to the --train_batch_size flag.'
)
)
flags.DEFINE_integer('warmup_steps', default=1000, help='The number of steps to use warmup learning rate for')
flags.DEFINE_bool('amp', default=False, help='Enable automatic mixed precision')
flags.DEFINE_bool(
'use_batched_nms',
default=False,
help='Enable Batched NMS at inference.'
)
flags.DEFINE_bool(
'use_custom_box_proposals_op',
default=False,
help='Use GenerateBoundingBoxProposals op.'
)
flags.DEFINE_bool('use_fake_data', False, 'Use fake input.')
flags.DEFINE_bool(
'use_tf_distributed',
default=False,
help='Use tensorflow distributed API'
)
flags.DEFINE_bool('xla', default=False, help='Enable XLA JIT Compiler.')
flags.DEFINE_string('training_file_pattern', default="", help='TFRecords file pattern for the training files')
flags.DEFINE_string('validation_file_pattern', default="", help='TFRecords file pattern for the validation files')
flags.DEFINE_string('val_json_file', default="", help='Filepath for the validation json file')
############################# TO BE REMOVED ###################################
flags.DEFINE_integer(
'report_frequency',
default=None,
help='The amount of batches in between accuracy reports at evaluation time'
)
############################# TO BE REMOVED ###################################
############################### ISSUES TO FIX - FLAGS #############################"
# TODO: Remove when XLA at inference fixed
flags.DEFINE_bool(
'allow_xla_at_inference',
default=False,
help='Enable XLA JIT Compiler at Inference'
)
return flags.FLAGS
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions to override model parameters from command-line flags."""
from mask_rcnn.hyperparameters import params_dict
ESSENTIAL_FLAGS = ['tpu', 'data_dir', 'model_dir']
def override_params_from_input_flags(params, input_flags):
"""Update params dictionary with input flags.
Args:
params: ParamsDict object containing dictionary of model parameters.
input_flags: All the flags with non-null value of overridden model
parameters.
Returns:
ParamsDict object containing dictionary of model parameters.
"""
if params is None:
raise ValueError('Input dictionary is empty. It is expected to be loaded with default ' 'values')
if not isinstance(params, params_dict.ParamsDict):
raise ValueError('The base parameter set must be a ParamsDict, was: {}'.format(type(params)))
essential_flag_dict = {}
for key in ESSENTIAL_FLAGS:
flag_value = input_flags.get_flag_value(key, None)
if flag_value is None:
raise ValueError('Flag {} could not be None.'.format(key))
else:
essential_flag_dict[key] = flag_value
params_dict.override_params_dict(params, essential_flag_dict, is_strict=False)
normal_flag_dict = get_dictionary_from_flags(params.as_dict(), input_flags)
params_dict.override_params_dict(params, normal_flag_dict, is_strict=False)
return params
def get_dictionary_from_flags(params, input_flags):
"""Generate dictionary from non-null flags.
Args:
params: Python dictionary of model parameters.
input_flags: All the flags with non-null value of overridden model
parameters.
Returns:
Python dict of overriding model parameters.
"""
flag_dict = {}
for k, v in params.items():
if isinstance(v, dict):
d = get_dictionary_from_flags(v, input_flags)
flag_dict[k] = d
else:
try:
flag_value = input_flags.get_flag_value(k, None)
if flag_value is not None:
flag_dict[k] = flag_value
except AttributeError:
flag_dict[k] = v
return flag_dict
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
import warnings
import six
import yaml
import tensorflow as tf
from mask_rcnn.utils.logging_formatter import logging
class _Hyperparameters(object):
"""_Hyperparameters class to generate final hparams from various inputs."""
def __init__(self, default_hparams_file, specific_hparams_file, input_flags, hparams_overrides):
"""Initialze and load parameter dictionary with different input sources.
Args:
default_hparams_file: YAML storing default values of all hyperparameters.
specific_hparams_file: YAML file storing accelerator specific values of
hyperparameters to override the default values.
input_flags: Command line flags values for hyperparameters. [This is
for backward compatibility, so that users passing hyperparameters as
regular flags should not run into trouble].
hparams_overrides: A kv string representing which hyperparameters need to
be override from the command-line.
Raises:
ValueError: Raised when 'default_hparams_file' is not readable.
"""
if not tf.io.gfile.exists(default_hparams_file):
raise ValueError(
'Expected a valid path to a YAML file, which represents the default '
'hyperparameters file. {}'.format(default_hparams_file)
)
self._params = {}
self._params_source = {}
self._default_hparams_file = default_hparams_file
self._specific_hparams_file = specific_hparams_file
self._input_flags = input_flags
self._hparams_overrides = hparams_overrides
def get_parameters(self, log_params):
"""Returns the dictionary loaded with final values of all hyperparameters.
Args:
log_params: Bool to specify if the hyperparameters final value need to be
logged or not.
Returns:
Python dictionary with all the final hyperparameters.
"""
self._params, self._params_source = load_from_file(
self._params, self._params_source, self._default_hparams_file
)
self._params, self._params_source = load_from_file(
self._params, self._params_source, self._specific_hparams_file
)
self._params, self._params_source = load_from_input_flags(self._params, self._params_source, self._input_flags)
self._params, self._params_source = load_from_hparams_overrides(
self._params, self._params_source, self._hparams_overrides
)
if log_params:
self.log_parameters()
return self._params
def log_parameters(self):
"""Log the hyperparameters value along with the source of those values.
"""
params_log = ''
for k in self._params:
params_log += k + ': \t' + str(self._params[k])
params_log += ' \t[' + self._params_source[k] + ']\n'
logging.info('\nModel hyperparameters [source]:\n%s', params_log)
def load_from_file(params, params_source, file_path):
"""Given a path to a YAML file, read the file and load it to dictionary.
Args:
params: Python dictionary of hyperparameters.
params_source: Python dictionary to record source of hyperparameters.
file_path: Python string containing path to file.
Returns:
Python dict of hyperparameters.
"""
if file_path is None:
return params, params_source
if not tf.io.gfile.exists(file_path):
warnings.warn('Could not read Hyperparameter file : ' + file_path, RuntimeWarning)
return params, params_source
with tf.io.gfile.GFile(file_path, 'r') as f:
overrides = yaml.load(f)
for key, value in six.iteritems(overrides):
params[key] = value
params_source[key] = os.path.basename(file_path)
return params, params_source
# TODO(amangu): Once global hyperparameter flags will be removed, we won't need
# this function. Remove this functions after implementing this.
def load_from_input_flags(params, params_source, input_flags):
"""Update params dictionary with input flags.
Args:
params: Python dictionary of hyperparameters.
params_source: Python dictionary to record source of hyperparameters.
input_flags: All the flags with non-null value of overridden
hyperparameters.
Returns:
Python dict of hyperparameters.
"""
if params is None:
raise ValueError('Input dictionary is empty. It is expected to be loaded with default ' 'values')
if not isinstance(params, dict):
raise ValueError('The base parameter set must be a Python dict, was: {}'.format(type(params)))
for key in params:
flag_value = input_flags.get_flag_value(key, None)
if flag_value is not None:
params[key] = flag_value
params_source[key] = 'Command-line flags'
return params, params_source
# TODO(amangu): Add tests to verify different dtypes of params.
def load_from_hparams_overrides(params, params_source, hparams_overrides):
"""Given a dictionary of hyperparameters and a list of overrides, merge them.
Args:
params: Python dict containing a base hyperparameters set.
params_source: Python dictionary to record source of hyperparameters.
hparams_overrides: Python list of strings. This is a set of k=v overrides
for the hyperparameters in `params`; if `k=v1` in `params` but `k=v2` in
`hparams_overrides`, the second value wins and the value for `k` is `v2`.
Returns:
Python dict of hyperparameters.
"""
if params is None:
raise ValueError('Input dictionary is empty. It is expected to be loaded with default ' 'values')
if not isinstance(params, dict):
raise ValueError('The base hyperparameters set must be a Python dict, was: {}'.format(type(params)))
if hparams_overrides is None:
return params, params_source
if isinstance(hparams_overrides, six.string_types):
hparams_overrides = [hparams_overrides]
if not isinstance(hparams_overrides, list):
raise ValueError(
'Expected that hparams_overrides would be `None`, a single string, or a'
' list of strings, was: {}'.format(type(hparams_overrides))
)
for kv_pair in hparams_overrides:
if not isinstance(kv_pair, six.string_types):
raise ValueError(
'Expected that hparams_overrides would contain Python list of strings,'
' but encountered an item: {}'.format(type(kv_pair))
)
key, value = kv_pair.split('=')
parser = type(params[key])
if parser is bool:
params[key] = value not in ('0', 'False', 'false')
else:
params[key] = parser(value)
params_source[key] = 'Command-line `hparams` flag'
return params, params_source
def get_hyperparameters(default_hparams_file, specific_hparams_file, input_flags, hparams_overrides, log_params=True):
"""Single function to get hparams for any model using different sources.
Args:
default_hparams_file: YAML storing default values of all hyperparameters.
specific_hparams_file: YAML file storing accelerator specific values of
hyperparameters to override the default values.
input_flags: Command line flags values for hyperparameters. [This is
for backward compatibility, so that users passing hyperparameters as
regular flags should not run into trouble].
hparams_overrides: A kv string representing which hyperparameters need to
be override from the command-line.
log_params: Bool to specify if the hyperparameters final value need to be
logged or not.
Returns:
Python dictionary with all the final hyperparameters.
"""
parameter = _Hyperparameters(default_hparams_file, specific_hparams_file, input_flags, hparams_overrides)
return parameter.get_parameters(log_params)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parameters used to build Mask-RCNN model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from argparse import Namespace
class _Namespace(Namespace):
def values(self):
return self.__dict__
def default_config():
return _Namespace(**dict(
# input pre-processing parameters
image_size=(832, 1344),
augment_input_data=True,
gt_mask_size=112,
# dataset specific parameters
num_classes=91,
# num_classes=81,
skip_crowd_during_training=True,
use_category=True,
# Region Proposal Network
rpn_positive_overlap=0.7,
rpn_negative_overlap=0.3,
rpn_batch_size_per_im=256,
rpn_fg_fraction=0.5,
rpn_min_size=0.,
# Proposal layer.
batch_size_per_im=512,
fg_fraction=0.25,
fg_thresh=0.5,
bg_thresh_hi=0.5,
bg_thresh_lo=0.,
# Faster-RCNN heads.
fast_rcnn_mlp_head_dim=1024,
bbox_reg_weights=(10., 10., 5., 5.),
# Mask-RCNN heads.
include_mask=True, # whether or not to include mask branch. # ===== Not existing in MLPerf ===== #
mrcnn_resolution=28,
# training
train_rpn_pre_nms_topn=2000,
train_rpn_post_nms_topn=1000,
train_rpn_nms_threshold=0.7,
# evaluation
test_detections_per_image=100,
test_nms=0.5,
test_rpn_pre_nms_topn=1000,
test_rpn_post_nms_topn=1000,
test_rpn_nms_thresh=0.7,
# model architecture
min_level=2,
max_level=6,
num_scales=1,
aspect_ratios=[(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)],
anchor_scale=8.0,
# localization loss
rpn_box_loss_weight=1.0,
fast_rcnn_box_loss_weight=1.0,
mrcnn_weight_loss_mask=1.0,
# ---------- Training configurations ----------
# Skips loading variables from the resnet checkpoint. It is used for
# skipping nonexistent variables from the constructed graph. The list
# of loaded variables is constructed from the scope 'resnetX', where 'X'
# is depth of the resnet model. Supports regular expression.
skip_checkpoint_variables='^NO_SKIP$',
# ---------- Eval configurations ----------
# Visualizes images and detection boxes on TensorBoard.
visualize_images_summary=False,
))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A parameter dictionary class which supports the nest structure."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import copy
import re
import six
import yaml
import tensorflow as tf
# regex pattern that matches on key-value pairs in a comma-separated
# key-value pair string. It splits each k-v pair on the = sign, and
# matches on values that are within single quotes, double quotes, single
# values (e.g. floats, ints, etc.), and a lists within brackets.
_PARAM_RE = re.compile(
r"""
(?P<name>[a-zA-Z][\w\.]*) # variable name: "var" or "x"
\s*=\s*
((?P<val>\'[^\]]*\' # single quote
|
\"[^\]]*\" # double quote
|
[^,\[]* # single value
|
\[[^\]]*\])) # list of values
($|,\s*)""", re.VERBOSE
)
class ParamsDict(object):
"""A hyperparameter container class."""
RESERVED_ATTR = ['_locked', '_restrictions']
def __init__(self, default_params=None, restrictions=None):
"""Instantiate a ParamsDict.
Instantiate a ParamsDict given a set of default parameters and a list of
restrictions. Upon initialization, it validates itself by checking all the
defined restrictions, and raise error if it finds inconsistency.
Args:
default_params: a Python dict or another ParamsDict object including the
default parameters to initialize.
restrictions: a list of strings, which define a list of restrictions to
ensure the consistency of different parameters internally. Each
restriction string is defined as a binary relation with a set of
operators, including {'==', '!=', '<', '<=', '>', '>='}.
"""
self._locked = False
self._restrictions = []
if restrictions:
self._restrictions = restrictions
if default_params is None:
default_params = {}
self.override(default_params, is_strict=False)
self.validate()
def _set(self, k, v):
if isinstance(v, dict):
self.__dict__[k] = ParamsDict(v)
else:
self.__dict__[k] = copy.deepcopy(v)
def __setattr__(self, k, v):
"""Sets the value of the existing key.
Note that this does not allow directly defining a new key. Use the
`override` method with `is_strict=False` instead.
Args:
k: the key string.
v: the value to be used to set the key `k`.
Raises:
KeyError: if k is not defined in the ParamsDict.
"""
if k not in ParamsDict.RESERVED_ATTR:
if k not in self.__dict__.keys():
raise KeyError(
'The key `%{}` does not exist. '
'To extend the existing keys, use '
'`override` with `is_strict` = True.'.format(k)
)
if self._locked:
raise ValueError('The ParamsDict has been locked. ' 'No change is allowed.')
self._set(k, v)
def __getattr__(self, k):
"""Gets the value of the existing key.
Args:
k: the key string.
Returns:
the value of the key.
Raises:
KeyError: if k is not defined in the ParamsDict.
"""
if k not in self.__dict__.keys():
raise KeyError('The key `{}` does not exist. '.format(k))
return self.__dict__[k]
def override(self, override_params, is_strict=True):
"""Override the ParamsDict with a set of given params.
Args:
override_params: a dict or a ParamsDict specifying the parameters to
be overridden.
is_strict: a boolean specifying whether override is strict or not. If
True, keys in `override_params` must be present in the ParamsDict.
If False, keys in `override_params` can be different from what is
currently defined in the ParamsDict. In this case, the ParamsDict will
be extended to include the new keys.
"""
if self._locked:
raise ValueError('The ParamsDict has been locked. No change is allowed.')
if isinstance(override_params, ParamsDict):
override_params = override_params.as_dict()
self._override(override_params, is_strict) # pylint: disable=protected-access
def _override(self, override_dict, is_strict=True):
"""The implementation of `override`."""
for k, v in six.iteritems(override_dict):
if k in ParamsDict.RESERVED_ATTR:
raise KeyError('The key `%{}` is internally reserved. ' 'Can not be overridden.')
if k not in self.__dict__.keys():
if is_strict:
raise KeyError(
'The key `{}` does not exist. '
'To extend the existing keys, use '
'`override` with `is_strict` = False.'.format(k)
)
else:
self._set(k, v)
else:
if isinstance(v, dict):
self.__dict__[k]._override(v, is_strict) # pylint: disable=protected-access
elif isinstance(v, ParamsDict):
self.__dict__[k]._override(v.as_dict(), is_strict) # pylint: disable=protected-access
else:
self.__dict__[k] = copy.deepcopy(v)
def lock(self):
"""Makes the ParamsDict immutable."""
self._locked = True
def as_dict(self):
"""Returns a dict representation of ParamsDict.
For the nested ParamsDict, a nested dict will be returned.
"""
params_dict = {}
for k, v in six.iteritems(self.__dict__):
if k not in ParamsDict.RESERVED_ATTR:
if isinstance(v, ParamsDict):
params_dict[k] = v.as_dict()
else:
params_dict[k] = copy.deepcopy(v)
return params_dict
def validate(self):
"""Validate the parameters consistency based on the restrictions.
This method validates the internal consistency using the pre-defined list of
restrictions. A restriction is defined as a string which specfiies a binary
operation. The supported binary operations are {'==', '!=', '<', '<=', '>',
'>='}. Note that the meaning of these operators are consistent with the
underlying Python immplementation. Users should make sure the define
restrictions on their type make sense.
For example, for a ParamsDict like the following
```
a:
a1: 1
a2: 2
b:
bb:
bb1: 10
bb2: 20
ccc:
a1: 1
a3: 3
```
one can define two restrictions like this
['a.a1 == b.ccc.a1', 'a.a2 <= b.bb.bb2']
What it enforces are:
- a.a1 = 1 == b.ccc.a1 = 2
- a.a2 = 2 <= b.bb.bb2 = 20
Raises:
KeyError: if any of the following happens
(1) any of parameters in any of restrictions is not defined in
ParamsDict,
(2) any inconsistency violating the restriction is found.
ValueError: if the restriction defined in the string is not supported.
"""
def _get_kv(dotted_string, params_dict):
tokenized_params = dotted_string.split('.')
v = params_dict
for t in tokenized_params:
v = v[t]
return tokenized_params[-1], v
def _get_kvs(tokens, params_dict):
if len(tokens) != 2:
raise ValueError('Only support binary relation in restriction.')
stripped_tokens = [t.strip() for t in tokens]
left_k, left_v = _get_kv(stripped_tokens[0], params_dict)
right_k, right_v = _get_kv(stripped_tokens[1], params_dict)
return left_k, left_v, right_k, right_v
params_dict = self.as_dict()
for restriction in self._restrictions:
if '==' in restriction:
tokens = restriction.split('==')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v != right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'.format(tokens[0], tokens[1]))
elif '!=' in restriction:
tokens = restriction.split('!=')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v == right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'.format(tokens[0], tokens[1]))
elif '<' in restriction:
tokens = restriction.split('<')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v >= right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'.format(tokens[0], tokens[1]))
elif '<=' in restriction:
tokens = restriction.split('<=')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v > right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'.format(tokens[0], tokens[1]))
elif '>' in restriction:
tokens = restriction.split('>')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v <= right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'.format(tokens[0], tokens[1]))
elif '>=' in restriction:
tokens = restriction.split('>=')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v < right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'.format(tokens[0], tokens[1]))
else:
raise ValueError('Unsupported relation in restriction.')
def read_yaml_to_params_dict(file_path):
"""Reads a YAML file to a ParamsDict."""
with tf.io.gfile.GFile(file_path, 'r') as f:
params_dict = yaml.load(f)
return ParamsDict(params_dict)
def save_params_dict_to_yaml(params, file_path):
"""Saves the input ParamsDict to a YAML file."""
with tf.io.gfile.GFile(file_path, 'w') as f:
def _my_list_rep(dumper, data):
# u'tag:yaml.org,2002:seq' is the YAML internal tag for sequence.
return dumper.represent_sequence(u'tag:yaml.org,2002:seq', data, flow_style=True)
yaml.add_representer(list, _my_list_rep)
yaml.dump(params.as_dict(), f, default_flow_style=False)
def nested_csv_str_to_json_str(csv_str):
"""Converts a nested (using '.') comma-separated k=v string to a JSON string.
Converts a comma-separated string of key/value pairs that supports
nesting of keys to a JSON string. Nesting is implemented using
'.' between levels for a given key.
Spacing between commas and = is supported (e.g. there is no difference between
"a=1,b=2", "a = 1, b = 2", or "a=1, b=2") but there should be no spaces before
keys or after values (e.g. " a=1,b=2" and "a=1,b=2 " are not supported).
Note that this will only support values supported by CSV, meaning
values such as nested lists (e.g. "a=[[1,2,3],[4,5,6]]") are not
supported. Strings are supported as well, e.g. "a='hello'".
An example conversion would be:
"a=1, b=2, c.a=2, c.b=3, d.a.a=5"
to
"{ a: 1, b : 2, c: {a : 2, b : 3}, d: {a: {a : 5}}}"
Args:
csv_str: the comma separated string.
Returns:
the converted JSON string.
Raises:
ValueError: If csv_str is not in a comma separated string or
if the string is formatted incorrectly.
"""
if not csv_str:
return ''
formatted_entries = []
nested_map = collections.defaultdict(list)
pos = 0
while pos < len(csv_str):
m = _PARAM_RE.match(csv_str, pos)
if not m:
raise ValueError('Malformed hyperparameter value while parsing ' 'CSV string: %s' % csv_str[pos:])
pos = m.end()
# Parse the values.
m_dict = m.groupdict()
name = m_dict['name']
v = m_dict['val']
name_nested = name.split('.')
if len(name_nested) > 1:
grouping = name_nested[0]
value = '.'.join(name_nested[1:]) + '=' + v
nested_map[grouping].append(value)
else:
formatted_entries.append('%s : %s' % (name, v))
for grouping, value in nested_map.items():
value = ','.join(value)
value = nested_csv_str_to_json_str(value)
formatted_entries.append('%s : %s' % (grouping, value))
return '{' + ', '.join(formatted_entries) + '}'
def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
"""Override a given ParamsDict using a dict, JSON/YAML/CSV string or YAML file.
The logic of the function is outlined below:
1. Test that the input is a dict. If not, proceed to 2.
2. Tests that the input is a string. If not, raise unknown ValueError
2.1. Test if the string is in a CSV format. If so, parse.
If not, proceed to 2.2.
2.2. Try loading the string as a YAML/JSON. If successful, parse to
dict and use it to override. If not, proceed to 2.3.
2.3. Try using the string as a file path and load the YAML file.
Args:
params: a ParamsDict object to be overridden.
dict_or_string_or_yaml_file: a Python dict, JSON/YAML/CSV string or
path to a YAML file specifying the parameters to be overridden.
is_strict: a boolean specifying whether override is strict or not.
Returns:
params: the overridden ParamsDict object.
Raises:
ValueError: if failed to override the parameters.
"""
if not dict_or_string_or_yaml_file:
return params
if isinstance(dict_or_string_or_yaml_file, dict):
params.override(dict_or_string_or_yaml_file, is_strict)
elif isinstance(dict_or_string_or_yaml_file, six.string_types):
try:
dict_or_string_or_yaml_file = (nested_csv_str_to_json_str(dict_or_string_or_yaml_file))
except ValueError:
pass
params_dict = yaml.load(dict_or_string_or_yaml_file)
if isinstance(params_dict, dict):
params.override(params_dict, is_strict)
else:
with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
params.override(yaml.load(f), is_strict)
else:
raise ValueError('Unknown input type to parse.')
return params
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#============================================================================
"""Utils to handle parameters IO."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import yaml
import tensorflow as tf
def save_hparams_to_yaml(hparams, file_path):
with tf.io.gfile.GFile(file_path, 'w') as f:
try:
hparams_val = hparams.values()
except AttributeError:
hparams_val = hparams.__dict__
yaml.dump(hparams_val, f)
def override_hparams(hparams, dict_or_string_or_yaml_file):
"""Override a given hparams using a dict or a string or a JSON file.
Args:
hparams: a HParams object to be overridden.
dict_or_string_or_yaml_file: a Python dict, or a comma-separated string,
or a path to a YAML file specifying the parameters to be overridden.
Returns:
hparams: the overridden HParams object.
Raises:
ValueError: if failed to override the parameters.
"""
if not dict_or_string_or_yaml_file:
return hparams
if isinstance(dict_or_string_or_yaml_file, dict):
for key, val in dict_or_string_or_yaml_file.items():
if key not in hparams:
try: # TF 1.x
hparams.add_hparam(key, val)
except AttributeError: # TF 2.x
try: # Dict
hparams[key] = val
except TypeError: # Namespace
setattr(hparams, key, val)
else:
raise ValueError("Parameter `%s` is already defined" % key)
# hparams.override_from_dict(dict_or_string_or_yaml_file)
elif isinstance(dict_or_string_or_yaml_file, six.string_types):
try:
hparams.parse(dict_or_string_or_yaml_file)
except ValueError as parse_error:
try:
with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
hparams.override_from_dict(yaml.load(f))
except Exception as read_error:
parse_message = ('Failed to parse config string: %s\n' % parse_error.message)
read_message = ('Failed to parse yaml file provided. %s' % read_error.message)
raise ValueError(parse_message + read_message)
else:
raise ValueError('Unknown input type to parse.')
return hparams
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model definition for the Mask-RCNN Model.
Defines model_fn of Mask-RCNN for TF Estimator. The model_fn includes Mask-RCNN
model architecture, loss function, learning rate schedule, and evaluation
procedure.
"""
import itertools
import tensorflow as tf
from mask_rcnn import anchors
from mask_rcnn.models import fpn
from mask_rcnn.models import heads
from mask_rcnn.models import resnet
from mask_rcnn.training import losses, learning_rates
from mask_rcnn.ops import postprocess_ops
from mask_rcnn.ops import roi_ops
from mask_rcnn.ops import spatial_transform_ops
from mask_rcnn.ops import training_ops
from mask_rcnn.utils.logging_formatter import logging
from mask_rcnn.utils.distributed_utils import MPI_is_distributed
from mask_rcnn.utils.distributed_utils import MPI_local_rank
from mask_rcnn.utils.meters import StandardMeter
from mask_rcnn.utils.metric_tracking import register_metric
from mask_rcnn.utils.lazy_imports import LazyImport
hvd = LazyImport("horovod.tensorflow")
MODELS = dict()
def create_optimizer(learning_rate, params):
"""Creates optimized based on the specified flags."""
optimizer = tf.compat.v1.train.MomentumOptimizer(learning_rate, momentum=params['momentum'])
if MPI_is_distributed():
optimizer = hvd.DistributedOptimizer(
optimizer,
name=None,
device_dense='/gpu:0',
device_sparse='',
# compression=hvd.Compression.fp16,
compression=hvd.Compression.none,
sparse_as_dense=False
)
if params["amp"]:
loss_scale = tf.train.experimental.DynamicLossScale(
initial_loss_scale=(2 ** 12),
increment_period=2000,
multiplier=2.0
)
optimizer = tf.compat.v1.train.experimental.MixedPrecisionLossScaleOptimizer(optimizer, loss_scale=loss_scale)
return optimizer
def compute_model_statistics(batch_size, is_training=True):
"""Compute number of parameters and FLOPS."""
options = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation()
options['output'] = 'none'
from tensorflow.python.keras.backend import get_graph
flops = tf.compat.v1.profiler.profile(get_graph(), options=options).total_float_ops
flops_per_image = flops / batch_size
logging.info('[%s Compute Statistics] %.1f GFLOPS/image' % (
"Training" if is_training else "Inference",
flops_per_image/1e9
))
def build_model_graph(features, labels, is_training, params):
"""Builds the forward model graph."""
model_outputs = {}
is_gpu_inference = not is_training and params['use_batched_nms']
batch_size, image_height, image_width, _ = features['images'].get_shape().as_list()
if 'source_ids' not in features:
features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)
all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
params['num_scales'], params['aspect_ratios'],
params['anchor_scale'],
(image_height, image_width))
MODELS["backbone"] = resnet.Resnet_Model(
"resnet50",
data_format='channels_last',
trainable=is_training,
finetune_bn=params['finetune_bn']
)
backbone_feats = MODELS["backbone"](
features['images'],
training=is_training,
)
MODELS["FPN"] = fpn.FPNNetwork(params['min_level'], params['max_level'], trainable=is_training)
fpn_feats = MODELS["FPN"](backbone_feats, training=is_training)
model_outputs.update({'fpn_features': fpn_feats})
def rpn_head_fn(features, min_level=2, max_level=6, num_anchors=3):
"""Region Proposal Network (RPN) for Mask-RCNN."""
scores_outputs = dict()
box_outputs = dict()
MODELS["RPN_Heads"] = heads.RPN_Head_Model(name="rpn_head", num_anchors=num_anchors, trainable=is_training)
for level in range(min_level, max_level + 1):
scores_outputs[level], box_outputs[level] = MODELS["RPN_Heads"](features[level], training=is_training)
return scores_outputs, box_outputs
rpn_score_outputs, rpn_box_outputs = rpn_head_fn(
features=fpn_feats,
min_level=params['min_level'],
max_level=params['max_level'],
num_anchors=len(params['aspect_ratios'] * params['num_scales'])
)
if is_training:
rpn_pre_nms_topn = params['train_rpn_pre_nms_topn']
rpn_post_nms_topn = params['train_rpn_post_nms_topn']
rpn_nms_threshold = params['train_rpn_nms_threshold']
else:
rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
rpn_post_nms_topn = params['test_rpn_post_nms_topn']
rpn_nms_threshold = params['test_rpn_nms_thresh']
if params['use_custom_box_proposals_op']:
rpn_box_scores, rpn_box_rois = roi_ops.custom_multilevel_propose_rois(
scores_outputs=rpn_score_outputs,
box_outputs=rpn_box_outputs,
all_anchors=all_anchors,
image_info=features['image_info'],
rpn_pre_nms_topn=rpn_pre_nms_topn,
rpn_post_nms_topn=rpn_post_nms_topn,
rpn_nms_threshold=rpn_nms_threshold,
rpn_min_size=params['rpn_min_size']
)
else:
rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
scores_outputs=rpn_score_outputs,
box_outputs=rpn_box_outputs,
all_anchors=all_anchors,
image_info=features['image_info'],
rpn_pre_nms_topn=rpn_pre_nms_topn,
rpn_post_nms_topn=rpn_post_nms_topn,
rpn_nms_threshold=rpn_nms_threshold,
rpn_min_size=params['rpn_min_size'],
bbox_reg_weights=None,
use_batched_nms=params['use_batched_nms']
)
rpn_box_rois = tf.cast(rpn_box_rois, dtype=tf.float32)
if is_training:
rpn_box_rois = tf.stop_gradient(rpn_box_rois)
rpn_box_scores = tf.stop_gradient(rpn_box_scores) # TODO Jonathan: Unused => Shall keep ?
# Sampling
box_targets, class_targets, rpn_box_rois, proposal_to_label_map = training_ops.proposal_label_op(
rpn_box_rois,
labels['gt_boxes'],
labels['gt_classes'],
batch_size_per_im=params['batch_size_per_im'],
fg_fraction=params['fg_fraction'],
fg_thresh=params['fg_thresh'],
bg_thresh_hi=params['bg_thresh_hi'],
bg_thresh_lo=params['bg_thresh_lo']
)
# Performs multi-level RoIAlign.
box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
features=fpn_feats,
boxes=rpn_box_rois,
output_size=7,
is_gpu_inference=is_gpu_inference
)
MODELS["Box_Head"] = heads.Box_Head_Model(
num_classes=params['num_classes'],
mlp_head_dim=params['fast_rcnn_mlp_head_dim'],
trainable=is_training
)
class_outputs, box_outputs, _ = MODELS["Box_Head"](inputs=box_roi_features)
if not is_training:
if params['use_batched_nms']:
generate_detections_fn = postprocess_ops.generate_detections_gpu
else:
generate_detections_fn = postprocess_ops.generate_detections_tpu
detections = generate_detections_fn(
class_outputs=class_outputs,
box_outputs=box_outputs,
anchor_boxes=rpn_box_rois,
image_info=features['image_info'],
pre_nms_num_detections=params['test_rpn_post_nms_topn'],
post_nms_num_detections=params['test_detections_per_image'],
nms_threshold=params['test_nms'],
bbox_reg_weights=params['bbox_reg_weights']
)
model_outputs.update({
'num_detections': detections[0],
'detection_boxes': detections[1],
'detection_classes': detections[2],
'detection_scores': detections[3],
})
else: # is training
encoded_box_targets = training_ops.encode_box_targets(
boxes=rpn_box_rois,
gt_boxes=box_targets,
gt_labels=class_targets,
bbox_reg_weights=params['bbox_reg_weights']
)
model_outputs.update({
'rpn_score_outputs': rpn_score_outputs,
'rpn_box_outputs': rpn_box_outputs,
'class_outputs': class_outputs,
'box_outputs': box_outputs,
'class_targets': class_targets,
'box_targets': encoded_box_targets,
'box_rois': rpn_box_rois,
})
# Faster-RCNN mode.
if not params['include_mask']:
return model_outputs
# Mask sampling
if not is_training:
selected_box_rois = model_outputs['detection_boxes']
class_indices = model_outputs['detection_classes']
# If using GPU for inference, delay the cast until when Gather ops show up
# since GPU inference supports float point better.
# TODO(laigd): revisit this when newer versions of GPU libraries is
# released.
if not params['use_batched_nms']:
class_indices = tf.cast(class_indices, dtype=tf.int32)
else:
selected_class_targets, selected_box_targets, \
selected_box_rois, proposal_to_label_map = training_ops.select_fg_for_masks(
class_targets=class_targets,
box_targets=box_targets,
boxes=rpn_box_rois,
proposal_to_label_map=proposal_to_label_map,
max_num_fg=int(params['batch_size_per_im'] * params['fg_fraction'])
)
class_indices = tf.cast(selected_class_targets, dtype=tf.int32)
mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
features=fpn_feats,
boxes=selected_box_rois,
output_size=14,
is_gpu_inference=is_gpu_inference
)
MODELS["Mask_Head"] = heads.Mask_Head_Model(
class_indices,
num_classes=params['num_classes'],
mrcnn_resolution=params['mrcnn_resolution'],
is_gpu_inference=is_gpu_inference,
trainable=is_training,
name="mask_head"
)
mask_outputs = MODELS["Mask_Head"](inputs=mask_roi_features)
if MPI_local_rank() == 0:
# Print #FLOPs in model.
compute_model_statistics(batch_size, is_training=is_training)
if is_training:
mask_targets = training_ops.get_mask_targets(
fg_boxes=selected_box_rois,
fg_proposal_to_label_map=proposal_to_label_map,
fg_box_targets=selected_box_targets,
mask_gt_labels=labels['cropped_gt_masks'],
output_size=params['mrcnn_resolution']
)
model_outputs.update({
'mask_outputs': mask_outputs,
'mask_targets': mask_targets,
'selected_class_targets': selected_class_targets,
})
else:
model_outputs.update({
'detection_masks': tf.nn.sigmoid(mask_outputs),
})
return model_outputs
def _model_fn(features, labels, mode, params):
"""Model defination for the Mask-RCNN model based on ResNet.
Args:
features: the input image tensor and auxiliary information, such as
`image_info` and `source_ids`. The image tensor has a shape of
[batch_size, height, width, 3]. The height and width are fixed and equal.
labels: the input labels in a dictionary. The labels include score targets
and box targets which are dense label maps. The labels are generated from
get_input_fn function in data/dataloader.py
mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
params: the dictionary defines hyperparameters of model. The default
settings are in default_hparams function in this file.
Returns:
tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
"""
# Set up training loss and learning rate.
global_step = tf.compat.v1.train.get_or_create_global_step()
if mode == tf.estimator.ModeKeys.PREDICT:
if params['include_groundtruth_in_features'] and 'labels' in features:
# In include groundtruth for eval.
labels = features['labels']
else:
labels = None
if 'features' in features:
features = features['features']
# Otherwise, it is in export mode, the features is past in directly.
model_outputs = build_model_graph(features, labels, mode == tf.estimator.ModeKeys.TRAIN, params)
model_outputs.update({
'source_id': features['source_ids'],
'image_info': features['image_info'],
})
if mode == tf.estimator.ModeKeys.PREDICT and 'orig_images' in features:
model_outputs['orig_images'] = features['orig_images']
# First check if it is in PREDICT mode or EVAL mode to fill out predictions.
# Predictions are used during the eval step to generate metrics.
if mode in [tf.estimator.ModeKeys.PREDICT, tf.estimator.ModeKeys.EVAL]:
predictions = {}
try:
model_outputs['orig_images'] = features['orig_images']
except KeyError:
pass
if labels and params['include_groundtruth_in_features']:
# Labels can only be embedded in predictions. The prediction cannot output
# dictionary as a value.
predictions.update(labels)
model_outputs.pop('fpn_features', None)
predictions.update(model_outputs)
if mode == tf.estimator.ModeKeys.PREDICT:
# If we are doing PREDICT, we can return here.
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# score_loss and box_loss are for logging. only total_loss is optimized.
total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
score_outputs=model_outputs['rpn_score_outputs'],
box_outputs=model_outputs['rpn_box_outputs'],
labels=labels,
params=params
)
total_fast_rcnn_loss, fast_rcnn_class_loss, fast_rcnn_box_loss = losses.fast_rcnn_loss(
class_outputs=model_outputs['class_outputs'],
box_outputs=model_outputs['box_outputs'],
class_targets=model_outputs['class_targets'],
box_targets=model_outputs['box_targets'],
params=params
)
# Only training has the mask loss.
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py
if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
mask_loss = losses.mask_rcnn_loss(
mask_outputs=model_outputs['mask_outputs'],
mask_targets=model_outputs['mask_targets'],
select_class_targets=model_outputs['selected_class_targets'],
params=params
)
else:
mask_loss = 0.
trainable_variables = list(itertools.chain.from_iterable([model.trainable_variables for model in MODELS.values()]))
l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
tf.nn.l2_loss(v)
for v in trainable_variables
if not any([pattern in v.name for pattern in ["batch_normalization", "bias", "beta"]])
])
total_loss = total_rpn_loss + total_fast_rcnn_loss + mask_loss + l2_regularization_loss
if mode == tf.estimator.ModeKeys.EVAL:
# Predictions can only contain a dict of tensors, not a dict of dict of
# tensors. These outputs are not used for eval purposes.
del predictions['rpn_score_outputs']
del predictions['rpn_box_outputs']
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=total_loss
)
if mode == tf.estimator.ModeKeys.TRAIN:
learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
global_step=global_step,
init_learning_rate=params['init_learning_rate'],
warmup_learning_rate=params['warmup_learning_rate'],
warmup_steps=params['warmup_steps'],
learning_rate_levels=params['learning_rate_levels'],
learning_rate_steps=params['learning_rate_steps']
)
optimizer = create_optimizer(learning_rate, params)
grads_and_vars = optimizer.compute_gradients(total_loss, trainable_variables, colocate_gradients_with_ops=True)
gradients, variables = zip(*grads_and_vars)
grads_and_vars = []
# Special treatment for biases (beta is named as bias in reference model)
# Reference: https://github.com/ddkang/Detectron/blob/80f3295308/lib/modeling/optimizer.py#L109
for grad, var in zip(gradients, variables):
if grad is not None and any([pattern in var.name for pattern in ["bias", "beta"]]):
grad = 2.0 * grad
grads_and_vars.append((grad, var))
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
else:
train_op = None
learning_rate = None
replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group
if not isinstance(replica_id, tf.Tensor) or tf.get_static_value(replica_id) == 0:
register_metric(name="L2 loss", tensor=l2_regularization_loss, aggregator=StandardMeter())
register_metric(name="Mask loss", tensor=mask_loss, aggregator=StandardMeter())
register_metric(name="Total loss", tensor=total_loss, aggregator=StandardMeter())
register_metric(name="RPN box loss", tensor=rpn_box_loss, aggregator=StandardMeter())
register_metric(name="RPN score loss", tensor=rpn_score_loss, aggregator=StandardMeter())
register_metric(name="RPN total loss", tensor=total_rpn_loss, aggregator=StandardMeter())
register_metric(name="FastRCNN class loss", tensor=fast_rcnn_class_loss, aggregator=StandardMeter())
register_metric(name="FastRCNN box loss", tensor=fast_rcnn_box_loss, aggregator=StandardMeter())
register_metric(name="FastRCNN total loss", tensor=total_fast_rcnn_loss, aggregator=StandardMeter())
register_metric(name="Learning rate", tensor=learning_rate, aggregator=StandardMeter())
pass
return tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op,
)
def mask_rcnn_model_fn(features, labels, mode, params):
"""Mask-RCNN model."""
return _model_fn(
features,
labels,
mode,
params
)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Feature Pyramid Network.
Feature Pyramid Networks were proposed in:
[1] Tsung-Yi Lin, Piotr Dollar, Ross Girshick, Kaiming He, Bharath Hariharan,
, and Serge Belongie
Feature Pyramid Networks for Object Detection. CVPR 2017.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from mask_rcnn.ops import spatial_transform_ops
class FPNNetwork(tf.keras.models.Model):
def __init__(self, min_level=3, max_level=7, filters=256, trainable=True):
"""Generates multiple scale feature pyramid (FPN).
Args:
feats_bottom_up: a dictionary of tensor with level as keys and bottom up
feature tensors as values. They are the features to generate FPN features.
min_level: the minimum level number to generate FPN features.
max_level: the maximum level number to generate FPN features.
filters: the FPN filter size.
Returns:
feats: a dictionary of tensor with level as keys and the generated FPN
features as values.
"""
super(FPNNetwork, self).__init__(name="fpn", trainable=trainable)
self._local_layers = dict()
self._min_level = min_level
self._max_level = max_level
self._filters = filters
self._backbone_max_level = 5 # max(feats_bottom_up.keys())
self._upsample_max_level = (
self._backbone_max_level if self._max_level > self._backbone_max_level else self._max_level
)
self._local_layers["stage1"] = dict()
for level in range(self._min_level, self._upsample_max_level + 1):
self._local_layers["stage1"][level] = tf.keras.layers.Conv2D(
filters=self._filters,
kernel_size=(1, 1),
padding='same',
name='l%d' % level,
trainable=trainable
)
self._local_layers["stage2"] = dict()
# add post-hoc 3x3 convolution kernel
for level in range(self._min_level, self._upsample_max_level + 1):
self._local_layers["stage2"][level] = tf.keras.layers.Conv2D(
filters=self._filters,
strides=(1, 1),
kernel_size=(3, 3),
padding='same',
name='post_hoc_d%d' % level,
trainable=trainable
)
self._local_layers["stage3_1"] = dict()
self._local_layers["stage3_2"] = dict()
if self._max_level == self._upsample_max_level + 1:
self._local_layers["stage3_1"] = tf.keras.layers.MaxPool2D(
pool_size=1,
strides=2,
padding='valid',
name='p%d' % self._max_level,
trainable=trainable
)
else:
for level in range(self._upsample_max_level + 1, self._max_level + 1):
self._local_layers["stage3_2"][level] = tf.keras.layers.Conv2D(
filters=self._filters,
strides=(2, 2),
kernel_size=(3, 3),
padding='same',
name='p%d' % level,
trainable=trainable
)
def call(self, inputs, *args, **kwargs):
feats_bottom_up = inputs
# lateral connections
feats_lateral = {}
for level in range(self._min_level, self._upsample_max_level + 1):
feats_lateral[level] = self._local_layers["stage1"][level](feats_bottom_up[level])
# add top-down path
feats = {self._upsample_max_level: feats_lateral[self._upsample_max_level]}
for level in range(self._upsample_max_level - 1, self._min_level - 1, -1):
feats[level] = spatial_transform_ops.nearest_upsampling(
feats[level + 1], 2
) + feats_lateral[level]
# add post-hoc 3x3 convolution kernel
for level in range(self._min_level, self._upsample_max_level + 1):
feats[level] = self._local_layers["stage2"][level](feats[level])
if self._max_level == self._upsample_max_level + 1:
feats[self._max_level] = self._local_layers["stage3_1"](feats[self._max_level - 1])
else:
for level in range(self._upsample_max_level + 1, self._max_level + 1):
feats[level] = self._local_layers["stage3_2"][level](feats[level - 1])
return feats
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions to build various prediction heads in Mask-RCNN."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
__all__ = ["RPN_Head_Model", "Box_Head_Model", "Mask_Head_Model"]
class RPN_Head_Model(tf.keras.models.Model):
def __init__(self, name, num_anchors, trainable, *args, **kwargs):
super(RPN_Head_Model, self).__init__(name=name, trainable=trainable, *args, **kwargs)
"""Shared RPN heads."""
self._local_layers = dict()
# TODO(chiachenc): check the channel depth of the first convolution.
self._local_layers["conv1"] = tf.keras.layers.Conv2D(
256,
kernel_size=(3, 3),
strides=(1, 1),
activation=tf.nn.relu,
bias_initializer=tf.keras.initializers.Zeros(),
kernel_initializer=tf.random_normal_initializer(stddev=0.01),
padding='same',
trainable=trainable,
name='rpn'
)
# Proposal classification scores
# scores = tf.keras.layers.Conv2D(
self._local_layers["conv2"] = tf.keras.layers.Conv2D(
num_anchors,
kernel_size=(1, 1),
strides=(1, 1),
bias_initializer=tf.keras.initializers.Zeros(),
kernel_initializer=tf.random_normal_initializer(stddev=0.01),
padding='valid',
trainable=trainable,
name='rpn-class'
)
# Proposal bbox regression deltas
# bboxes = tf.keras.layers.Conv2D(
self._local_layers["conv3"] = tf.keras.layers.Conv2D(
4 * num_anchors,
kernel_size=(1, 1),
strides=(1, 1),
bias_initializer=tf.keras.initializers.Zeros(),
kernel_initializer=tf.random_normal_initializer(stddev=0.01),
padding='valid',
trainable=trainable,
name='rpn-box'
)
def call(self, inputs, *args, **kwargs):
net = self._local_layers["conv1"](inputs)
scores = self._local_layers["conv2"](net)
bboxes = self._local_layers["conv3"](net)
return scores, bboxes
class Box_Head_Model(tf.keras.Model):
def __init__(self, num_classes=91, mlp_head_dim=1024, name="box_head", trainable=True, *args, **kwargs):
"""Box and class branches for the Mask-RCNN model.
Args:
roi_features: A ROI feature tensor of shape
[batch_size, num_rois, height_l, width_l, num_filters].
num_classes: a integer for the number of classes.
mlp_head_dim: a integer that is the hidden dimension in the fully-connected
layers.
"""
super(Box_Head_Model, self).__init__(name=name, trainable=trainable, *args, **kwargs)
self._num_classes = num_classes
self._mlp_head_dim = mlp_head_dim
self._dense_fc6 = tf.keras.layers.Dense(
units=mlp_head_dim,
activation=tf.nn.relu,
trainable=trainable,
name='fc6'
)
self._dense_fc7 = tf.keras.layers.Dense(
units=mlp_head_dim,
activation=tf.nn.relu,
trainable=trainable,
name='fc7'
)
self._dense_class = tf.keras.layers.Dense(
num_classes,
kernel_initializer=tf.random_normal_initializer(stddev=0.01),
bias_initializer=tf.keras.initializers.Zeros(),
trainable=trainable,
name='class-predict'
)
self._dense_box = tf.keras.layers.Dense(
num_classes * 4,
kernel_initializer=tf.random_normal_initializer(stddev=0.001),
bias_initializer=tf.keras.initializers.Zeros(),
trainable=trainable,
name='box-predict'
)
def call(self, inputs, **kwargs):
"""
Returns:
class_outputs: a tensor with a shape of
[batch_size, num_rois, num_classes], representing the class predictions.
box_outputs: a tensor with a shape of
[batch_size, num_rois, num_classes * 4], representing the box predictions.
box_features: a tensor with a shape of
[batch_size, num_rois, mlp_head_dim], representing the box features.
"""
# reshape inputs before FC.
batch_size, num_rois, height, width, filters = inputs.get_shape().as_list()
net = tf.reshape(inputs, [batch_size, num_rois, height * width * filters])
net = self._dense_fc6(net)
box_features = self._dense_fc7(net)
class_outputs = self._dense_class(box_features)
box_outputs = self._dense_box(box_features)
return class_outputs, box_outputs, box_features
class Mask_Head_Model(tf.keras.Model):
@staticmethod
def _get_stddev_equivalent_to_msra_fill(kernel_size, fan_out):
"""Returns the stddev of random normal initialization as MSRAFill."""
# Reference: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/filler_op.h#L445-L463
# For example, kernel size is (3, 3) and fan out is 256, stddev is 0.029.
# stddev = (2/(3*3*256))^0.5 = 0.029
return (2 / (kernel_size[0] * kernel_size[1] * fan_out)) ** 0.5
def __init__(
self,
class_indices,
num_classes=91,
mrcnn_resolution=28,
is_gpu_inference=False,
name="mask_head",
trainable=True,
*args,
**kwargs
):
"""Mask branch for the Mask-RCNN model.
Args:
roi_features: A ROI feature tensor of shape
[batch_size, num_rois, height_l, width_l, num_filters].
class_indices: a Tensor of shape [batch_size, num_rois], indicating
which class the ROI is.
num_classes: an integer for the number of classes.
mrcnn_resolution: an integer that is the resolution of masks.
is_gpu_inference: whether to build the model for GPU inference.
"""
super(Mask_Head_Model, self).__init__(name=name, trainable=trainable, *args, **kwargs)
self._class_indices = class_indices
self._num_classes = num_classes
self._mrcnn_resolution = mrcnn_resolution
self._is_gpu_inference = is_gpu_inference
self._conv_stage1 = list()
kernel_size = (3, 3)
fan_out = 256
init_stddev = Mask_Head_Model._get_stddev_equivalent_to_msra_fill(kernel_size, fan_out)
for conv_id in range(4):
self._conv_stage1.append(tf.keras.layers.Conv2D(
fan_out,
kernel_size=kernel_size,
strides=(1, 1),
padding='same',
dilation_rate=(1, 1),
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(stddev=init_stddev),
bias_initializer=tf.keras.initializers.Zeros(),
trainable=trainable,
name='mask-conv-l%d' % conv_id
))
kernel_size = (2, 2)
fan_out = 256
init_stddev = Mask_Head_Model._get_stddev_equivalent_to_msra_fill(kernel_size, fan_out)
self._conv_stage2 = tf.keras.layers.Conv2DTranspose(
fan_out,
kernel_size=kernel_size,
strides=(2, 2),
padding='valid',
activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(stddev=init_stddev),
bias_initializer=tf.keras.initializers.Zeros(),
trainable=trainable,
name='conv5-mask'
)
kernel_size = (1, 1)
fan_out = self._num_classes
init_stddev = Mask_Head_Model._get_stddev_equivalent_to_msra_fill(kernel_size, fan_out)
self._conv_stage3 = tf.keras.layers.Conv2D(
fan_out,
kernel_size=kernel_size,
strides=(1, 1),
padding='valid',
kernel_initializer=tf.random_normal_initializer(stddev=init_stddev),
bias_initializer=tf.keras.initializers.Zeros(),
trainable=trainable,
name='mask_fcn_logits'
)
def call(self, inputs, **kwargs):
"""
Returns:
mask_outputs: a tensor with a shape of
[batch_size, num_masks, mask_height, mask_width],
representing the mask predictions.
fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2],
representing the fg mask targets.
Raises:
ValueError: If boxes is not a rank-3 tensor or the last dimension of
boxes is not 4.
"""
batch_size, num_rois, height, width, filters = inputs.get_shape().as_list()
net = tf.reshape(inputs, [-1, height, width, filters])
for conv_id in range(4):
net = self._conv_stage1[conv_id](net)
net = self._conv_stage2(net)
mask_outputs = self._conv_stage3(net)
mask_outputs = tf.reshape(
mask_outputs,
[-1, num_rois, self._mrcnn_resolution, self._mrcnn_resolution, self._num_classes]
)
with tf.name_scope('masks_post_processing'):
mask_outputs = tf.transpose(a=mask_outputs, perm=[0, 1, 4, 2, 3])
indices_dtype = tf.float32 if self._is_gpu_inference else tf.int32
if batch_size == 1:
indices = tf.reshape(
tf.reshape(
tf.range(num_rois, dtype=indices_dtype),
[batch_size, num_rois, 1]
) * self._num_classes + tf.expand_dims(self._class_indices, axis=-1),
[batch_size, -1]
)
indices = tf.cast(indices, tf.int32)
mask_outputs = tf.gather(
tf.reshape(mask_outputs, [batch_size, -1, self._mrcnn_resolution, self._mrcnn_resolution]),
indices,
axis=1
)
mask_outputs = tf.squeeze(mask_outputs, axis=1)
mask_outputs = tf.reshape(
mask_outputs,
[batch_size, num_rois, self._mrcnn_resolution, self._mrcnn_resolution])
else:
batch_indices = (
tf.expand_dims(tf.range(batch_size, dtype=indices_dtype), axis=1) *
tf.ones([1, num_rois], dtype=indices_dtype)
)
mask_indices = (
tf.expand_dims(tf.range(num_rois, dtype=indices_dtype), axis=0) *
tf.ones([batch_size, 1], dtype=indices_dtype)
)
gather_indices = tf.stack([batch_indices, mask_indices, self._class_indices], axis=2)
if self._is_gpu_inference:
gather_indices = tf.cast(gather_indices, dtype=tf.int32)
mask_outputs = tf.gather_nd(mask_outputs, gather_indices)
return mask_outputs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import tensorflow as tf
__all__ = ["KerasMockLayer"]
class KerasMockLayer(tf.Module):
"""
This class reproduces the Keras Layer important APIs without enforcing a variable scope.
"""
def __init__(self, trainable=True, *args, **kwargs):
super(KerasMockLayer, self).__init__(*args, **kwargs)
self._local_layers = dict()
self._trainable = trainable
@property
def trainable(self):
return self._trainable
@trainable.setter
def trainable(self, value):
self._trainable = value
for layer in getattr(self, '_layers', []):
layer.trainable = value
@property
def variables(self):
"""Returns the list of all layer variables/weights.
Alias of `self.weights`.
Returns:
A list of variables.
"""
return self.weights
@property
def trainable_variables(self):
return self.trainable_weights
@property
def non_trainable_variables(self):
return self.non_trainable_weights
@property
def weights(self):
"""Returns the list of all layer variables/weights.
Returns:
A list of variables.
"""
return self.trainable_weights + self.non_trainable_weights
@property
def name(self):
return self._name
@property
def trainable_weights(self):
layers = list()
for layer in self._local_layers.values():
if not isinstance(layer, dict):
layers.append(layer)
else:
for sublayer in layer.values():
layers.append(sublayer)
return list(itertools.chain.from_iterable([layer.trainable_variables for layer in layers]))
@property
def non_trainable_weights(self):
layers = list()
for layer in self._local_layers.values():
if not isinstance(layer, dict):
layers.append(layer)
else:
for sublayer in layer.values():
layers.append(sublayer)
return list(itertools.chain.from_iterable([layer.non_trainable_weights for layer in layers]))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Resnet."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.ops import variable_scope
from tensorflow.python.keras import backend
from mask_rcnn.models.keras_utils import KerasMockLayer
_BATCH_NORM_DECAY = 0.997
_BATCH_NORM_EPSILON = 1e-4
class BNReLULayer(KerasMockLayer):
def __init__(self, trainable, relu=True, init_zero=False, data_format='channels_last'):
"""Performs a batch normalization followed by a ReLU.
Args:
inputs: `Tensor` of shape `[batch, channels, ...]`.
trainable: `bool` for whether to finetune the batchnorm layer.
relu: `bool` if False, omits the ReLU operation.
init_zero: `bool` if True, initializes scale parameter of batch
normalization with 0 instead of 1 (default).
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
name: the name of the batch normalization layer
Returns:
A normalized `Tensor` with the same `data_format`.
"""
super(BNReLULayer, self).__init__(trainable=trainable)
if init_zero:
gamma_initializer = tf.keras.initializers.Zeros()
else:
gamma_initializer = tf.keras.initializers.Ones()
if data_format == 'channels_first':
axis = 1
else:
axis = 3
self._local_layers = dict()
self._local_layers["batchnorm"] = tf.keras.layers.BatchNormalization(
axis=axis,
momentum=_BATCH_NORM_DECAY,
epsilon=_BATCH_NORM_EPSILON,
center=True,
scale=True,
trainable=self._trainable,
gamma_initializer=gamma_initializer,
fused=True,
name="batch_normalization"
)
if relu:
self._local_layers["relu"] = tf.keras.layers.ReLU()
def __call__(self, inputs, training=False, *args, **kwargs):
net = self._local_layers["batchnorm"](inputs, training=training and self._trainable)
try:
return self._local_layers["relu"](net)
except KeyError:
return net
class FixedPaddingLayer(KerasMockLayer):
def __init__(self, kernel_size, data_format='channels_last', trainable=True):
"""Pads the input along the spatial dimensions independently of input size.
Args:
kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d`
operations. Should be a positive integer.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
"""
super(FixedPaddingLayer, self).__init__(trainable=trainable)
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
if data_format == 'channels_first':
self._paddings = [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]]
else:
self._paddings = [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]
def __call__(self, inputs, *args, **kwargs):
"""
Args:
inputs: `Tensor` of size `[batch, channels, height, width]` or
`[batch, height, width, channels]` depending on `data_format`.
Returns:
A padded `Tensor` of the same `data_format` with size either intact
(if `kernel_size == 1`) or padded (if `kernel_size > 1`).
:param **kwargs:
"""
return tf.pad(tensor=inputs, paddings=self._paddings)
class Conv2dFixedPadding(KerasMockLayer):
def __init__(self, filters, kernel_size, strides, data_format='channels_last', trainable=False):
"""Strided 2-D convolution with explicit padding.
The padding is consistent and is based only on `kernel_size`, not on the
dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
Args:
inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
filters: `int` number of filters in the convolution.
kernel_size: `int` size of the kernel to be used in the convolution.
strides: `int` strides of the convolution.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
A `Tensor` of shape `[batch, filters, height_out, width_out]`.
"""
super(Conv2dFixedPadding, self).__init__(trainable=trainable)
if strides > 1:
self._local_layers["fixed_padding"] = FixedPaddingLayer(kernel_size=kernel_size, data_format=data_format)
self._local_layers["conv2d"] = tf.keras.layers.Conv2D(
filters=filters,
kernel_size=kernel_size,
strides=strides,
padding=('SAME' if strides == 1 else 'VALID'),
use_bias=False,
kernel_initializer=tf.keras.initializers.VarianceScaling(),
data_format=data_format,
trainable=self._trainable,
name="conv2d"
)
def __call__(self, inputs, *args, **kwargs):
try:
net = self._local_layers["fixed_padding"](inputs)
except KeyError:
net = inputs
return self._local_layers["conv2d"](net)
class ResidualBlock(KerasMockLayer):
def __init__(self, filters, trainable, finetune_bn, strides, use_projection=False, data_format='channels_last'):
"""Standard building block for residual networks with BN after convolutions.
Args:
filters: `int` number of filters for the first two convolutions. Note that
the third and final convolution will use 4 times as many filters.
finetune_bn: `bool` for whether the model is in training.
strides: `int` block stride. If greater than 1, this block will ultimately downsample the input.
use_projection: `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
data_format: `str` either "channels_first" for `[batch, channels, height, width]`
or "channels_last for `[batch, height, width, channels]`.
"""
super(ResidualBlock, self).__init__(trainable=trainable)
self._finetune_bn = finetune_bn
if use_projection:
self._local_layers["projection"] = dict()
self._local_layers["projection"]["conv2d"] = Conv2dFixedPadding(
filters=filters,
kernel_size=1,
strides=strides,
data_format=data_format,
trainable=trainable
)
self._local_layers["projection"]["batchnorm"] = BNReLULayer(
trainable=finetune_bn and trainable,
relu=False,
init_zero=False,
data_format=data_format,
)
self._local_layers["conv2d_1"] = Conv2dFixedPadding(
trainable=trainable,
filters=filters,
kernel_size=3,
strides=strides,
data_format=data_format,
)
self._local_layers["conv2d_2"] = Conv2dFixedPadding(
trainable=trainable,
filters=filters,
kernel_size=3,
strides=1,
data_format=data_format,
)
self._local_layers["batchnorm_1"] = BNReLULayer(
trainable=finetune_bn and trainable,
relu=True,
init_zero=False,
data_format=data_format,
)
self._local_layers["batchnorm_2"] = BNReLULayer(
trainable=finetune_bn and trainable,
relu=False,
init_zero=True,
data_format=data_format,
)
self._local_layers["activation"] = tf.keras.layers.ReLU()
def __call__(self, inputs, training=False):
"""
Args:
inputs: `Tensor` of size `[batch, channels, height, width]`.
Returns:
The output `Tensor` of the block.
"""
try:
# Projection shortcut in first layer to match filters and strides
shortcut = self._local_layers["projection"]["conv2d"](inputs=inputs)
shortcut = self._local_layers["projection"]["batchnorm"](
inputs=shortcut,
training=training and self._trainable and self._finetune_bn
)
except KeyError:
shortcut = inputs
net = inputs
for i in range(1, 3):
net = self._local_layers["conv2d_%d" % i](inputs=net)
net = self._local_layers["batchnorm_%d" % i](
inputs=net,
training=training and self._trainable and self._finetune_bn
)
return self._local_layers["activation"](net + shortcut)
class BottleneckBlock(KerasMockLayer):
def __init__(self, filters, trainable, finetune_bn, strides, use_projection=False, data_format='channels_last'):
"""Bottleneck block variant for residual networks with BN after convolutions.
Args:
filters: `int` number of filters for the first two convolutions. Note that
the third and final convolution will use 4 times as many filters.
finetune_bn: `bool` for whether the model is in training.
strides: `int` block stride. If greater than 1, this block will ultimately downsample the input.
use_projection: `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
data_format: `str` either "channels_first" for `[batch, channels, height, width]`
or "channels_last for `[batch, height, width, channels]`.
"""
super(BottleneckBlock, self).__init__(trainable=trainable)
self._finetune_bn = finetune_bn
if use_projection:
# Projection shortcut only in first block within a group. Bottleneck blocks
# end with 4 times the number of filters.
filters_out = 4 * filters
self._local_layers["projection"] = dict()
self._local_layers["projection"]["conv2d"] = Conv2dFixedPadding(
filters=filters_out,
kernel_size=1,
strides=strides,
data_format=data_format,
trainable=trainable
)
self._local_layers["projection"]["batchnorm"] = BNReLULayer(
trainable=finetune_bn and trainable,
relu=False,
init_zero=False,
data_format=data_format,
)
self._local_layers["conv2d_1"] = Conv2dFixedPadding(
filters=filters,
kernel_size=1,
strides=1,
data_format=data_format,
trainable=trainable
)
self._local_layers["conv2d_2"] = Conv2dFixedPadding(
filters=filters,
kernel_size=3,
strides=strides,
data_format=data_format,
trainable=trainable
)
self._local_layers["conv2d_3"] = Conv2dFixedPadding(
filters=4 * filters,
kernel_size=1,
strides=1,
data_format=data_format,
trainable=trainable
)
self._local_layers["batchnorm_1"] = BNReLULayer(
trainable=finetune_bn and trainable,
relu=True,
init_zero=False,
data_format=data_format,
)
self._local_layers["batchnorm_2"] = BNReLULayer(
trainable=finetune_bn and trainable,
relu=True,
init_zero=False,
data_format=data_format,
)
self._local_layers["batchnorm_3"] = BNReLULayer(
trainable=finetune_bn and trainable,
relu=False,
init_zero=True,
data_format=data_format,
)
self._local_layers["activation"] = tf.keras.layers.ReLU()
def __call__(self, inputs, training=False):
"""
Args:
inputs: `Tensor` of size `[batch, channels, height, width]`.
Returns:
The output `Tensor` of the block.
"""
try:
# Projection shortcut in first layer to match filters and strides
shortcut = self._local_layers["projection"]["conv2d"](inputs=inputs)
shortcut = self._local_layers["projection"]["batchnorm"](
inputs=shortcut,
training=training and self._trainable and self._finetune_bn
)
except KeyError:
shortcut = inputs
net = inputs
for i in range(1, 4):
net = self._local_layers["conv2d_%d" % i](inputs=net)
net = self._local_layers["batchnorm_%d" % i](
inputs=net,
training=training and self._trainable and self._finetune_bn
)
return self._local_layers["activation"](net + shortcut)
class BlockGroup(KerasMockLayer):
def __init__(self, filters, block_layer, n_blocks, strides, trainable, finetune_bn, data_format='channels_last'):
"""Creates one group of blocks for the ResNet model.
Args:
inputs: `Tensor` of size `[batch, channels, height, width]`.
filters: `int` number of filters for the first convolution of the layer.
block_layer: `layer` for the block to use within the model
n_blocks: `int` number of blocks contained in the layer.
strides: `int` stride to use for the first convolution of the layer. If
greater than 1, this layer will downsample the input.
finetune_bn: `bool` for whether the model is training.
name: `str`name for the Tensor output of the block layer.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
The output `Tensor` of the block layer.
"""
super(BlockGroup, self).__init__(trainable=trainable)
self._finetune_bn = finetune_bn
self._n_blocks = n_blocks
for block_id in range(self._n_blocks):
# Only the first block per block_group uses projection shortcut and strides.
self._local_layers["block_%d" % (block_id + 1)] = block_layer(
filters=filters,
finetune_bn=finetune_bn,
trainable=trainable,
strides=strides if block_id == 0 else 1,
use_projection=block_id == 0,
data_format=data_format
)
def __call__(self, inputs, training=False):
net = inputs
for block_id in range(self._n_blocks):
net = self._local_layers["block_%d" % (block_id + 1)](net, training=training and self._trainable)
return net
class Resnet_Model(KerasMockLayer, tf.keras.models.Model):
def __init__(self, resnet_model, data_format='channels_last', trainable=True, finetune_bn=False, *args, **kwargs):
"""
Our actual ResNet network. We return the output of c2, c3,c4,c5
N.B. batch norm is always run with trained parameters, as we use very small
batches when training the object layers.
Args:
resnet_model: model type. Authorized Values: (resnet18, resnet34, resnet50, resnet101, resnet152, resnet200)
data_format: `str` either "channels_first" for
`[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`.
finetune_bn: `bool` for whether the model is training.
Returns the ResNet model for a given size and number of output classes.
"""
model_params = {
'resnet18': {'block': ResidualBlock, 'layers': [2, 2, 2, 2]},
'resnet34': {'block': ResidualBlock, 'layers': [3, 4, 6, 3]},
'resnet50': {'block': BottleneckBlock, 'layers': [3, 4, 6, 3]},
'resnet101': {'block': BottleneckBlock, 'layers': [3, 4, 23, 3]},
'resnet152': {'block': BottleneckBlock, 'layers': [3, 8, 36, 3]},
'resnet200': {'block': BottleneckBlock, 'layers': [3, 24, 36, 3]}
}
if resnet_model not in model_params:
raise ValueError('Not a valid resnet_model: %s' % resnet_model)
super(Resnet_Model, self).__init__(trainable=trainable, name=resnet_model, *args, **kwargs)
self._finetune_bn = finetune_bn
self._data_format = data_format
self._block_layer = model_params[resnet_model]['block']
self._n_layers = model_params[resnet_model]['layers']
self._local_layers["conv2d"] = Conv2dFixedPadding(
filters=64,
kernel_size=7,
strides=2,
data_format=self._data_format,
# Freeze at conv2d and batchnorm first 11 layers based on reference model.
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194
trainable=False
)
self._local_layers["batchnorm"] = BNReLULayer(
relu=True,
init_zero=False,
data_format=self._data_format,
# Freeze at conv2d and batchnorm first 11 layers based on reference model.
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194
trainable=False
)
self._local_layers["maxpool2d"] = tf.keras.layers.MaxPool2D(
pool_size=3,
strides=2,
padding='SAME',
data_format=self._data_format
)
self._local_layers["block_1"] = BlockGroup(
filters=64,
strides=1,
n_blocks=self._n_layers[0],
block_layer=self._block_layer,
data_format=self._data_format,
# Freeze at conv2d and batchnorm first 11 layers based on reference model.
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194
trainable=False,
finetune_bn=False
)
self._local_layers["block_2"] = BlockGroup(
filters=128,
strides=2,
n_blocks=self._n_layers[1],
block_layer=self._block_layer,
data_format=self._data_format,
# Freeze at conv2d and batchnorm first 11 layers based on reference model.
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194
trainable=self._trainable,
finetune_bn=self._finetune_bn
)
self._local_layers["block_3"] = BlockGroup(
filters=256,
strides=2,
n_blocks=self._n_layers[2],
block_layer=self._block_layer,
data_format=self._data_format,
# Freeze at conv2d and batchnorm first 11 layers based on reference model.
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194
trainable=self._trainable,
finetune_bn=self._finetune_bn
)
self._local_layers["block_4"] = BlockGroup(
filters=512,
strides=2,
n_blocks=self._n_layers[3],
block_layer=self._block_layer,
data_format=self._data_format,
# Freeze at conv2d and batchnorm first 11 layers based on reference model.
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194
trainable=self._trainable,
finetune_bn=self._finetune_bn
)
def call(self, inputs, training=True, *args, **kwargs):
"""Creation of the model graph."""
net = self._local_layers["conv2d"](inputs=inputs)
net = self._local_layers["batchnorm"](
inputs=net,
training=False
)
net = self._local_layers["maxpool2d"](net)
c2 = self._local_layers["block_1"](
inputs=net,
training=False,
)
c3 = self._local_layers["block_2"](
inputs=c2,
training=training,
)
c4 = self._local_layers["block_3"](
inputs=c3,
training=training,
)
c5 = self._local_layers["block_4"](
inputs=c4,
training=training,
)
return {2: c2, 3: c3, 4: c4, 5: c5}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment