Commit d0d91e12 authored by huchen's avatar huchen
Browse files

Merge branch 'tf2' into 'main'

tf2 detection

See merge request dcutoolkit/deeplearing/dlexamples_new!2
parents 2795dc1f c320b6ef
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to perform COCO evaluation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import operator
import pprint
import six
import time
import io
from PIL import Image
import numpy as np
import tensorflow as tf
from mask_rcnn.utils.logging_formatter import logging
from mask_rcnn import coco_metric
from mask_rcnn.utils import coco_utils
from mask_rcnn.object_detection import visualization_utils
import dllogger
from dllogger import Verbosity
def process_prediction_for_eval(prediction):
"""Process the model prediction for COCO eval."""
image_info = prediction['image_info']
box_coordinates = prediction['detection_boxes']
processed_box_coordinates = np.zeros_like(box_coordinates)
for image_id in range(box_coordinates.shape[0]):
scale = image_info[image_id][2]
for box_id in range(box_coordinates.shape[1]):
# Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
# Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
# by image scale.
y1, x1, y2, x2 = box_coordinates[image_id, box_id, :]
new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
processed_box_coordinates[image_id, box_id, :] = new_box
prediction['detection_boxes'] = processed_box_coordinates
return prediction
def compute_coco_eval_metric(predictor,
num_batches=-1,
include_mask=True,
annotation_json_file="",
eval_batch_size=-1,
report_frequency=None):
"""Compute COCO eval metric given a prediction generator.
Args:
predictor: a generator that iteratively pops a dictionary of predictions
with the format compatible with COCO eval tool.
num_batches: the number of batches to be aggregated in eval. This is how
many times that the predictor gets pulled.
include_mask: a boolean that indicates whether we include the mask eval.
annotation_json_file: the annotation json file of the eval dataset.
Returns:
eval_results: the aggregated COCO metric eval results.
"""
if annotation_json_file == "":
annotation_json_file = None
use_groundtruth_from_json = (annotation_json_file is not None)
predictions = dict()
batch_idx = 0
if use_groundtruth_from_json:
eval_metric = coco_metric.EvaluationMetric(annotation_json_file, include_mask=include_mask)
else:
eval_metric = coco_metric.EvaluationMetric(filename=None, include_mask=include_mask)
def evaluation_preds(preds):
# Essential to avoid modifying the source dict
_preds = copy.deepcopy(preds)
for k, v in six.iteritems(_preds):
_preds[k] = np.concatenate(_preds[k], axis=0)
if 'orig_images' in _preds and _preds['orig_images'].shape[0] > 10:
# Only samples a few images for visualization.
_preds['orig_images'] = _preds['orig_images'][:10]
if use_groundtruth_from_json:
eval_results = eval_metric.predict_metric_fn(_preds)
else:
images, annotations = coco_utils.extract_coco_groundtruth(_preds, include_mask)
coco_dataset = coco_utils.create_coco_format_dataset(images, annotations)
eval_results = eval_metric.predict_metric_fn(_preds, groundtruth_data=coco_dataset)
return eval_results
# Take into account cuDNN & Tensorflow warmup
# Drop N first steps for avg throughput calculation
BURNIN_STEPS = 100
model_throughput_list = list()
inference_time_list = list()
while num_batches < 0 or batch_idx < num_batches:
try:
step_t0 = time.time()
step_predictions = six.next(predictor)
batch_time = time.time() - step_t0
throughput = eval_batch_size / batch_time
model_throughput_list.append(throughput)
inference_time_list.append(batch_time)
logging.info('Running inference on batch %03d/%03d... - Step Time: %.4fs - Throughput: %.1f imgs/s' % (
batch_idx + 1,
num_batches,
batch_time,
throughput
))
except StopIteration:
logging.info('Get StopIteration at %d batch.' % (batch_idx + 1))
break
step_predictions = process_prediction_for_eval(step_predictions)
for k, v in step_predictions.items():
if k not in predictions:
predictions[k] = [v]
else:
predictions[k].append(v)
batch_idx = batch_idx + 1
# If you want the report to happen each report_frequency to happen each report_frequency batches.
# Thus, each report is of eval_batch_size * report_frequency
if report_frequency and batch_idx % report_frequency == 0:
eval_results = evaluation_preds(preds=predictions)
logging.info('Eval results: %s' % pprint.pformat(eval_results, indent=4))
inference_time_list.sort()
eval_results = evaluation_preds(preds=predictions)
average_time = np.mean(inference_time_list)
latency_50 = max(inference_time_list[:int(len(inference_time_list) * 0.5)])
latency_90 = max(inference_time_list[:int(len(inference_time_list) * 0.90)])
latency_95 = max(inference_time_list[:int(len(inference_time_list) * 0.95)])
latency_99 = max(inference_time_list[:int(len(inference_time_list) * 0.99)])
latency_100 = max(inference_time_list[:int(len(inference_time_list) * 1)])
print() # Visual Spacing
logging.info("# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #")
logging.info(" Evaluation Performance Summary ")
logging.info("# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #")
total_processing_hours, rem = divmod(np.sum(model_throughput_list), 3600)
total_processing_minutes, total_processing_seconds = divmod(rem, 60)
if len(model_throughput_list) > BURNIN_STEPS:
# Take into account cuDNN & Tensorflow warmup
# Drop N first steps for avg throughput calculation
# Also drop last step which may have a different batch size
avg_throughput = np.mean(model_throughput_list[BURNIN_STEPS:-1])
else:
avg_throughput = -1.
print() # Visual Spacing
logging.info("Average throughput: {throughput:.1f} samples/sec".format(throughput=avg_throughput))
logging.info("Inference Latency Average (s) = {avg:.4f}".format(avg=average_time))
logging.info("Inference Latency 50% (s) = {cf_50:.4f}".format(cf_50=latency_50))
logging.info("Inference Latency 90% (s) = {cf_90:.4f}".format(cf_90=latency_90))
logging.info("Inference Latency 95% (s) = {cf_95:.4f}".format(cf_95=latency_95))
logging.info("Inference Latency 99% (s) = {cf_99:.4f}".format(cf_99=latency_99))
logging.info("Inference Latency 100% (s) = {cf_100:.4f}".format(cf_100=latency_100))
logging.info("Total processed steps: {total_steps}".format(total_steps=len(model_throughput_list)))
logging.info(
"Total processing time: {hours}h {minutes:02d}m {seconds:02d}s".format(
hours=total_processing_hours,
minutes=int(total_processing_minutes),
seconds=int(total_processing_seconds)
)
)
dllogger.log(step=(), data={"avg_inference_throughput": avg_throughput}, verbosity=Verbosity.DEFAULT)
avg_inference_time = float(total_processing_hours * 3600 + int(total_processing_minutes) * 60 +
int(total_processing_seconds))
dllogger.log(step=(), data={"avg_inference_time": avg_inference_time}, verbosity=Verbosity.DEFAULT)
logging.info("==================== Metrics ====================")
# logging.info('Eval Epoch results: %s' % pprint.pformat(eval_results, indent=4))
for key, value in sorted(eval_results.items(), key=operator.itemgetter(0)):
logging.info("%s: %.9f" % (key, value))
print() # Visual Spacing
return eval_results, predictions
def evaluate(eval_estimator,
input_fn,
num_eval_samples,
eval_batch_size,
include_mask=True,
validation_json_file="",
report_frequency=None):
"""Runs COCO evaluation once."""
predictor = eval_estimator.predict(
input_fn=input_fn,
yield_single_examples=False
)
# Every predictor.next() gets a batch of prediction (a dictionary).
num_eval_times = num_eval_samples // eval_batch_size
assert num_eval_times > 0, 'num_eval_samples must be >= eval_batch_size!'
eval_results, predictions = compute_coco_eval_metric(
predictor,
num_eval_times,
include_mask,
validation_json_file,
eval_batch_size=eval_batch_size,
report_frequency=report_frequency
)
return eval_results, predictions
def write_summary(eval_results, summary_dir, current_step, predictions=None):
"""Write out eval results for the checkpoint."""
with tf.Graph().as_default():
summaries = []
# Summary writer writes out eval metrics.
try:
# Tensorflow 1.x
summary_writer = tf.compat.v1.summary.FileWriter(summary_dir)
except AttributeError:
# Tensorflow 2.x
summary_writer = tf.summary.create_file_writer(summary_dir)
summary_writer.as_default()
eval_results_dict = {}
for metric in eval_results:
try:
summaries.append(tf.compat.v1.Summary.Value(tag=metric, simple_value=eval_results[metric]))
eval_results_dict[metric] = float(eval_results[metric])
except AttributeError:
tf.summary.scalar(name=metric, data=eval_results[metric], step=current_step)
eval_results_dict[metric] = float(eval_results[metric])
dllogger.log(step=(), data=eval_results_dict, verbosity=Verbosity.DEFAULT)
if isinstance(predictions, dict) and predictions:
images_summary = get_image_summary(predictions, current_step)
try:
summaries += images_summary
except TypeError:
summaries.append(images_summary)
try:
# tf_summaries = tf.compat.v1.Summary(value=list(summaries))
tf_summaries = tf.compat.v1.Summary(value=summaries)
summary_writer.add_summary(tf_summaries, current_step)
summary_writer.flush()
except AttributeError:
tf.summary.flush(summary_writer)
def generate_image_preview(image, boxes, scores, classes, gt_boxes=None, segmentations=None):
"""Creates an image summary given predictions."""
max_boxes_to_draw = 100
min_score_thresh = 0.1
# Visualizes the predicitons.
image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
image,
boxes,
classes=classes,
scores=scores,
category_index={},
instance_masks=segmentations,
use_normalized_coordinates=False,
max_boxes_to_draw=max_boxes_to_draw,
min_score_thresh=min_score_thresh,
agnostic_mode=False
)
if gt_boxes is not None:
# Visualizes the groundtruth boxes. They are in black by default.
image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
image_with_detections,
gt_boxes,
classes=None,
scores=None,
category_index={},
use_normalized_coordinates=False,
max_boxes_to_draw=max_boxes_to_draw,
agnostic_mode=True
)
return image_with_detections
def generate_image_buffer(input_image):
buf = io.BytesIO()
w, h = input_image.shape[:2]
ratio = 1024 / w
new_size = [int(w * ratio), int(h * ratio)]
image = Image.fromarray(input_image.astype(np.uint8))
image.thumbnail(new_size)
image.save(buf, format='png')
return buf.getvalue()
def get_image_summary(predictions, current_step, max_images=10):
"""Write out image and prediction for summary."""
if 'orig_images' not in predictions:
logging.info('Missing orig_images in predictions: %s', predictions.keys())
return
max_images = min(
len(predictions['orig_images']) * predictions['orig_images'][0].shape[0],
max_images
)
_detection_boxes = np.concatenate(predictions['detection_boxes'], axis=0)
_detection_scores = np.concatenate(predictions['detection_scores'], axis=0)
_detection_classes = np.concatenate(predictions['detection_classes'], axis=0)
_image_info = np.concatenate(predictions['image_info'], axis=0)
_num_detections = np.concatenate(predictions['num_detections'], axis=0)
_orig_images = np.concatenate(predictions['orig_images'], axis=0)
if 'detection_masks' in predictions:
_detection_masks = np.concatenate(predictions['detection_masks'], axis=0)
else:
_detection_masks = None
if 'groundtruth_boxes' in predictions:
_groundtruth_boxes = np.concatenate(predictions['groundtruth_boxes'], axis=0)
else:
_groundtruth_boxes = None
_orig_images = _orig_images * 255
_orig_images = _orig_images.astype(np.uint8)
image_previews = []
for i in range(max_images):
num_detections = min(len(_detection_boxes[i]), int(_num_detections[i]))
detection_boxes = _detection_boxes[i][:num_detections]
detection_scores = _detection_scores[i][:num_detections]
detection_classes = _detection_classes[i][:num_detections]
image = _orig_images[i]
image_height = image.shape[0]
image_width = image.shape[1]
# Rescale the box to fit the visualization image.
h, w = _image_info[i][3:5]
detection_boxes = detection_boxes / np.array([w, h, w, h])
detection_boxes = detection_boxes * np.array([image_width, image_height, image_width, image_height])
if _groundtruth_boxes is not None:
gt_boxes = _groundtruth_boxes[i]
gt_boxes = gt_boxes * np.array([image_height, image_width, image_height, image_width])
else:
gt_boxes = None
if _detection_masks is not None:
instance_masks = _detection_masks[i][0:num_detections]
segmentations = coco_metric.generate_segmentation_from_masks(
instance_masks,
detection_boxes,
image_height,
image_width
)
else:
segmentations = None
# From [x, y, w, h] to [x1, y1, x2, y2] and
# process_prediction_for_eval() set the box to be [x, y] format, need to
# reverted them to [y, x] format.
xmin, ymin, w, h = np.split(detection_boxes, 4, axis=-1)
xmax = xmin + w
ymax = ymin + h
boxes_to_visualize = np.concatenate([ymin, xmin, ymax, xmax], axis=-1)
image_preview = generate_image_preview(
image,
boxes=boxes_to_visualize,
scores=detection_scores,
classes=detection_classes.astype(np.int32),
gt_boxes=gt_boxes,
segmentations=segmentations
)
image_previews.append(image_preview)
try:
summaries = []
for i, image_preview in enumerate(image_previews):
image_buffer = generate_image_buffer(image_preview)
image_summary = tf.compat.v1.Summary.Image(encoded_image_string=image_buffer)
image_value = tf.compat.v1.Summary.Value(tag='%d_input' % i, image=image_summary)
summaries.append(image_value)
except AttributeError:
image_previews = np.array(image_previews)
summaries = tf.summary.image(
name='image_summary',
data=image_previews,
step=current_step,
max_outputs=max_images
)
return summaries
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from mask_rcnn.hooks.ckpt_hook import CheckpointSaverHook
from mask_rcnn.hooks.pretrained_restore_hook import PretrainedWeightsLoadingHook
__all__ = [
"CheckpointSaverHook",
"PretrainedWeightsLoadingHook",
]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tensorflow as tf
from mask_rcnn.utils.logging_formatter import logging
__all__ = ["CheckpointSaverHook"]
class CheckpointSaverHook(tf.estimator.SessionRunHook):
"""Saves checkpoints every N steps or seconds."""
def __init__(self, checkpoint_dir, checkpoint_basename="model.ckpt"):
"""Initializes a `CheckpointSaverHook`.
Args:
checkpoint_dir: `str`, base directory for the checkpoint files.
checkpoint_basename: `str`, base name for the checkpoint files.
Raises:
ValueError: One of `save_steps` or `save_secs` should be set.
ValueError: At most one of `saver` or `scaffold` should be set.
"""
logging.info("Create CheckpointSaverHook.")
self._saver = None
self._checkpoint_dir = checkpoint_dir
self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
self._steps_per_run = 1
self._is_initialized = False
self._global_step_tensor = None
self._summary_writer = None
def _set_steps_per_run(self, steps_per_run):
self._steps_per_run = steps_per_run
def begin(self):
self._global_step_tensor = tf.compat.v1.train.get_or_create_global_step()
self._saver = tf.compat.v1.train.Saver()
from tensorflow.python.training import summary_io
self._summary_writer = summary_io.SummaryWriterCache.get(self._checkpoint_dir)
if self._global_step_tensor is None:
raise RuntimeError(
"Global step should be created to use CheckpointSaverHook."
)
def after_create_session(self, session, coord):
if not self._is_initialized:
global_step = session.run(self._global_step_tensor)
from tensorflow.python.keras.backend import get_graph
default_graph = get_graph()
# We do write graph and saver_def at the first call of before_run.
# We cannot do this in begin, since we let other hooks to change graph and
# add variables in begin. Graph is finalized after all begin calls.
tf.io.write_graph(
default_graph.as_graph_def(add_shapes=True),
self._checkpoint_dir,
"graph.pbtxt"
)
saver_def = self._saver.saver_def
from tensorflow.python.framework import meta_graph
meta_graph_def = meta_graph.create_meta_graph_def(
graph_def=default_graph.as_graph_def(add_shapes=True),
saver_def=saver_def
)
self._summary_writer.add_graph(default_graph)
self._summary_writer.add_meta_graph(meta_graph_def)
# The checkpoint saved here is the state at step "global_step".
self._save(session, global_step)
self._is_initialized = True
def end(self, session):
last_step = session.run(self._global_step_tensor)
self._save(session, last_step)
def _save(self, session, step):
"""Saves the latest checkpoint, returns should_stop."""
logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
self._saver.save(session, self._save_path, global_step=step)
self._summary_writer.add_session_log(
tf.compat.v1.SessionLog(status=tf.compat.v1.SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
step
)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import operator
import time
import numpy as np
import tensorflow as tf
from distutils.version import LooseVersion
from mask_rcnn.utils.logging_formatter import logging
from mask_rcnn.utils import meters
from mask_rcnn.utils.decorators import atexit_hook
from mask_rcnn.utils.distributed_utils import MPI_is_distributed
from mask_rcnn.utils.distributed_utils import MPI_rank_and_size
from mask_rcnn.utils.distributed_utils import MPI_size
from mask_rcnn.utils.logging_backend import LoggingBackend
from mask_rcnn.utils.logging_backend import RuntimeMode
from mask_rcnn.utils.metric_tracking import clear_registered_metrics
from mask_rcnn.utils.metric_tracking import TF_METRICS
from mask_rcnn.utils.metric_tracking import KERAS_MODELS
from mask_rcnn.utils.lazy_imports import LazyImport
hvd = LazyImport("horovod.tensorflow")
__all__ = ["AutoLoggingHook"]
@atexit_hook
class _AutoLoggingHook(tf.estimator.SessionRunHook):
def __init__(self, log_every_n_steps=200, warmup_steps=500, is_training=True):
"""
AutoLogging Hook for Tensorflow
:param log_every_n_steps: log will be output on the console every N steps
:param warmup_steps: integers, numbers of steps considered as warmup
:param is_training: boolean
"""
self._logging_proxy = LoggingBackend()
self._initialized = False
self._metrics = copy.copy(TF_METRICS)
self._batch_size_tensor = None
self._AMP_steps_since_last_loss_scale = None
self._AMP_loss_scale_tensor = None
self._current_step = None
self._amp_steps_non_skipped = None
self._warmup_steps = warmup_steps
self._log_every_n_steps = log_every_n_steps
self._step_t0 = None
self._session_t0 = None
self._session_run_times = list()
self._global_step_tensor = None
self._is_training = is_training
self._runtime_mode = RuntimeMode.TRAIN if is_training else RuntimeMode.VALIDATION
self._model_throughput = meters.MovingAverageMeter(window_size=1000)
self._model_stats = None
self._n_gpus = None
def __atexit__(self):
if self._initialized:
total_processing_time = int(np.sum(self._session_run_times))
try:
avg_throughput = self._model_throughput.read()
except ValueError:
avg_throughput = -1
self._logging_proxy.log_summary(
is_train=self._is_training,
total_steps=self._current_step,
total_processing_time=total_processing_time,
avg_throughput=avg_throughput
)
metric_data = dict()
for key, value in self._metrics.items():
try:
metric_data[key] = value["aggregator"].read()
except ValueError:
pass
self._logging_proxy.log_final_metrics(metric_data=metric_data, runtime_mode=self._runtime_mode)
def begin(self):
"""Called once before using the session.
When called, the default graph is the one that will be launched in the
session. The hook can modify the graph by adding new operations to it.
After the `begin()` call the graph will be finalized and the other callbacks
can not modify the graph anymore. Second call of `begin()` on the same
graph, should not change the graph.
"""
from tensorflow.python.keras.backend import get_graph
_graph = get_graph()
try:
self._batch_size_tensor = None
for tensor in _graph.as_graph_def().node:
if "IteratorGetNext" in tensor.name:
_input_tensor = _graph.get_tensor_by_name(tensor.name + ":0")
try:
self._batch_size_tensor = tf.shape(input=_input_tensor)[0]
except TypeError: # Ragged Tensor
self._batch_size_tensor = _input_tensor.bounding_shape()[0]
break
else:
raise RuntimeError(
"Tensor `{}` could not be found. "
"Make sure you are using tf.data API".format("IteratorGetNext")
)
except RuntimeError:
raise
except Exception as e:
raise RuntimeError(
"Impossible to fetch the tensor: `IteratorGetNext`. Make sure you are using tf.data API."
) from e
self._global_step_tensor = tf.compat.v1.train.get_or_create_global_step()
try:
self._AMP_loss_scale_tensor = _graph.get_tensor_by_name("current_loss_scale/Read/ReadVariableOp:0")
self._AMP_steps_since_last_loss_scale = _graph.get_tensor_by_name("current_loss_scale/Read/ReadVariableOp:0")
except RuntimeError:
raise
# TF-AMP is not activated
except Exception:
pass
# if self._is_training:
# self.runtime_data["params_count"] = tf.reduce_sum(
# [tf.reduce_prod(v.shape) for v in tf.trainable_variables()]
# )
def end(self, session): # pylint: disable=unused-argument
"""Called at the end of session.
The `session` argument can be used in case the hook wants to run final ops,
such as saving a last checkpoint.
If `session.run()` raises exception other than OutOfRangeError or
StopIteration then `end()` is not called.
Note the difference between `end()` and `after_run()` behavior when
`session.run()` raises OutOfRangeError or StopIteration. In that case
`end()` is called but `after_run()` is not called.
Args:
session: A TensorFlow Session that will be soon closed.
"""
self._session_run_times.append(time.time() - self._session_t0)
def after_create_session(self, session, coord): # pylint: disable=unused-argument3
"""Called when new TensorFlow session is created.
This is called to signal the hooks that a new session has been created. This
has two essential differences with the situation in which `begin` is called:
* When this is called, the graph is finalized and ops can no longer be added
to the graph.
* This method will also be called as a result of recovering a wrapped
session, not only at the beginning of the overall session.
Args:
session: A TensorFlow Session that has been created.
coord: A Coordinator object which keeps track of all threads.
"""
# ========= Collect the number of GPUs ======== #
if self._is_training:
if MPI_is_distributed():
self._n_gpus = MPI_size()
elif tf.distribute.has_strategy():
self._n_gpus = tf.distribute.get_strategy().num_replicas_in_sync
else:
self._n_gpus = 1
else:
self._n_gpus = 1
# =========== TensorFlow Hook Setup =========== #
_global_step, _metrics = setup_tensorflow_hook(
sess=session,
logging_proxy=self._logging_proxy,
is_training=self._is_training,
is_initialized=self._initialized
)
if _global_step >= 0:
self._current_step = self._amp_steps_non_skipped = _global_step
self._metrics.update(_metrics)
if not self._is_training:
for metric_name in self._metrics.keys():
self._metrics[metric_name]["aggregator"].reset()
self._initialized = True
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
self._session_t0 = time.time()
def before_run(self, run_context): # pylint: disable=unused-argument
"""Called before each call to run().
You can return from this call a `SessionRunArgs` object indicating ops or
tensors to add to the upcoming `run()` call. These ops/tensors will be run
together with the ops/tensors originally passed to the original run() call.
The run args you return can also contain feeds to be added to the run()
call.
The `run_context` argument is a `SessionRunContext` that provides
information about the upcoming `run()` call: the originally requested
op/tensors, the TensorFlow Session.
At this point graph is finalized and you can not add ops.
Args:
run_context: A `SessionRunContext` object.
Returns:
None or a `SessionRunArgs` object.
"""
self._current_step += 1
request_fetches = {
"global_step": self._global_step_tensor, "metrics": dict(), "batch_size": self._batch_size_tensor
}
if self._is_training and self._AMP_steps_since_last_loss_scale is not None:
request_fetches["AMP"] = {
"steps_since_last_loss_scale": self._AMP_steps_since_last_loss_scale,
"current_loss_scale": self._AMP_loss_scale_tensor,
}
if self._current_step % self._log_every_n_steps == 0:
for key, value in self._metrics.items():
request_fetches["metrics"][key] = value["tensor"]
self._step_t0 = time.time()
return tf.estimator.SessionRunArgs(request_fetches)
def after_run(self, run_context, run_values): # pylint: disable=unused-argument
"""Called after each call to run().
The `run_values` argument contains results of requested ops/tensors by
`before_run()`.
The `run_context` argument is the same one send to `before_run` call.
`run_context.request_stop()` can be called to stop the iteration.
If `session.run()` raises any exceptions then `after_run()` is not called.
Args:
run_context: A `SessionRunContext` object.
run_values: A SessionRunValues object.
"""
batch_time = time.time() - self._step_t0
_global_step = run_values.results["global_step"]
if self._is_training and self._AMP_steps_since_last_loss_scale is not None:
try:
AMP_steps_since_last_loss_scale = run_values.results["AMP"]["steps_since_last_loss_scale"]
AMP_loss_scale = run_values.results["AMP"]["current_loss_scale"]
except KeyError:
AMP_steps_since_last_loss_scale = None
AMP_loss_scale = None
if AMP_steps_since_last_loss_scale is not None:
# Step has been skipped
if _global_step != (self._amp_steps_non_skipped + 1):
logging.warning(
"AMP - Training iteration `#{step}` has been skipped and loss rescaled. "
"New Loss Scale: {loss_scale}\n".format(step=self._current_step, loss_scale=AMP_loss_scale)
)
else:
self._amp_steps_non_skipped += 1
if AMP_steps_since_last_loss_scale == 0:
logging.warning(
"AMP - Training iteration `#{step}` - Loss scale has been automatically increased. "
"New Loss Scale: {loss_scale}\n".format(step=self._current_step, loss_scale=AMP_loss_scale)
)
else:
AMP_steps_since_last_loss_scale = None
AMP_loss_scale = None
def get_model_throughput():
gpu_batch_size = run_values.results["batch_size"]
return gpu_batch_size / batch_time * self._n_gpus
# def get_model_stats():
# return get_tf_model_statistics(batch_size=run_values.results["batch_size"], scope_name=None)
#
# if self._model_stats is None:
# self._model_stats = get_model_stats()
is_log_step = self._current_step % self._log_every_n_steps == 0
if is_log_step:
if self._current_step > self._warmup_steps:
try:
model_throughput = self._model_throughput.read()
except ValueError:
model_throughput = get_model_throughput()
else:
model_throughput = get_model_throughput()
self._logging_proxy.log_step(iteration=self._current_step, throughput=model_throughput, gpu_stats=[])
self._logging_proxy.log_amp_runtime(
current_loss_scale=AMP_loss_scale,
steps_non_skipped=_global_step,
steps_since_last_scale=AMP_steps_since_last_loss_scale,
)
metric_data = dict()
for name, value in sorted(run_values.results["metrics"].items(), key=operator.itemgetter(0)):
self._metrics[name]["aggregator"].record(value)
metric_data[name] = self._metrics[name]["aggregator"].read()
self._logging_proxy.log_metrics(
metric_data=metric_data, iteration=self._current_step, runtime_mode=self._runtime_mode
)
print() # Visual Spacing
elif self._current_step > self._warmup_steps:
# Do not store speed for log step due to additional fetches
self._model_throughput.record(get_model_throughput())
class _SlaveGPUsHook(tf.estimator.SessionRunHook):
def after_create_session(self, session, coord):
with logging.temp_verbosity(logging.INFO): # Do not warn user about metric cleaning
clear_registered_metrics()
def real_autologging_hook(*args, **kwargs):
replica_id = tf.distribute.get_replica_context().replica_id_in_sync_group
# Do not set a logging hook for GPUs != 0
if MPI_rank_and_size()[0] != 0 or (isinstance(replica_id, tf.Tensor) and tf.get_static_value(replica_id) != 0):
return _SlaveGPUsHook()
else:
_ = LoggingBackend() # Making sure the backend is defined before any hook due to __atexit__ hook
return _AutoLoggingHook(*args, **kwargs)
def collect_registered_metrics():
if TF_METRICS: # if not empty
metrics = copy.copy(TF_METRICS)
# Do not warn user about metric cleaning
with logging.temp_verbosity(logging.INFO):
clear_registered_metrics()
return metrics
else:
return dict()
def get_model_variables():
"""return model variables: global variables without optimizer's variables"""
return [
# yapf: disable
var for var in tf.compat.v1.global_variables() if (
var.name[-11:] not in "/Momentum:0" and
var.name[-11:] not in "/Adadelta:0" and
var.name[-13:] not in "/Adadelta_1:0" and
var.name[-7:] not in "/Adam:0" and
var.name[-9:] not in "/Adam_1:0" and
var.name[-10:] not in "/Adagrad:0" and
var.name[10:] not in "/RMSProp:0" and
var.name[-12:] not in "/RMSProp_1:0" and
var.name[-16:] not in "/LARSOptimizer:0"
)
# yapf: enable
]
def get_trainable_variables():
"""Get a list of trainable TensorFlow variables.
Parameters
----------
train_only : boolean
If True, only get the trainable variables.
Returns
-------
list of Tensor
A list of trainable TensorFlow variables
Examples
--------
"""
if KERAS_MODELS or LooseVersion(tf.__version__) >= LooseVersion("2.0.0"):
logging.warning(
"In TF2.x, only trainable variables created with Keras Models are captured for logging.\n"
"In TF1.x, if any keras model is defined. Only variables created inside Keras Models will be logged."
)
var_list = list()
for model in KERAS_MODELS:
var_list.extend(model.trainable_variables)
# Keep only a list of unique variables (remove potential duplicates)
var_list = list(set(var_list))
# clearing the list of Keras Model to avoid memory leaks
KERAS_MODELS.clear()
return [var for var in sorted(var_list, key=lambda v: v.name)]
else:
# return tf.trainable_variables() # deprecated in TF2.x
from tensorflow.python.keras.backend import get_graph
return get_graph().get_collection('trainable_variables')
def setup_tensorflow_hook(sess, logging_proxy, is_training, is_initialized):
global_step = -1
if is_training:
if not is_initialized:
_global_step_tensor = tf.compat.v1.train.get_or_create_global_step()
global_step = sess.run(_global_step_tensor)
trainable_variables = get_trainable_variables()
def count_weights_in_varlist(var_list):
return np.sum([np.prod(s.get_shape()) for s in var_list])
logging_proxy.log_git_status()
logging_proxy.log_model_statistics(
model_statistics={
"# Trainable Weights": "{:,}".format(int(count_weights_in_varlist(trainable_variables))),
"# Model Weights": "{:,}".format(int(count_weights_in_varlist(get_model_variables()))),
}
)
logging_proxy.log_trainable_variables([(var.name, var.get_shape()) for var in trainable_variables])
else:
if not is_initialized:
global_step = 0
metrics = collect_registered_metrics()
logging_proxy.log_runtime(is_train=is_training)
return global_step, metrics
AutoLoggingHook = lambda *args, **kwargs: real_autologging_hook(*args, **kwargs)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions to override model parameters from command-line flags."""
from mask_rcnn.hyperparameters import params_dict
ESSENTIAL_FLAGS = ['tpu', 'data_dir', 'model_dir']
def override_params_from_input_flags(params, input_flags):
"""Update params dictionary with input flags.
Args:
params: ParamsDict object containing dictionary of model parameters.
input_flags: All the flags with non-null value of overridden model
parameters.
Returns:
ParamsDict object containing dictionary of model parameters.
"""
if params is None:
raise ValueError('Input dictionary is empty. It is expected to be loaded with default ' 'values')
if not isinstance(params, params_dict.ParamsDict):
raise ValueError('The base parameter set must be a ParamsDict, was: {}'.format(type(params)))
essential_flag_dict = {}
for key in ESSENTIAL_FLAGS:
flag_value = input_flags.get_flag_value(key, None)
if flag_value is None:
raise ValueError('Flag {} could not be None.'.format(key))
else:
essential_flag_dict[key] = flag_value
params_dict.override_params_dict(params, essential_flag_dict, is_strict=False)
normal_flag_dict = get_dictionary_from_flags(params.as_dict(), input_flags)
params_dict.override_params_dict(params, normal_flag_dict, is_strict=False)
return params
def get_dictionary_from_flags(params, input_flags):
"""Generate dictionary from non-null flags.
Args:
params: Python dictionary of model parameters.
input_flags: All the flags with non-null value of overridden model
parameters.
Returns:
Python dict of overriding model parameters.
"""
flag_dict = {}
for k, v in params.items():
if isinstance(v, dict):
d = get_dictionary_from_flags(v, input_flags)
flag_dict[k] = d
else:
try:
flag_value = input_flags.get_flag_value(k, None)
if flag_value is not None:
flag_dict[k] = flag_value
except AttributeError:
flag_dict[k] = v
return flag_dict
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#============================================================================
"""Utils to handle parameters IO."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import yaml
import tensorflow as tf
def save_hparams_to_yaml(hparams, file_path):
with tf.io.gfile.GFile(file_path, 'w') as f:
try:
hparams_val = hparams.values()
except AttributeError:
hparams_val = hparams.__dict__
yaml.dump(hparams_val, f)
def override_hparams(hparams, dict_or_string_or_yaml_file):
"""Override a given hparams using a dict or a string or a JSON file.
Args:
hparams: a HParams object to be overridden.
dict_or_string_or_yaml_file: a Python dict, or a comma-separated string,
or a path to a YAML file specifying the parameters to be overridden.
Returns:
hparams: the overridden HParams object.
Raises:
ValueError: if failed to override the parameters.
"""
if not dict_or_string_or_yaml_file:
return hparams
if isinstance(dict_or_string_or_yaml_file, dict):
for key, val in dict_or_string_or_yaml_file.items():
if key not in hparams:
try: # TF 1.x
hparams.add_hparam(key, val)
except AttributeError: # TF 2.x
try: # Dict
hparams[key] = val
except TypeError: # Namespace
setattr(hparams, key, val)
else:
raise ValueError("Parameter `%s` is already defined" % key)
# hparams.override_from_dict(dict_or_string_or_yaml_file)
elif isinstance(dict_or_string_or_yaml_file, six.string_types):
try:
hparams.parse(dict_or_string_or_yaml_file)
except ValueError as parse_error:
try:
with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
hparams.override_from_dict(yaml.load(f))
except Exception as read_error:
parse_message = ('Failed to parse config string: %s\n' % parse_error.message)
read_message = ('Failed to parse yaml file provided. %s' % read_error.message)
raise ValueError(parse_message + read_message)
else:
raise ValueError('Unknown input type to parse.')
return hparams
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment