"backend/apps/vscode:/vscode.git/clone" did not exist on "642c352c69ceadb118ed5347c091c761a5a65a8b"
Commit ee3997b3 authored by qianyj's avatar qianyj
Browse files

new tf branch for dtk21.10.1

parent 2795dc1f
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""DeepSpeech2 model configuration.
References:
https://arxiv.org/abs/1512.02595
Deep Speech 2: End-to-End Speech Recognition in English and Mandarin
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf
import constants
from cnn_util import log_fn
from models import model as model_lib
from tensorflow.python.ops import variables # pylint: disable=g-direct-tensorflow-import
class DeepSpeechDecoder(object):
"""Greedy decoder implementation for Deep Speech model."""
def __init__(self, labels, blank_index=28):
"""Decoder initialization.
Args:
labels: a string specifying the speech labels for the decoder to use.
blank_index: an integer specifying index for the blank character. Defaults
to 28.
"""
self.labels = labels
self.blank_index = blank_index
self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
def convert_to_string(self, sequence):
"""Convert a sequence of indexes into corresponding string."""
return ''.join([self.int_to_char[i] for i in sequence])
def wer(self, decode, target):
"""Computes the Word Error Rate (WER).
WER is defined as the edit distance between the two provided sentences after
tokenizing to words.
Args:
decode: string of the decoded output.
target: a string for the ground truth label.
Returns:
A float number for the WER of the current decode-target pair.
"""
try:
from nltk.metrics import distance # pylint: disable=g-import-not-at-top
except ImportError as e:
if 'nltk.metrics' not in e.message:
raise
raise ImportError('To use the experimental deepspeech model, you must '
'pip install -U nltk')
# Map each word to a new char.
words = set(decode.split() + target.split())
word2char = dict(zip(words, range(len(words))))
new_decode = [chr(word2char[w]) for w in decode.split()]
new_target = [chr(word2char[w]) for w in target.split()]
return distance.edit_distance(''.join(new_decode), ''.join(new_target))
def cer(self, decode, target):
"""Computes the Character Error Rate (CER).
CER is defined as the edit distance between the two given strings.
Args:
decode: a string of the decoded output.
target: a string for the ground truth label.
Returns:
A float number denoting the CER for the current sentence pair.
"""
try:
from nltk.metrics import distance # pylint: disable=g-import-not-at-top
except ImportError as e:
if 'nltk.metrics' not in e.message:
raise
raise ImportError('To use the experimental deepspeech model, you must '
'pip install -U nltk')
return distance.edit_distance(decode, target)
def decode(self, char_indexes):
"""Decode the best guess from logits using greedy algorithm."""
# Merge repeated chars.
merge = [k for k, _ in itertools.groupby(char_indexes)]
# Remove the blank index in the decoded sequence.
merge_remove_blank = []
for k in merge:
if k != self.blank_index:
merge_remove_blank.append(k)
return self.convert_to_string(merge_remove_blank)
def decode_logits(self, logits):
"""Decode the best guess from logits using greedy algorithm."""
# Choose the class with maximimum probability.
best = list(np.argmax(logits, axis=1))
return self.decode(best)
class DeepSpeech2Model(model_lib.Model):
"""Define DeepSpeech2 model."""
# Supported rnn cells.
SUPPORTED_RNNS = {
'lstm': tf.nn.rnn_cell.BasicLSTMCell,
'rnn': tf.nn.rnn_cell.RNNCell,
'gru': tf.nn.rnn_cell.GRUCell,
}
# Parameters for batch normalization.
BATCH_NORM_EPSILON = 1e-5
BATCH_NORM_DECAY = 0.997
# Filters of convolution layer
CONV_FILTERS = 32
def __init__(self,
num_rnn_layers=5,
rnn_type='lstm',
is_bidirectional=True,
rnn_hidden_size=800,
use_bias=True,
params=None):
"""Initialize DeepSpeech2 model.
Args:
num_rnn_layers: an integer, the number of rnn layers (default: 5).
rnn_type: a string, one of the supported rnn cells: gru, rnn or lstm.
is_bidirectional: a boolean to indicate if the rnn layer is bidirectional.
rnn_hidden_size: an integer for the number of hidden units in the RNN
cell.
use_bias: a boolean specifying whether to use a bias in the last fc layer.
params: the params from BenchmarkCNN.
"""
super(DeepSpeech2Model, self).__init__(
'deepspeech2',
batch_size=128,
learning_rate=0.0005,
fp16_loss_scale=128,
params=params)
self.num_rnn_layers = num_rnn_layers
self.rnn_type = rnn_type
self.is_bidirectional = is_bidirectional
self.rnn_hidden_size = rnn_hidden_size
self.use_bias = use_bias
self.num_feature_bins = 161
self.max_time_steps = 3494
self.max_label_length = 576
def _batch_norm(self, inputs, training):
"""Batch normalization layer.
Note that the momentum to use will affect validation accuracy over time.
Batch norm has different behaviors during training/evaluation. With a large
momentum, the model takes longer to get a near-accurate estimation of the
moving mean/variance over the entire training dataset, which means we need
more iterations to see good evaluation results. If the training data is
evenly distributed over the feature space, we can also try setting a smaller
momentum (such as 0.1) to get good evaluation result sooner.
Args:
inputs: input data for batch norm layer.
training: a boolean to indicate if it is in training stage.
Returns:
tensor output from batch norm layer.
"""
return tf.layers.batch_normalization(
inputs=inputs,
momentum=DeepSpeech2Model.BATCH_NORM_DECAY,
epsilon=DeepSpeech2Model.BATCH_NORM_EPSILON,
fused=True,
training=training)
def _conv_bn_layer(self, inputs, padding, filters, kernel_size, strides,
layer_id, training):
"""Defines 2D convolutional + batch normalization layer.
Args:
inputs: input data for convolution layer.
padding: padding to be applied before convolution layer.
filters: an integer, number of output filters in the convolution.
kernel_size: a tuple specifying the height and width of the 2D convolution
window.
strides: a tuple specifying the stride length of the convolution.
layer_id: an integer specifying the layer index.
training: a boolean to indicate which stage we are in (training/eval).
Returns:
tensor output from the current layer.
"""
# Perform symmetric padding on the feature dimension of time_step
# This step is required to avoid issues when RNN output sequence is shorter
# than the label length.
inputs = tf.pad(
inputs,
[[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]])
inputs = tf.layers.conv2d(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
padding='valid',
use_bias=False,
activation=tf.nn.relu6,
name='cnn_{}'.format(layer_id))
return self._batch_norm(inputs, training)
def _rnn_layer(self, inputs, rnn_cell, rnn_hidden_size, layer_id,
use_batch_norm, is_bidirectional, training):
"""Defines a batch normalization + rnn layer.
Args:
inputs: input tensors for the current layer.
rnn_cell: RNN cell instance to use.
rnn_hidden_size: an integer for the dimensionality of the rnn output
space.
layer_id: an integer for the index of current layer.
use_batch_norm: a boolean specifying whether to perform batch
normalization on input states.
is_bidirectional: a boolean specifying whether the rnn layer is
bi-directional.
training: a boolean to indicate which stage we are in (training/eval).
Returns:
tensor output for the current layer.
"""
if use_batch_norm:
inputs = self._batch_norm(inputs, training)
# Construct forward/backward RNN cells.
fw_cell = rnn_cell(
num_units=rnn_hidden_size, name='rnn_fw_{}'.format(layer_id))
if is_bidirectional:
bw_cell = rnn_cell(
num_units=rnn_hidden_size, name='rnn_bw_{}'.format(layer_id))
outputs, _ = tf.nn.bidirectional_dynamic_rnn(
cell_fw=fw_cell,
cell_bw=bw_cell,
inputs=inputs,
dtype=tf.float32,
swap_memory=True)
rnn_outputs = tf.concat(outputs, -1)
else:
rnn_outputs = tf.nn.dynamic_rnn(
fw_cell, inputs, dtype=tf.float32, swap_memory=True)
return rnn_outputs
def get_input_data_types(self, subset):
"""Returns the list of data types of the inputs."""
del subset # Same data types for both train and validation subsets.
return [self.data_type, tf.int32, tf.int32, tf.int32]
def get_input_shapes(self, subset):
"""Returns the list of shapes of the padded inputs."""
del subset # Same shapes for both train and validation subsets
return [
[self.batch_size, self.max_time_steps, self.num_feature_bins, 1],
[self.batch_size, self.max_label_length],
[self.batch_size, 1],
[self.batch_size, 1],
]
def get_synthetic_inputs(self, input_name, nclass):
inputs = tf.random_uniform(self.get_input_shapes('train')[0],
dtype=self.get_input_data_types('train')[0])
inputs = variables.VariableV1(inputs, trainable=False,
collections=[tf.GraphKeys.LOCAL_VARIABLES],
name=input_name)
labels = tf.convert_to_tensor(
np.random.randint(28, size=[self.batch_size, self.max_label_length]))
input_lengths = tf.convert_to_tensor(
[self.max_time_steps] * self.batch_size)
label_lengths = tf.convert_to_tensor(
[self.max_label_length] * self.batch_size)
return [inputs, labels, input_lengths, label_lengths]
# TODO(laigd): support fp16.
# TODO(laigd): support multiple gpus.
def build_network(self, inputs, phase_train=True, nclass=29):
"""Builds the forward pass of the deepspeech2 model.
Args:
inputs: The input list of the model.
phase_train: True during training. False during evaluation.
nclass: Number of classes that the input spectrogram can belong to.
Returns:
A BuildNetworkResult which contains the logits and model-specific extra
information.
"""
inputs = inputs[0] # Get the spectrogram feature.
# Two cnn layers.
inputs = self._conv_bn_layer(
inputs,
padding=(20, 5),
filters=DeepSpeech2Model.CONV_FILTERS,
kernel_size=(41, 11),
strides=(2, 2),
layer_id=1,
training=phase_train)
inputs = self._conv_bn_layer(
inputs,
padding=(10, 5),
filters=DeepSpeech2Model.CONV_FILTERS,
kernel_size=(21, 11),
strides=(2, 1),
layer_id=2,
training=phase_train)
# output of conv_layer2 with the shape of
# [batch_size (N), times (T), features (F), channels (C)].
# Convert the conv output to rnn input.
# batch_size = tf.shape(inputs)[0]
feat_size = inputs.get_shape().as_list()[2]
inputs = tf.reshape(
inputs,
[self.batch_size, -1, feat_size * DeepSpeech2Model.CONV_FILTERS])
# RNN layers.
rnn_cell = DeepSpeech2Model.SUPPORTED_RNNS[self.rnn_type]
for layer_counter in xrange(self.num_rnn_layers):
# No batch normalization on the first layer.
use_batch_norm = (layer_counter != 0)
inputs = self._rnn_layer(inputs, rnn_cell, self.rnn_hidden_size,
layer_counter + 1, use_batch_norm,
self.is_bidirectional, phase_train)
# FC layer with batch norm.
inputs = self._batch_norm(inputs, phase_train)
logits = tf.layers.dense(inputs, nclass, use_bias=self.use_bias)
return model_lib.BuildNetworkResult(logits=logits, extra_info=None)
def loss_function(self, inputs, build_network_result):
"""Computes the ctc loss for the current batch of predictions.
Args:
inputs: the input list of the model.
build_network_result: a BuildNetworkResult returned by build_network().
Returns:
The loss tensor of the model.
"""
logits = build_network_result.logits
actual_time_steps = inputs[2]
probs = tf.nn.softmax(logits)
ctc_time_steps = tf.shape(probs)[1]
ctc_input_length = tf.to_float(
tf.multiply(actual_time_steps, ctc_time_steps))
ctc_input_length = tf.to_int32(
tf.floordiv(ctc_input_length, tf.to_float(self.max_time_steps)))
label_length = inputs[3]
label_length = tf.to_int32(tf.squeeze(label_length))
ctc_input_length = tf.to_int32(tf.squeeze(ctc_input_length))
labels = inputs[1]
sparse_labels = tf.to_int32(
tf.keras.backend.ctc_label_dense_to_sparse(labels, label_length))
y_pred = tf.log(
tf.transpose(probs, perm=[1, 0, 2]) + tf.keras.backend.epsilon())
losses = tf.expand_dims(
tf.nn.ctc_loss(
labels=sparse_labels,
inputs=y_pred,
sequence_length=ctc_input_length,
ignore_longer_outputs_than_inputs=True),
axis=1)
loss = tf.reduce_mean(losses)
return loss
PROBABILITY_TENSOR = 'deepspeech2_prob'
LABEL_TENSOR = 'deepspeech2_label'
def accuracy_function(self, inputs, logits):
"""Returns the ops to evaluate the model performance."""
# Get probabilities of each predicted class
probs = tf.nn.softmax(logits)
assert probs.shape.as_list()[0] == self.batch_size
return {
(constants.UNREDUCED_ACCURACY_OP_PREFIX + self.PROBABILITY_TENSOR):
probs,
(constants.UNREDUCED_ACCURACY_OP_PREFIX + self.LABEL_TENSOR):
inputs[1],
}
def postprocess(self, results):
"""Postprocess results returned from model in Python."""
probs = results[self.PROBABILITY_TENSOR]
total_wer, total_cer = 0, 0
speech_labels = " abcdefghijklmnopqrstuvwxyz'-"
greedy_decoder = DeepSpeechDecoder(speech_labels)
# Evaluate the performance using WER (Word Error Rate) and CER (Character
# Error Rate) as metrics.
targets = results[self.LABEL_TENSOR] # The ground truth transcript
for i in range(self.batch_size):
# Decode string.
predicted_str = greedy_decoder.decode_logits(probs[i])
expected_str = greedy_decoder.decode(targets[i])
# Compute CER.
total_cer += (greedy_decoder.cer(predicted_str, expected_str) /
len(expected_str))
# Compute WER.
total_wer += (greedy_decoder.wer(predicted_str, expected_str) /
len(expected_str.split()))
# Get mean value
total_cer /= self.batch_size
total_wer /= self.batch_size
log_fn('total CER: {:f}; total WER: {:f}; total example: {:d}.'.format(
total_cer, total_wer, self.batch_size))
# TODO(laigd): get rid of top_N_accuracy bindings in benchmark_cnn.py
return {'top_1_accuracy': 0., 'top_5_accuracy': 0.}
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Import official resnet models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow.compat.v1 as tf
import datasets
from models import model as model_lib
class ImagenetResnetModel(model_lib.CNNModel):
"""Official resnet models."""
def __init__(self, resnet_size, version=2, params=None):
"""These are the parameters that work for Imagenet data.
Args:
resnet_size: The number of convolutional layers needed in the model.
version: 1 or 2 for v1 or v2, respectively.
params: params passed by BenchmarkCNN.
"""
default_batch_sizes = {
50: 128,
101: 32,
152: 32
}
batch_size = default_batch_sizes.get(resnet_size, 32)
default_learning_rate = 0.0125 * batch_size / 32
model_name = 'official_resnet_{}_v{}'.format(resnet_size, version)
super(ImagenetResnetModel, self).__init__(
model_name, 224, batch_size, default_learning_rate, params=params)
self.resnet_size = resnet_size
self.version = version
def get_learning_rate(self, global_step, batch_size):
num_batches_per_epoch = (
float(datasets.IMAGENET_NUM_TRAIN_IMAGES) / batch_size)
boundaries = [int(num_batches_per_epoch * x) for x in [30, 60, 80, 90]]
values = [1, 0.1, 0.01, 0.001, 0.0001]
adjusted_learning_rate = (
self.learning_rate / self.default_batch_size * batch_size)
values = [v * adjusted_learning_rate for v in values]
return tf.train.piecewise_constant(global_step, boundaries, values)
def build_network(self, images, phase_train=True, nclass=1001,
data_type=tf.float32):
# pylint: disable=g-import-not-at-top
try:
from official.r1.resnet.imagenet_main import ImagenetModel
except ImportError:
tf.logging.fatal('Please include tensorflow/models to the PYTHONPATH.')
raise
images = tf.cast(images, data_type)
model_class = ImagenetModel(resnet_size=self.resnet_size,
resnet_version=self.version,
# The official model dtype seems to be ignored,
# as the dtype it uses is the dtype of the input
# images. Doesn't hurt to set it though.
dtype=data_type)
logits = model_class(images, phase_train)
logits = tf.cast(logits, tf.float32)
return model_lib.BuildNetworkResult(logits=logits, extra_info=None)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Mobilenet Base Class, branched from slim for fp16 performance study."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import contextlib
import copy
import os
import tensorflow.compat.v1 as tf
from tensorflow.contrib import slim as contrib_slim
slim = contrib_slim
@slim.add_arg_scope
def apply_activation(x, name=None, activation_fn=None):
return activation_fn(x, name=name) if activation_fn else x
def _fixed_padding(inputs, kernel_size, rate=1):
"""Pads the input along the spatial dimensions independently of input size.
Pads the input such that if it was used in a convolution with 'VALID' padding,
the output would have the same dimensions as if the unpadded input was used
in a convolution with 'SAME' padding.
Args:
inputs: A tensor of size [batch, height_in, width_in, channels].
kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
rate: An integer, rate for atrous convolution.
Returns:
output: A tensor of size [batch, height_out, width_out, channels] with the
input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
"""
kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
[pad_beg[1], pad_end[1]], [0, 0]])
return padded_inputs
def _make_divisible(v, divisor, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_v < 0.9 * v:
new_v += divisor
return new_v
@contextlib.contextmanager
def _set_arg_scope_defaults(defaults):
"""Sets arg scope defaults for all items present in defaults.
Args:
defaults: dictionary/list of pairs, containing a mapping from
function to a dictionary of default args.
Yields:
context manager where all defaults are set.
"""
if hasattr(defaults, 'items'):
items = list(defaults.items())
else:
items = defaults
if not items:
yield
else:
func, default_arg = items[0]
with slim.arg_scope(func, **default_arg):
with _set_arg_scope_defaults(items[1:]):
yield
@slim.add_arg_scope
def depth_multiplier(output_params,
multiplier,
divisible_by=8,
min_depth=8,
**unused_kwargs):
if 'num_outputs' not in output_params:
return
d = output_params['num_outputs']
output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by,
min_depth)
_Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func'])
def op(opfunc, **params):
multiplier = params.pop('multiplier_transorm', depth_multiplier)
return _Op(opfunc, params=params, multiplier_func=multiplier)
class NoOpScope(object):
"""No-op context manager."""
def __enter__(self):
return
def __exit__(self, exc_type, exc_value, traceback):
return False
def safe_arg_scope(funcs, **kwargs):
"""Returns `slim.arg_scope` with all None arguments removed.
Args:
funcs: Functions to pass to `arg_scope`.
**kwargs: Arguments to pass to `arg_scope`.
Returns:
arg_scope or No-op context manager.
Note: can be useful if None value should be interpreted as "do not overwrite
this parameter value".
"""
filtered_args = {name: value for name, value in kwargs.items()
if value is not None}
if filtered_args:
return slim.arg_scope(funcs, **filtered_args)
else:
return NoOpScope()
@slim.add_arg_scope
def mobilenet_base( # pylint: disable=invalid-name
inputs,
conv_defs,
multiplier=1.0,
final_endpoint=None,
output_stride=None,
use_explicit_padding=False,
scope=None,
is_training=False):
"""Mobilenet base network.
Constructs a network from inputs to the given final endpoint. By default
the network is constructed in inference mode. To create network
in training mode use:
with slim.arg_scope(mobilenet.training_scope()):
logits, endpoints = mobilenet_base(...)
Args:
inputs: a tensor of shape [batch_size, height, width, channels].
conv_defs: A list of op(...) layers specifying the net architecture.
multiplier: Float multiplier for the depth (number of channels)
for all convolution ops. The value must be greater than zero. Typical
usage will be to set this value in (0, 1) to reduce the number of
parameters or computation cost of the model.
final_endpoint: The name of last layer, for early termination for
for V1-based networks: last layer is "layer_14", for V2: "layer_20"
output_stride: An integer that specifies the requested ratio of input to
output spatial resolution. If not None, then we invoke atrous convolution
if necessary to prevent the network from reducing the spatial resolution
of the activation maps. Allowed values are 1 or any even number, excluding
zero. Typical values are 8 (accurate fully convolutional mode), 16
(fast fully convolutional mode), and 32 (classification mode).
NOTE- output_stride relies on all consequent operators to support dilated
operators via "rate" parameter. This might require wrapping non-conv
operators to operate properly.
use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
inputs so that the output dimensions are the same as if 'SAME' padding
were used.
scope: optional variable scope.
is_training: How to setup batch_norm and other ops. Note: most of the time
this does not need be set directly. Use mobilenet.training_scope() to set
up training instead. This parameter is here for backward compatibility
only. It is safe to set it to the value matching
training_scope(is_training=...). It is also safe to explicitly set
it to False, even if there is outer training_scope set to to training.
(The network will be built in inference mode). If this is set to None,
no arg_scope is added for slim.batch_norm's is_training parameter.
Returns:
tensor_out: output tensor.
end_points: a set of activations for external use, for example summaries or
losses.
Raises:
ValueError: depth_multiplier <= 0, or the target output_stride is not
allowed.
"""
if multiplier <= 0:
raise ValueError('multiplier is not greater than zero.')
# Set conv defs defaults and overrides.
conv_defs_defaults = conv_defs.get('defaults', {})
conv_defs_overrides = conv_defs.get('overrides', {})
if use_explicit_padding:
conv_defs_overrides = copy.deepcopy(conv_defs_overrides)
conv_defs_overrides[
(slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'}
if output_stride is not None:
if output_stride == 0 or (output_stride > 1 and output_stride % 2):
raise ValueError('Output stride must be None, 1 or a multiple of 2.')
# a) Set the tensorflow scope
# b) set padding to default: note we might consider removing this
# since it is also set by mobilenet_scope
# c) set all defaults
# d) set all extra overrides.
with _scope_all(scope, default_scope='Mobilenet'), \
safe_arg_scope([slim.batch_norm], is_training=is_training), \
_set_arg_scope_defaults(conv_defs_defaults), \
_set_arg_scope_defaults(conv_defs_overrides):
# The current_stride variable keeps track of the output stride of the
# activations, i.e., the running product of convolution strides up to the
# current network layer. This allows us to invoke atrous convolution
# whenever applying the next convolution would result in the activations
# having output stride larger than the target output_stride.
current_stride = 1
# The atrous convolution rate parameter.
rate = 1
net = inputs
# Insert default parameters before the base scope which includes
# any custom overrides set in mobilenet.
end_points = {}
scopes = {}
for i, opdef in enumerate(conv_defs['spec']):
params = dict(opdef.params)
opdef.multiplier_func(params, multiplier)
stride = params.get('stride', 1)
if output_stride is not None and current_stride == output_stride:
# If we have reached the target output_stride, then we need to employ
# atrous convolution with stride=1 and multiply the atrous rate by the
# current unit's stride for use in subsequent layers.
layer_stride = 1
layer_rate = rate
rate *= stride
else:
layer_stride = stride
layer_rate = 1
current_stride *= stride
# Update params.
params['stride'] = layer_stride
# Only insert rate to params if rate > 1.
if layer_rate > 1:
params['rate'] = layer_rate
# Set padding
if use_explicit_padding:
if 'kernel_size' in params:
net = _fixed_padding(net, params['kernel_size'], layer_rate)
else:
params['use_explicit_padding'] = True
end_point = 'layer_%d' % (i + 1)
try:
net = opdef.op(net, **params)
except Exception:
print('Failed to create op %i: %r params: %r' % (i, opdef, params))
raise
end_points[end_point] = net
scope = os.path.dirname(net.name)
scopes[scope] = end_point
if final_endpoint is not None and end_point == final_endpoint:
break
# Add all tensors that end with 'output' to
# endpoints
for t in net.graph.get_operations():
scope = os.path.dirname(t.name)
bn = os.path.basename(t.name)
if scope in scopes and t.name.endswith('output'):
end_points[scopes[scope] + '/' + bn] = t.outputs[0]
return net, end_points
@contextlib.contextmanager
def _scope_all(scope, default_scope=None):
with tf.variable_scope(scope, default_name=default_scope) as s,\
tf.name_scope(s.original_name_scope):
yield s
@slim.add_arg_scope
def mobilenet(inputs,
num_classes=1001,
prediction_fn=slim.softmax,
reuse=None,
scope='Mobilenet',
base_only=False,
**mobilenet_args):
"""Mobilenet model for classification, supports both V1 and V2.
Note: default mode is inference, use mobilenet.training_scope to create
training network.
Args:
inputs: a tensor of shape [batch_size, height, width, channels].
num_classes: number of predicted classes. If 0 or None, the logits layer
is omitted and the input features to the logits layer (before dropout)
are returned instead.
prediction_fn: a function to get predictions out of logits
(default softmax).
reuse: whether or not the network and its variables should be reused. To be
able to reuse 'scope' must be given.
scope: Optional variable_scope.
base_only: if True will only create the base of the network (no pooling
and no logits).
**mobilenet_args: passed to mobilenet_base verbatim.
- conv_defs: list of conv defs
- multiplier: Float multiplier for the depth (number of channels)
for all convolution ops. The value must be greater than zero. Typical
usage will be to set this value in (0, 1) to reduce the number of
parameters or computation cost of the model.
- output_stride: will ensure that the last layer has at most total stride.
If the architecture calls for more stride than that provided
(e.g. output_stride=16, but the architecture has 5 stride=2 operators),
it will replace output_stride with fractional convolutions using Atrous
Convolutions.
Returns:
logits: the pre-softmax activations, a tensor of size
[batch_size, num_classes]
end_points: a dictionary from components of the network to the corresponding
activation tensor.
Raises:
ValueError: Input rank is invalid.
"""
is_training = mobilenet_args.get('is_training', False)
input_shape = inputs.get_shape().as_list()
if len(input_shape) != 4:
raise ValueError('Expected rank 4 input, was: %d' % len(input_shape))
with tf.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope:
inputs = tf.identity(inputs, 'input')
net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args)
if base_only:
return net, end_points
net = tf.identity(net, name='embedding')
with tf.variable_scope('Logits'):
net = global_pool(net)
end_points['global_pool'] = net
if not num_classes:
return net, end_points
net = slim.dropout(net, scope='Dropout', is_training=is_training)
# 1 x 1 x num_classes
# Note: legacy scope name.
logits = slim.conv2d(
net,
num_classes, [1, 1],
activation_fn=None,
normalizer_fn=None,
biases_initializer=tf.zeros_initializer(),
scope='Conv2d_1c_1x1')
logits = tf.squeeze(logits, [1, 2])
logits = tf.identity(logits, name='output')
end_points['Logits'] = logits
if prediction_fn:
end_points['Predictions'] = prediction_fn(logits, 'Predictions')
return logits, end_points
def global_pool(input_tensor, pool_op=tf.nn.avg_pool):
"""Applies avg pool to produce 1x1 output.
NOTE: This function is funcitonally equivalenet to reduce_mean, but it has
baked in average pool which has better support across hardware.
Args:
input_tensor: input tensor
pool_op: pooling op (avg pool is default)
Returns:
a tensor batch_size x 1 x 1 x depth.
"""
shape = input_tensor.get_shape().as_list()
if shape[1] is None or shape[2] is None:
kernel_size = tf.convert_to_tensor(
[1, tf.shape(input_tensor)[1],
tf.shape(input_tensor)[2], 1])
else:
kernel_size = [1, shape[1], shape[2], 1]
output = pool_op(
input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID')
# Recover output shape, for unknown shape.
output.set_shape([None, 1, 1, None])
return output
def training_scope(is_training=True,
weight_decay=0.00004,
stddev=0.09,
dropout_keep_prob=0.8,
bn_decay=0.997):
"""Defines Mobilenet training scope.
Usage:
with tf.contrib.slim.arg_scope(mobilenet.training_scope()):
logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
# the network created will be trainble with dropout/batch norm
# initialized appropriately.
Args:
is_training: if set to False this will ensure that all customizations are
set to non-training mode. This might be helpful for code that is reused
across both training/evaluation, but most of the time training_scope with
value False is not needed. If this is set to None, the parameters is not
added to the batch_norm arg_scope.
weight_decay: The weight decay to use for regularizing the model.
stddev: Standard deviation for initialization, if negative uses xavier.
dropout_keep_prob: dropout keep probability (not set if equals to None).
bn_decay: decay for the batch norm moving averages (not set if equals to
None).
Returns:
An argument scope to use via arg_scope.
"""
# Note: do not introduce parameters that would change the inference
# model here (for example whether to use bias), modify conv_def instead.
batch_norm_params = {
'decay': bn_decay,
'is_training': is_training
}
if stddev < 0:
weight_intitializer = slim.initializers.xavier_initializer()
else:
weight_intitializer = tf.truncated_normal_initializer(stddev=stddev)
# Set weight_decay for weights in Conv and FC layers.
with slim.arg_scope(
[slim.conv2d, slim.fully_connected, slim.separable_conv2d],
weights_initializer=weight_intitializer,
normalizer_fn=slim.batch_norm), \
slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training),\
safe_arg_scope([slim.batch_norm], **batch_norm_params), \
safe_arg_scope([slim.dropout], is_training=is_training,
keep_prob=dropout_keep_prob), \
slim.arg_scope([slim.conv2d], \
weights_regularizer=slim.l2_regularizer(weight_decay)), \
slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s:
return s
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SSD300 Model Configuration.
References:
Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
Cheng-Yang Fu, Alexander C. Berg
SSD: Single Shot MultiBox Detector
arXiv:1512.02325
Ported from MLPerf reference implementation:
https://github.com/mlperf/reference/tree/ssd/single_stage_detector/ssd
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import multiprocessing
import os
import re
import threading
import tensorflow.compat.v1 as tf
# pylint: disable=g-direct-tensorflow-import
import constants
import mlperf
import ssd_constants
from cnn_util import log_fn
from models import model as model_lib
from models import resnet_model
from tensorflow.contrib import layers as contrib_layers
from tensorflow.python.ops import variables
BACKBONE_MODEL_SCOPE_NAME = 'resnet34_backbone'
class SSD300Model(model_lib.CNNModel):
"""Single Shot Multibox Detection (SSD) model for 300x300 image datasets."""
def __init__(self, label_num=ssd_constants.NUM_CLASSES, batch_size=32,
learning_rate=1e-3, backbone='resnet34', params=None):
super(SSD300Model, self).__init__('ssd300', 300, batch_size, learning_rate,
params=params)
# For COCO dataset, 80 categories + 1 background = 81 labels
self.label_num = label_num
# Currently only support ResNet-34 as backbone model
if backbone != 'resnet34':
raise ValueError('Invalid backbone model %s for SSD.' % backbone)
mlperf.logger.log(key=mlperf.tags.BACKBONE, value=backbone)
# Number of channels and default boxes associated with the following layers:
# ResNet34 layer, Conv7, Conv8_2, Conv9_2, Conv10_2, Conv11_2
self.out_chan = [256, 512, 512, 256, 256, 256]
mlperf.logger.log(key=mlperf.tags.LOC_CONF_OUT_CHANNELS,
value=self.out_chan)
# Number of default boxes from layers of different scales
# 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
self.num_dboxes = [4, 6, 6, 6, 4, 4]
mlperf.logger.log(key=mlperf.tags.NUM_DEFAULTS_PER_CELL,
value=self.num_dboxes)
# TODO(haoyuzhang): in order to correctly restore in replicated mode, need
# to create a saver for each tower before graph is finalized. Use variable
# manager for better efficiency.
self.backbone_savers = []
# Collected predictions for eval stage. It maps each image id in eval
# dataset to a dict containing the following information:
# source_id: raw ID of image
# raw_shape: raw shape of image
# pred_box: encoded box coordinates of prediction
# pred_scores: scores of classes in prediction
self.predictions = {}
# Global step when predictions are collected.
self.eval_global_step = 0
# Average precision. In asynchronous eval mode, this is the latest AP we
# get so far and may not be the results at current eval step.
self.eval_coco_ap = 0
# Process, queues, and thread for asynchronous evaluation. When enabled,
# create a separate process (async_eval_process) that continuously pull
# intermediate results from the predictions queue (a multiprocessing queue),
# process them, and push final results into results queue (another
# multiprocessing queue). The main thread is responsible to push messages
# into predictions queue, and start a separate thread to continuously pull
# messages from results queue to update final results.
# Message in predictions queue should be a tuple of two elements:
# (evaluation step, predictions)
# Message in results queue should be a tuple of two elements:
# (evaluation step, final results)
self.async_eval_process = None
self.async_eval_predictions_queue = None
self.async_eval_results_queue = None
self.async_eval_results_getter_thread = None
# The MLPerf reference uses a starting lr of 1e-3 at bs=32.
self.base_lr_batch_size = 32
def skip_final_affine_layer(self):
return True
def gpu_preprocess_nhwc(self, images, phase_train=True):
try:
import ssd_dataloader # pylint: disable=g-import-not-at-top
except ImportError:
raise ImportError('To use the COCO dataset, you must clone the '
'repo https://github.com/tensorflow/models and add '
'tensorflow/models and tensorflow/models/research to '
'the PYTHONPATH, and compile the protobufs by '
'following https://github.com/tensorflow/models/blob/'
'master/research/object_detection/g3doc/installation.md'
'#protobuf-compilation ; To evaluate using COCO'
'metric, download and install Python COCO API from'
'https://github.com/cocodataset/cocoapi')
if phase_train:
images = ssd_dataloader.color_jitter(
images, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05)
images = ssd_dataloader.normalize_image(images)
return images
def add_backbone_model(self, cnn):
# --------------------------------------------------------------------------
# Resnet-34 backbone model -- modified for SSD
# --------------------------------------------------------------------------
# Input 300x300, output 150x150
cnn.conv(64, 7, 7, 2, 2, mode='SAME_RESNET', use_batch_norm=True)
cnn.mpool(3, 3, 2, 2, mode='SAME')
resnet34_layers = [3, 4, 6, 3]
version = 'v1'
# ResNet-34 block group 1
# Input 150x150, output 75x75
for i in range(resnet34_layers[0]):
# Last argument forces residual_block to use projection shortcut, even
# though the numbers of input and output channels are equal
resnet_model.residual_block(cnn, 64, 1, version)
# ResNet-34 block group 2
# Input 75x75, output 38x38
for i in range(resnet34_layers[1]):
stride = 2 if i == 0 else 1
resnet_model.residual_block(cnn, 128, stride, version, i == 0)
# ResNet-34 block group 3
# This block group is modified: first layer uses stride=1 so that the image
# size does not change in group of layers
# Input 38x38, output 38x38
for i in range(resnet34_layers[2]):
# The following line is intentionally commented out to differentiate from
# the original ResNet-34 model
# stride = 2 if i == 0 else 1
resnet_model.residual_block(cnn, 256, stride, version, i == 0)
# ResNet-34 block group 4: removed final block group
# The following 3 lines are intentionally commented out to differentiate
# from the original ResNet-34 model
# for i in range(resnet34_layers[3]):
# stride = 2 if i == 0 else 1
# resnet_model.residual_block(cnn, 512, stride, version, i == 0)
def add_inference(self, cnn):
cnn.use_batch_norm = True
cnn.batch_norm_config = {'decay': ssd_constants.BATCH_NORM_DECAY,
'epsilon': ssd_constants.BATCH_NORM_EPSILON,
'scale': True}
with tf.variable_scope(BACKBONE_MODEL_SCOPE_NAME):
self.add_backbone_model(cnn)
# --------------------------------------------------------------------------
# SSD additional layers
# --------------------------------------------------------------------------
def add_ssd_layer(cnn, depth, k_size, stride, mode):
return cnn.conv(
depth,
k_size,
k_size,
stride,
stride,
mode=mode,
use_batch_norm=False,
kernel_initializer=contrib_layers.xavier_initializer())
# Activations for feature maps of different layers
self.activations = [cnn.top_layer]
# Conv7_1, Conv7_2
# Input 38x38, output 19x19
add_ssd_layer(cnn, 256, 1, 1, 'valid')
self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))
# Conv8_1, Conv8_2
# Input 19x19, output 10x10
add_ssd_layer(cnn, 256, 1, 1, 'valid')
self.activations.append(add_ssd_layer(cnn, 512, 3, 2, 'same'))
# Conv9_1, Conv9_2
# Input 10x10, output 5x5
add_ssd_layer(cnn, 128, 1, 1, 'valid')
self.activations.append(add_ssd_layer(cnn, 256, 3, 2, 'same'))
# Conv10_1, Conv10_2
# Input 5x5, output 3x3
add_ssd_layer(cnn, 128, 1, 1, 'valid')
self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))
# Conv11_1, Conv11_2
# Input 3x3, output 1x1
add_ssd_layer(cnn, 128, 1, 1, 'valid')
self.activations.append(add_ssd_layer(cnn, 256, 3, 1, 'valid'))
self.loc = []
self.conf = []
for nd, ac, oc in zip(self.num_dboxes, self.activations, self.out_chan):
l = cnn.conv(
nd * 4,
3,
3,
1,
1,
input_layer=ac,
num_channels_in=oc,
activation=None,
use_batch_norm=False,
kernel_initializer=contrib_layers.xavier_initializer())
scale = l.get_shape()[-1]
# shape = [batch_size, nd * 4, scale, scale]
l = tf.reshape(l, [self.batch_size, nd, 4, scale, scale])
# shape = [batch_size, nd, 4, scale, scale]
l = tf.transpose(l, [0, 1, 3, 4, 2])
# shape = [batch_size, nd, scale, scale, 4]
self.loc.append(tf.reshape(l, [self.batch_size, -1, 4]))
# shape = [batch_size, nd * scale * scale, 4]
c = cnn.conv(
nd * self.label_num,
3,
3,
1,
1,
input_layer=ac,
num_channels_in=oc,
activation=None,
use_batch_norm=False,
kernel_initializer=contrib_layers.xavier_initializer())
# shape = [batch_size, nd * label_num, scale, scale]
c = tf.reshape(c, [self.batch_size, nd, self.label_num, scale, scale])
# shape = [batch_size, nd, label_num, scale, scale]
c = tf.transpose(c, [0, 1, 3, 4, 2])
# shape = [batch_size, nd, scale, scale, label_num]
self.conf.append(tf.reshape(c, [self.batch_size, -1, self.label_num]))
# shape = [batch_size, nd * scale * scale, label_num]
# Shape of locs: [batch_size, NUM_SSD_BOXES, 4]
# Shape of confs: [batch_size, NUM_SSD_BOXES, label_num]
locs, confs = tf.concat(self.loc, 1), tf.concat(self.conf, 1)
# Pack location and confidence outputs into a single output layer
# Shape of logits: [batch_size, NUM_SSD_BOXES, 4+label_num]
logits = tf.concat([locs, confs], 2)
cnn.top_layer = logits
cnn.top_size = 4 + self.label_num
return cnn.top_layer
def get_learning_rate(self, global_step, batch_size):
rescaled_lr = self.get_scaled_base_learning_rate(batch_size)
# Defined in MLPerf reference model
boundaries = [160000, 200000]
boundaries = [b * self.base_lr_batch_size // batch_size for b in boundaries]
decays = [1, 0.1, 0.01]
learning_rates = [rescaled_lr * d for d in decays]
lr = tf.train.piecewise_constant(global_step, boundaries, learning_rates)
warmup_steps = int(118287 / batch_size * 5)
warmup_lr = (
rescaled_lr * tf.cast(global_step, tf.float32) / tf.cast(
warmup_steps, tf.float32))
return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr)
def get_scaled_base_learning_rate(self, batch_size):
"""Calculates base learning rate for creating lr schedule.
In replicated mode, gradients are summed rather than averaged which, with
the sgd and momentum optimizers, increases the effective learning rate by
lr * num_gpus. Dividing the base lr by num_gpus negates the increase.
Args:
batch_size: Total batch-size.
Returns:
Base learning rate to use to create lr schedule.
"""
base_lr = self.learning_rate
if self.params.variable_update == 'replicated':
base_lr = self.learning_rate / self.params.num_gpus
scaled_lr = base_lr * (batch_size / self.base_lr_batch_size)
return scaled_lr
def _collect_backbone_vars(self):
backbone_vars = tf.get_collection(
tf.GraphKeys.GLOBAL_VARIABLES, scope='.*'+ BACKBONE_MODEL_SCOPE_NAME)
var_list = {}
# Assume variables in the checkpoint are following the naming convention of
# a model checkpoint trained with TF official model
# TODO(haoyuzhang): the following variable name parsing is hacky and easy
# to break if there is change in naming convention of either benchmarks or
# official models.
for v in backbone_vars:
# conv2d variable example (model <-- checkpoint):
# v/cg/conv24/conv2d/kernel:0 <-- conv2d_24/kernel
if 'conv2d' in v.name:
re_match = re.search(r'conv(\d+)/conv2d/(.+):', v.name)
if re_match:
layer_id = int(re_match.group(1))
param_name = re_match.group(2)
vname_in_ckpt = self._var_name_in_official_model_ckpt(
'conv2d', layer_id, param_name)
var_list[vname_in_ckpt] = v
# batchnorm varariable example:
# v/cg/conv24/batchnorm25/gamma:0 <-- batch_normalization_25/gamma
elif 'batchnorm' in v.name:
re_match = re.search(r'batchnorm(\d+)/(.+):', v.name)
if re_match:
layer_id = int(re_match.group(1))
param_name = re_match.group(2)
vname_in_ckpt = self._var_name_in_official_model_ckpt(
'batch_normalization', layer_id, param_name)
var_list[vname_in_ckpt] = v
return var_list
def _var_name_in_official_model_ckpt(self, layer_name, layer_id, param_name):
"""Return variable names according to convention in TF official models."""
vname_in_ckpt = layer_name
if layer_id > 0:
vname_in_ckpt += '_' + str(layer_id)
vname_in_ckpt += '/' + param_name
return vname_in_ckpt
def loss_function(self, inputs, build_network_result):
logits = build_network_result.logits
# Unpack model output back to locations and confidence scores of predictions
# Shape of pred_loc: [batch_size, NUM_SSD_BOXES, 4]
# Shape of pred_label: [batch_size, NUM_SSD_BOXES, label_num]
pred_loc, pred_label = tf.split(logits, [4, self.label_num], 2)
# Shape of gt_loc: [batch_size, NUM_SSD_BOXES, 4]
# Shape of gt_label: [batch_size, NUM_SSD_BOXES, 1]
# Shape of num_gt: [batch_size]
_, gt_loc, gt_label, num_gt = inputs
gt_label = tf.cast(gt_label, tf.int32)
box_loss = self._localization_loss(pred_loc, gt_loc, gt_label, num_gt)
class_loss = self._classification_loss(pred_label, gt_label, num_gt)
tf.summary.scalar('box_loss', tf.reduce_mean(box_loss))
tf.summary.scalar('class_loss', tf.reduce_mean(class_loss))
return class_loss + box_loss
def _localization_loss(self, pred_loc, gt_loc, gt_label, num_matched_boxes):
"""Computes the localization loss.
Computes the localization loss using smooth l1 loss.
Args:
pred_loc: a flatten tensor that includes all predicted locations. The
shape is [batch_size, num_anchors, 4].
gt_loc: a tensor representing box regression targets in
[batch_size, num_anchors, 4].
gt_label: a tensor that represents the classification groundtruth targets.
The shape is [batch_size, num_anchors, 1].
num_matched_boxes: the number of anchors that are matched to a groundtruth
targets, used as the loss normalizater. The shape is [batch_size].
Returns:
box_loss: a float32 representing total box regression loss.
"""
mask = tf.greater(tf.squeeze(gt_label), 0)
float_mask = tf.cast(mask, tf.float32)
smooth_l1 = tf.reduce_sum(tf.losses.huber_loss(
gt_loc, pred_loc,
reduction=tf.losses.Reduction.NONE
), axis=2)
smooth_l1 = tf.multiply(smooth_l1, float_mask)
box_loss = tf.reduce_sum(smooth_l1, axis=1)
return tf.reduce_mean(box_loss / num_matched_boxes)
def _classification_loss(self, pred_label, gt_label, num_matched_boxes):
"""Computes the classification loss.
Computes the classification loss with hard negative mining.
Args:
pred_label: a flatten tensor that includes all predicted class. The shape
is [batch_size, num_anchors, num_classes].
gt_label: a tensor that represents the classification groundtruth targets.
The shape is [batch_size, num_anchors, 1].
num_matched_boxes: the number of anchors that are matched to a groundtruth
targets. This is used as the loss normalizater.
Returns:
box_loss: a float32 representing total box regression loss.
"""
cross_entropy = tf.losses.sparse_softmax_cross_entropy(
gt_label, pred_label, reduction=tf.losses.Reduction.NONE)
mask = tf.greater(tf.squeeze(gt_label), 0)
float_mask = tf.cast(mask, tf.float32)
# Hard example mining
neg_masked_cross_entropy = cross_entropy * (1 - float_mask)
relative_position = tf.argsort(
tf.argsort(
neg_masked_cross_entropy, direction='DESCENDING'))
num_neg_boxes = tf.minimum(
tf.to_int32(num_matched_boxes) * ssd_constants.NEGS_PER_POSITIVE,
ssd_constants.NUM_SSD_BOXES)
top_k_neg_mask = tf.cast(tf.less(
relative_position,
tf.tile(num_neg_boxes[:, tf.newaxis], (1, ssd_constants.NUM_SSD_BOXES))
), tf.float32)
class_loss = tf.reduce_sum(
tf.multiply(cross_entropy, float_mask + top_k_neg_mask), axis=1)
return tf.reduce_mean(class_loss / num_matched_boxes)
def add_backbone_saver(self):
# Create saver with mapping from variable names in checkpoint of backbone
# model to variables in SSD model
backbone_var_list = self._collect_backbone_vars()
self.backbone_savers.append(tf.train.Saver(backbone_var_list))
def load_backbone_model(self, sess, backbone_model_path):
for saver in self.backbone_savers:
saver.restore(sess, backbone_model_path)
def get_input_data_types(self, subset):
if subset == 'validation':
return [self.data_type, tf.float32, tf.float32, tf.float32, tf.int32]
return [self.data_type, tf.float32, tf.float32, tf.float32]
def get_input_shapes(self, subset):
"""Return encoded tensor shapes for train and eval data respectively."""
if subset == 'validation':
# Validation data shapes:
# 1. images
# 2. ground truth locations of boxes
# 3. ground truth classes of objects in boxes
# 4. source image IDs
# 5. raw image shapes
return [
[self.batch_size, self.image_size, self.image_size, self.depth],
[self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 4],
[self.batch_size, ssd_constants.MAX_NUM_EVAL_BOXES, 1],
[self.batch_size],
[self.batch_size, 3],
]
# Training data shapes:
# 1. images
# 2. ground truth locations of boxes
# 3. ground truth classes of objects in boxes
# 4. numbers of objects in images
return [
[self.batch_size, self.image_size, self.image_size, self.depth],
[self.batch_size, ssd_constants.NUM_SSD_BOXES, 4],
[self.batch_size, ssd_constants.NUM_SSD_BOXES, 1],
[self.batch_size]
]
def accuracy_function(self, inputs, logits):
"""Returns the ops to measure the mean precision of the model."""
try:
import ssd_dataloader # pylint: disable=g-import-not-at-top
from object_detection.box_coders import faster_rcnn_box_coder # pylint: disable=g-import-not-at-top
from object_detection.core import box_coder # pylint: disable=g-import-not-at-top
from object_detection.core import box_list # pylint: disable=g-import-not-at-top
except ImportError:
raise ImportError('To use the COCO dataset, you must clone the '
'repo https://github.com/tensorflow/models and add '
'tensorflow/models and tensorflow/models/research to '
'the PYTHONPATH, and compile the protobufs by '
'following https://github.com/tensorflow/models/blob/'
'master/research/object_detection/g3doc/installation.md'
'#protobuf-compilation ; To evaluate using COCO'
'metric, download and install Python COCO API from'
'https://github.com/cocodataset/cocoapi')
# Unpack model output back to locations and confidence scores of predictions
# pred_locs: relative locations (coordinates) of objects in all SSD boxes
# shape: [batch_size, NUM_SSD_BOXES, 4]
# pred_labels: confidence scores of objects being of all categories
# shape: [batch_size, NUM_SSD_BOXES, label_num]
pred_locs, pred_labels = tf.split(logits, [4, self.label_num], 2)
ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
scale_factors=ssd_constants.BOX_CODER_SCALES)
anchors = box_list.BoxList(
tf.convert_to_tensor(ssd_dataloader.DefaultBoxes()('ltrb')))
pred_boxes = box_coder.batch_decode(
encoded_boxes=pred_locs, box_coder=ssd_box_coder, anchors=anchors)
pred_scores = tf.nn.softmax(pred_labels, axis=2)
# TODO(haoyuzhang): maybe use `gt_boxes` and `gt_classes` for visualization.
_, gt_boxes, gt_classes, source_id, raw_shape = inputs # pylint: disable=unused-variable
return {
(constants.UNREDUCED_ACCURACY_OP_PREFIX +
ssd_constants.PRED_BOXES): pred_boxes,
(constants.UNREDUCED_ACCURACY_OP_PREFIX +
ssd_constants.PRED_SCORES): pred_scores,
# TODO(haoyuzhang): maybe use these values for visualization.
# constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_boxes': gt_boxes,
# constants.UNREDUCED_ACCURACY_OP_PREFIX+'gt_classes': gt_classes,
(constants.UNREDUCED_ACCURACY_OP_PREFIX +
ssd_constants.SOURCE_ID): source_id,
(constants.UNREDUCED_ACCURACY_OP_PREFIX +
ssd_constants.RAW_SHAPE): raw_shape
}
def postprocess(self, results):
"""Postprocess results returned from model."""
try:
import coco_metric # pylint: disable=g-import-not-at-top
except ImportError:
raise ImportError('To use the COCO dataset, you must clone the '
'repo https://github.com/tensorflow/models and add '
'tensorflow/models and tensorflow/models/research to '
'the PYTHONPATH, and compile the protobufs by '
'following https://github.com/tensorflow/models/blob/'
'master/research/object_detection/g3doc/installation.md'
'#protobuf-compilation ; To evaluate using COCO'
'metric, download and install Python COCO API from'
'https://github.com/cocodataset/cocoapi')
pred_boxes = results[ssd_constants.PRED_BOXES]
pred_scores = results[ssd_constants.PRED_SCORES]
# TODO(haoyuzhang): maybe use these values for visualization.
# gt_boxes = results['gt_boxes']
# gt_classes = results['gt_classes']
source_id = results[ssd_constants.SOURCE_ID]
raw_shape = results[ssd_constants.RAW_SHAPE]
# COCO evaluation requires processing COCO_NUM_VAL_IMAGES exactly once. Due
# to rounding errors (i.e., COCO_NUM_VAL_IMAGES % batch_size != 0), setting
# `num_eval_epochs` to 1 is not enough and will often miss some images. We
# expect user to set `num_eval_epochs` to >1, which will leave some unused
# images from previous steps in `predictions`. Here we check if we are doing
# eval at a new global step.
if results['global_step'] > self.eval_global_step:
self.eval_global_step = results['global_step']
self.predictions.clear()
for i, sid in enumerate(source_id):
self.predictions[int(sid)] = {
ssd_constants.PRED_BOXES: pred_boxes[i],
ssd_constants.PRED_SCORES: pred_scores[i],
ssd_constants.SOURCE_ID: source_id[i],
ssd_constants.RAW_SHAPE: raw_shape[i]
}
# COCO metric calculates mAP only after a full epoch of evaluation. Return
# dummy results for top_N_accuracy to be compatible with benchmar_cnn.py.
if len(self.predictions) >= ssd_constants.COCO_NUM_VAL_IMAGES:
log_fn('Got results for all {:d} eval examples. Calculate mAP...'.format(
ssd_constants.COCO_NUM_VAL_IMAGES))
annotation_file = os.path.join(self.params.data_dir,
ssd_constants.ANNOTATION_FILE)
# Size of predictions before decoding about 15--30GB, while size after
# decoding is 100--200MB. When using async eval mode, decoding takes
# 20--30 seconds of main thread time but is necessary to avoid OOM during
# inter-process communication.
decoded_preds = coco_metric.decode_predictions(self.predictions.values())
self.predictions.clear()
if self.params.collect_eval_results_async:
def _eval_results_getter():
"""Iteratively get eval results from async eval process."""
while True:
step, eval_results = self.async_eval_results_queue.get()
self.eval_coco_ap = eval_results['COCO/AP']
mlperf.logger.log_eval_accuracy(
self.eval_coco_ap, step, self.batch_size * self.params.num_gpus,
ssd_constants.COCO_NUM_TRAIN_IMAGES)
if self.reached_target():
# Reached target, clear all pending messages in predictions queue
# and insert poison pill to stop the async eval process.
while not self.async_eval_predictions_queue.empty():
self.async_eval_predictions_queue.get()
self.async_eval_predictions_queue.put('STOP')
break
if not self.async_eval_process:
# Limiting the number of messages in predictions queue to prevent OOM.
# Each message (predictions data) can potentially consume a lot of
# memory, and normally there should only be few messages in the queue.
# If often blocked on this, consider reducing eval frequency.
self.async_eval_predictions_queue = multiprocessing.Queue(2)
self.async_eval_results_queue = multiprocessing.Queue()
# Reason to use a Process as opposed to Thread is mainly the
# computationally intensive eval runner. Python multithreading is not
# truly running in parallel, a runner thread would get significantly
# delayed (or alternatively delay the main thread).
self.async_eval_process = multiprocessing.Process(
target=coco_metric.async_eval_runner,
args=(self.async_eval_predictions_queue,
self.async_eval_results_queue,
annotation_file))
self.async_eval_process.daemon = True
self.async_eval_process.start()
self.async_eval_results_getter_thread = threading.Thread(
target=_eval_results_getter, args=())
self.async_eval_results_getter_thread.daemon = True
self.async_eval_results_getter_thread.start()
self.async_eval_predictions_queue.put(
(self.eval_global_step, decoded_preds))
return {'top_1_accuracy': 0, 'top_5_accuracy': 0.}
eval_results = coco_metric.compute_map(decoded_preds, annotation_file)
self.eval_coco_ap = eval_results['COCO/AP']
ret = {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
for metric_key, metric_value in eval_results.items():
ret[constants.SIMPLE_VALUE_RESULT_PREFIX + metric_key] = metric_value
mlperf.logger.log_eval_accuracy(self.eval_coco_ap, self.eval_global_step,
self.batch_size * self.params.num_gpus,
ssd_constants.COCO_NUM_TRAIN_IMAGES)
return ret
log_fn('Got {:d} out of {:d} eval examples.'
' Waiting for the remaining to calculate mAP...'.format(
len(self.predictions), ssd_constants.COCO_NUM_VAL_IMAGES))
return {'top_1_accuracy': self.eval_coco_ap, 'top_5_accuracy': 0.}
def get_synthetic_inputs(self, input_name, nclass):
"""Generating synthetic data matching real data shape and type."""
inputs = tf.random_uniform(
self.get_input_shapes('train')[0], dtype=self.data_type)
inputs = variables.VariableV1(inputs, trainable=False,
collections=[tf.GraphKeys.LOCAL_VARIABLES],
name=input_name)
boxes = tf.random_uniform(
[self.batch_size, ssd_constants.NUM_SSD_BOXES, 4], dtype=tf.float32)
classes = tf.random_uniform(
[self.batch_size, ssd_constants.NUM_SSD_BOXES, 1], dtype=tf.float32)
nboxes = tf.random_uniform(
[self.batch_size], minval=1, maxval=10, dtype=tf.float32)
return (inputs, boxes, classes, nboxes)
def reached_target(self):
return (self.params.stop_at_top_1_accuracy and
self.eval_coco_ap >= self.params.stop_at_top_1_accuracy)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment