Merge pull request #1432 from alexgorban/master

Open source release of Attention OCR

Merge pull request #1432 from alexgorban/master
Open source release of Attention OCR
4cc1fa0f · Martin Wicke · GitHub · 3a3c5b9d · 9beaea41 · 4cc1fa0f
Commit 4cc1fa0f authored May 01, 2017 by Martin Wicke Committed by GitHub May 01, 2017
4 changed files
--- a/attention_ocr/python/sequence_layers.py
+++ b/attention_ocr/python/sequence_layers.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various implementations of sequence layers for character prediction.
+A 'sequence layer' is a part of a computation graph which is responsible of
+producing a sequence of characters using extracted image features. There are
+many reasonable ways to implement such layers. All of them are using RNNs.
+This module provides implementations which uses 'attention' mechanism to
+spatially 'pool' image features and also can use a previously predicted
+character to predict the next (aka auto regression).
+Usage:
+  Select one of available classes, e.g. Attention or use a wrapper function to
+  pick one based on your requirements:
+  layer_class = sequence_layers.get_layer_class(use_attention=True,
+                                                use_autoregression=True)
+  layer = layer_class(net, labels_one_hot, model_params, method_params)
+  char_logits = layer.create_logits()
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import abc
+import logging
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib import slim
+def orthogonal_initializer(shape, dtype=tf.float32, *args, **kwargs):
+  """Generates orthonormal matrices with random values.
+  Orthonormal initialization is important for RNNs:
+    http://arxiv.org/abs/1312.6120
+    http://smerity.com/articles/2016/orthogonal_init.html
+  For non-square shapes the returned matrix will be semi-orthonormal: if the
+  number of columns exceeds the number of rows, then the rows are orthonormal
+  vectors; but if the number of rows exceeds the number of columns, then the
+  columns are orthonormal vectors.
+  We use SVD decomposition to generate an orthonormal matrix with random
+  values. The same way as it is done in the Lasagne library for Theano. Note
+  that both u and v returned by the svd are orthogonal and random. We just need
+  to pick one with the right shape.
+  Args:
+    shape: a shape of the tensor matrix to initialize.
+    dtype: a dtype of the initialized tensor.
+    *args: not used.
+    **kwargs: not used.
+  Returns:
+    An initialized tensor.
+  """
+  del args
+  del kwargs
+  flat_shape = (shape[0], np.prod(shape[1:]))
+  w = np.random.randn(*flat_shape)
+  u, _, v = np.linalg.svd(w, full_matrices=False)
+  w = u if u.shape == flat_shape else v
+  return tf.constant(w.reshape(shape), dtype=dtype)
+SequenceLayerParams = collections.namedtuple('SequenceLogitsParams', [
+    'num_lstm_units', 'weight_decay', 'lstm_state_clip_value'
+])
+class SequenceLayerBase(object):
+  """A base abstruct class for all sequence layers.
+  A child class has to define following methods:
+    get_train_input
+    get_eval_input
+    unroll_cell
+  """
+  __metaclass__ = abc.ABCMeta
+  def __init__(self, net, labels_one_hot, model_params, method_params):
+    """Stores argument in member variable for further use.
+    Args:
+      net: A tensor with shape [batch_size, num_features, feature_size] which
+        contains some extracted image features.
+      labels_one_hot: An optional (can be None) ground truth labels for the
+        input features. Is a tensor with shape
+        [batch_size, seq_length, num_char_classes]
+      model_params: A namedtuple with model parameters (model.ModelParams).
+      method_params: A SequenceLayerParams instance.
+    """
+    self._params = model_params
+    self._mparams = method_params
+    self._net = net
+    self._labels_one_hot = labels_one_hot
+    self._batch_size = net.get_shape().dims[0].value
+    # Initialize parameters for char logits which will be computed on the fly
+    # inside an LSTM decoder.
+    self._char_logits = {}
+    regularizer = slim.l2_regularizer(self._mparams.weight_decay)
+    self._softmax_w = slim.model_variable(
+        'softmax_w',
+        [self._mparams.num_lstm_units, self._params.num_char_classes],
+        initializer=orthogonal_initializer,
+        regularizer=regularizer)
+    self._softmax_b = slim.model_variable(
+        'softmax_b', [self._params.num_char_classes],
+        initializer=tf.zeros_initializer(),
+        regularizer=regularizer)
+  @abc.abstractmethod
+  def get_train_input(self, prev, i):
+    """Returns a sample to be used to predict a character during training.
+    This function is used as a loop_function for an RNN decoder.
+    Args:
+      prev: output tensor from previous step of the RNN. A tensor with shape:
+        [batch_size, num_char_classes].
+      i: index of a character in the output sequence.
+    Returns:
+      A tensor with shape [batch_size, ?] - depth depends on implementation
+      details.
+    """
+    pass
+  @abc.abstractmethod
+  def get_eval_input(self, prev, i):
+    """Returns a sample to be used to predict a character during inference.
+    This function is used as a loop_function for an RNN decoder.
+    Args:
+      prev: output tensor from previous step of the RNN. A tensor with shape:
+        [batch_size, num_char_classes].
+      i: index of a character in the output sequence.
+    Returns:
+      A tensor with shape [batch_size, ?] - depth depends on implementation
+      details.
+    """
+    raise AssertionError('Not implemented')
+  @abc.abstractmethod
+  def unroll_cell(self, decoder_inputs, initial_state, loop_function, cell):
+    """Unrolls an RNN cell for all inputs.
+    This is a placeholder to call some RNN decoder. It has a similar to
+    tf.seq2seq.rnn_decode interface.
+    Args:
+      decoder_inputs: A list of 2D Tensors* [batch_size x input_size]. In fact,
+        most of existing decoders in presence of a loop_function use only the
+        first element to determine batch_size and length of the list to
+        determine number of steps.
+      initial_state: 2D Tensor with shape [batch_size x cell.state_size].
+      loop_function: function will be applied to the i-th output in order to
+        generate the i+1-st input (see self.get_input).
+      cell: rnn_cell.RNNCell defining the cell function and size.
+    Returns:
+      A tuple of the form (outputs, state), where:
+        outputs: A list of character logits of the same length as
+        decoder_inputs of 2D Tensors with shape [batch_size x num_characters].
+        state: The state of each cell at the final time-step.
+          It is a 2D Tensor of shape [batch_size x cell.state_size].
+    """
+    pass
+  def is_training(self):
+    """Returns True if the layer is created for training stage."""
+    return self._labels_one_hot is not None
+  def char_logit(self, inputs, char_index):
+    """Creates logits for a character if required.
+    Args:
+      inputs: A tensor with shape [batch_size, ?] (depth is implementation
+        dependent).
+      char_index: A integer index of a character in the output sequence.
+    Returns:
+      A tensor with shape [batch_size, num_char_classes]
+    """
+    if char_index not in self._char_logits:
+      self._char_logits[char_index] = tf.nn.xw_plus_b(inputs, self._softmax_w,
+                                                      self._softmax_b)
+    return self._char_logits[char_index]
+  def char_one_hot(self, logit):
+    """Creates one hot encoding for a logit of a character.
+    Args:
+      logit: A tensor with shape [batch_size, num_char_classes].
+    Returns:
+      A tensor with shape [batch_size, num_char_classes]
+    """
+    prediction = tf.argmax(logit, dimension=1)
+    return slim.one_hot_encoding(prediction, self._params.num_char_classes)
+  def get_input(self, prev, i):
+    """A wrapper for get_train_input and get_eval_input.
+    Args:
+      prev: output tensor from previous step of the RNN. A tensor with shape:
+        [batch_size, num_char_classes].
+      i: index of a character in the output sequence.
+    Returns:
+      A tensor with shape [batch_size, ?] - depth depends on implementation
+      details.
+    """
+    if self.is_training():
+      return self.get_train_input(prev, i)
+    else:
+      return self.get_eval_input(prev, i)
+  def create_logits(self):
+    """Creates character sequence logits for a net specified in the constructor.
+    A "main" method for the sequence layer which glues together all pieces.
+    Returns:
+      A tensor with shape [batch_size, seq_length, num_char_classes].
+    """
+    with tf.variable_scope('LSTM'):
+      first_label = self.get_input(prev=None, i=0)
+      decoder_inputs = [first_label] + [None] * (self._params.seq_length - 1)
+      lstm_cell = tf.contrib.rnn.LSTMCell(
+          self._mparams.num_lstm_units,
+          use_peepholes=False,
+          cell_clip=self._mparams.lstm_state_clip_value,
+          state_is_tuple=True,
+          initializer=orthogonal_initializer)
+      lstm_outputs, _ = self.unroll_cell(
+          decoder_inputs=decoder_inputs,
+          initial_state=lstm_cell.zero_state(self._batch_size, tf.float32),
+          loop_function=self.get_input,
+          cell=lstm_cell)
+    with tf.variable_scope('logits'):
+      logits_list = [
+          tf.expand_dims(self.char_logit(logit, i), dim=1)
+          for i, logit in enumerate(lstm_outputs)
+      ]
+    return tf.concat(logits_list, 1)
+class NetSlice(SequenceLayerBase):
+  """A layer which uses a subset of image features to predict each character.
+  """
+  def __init__(self, *args, **kwargs):
+    super(NetSlice, self).__init__(*args, **kwargs)
+    self._zero_label = tf.zeros(
+        [self._batch_size, self._params.num_char_classes])
+  def get_image_feature(self, char_index):
+    """Returns a subset of image features for a character.
+    Args:
+      char_index: an index of a character.
+    Returns:
+      A tensor with shape [batch_size, ?]. The output depth depends on the
+      depth of input net.
+    """
+    batch_size, features_num, _ = [d.value for d in self._net.get_shape()]
+    slice_len = int(features_num / self._params.seq_length)
+    # In case when features_num != seq_length, we just pick a subset of image
+    # features, this choice is arbitrary and there is no intuitive geometrical
+    # interpretation. If features_num is not dividable by seq_length there will
+    # be unused image features.
+    net_slice = self._net[:, char_index:char_index + slice_len, :]
+    feature = tf.reshape(net_slice, [batch_size, -1])
+    logging.debug('Image feature: %s', feature)
+    return feature
+  def get_eval_input(self, prev, i):
+    """See SequenceLayerBase.get_eval_input for details."""
+    del prev
+    return self.get_image_feature(i)
+  def get_train_input(self, prev, i):
+    """See SequenceLayerBase.get_train_input for details."""
+    return self.get_eval_input(prev, i)
+  def unroll_cell(self, decoder_inputs, initial_state, loop_function, cell):
+    """See SequenceLayerBase.unroll_cell for details."""
+    return tf.contrib.legacy_seq2seq.rnn_decoder(
+        decoder_inputs=decoder_inputs,
+        initial_state=initial_state,
+        cell=cell,
+        loop_function=self.get_input)
+class NetSliceWithAutoregression(NetSlice):
+  """A layer similar to NetSlice, but it also uses auto regression.
+  The "auto regression" means that we use network output for previous character
+  as a part of input for the current character.
+  """
+  def __init__(self, *args, **kwargs):
+    super(NetSliceWithAutoregression, self).__init__(*args, **kwargs)
+  def get_eval_input(self, prev, i):
+    """See SequenceLayerBase.get_eval_input for details."""
+    if i == 0:
+      prev = self._zero_label
+    else:
+      logit = self.char_logit(prev, char_index=i - 1)
+      prev = self.char_one_hot(logit)
+    image_feature = self.get_image_feature(char_index=i)
+    return tf.concat([image_feature, prev], 1)
+  def get_train_input(self, prev, i):
+    """See SequenceLayerBase.get_train_input for details."""
+    if i == 0:
+      prev = self._zero_label
+    else:
+      prev = self._labels_one_hot[:, i - 1, :]
+    image_feature = self.get_image_feature(i)
+    return tf.concat([image_feature, prev], 1)
+class Attention(SequenceLayerBase):
+  """A layer which uses attention mechanism to select image features."""
+  def __init__(self, *args, **kwargs):
+    super(Attention, self).__init__(*args, **kwargs)
+    self._zero_label = tf.zeros(
+        [self._batch_size, self._params.num_char_classes])
+  def get_eval_input(self, prev, i):
+    """See SequenceLayerBase.get_eval_input for details."""
+    del prev, i
+    # The attention_decoder will fetch image features from the net, no need for
+    # extra inputs.
+    return self._zero_label
+  def get_train_input(self, prev, i):
+    """See SequenceLayerBase.get_train_input for details."""
+    return self.get_eval_input(prev, i)
+  def unroll_cell(self, decoder_inputs, initial_state, loop_function, cell):
+    return tf.contrib.legacy_seq2seq.attention_decoder(
+        decoder_inputs=decoder_inputs,
+        initial_state=initial_state,
+        attention_states=self._net,
+        cell=cell,
+        loop_function=self.get_input)
+class AttentionWithAutoregression(Attention):
+  """A layer which uses both attention and auto regression."""
+  def __init__(self, *args, **kwargs):
+    super(AttentionWithAutoregression, self).__init__(*args, **kwargs)
+  def get_train_input(self, prev, i):
+    """See SequenceLayerBase.get_train_input for details."""
+    if i == 0:
+      return self._zero_label
+    else:
+      # TODO(gorban): update to gradually introduce gt labels.
+      return self._labels_one_hot[:, i - 1, :]
+  def get_eval_input(self, prev, i):
+    """See SequenceLayerBase.get_eval_input for details."""
+    if i == 0:
+      return self._zero_label
+    else:
+      logit = self.char_logit(prev, char_index=i - 1)
+      return self.char_one_hot(logit)
+def get_layer_class(use_attention, use_autoregression):
+  """A convenience function to get a layer class based on requirements.
+  Args:
+    use_attention: if True a returned class will use attention.
+    use_autoregression: if True a returned class will use auto regression.
+  Returns:
+    One of available sequence layers (child classes for SequenceLayerBase).
+  """
+  if use_attention and use_autoregression:
+    layer_class = AttentionWithAutoregression
+  elif use_attention and not use_autoregression:
+    layer_class = Attention
+  elif not use_attention and not use_autoregression:
+    layer_class = NetSlice
+  elif not use_attention and use_autoregression:
+    layer_class = NetSliceWithAutoregression
+  else:
+    raise AssertionError('Unsupported sequence layer class')
+  logging.debug('Use %s as a layer class', layer_class.__name__)
+  return layer_class
--- a/attention_ocr/python/sequence_layers_test.py
+++ b/attention_ocr/python/sequence_layers_test.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sequence_layers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib import slim
+import model
+import sequence_layers
+def fake_net(batch_size, num_features, feature_size):
+  return tf.convert_to_tensor(
+      np.random.uniform(size=(batch_size, num_features, feature_size)),
+      dtype=tf.float32)
+def fake_labels(batch_size, seq_length, num_char_classes):
+  labels_np = tf.convert_to_tensor(
+      np.random.randint(
+          low=0, high=num_char_classes, size=(batch_size, seq_length)))
+  return slim.one_hot_encoding(labels_np, num_classes=num_char_classes)
+def create_layer(layer_class, batch_size, seq_length, num_char_classes):
+  model_params = model.ModelParams(
+      num_char_classes=num_char_classes,
+      seq_length=seq_length,
+      num_views=1,
+      null_code=num_char_classes)
+  net = fake_net(
+      batch_size=batch_size, num_features=seq_length * 5, feature_size=6)
+  labels_one_hot = fake_labels(batch_size, seq_length, num_char_classes)
+  layer_params = sequence_layers.SequenceLayerParams(
+      num_lstm_units=10, weight_decay=0.00004, lstm_state_clip_value=10.0)
+  return layer_class(net, labels_one_hot, model_params, layer_params)
+class SequenceLayersTest(tf.test.TestCase):
+  def test_net_slice_char_logits_with_correct_shape(self):
+    batch_size = 2
+    seq_length = 4
+    num_char_classes = 3
+    layer = create_layer(sequence_layers.NetSlice, batch_size, seq_length,
+                         num_char_classes)
+    char_logits = layer.create_logits()
+    self.assertEqual(
+        tf.TensorShape([batch_size, seq_length, num_char_classes]),
+        char_logits.get_shape())
+  def test_net_slice_with_autoregression_char_logits_with_correct_shape(self):
+    batch_size = 2
+    seq_length = 4
+    num_char_classes = 3
+    layer = create_layer(sequence_layers.NetSliceWithAutoregression,
+                         batch_size, seq_length, num_char_classes)
+    char_logits = layer.create_logits()
+    self.assertEqual(
+        tf.TensorShape([batch_size, seq_length, num_char_classes]),
+        char_logits.get_shape())
+  def test_attention_char_logits_with_correct_shape(self):
+    batch_size = 2
+    seq_length = 4
+    num_char_classes = 3
+    layer = create_layer(sequence_layers.Attention, batch_size, seq_length,
+                         num_char_classes)
+    char_logits = layer.create_logits()
+    self.assertEqual(
+        tf.TensorShape([batch_size, seq_length, num_char_classes]),
+        char_logits.get_shape())
+  def test_attention_with_autoregression_char_logits_with_correct_shape(self):
+    batch_size = 2
+    seq_length = 4
+    num_char_classes = 3
+    layer = create_layer(sequence_layers.AttentionWithAutoregression,
+                         batch_size, seq_length, num_char_classes)
+    char_logits = layer.create_logits()
+    self.assertEqual(
+        tf.TensorShape([batch_size, seq_length, num_char_classes]),
+        char_logits.get_shape())
+if __name__ == '__main__':
+  tf.test.main()
--- a/attention_ocr/python/train.py
+++ b/attention_ocr/python/train.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to train the Attention OCR model.
+A simple usage example:
+python train.py
+"""
+import collections
+import logging
+import tensorflow as tf
+from tensorflow.contrib import slim
+from tensorflow import app
+from tensorflow.python.platform import flags
+from tensorflow.contrib.tfprof import model_analyzer
+import data_provider
+import common_flags
+FLAGS = flags.FLAGS
+common_flags.define()
+# yapf: disable
+flags.DEFINE_integer('task', 0,
+                     'The Task ID. This value is used when training with '
+                     'multiple workers to identify each worker.')
+flags.DEFINE_integer('ps_tasks', 0,
+                     'The number of parameter servers. If the value is 0, then'
+                     ' the parameters are handled locally by the worker.')
+flags.DEFINE_integer('save_summaries_secs', 60,
+                     'The frequency with which summaries are saved, in '
+                     'seconds.')
+flags.DEFINE_integer('save_interval_secs', 600,
+                     'Frequency in seconds of saving the model.')
+flags.DEFINE_integer('max_number_of_steps', int(1e10),
+                     'The maximum number of gradient steps.')
+flags.DEFINE_string('checkpoint_inception', '',
+                    'Checkpoint to recover inception weights from.')
+flags.DEFINE_float('clip_gradient_norm', 2.0,
+                   'If greater than 0 then the gradients would be clipped by '
+                   'it.')
+flags.DEFINE_bool('sync_replicas', False,
+                  'If True will synchronize replicas during training.')
+flags.DEFINE_integer('replicas_to_aggregate', 1,
+                     'The number of gradients updates before updating params.')
+flags.DEFINE_integer('total_num_replicas', 1,
+                     'Total number of worker replicas.')
+flags.DEFINE_integer('startup_delay_steps', 15,
+                     'Number of training steps between replicas startup.')
+flags.DEFINE_boolean('reset_train_dir', False,
+                     'If true will delete all files in the train_log_dir')
+flags.DEFINE_boolean('show_graph_stats', False,
+                     'Output model size stats to stderr.')
+# yapf: enable
+TrainingHParams = collections.namedtuple('TrainingHParams', [
+    'learning_rate',
+    'optimizer',
+    'momentum',
+    'use_augment_input',
+])
+def get_training_hparams():
+  return TrainingHParams(
+      learning_rate=FLAGS.learning_rate,
+      optimizer=FLAGS.optimizer,
+      momentum=FLAGS.momentum,
+      use_augment_input=FLAGS.use_augment_input)
+def create_optimizer(hparams):
+  """Creates optimized based on the specified flags."""
+  if hparams.optimizer == 'momentum':
+    optimizer = tf.train.MomentumOptimizer(
+        hparams.learning_rate, momentum=hparams.momentum)
+  elif hparams.optimizer == 'adam':
+    optimizer = tf.train.AdamOptimizer(hparams.learning_rate)
+  elif hparams.optimizer == 'adadelta':
+    optimizer = tf.train.AdadeltaOptimizer(hparams.learning_rate)
+  elif hparams.optimizer == 'adagrad':
+    optimizer = tf.train.AdagradOptimizer(hparams.learning_rate)
+  elif hparams.optimizer == 'rmsprop':
+    optimizer = tf.train.RMSPropOptimizer(
+        hparams.learning_rate, momentum=hparams.momentum)
+  return optimizer
+def train(loss, init_fn, hparams):
+  """Wraps slim.learning.train to run a training loop.
+  Args:
+    loss: a loss tensor
+    init_fn: A callable to be executed after all other initialization is done.
+    hparams: a model hyper parameters
+  """
+  optimizer = create_optimizer(hparams)
+  if FLAGS.sync_replicas:
+    replica_id = tf.constant(FLAGS.task, tf.int32, shape=())
+    optimizer = tf.LegacySyncReplicasOptimizer(
+        opt=optimizer,
+        replicas_to_aggregate=FLAGS.replicas_to_aggregate,
+        replica_id=replica_id,
+        total_num_replicas=FLAGS.total_num_replicas)
+    sync_optimizer = optimizer
+    startup_delay_steps = 0
+  else:
+    startup_delay_steps = 0
+    sync_optimizer = None
+  train_op = slim.learning.create_train_op(
+      loss,
+      optimizer,
+      summarize_gradients=True,
+      clip_gradient_norm=FLAGS.clip_gradient_norm)
+  slim.learning.train(
+      train_op=train_op,
+      logdir=FLAGS.train_log_dir,
+      graph=loss.graph,
+      master=FLAGS.master,
+      is_chief=(FLAGS.task == 0),
+      number_of_steps=FLAGS.max_number_of_steps,
+      save_summaries_secs=FLAGS.save_summaries_secs,
+      save_interval_secs=FLAGS.save_interval_secs,
+      startup_delay_steps=startup_delay_steps,
+      sync_optimizer=sync_optimizer,
+      init_fn=init_fn)
+def prepare_training_dir():
+  if not tf.gfile.Exists(FLAGS.train_log_dir):
+    logging.info('Create a new training directory %s', FLAGS.train_log_dir)
+    tf.gfile.MakeDirs(FLAGS.train_log_dir)
+  else:
+    if FLAGS.reset_train_dir:
+      logging.info('Reset the training directory %s', FLAGS.train_log_dir)
+      tf.gfile.DeleteRecursively(FLAGS.train_log_dir)
+      tf.gfile.MakeDirs(FLAGS.train_log_dir)
+    else:
+      logging.info('Use already existing training directory %s',
+                   FLAGS.train_log_dir)
+def calculate_graph_metrics():
+  param_stats = model_analyzer.print_model_analysis(
+      tf.get_default_graph(),
+      tfprof_options=model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
+  return param_stats.total_parameters
+def main(_):
+  prepare_training_dir()
+  dataset = common_flags.create_dataset(split_name=FLAGS.split_name)
+  model = common_flags.create_model(dataset.num_char_classes,
+                                    dataset.max_sequence_length,
+                                    dataset.num_of_views, dataset.null_code)
+  hparams = get_training_hparams()
+  # If ps_tasks is zero, the local device is used. When using multiple
+  # (non-local) replicas, the ReplicaDeviceSetter distributes the variables
+  # across the different devices.
+  device_setter = tf.train.replica_device_setter(
+      FLAGS.ps_tasks, merge_devices=True)
+  with tf.device(device_setter):
+    data = data_provider.get_data(
+        dataset,
+        FLAGS.batch_size,
+        augment=hparams.use_augment_input,
+        central_crop_size=common_flags.get_crop_size())
+    endpoints = model.create_base(data.images, data.labels_one_hot)
+    total_loss = model.create_loss(data, endpoints)
+    model.create_summaries(data, endpoints, dataset.charset, is_training=True)
+    init_fn = model.create_init_fn_to_restore(FLAGS.checkpoint,
+                                              FLAGS.checkpoint_inception)
+    if FLAGS.show_graph_stats:
+      logging.info('Total number of weights in the graph: %s',
+                   calculate_graph_metrics())
+    train(total_loss, init_fn, hparams)
+if __name__ == '__main__':
+  app.run()
--- a/attention_ocr/python/utils.py
+++ b/attention_ocr/python/utils.py
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to support building models for StreetView text transcription."""
+import tensorflow as tf
+from tensorflow.contrib import slim
+def logits_to_log_prob(logits):
+  """Computes log probabilities using numerically stable trick.
+  This uses two numerical stability tricks:
+  1) softmax(x) = softmax(x - c) where c is a constant applied to all
+  arguments. If we set c = max(x) then the softmax is more numerically
+  stable.
+  2) log softmax(x) is not numerically stable, but we can stabilize it
+  by using the identity log softmax(x) = x - log sum exp(x)
+  Args:
+    logits: Tensor of arbitrary shape whose last dimension contains logits.
+  Returns:
+    A tensor of the same shape as the input, but with corresponding log
+    probabilities.
+  """
+  with tf.variable_scope('log_probabilities'):
+    reduction_indices = len(logits.shape.as_list()) - 1
+    max_logits = tf.reduce_max(
+        logits, reduction_indices=reduction_indices, keep_dims=True)
+    safe_logits = tf.subtract(logits, max_logits)
+    sum_exp = tf.reduce_sum(
+        tf.exp(safe_logits),
+        reduction_indices=reduction_indices,
+        keep_dims=True)
+    log_probs = tf.subtract(safe_logits, tf.log(sum_exp))
+  return log_probs
+def variables_to_restore(scope=None, strip_scope=False):
+  """Returns a list of variables to restore for the specified list of methods.
+  It is supposed that variable name starts with the method's scope (a prefix
+  returned by _method_scope function).
+  Args:
+    methods_names: a list of names of configurable methods.
+    strip_scope: if True will return variable names without method's scope.
+      If methods_names is None will return names unchanged.
+    model_scope: a scope for a whole model.
+  Returns:
+    a dictionary mapping variable names to variables for restore.
+  """
+  if scope:
+    variable_map = {}
+    method_variables = slim.get_variables_to_restore(include=[scope])
+    for var in method_variables:
+      if strip_scope:
+        var_name = var.op.name[len(scope) + 1:]
+      else:
+        var_name = var.op.name
+      variable_map[var_name] = var
+    return variable_map
+  else:
+    return {v.op.name: v for v in slim.get_variables_to_restore()}