Release DRAGNN bulk networks (#2785)

* Release DRAGNN bulk networks

Release DRAGNN bulk networks (#2785)
* Release DRAGNN bulk networks
4364390a · Ivan Bogatyy · calberti · 638fd759 · 4364390a · 4364390a
Commit 4364390a authored Nov 13, 2017 by Ivan Bogatyy Committed by calberti Nov 13, 2017
20 changed files
--- a/research/syntaxnet/dragnn/python/bulk_component_test.py
+++ b/research/syntaxnet/dragnn/python/bulk_component_test.py
@@ -41,9 +41,6 @@ from dragnn.python import dragnn_ops
 from dragnn.python import network_units
 from syntaxnet import sentence_pb2

-import dragnn.python.load_dragnn_cc_impl
-import syntaxnet.load_parser_ops
-
 FLAGS = tf.app.flags.FLAGS


@@ -473,6 +470,17 @@ class BulkComponentTest(test_util.TensorFlowTestCase):
             [2], [-1], [-1], [-1],
             [2], [3], [-1], [-1]])

+  def testBuildLossFailsOnNoExamples(self):
+    with tf.Graph().as_default():
+      logits = tf.constant([[0.5], [-0.5], [0.5], [-0.5]])
+      gold = tf.constant([-1, -1, -1, -1])
+      result = bulk_component.build_cross_entropy_loss(logits, gold)
+
+      # Expect loss computation to generate a runtime error due to the gold
+      # tensor containing no valid examples.
+      with self.test_session() as sess:
+        with self.assertRaises(tf.errors.InvalidArgumentError):
+          sess.run(result)

 if __name__ == '__main__':
  googletest.main()
--- a/research/syntaxnet/dragnn/python/component.py
+++ b/research/syntaxnet/dragnn/python/component.py
@@ -46,9 +46,8 @@ class MasterState(object):
  """Simple utility to encapsulate tensors associated with the master state.

  Attributes:
-    handle: string tensor handle to the underlying nlp_saft::dragnn::MasterState
-    current_batch_size: int tensor containing the batch size following the most
-        recent MasterState::Reset().
+    handle: string tensor handle to the underlying ComputeSession.
+    current_batch_size: int tensor containing the current batch size.
  """

  def __init__(self, handle, current_batch_size):
@@ -390,7 +389,11 @@ class DynamicComponentBuilder(ComponentBuilderBase):
      correctly predicted actions, and the total number of actions.
    """
    logging.info('Building component: %s', self.spec.name)
-    with tf.control_dependencies([tf.assert_equal(self.training_beam_size, 1)]):
+    # Add 0 to training_beam_size to disable eager static evaluation.
+    # This is possible because tensorflow's constant_value does not
+    # propagate arithmetic operations.
+    with tf.control_dependencies([
+        tf.assert_equal(self.training_beam_size + 0, 1)]):
      stride = state.current_batch_size * self.training_beam_size

    cost = tf.constant(0.)
@@ -462,10 +465,10 @@ class DynamicComponentBuilder(ComponentBuilderBase):

    # Saves completed arrays and return final state and cost.
    state.handle = output[0]
+    cost = output[1]
    correct = output[2]
    total = output[3]
    arrays = output[4:]
-    cost = output[1]

    # Store handles to the final output for use in subsequent tasks.
    network_state = network_states[self.name]
@@ -475,6 +478,9 @@ class DynamicComponentBuilder(ComponentBuilderBase):
            array=arrays[index])

    # Normalize the objective by the total # of steps taken.
+    # Note: Total could be zero by a number of reasons, including:
+    #   * Oracle labels not being emitted.
+    #   * No steps being taken if component is terminal at the start of a batch.
    with tf.control_dependencies([tf.assert_greater(total, 0)]):
      cost /= tf.to_float(total)

@@ -524,11 +530,14 @@ class DynamicComponentBuilder(ComponentBuilderBase):
            during_training=during_training)
        next_arrays = update_tensor_arrays(network_tensors, arrays)
        with tf.control_dependencies([x.flow for x in next_arrays]):
-          logits = self.network.get_logits(network_tensors)
-          logits = tf.cond(self.locally_normalize,
-                           lambda: tf.nn.log_softmax(logits), lambda: logits)
-          handle = dragnn_ops.advance_from_prediction(
-              handle, logits, component=self.name)
+          if self.num_actions == 1:  # deterministic; take oracle transition
+            handle = dragnn_ops.advance_from_oracle(handle, component=self.name)
+          else:  # predict next transition using network logits
+            logits = self.network.get_logits(network_tensors)
+            logits = tf.cond(self.locally_normalize,
+                             lambda: tf.nn.log_softmax(logits), lambda: logits)
+            handle = dragnn_ops.advance_from_prediction(
+                handle, logits, component=self.name)
        return [handle] + next_arrays

    # Create the TensorArray's to store activations for downstream/recurrent

--- a/research/syntaxnet/dragnn/python/composite_optimizer.py
+++ b/research/syntaxnet/dragnn/python/composite_optimizer.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """An optimizer that switches between several methods."""
+import functools
+

 import tensorflow as tf
 from tensorflow.python.training import optimizer
@@ -28,7 +29,7 @@ class CompositeOptimizer(optimizer.Optimizer):
               optimizer2,
               switch,
               use_locking=False,
-               name='Composite'):
+               name="Composite"):
    """Construct a new Composite optimizer.

    Args:
@@ -47,24 +48,20 @@ class CompositeOptimizer(optimizer.Optimizer):
    self._switch = switch

  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-
-    return tf.cond(
-        self._switch,
-        lambda: self._optimizer1.apply_gradients(grads_and_vars,
-                                                 global_step, name),
-        lambda: self._optimizer2.apply_gradients(grads_and_vars,
-                                                 global_step, name)
-    )
-
+    return tf.cond(self._switch,
+                   functools.partial(self._optimizer1.apply_gradients,
+                                     grads_and_vars, global_step, name),
+                   functools.partial(self._optimizer2.apply_gradients,
+                                     grads_and_vars, global_step, name))

  def get_slot(self, var, name):
-    slot1 = self._optimizer1.get_slot(var, name)
-    slot2 = self._optimizer2.get_slot(var, name)
-    if slot1 and slot2:
-      raise LookupError('Slot named %s for variable %s populated for both '
-                        'optimizers' % (name, var.name))
-    return slot1 or slot2
+    if name.startswith("c1-"):
+      return self._optimizer1.get_slot(var, name[3:])
+    else:
+      return self._optimizer2.get_slot(var, name[3:])

  def get_slot_names(self):
-    return sorted(self._optimizer1.get_slot_names() +
-                  self._optimizer2.get_slot_names())
+    opt1_names = self._optimizer1.get_slot_names()
+    opt2_names = self._optimizer2.get_slot_names()
+    return sorted(["c1-{}".format(name) for name in opt1_names] +
+                  ["c2-{}".format(name) for name in opt2_names])
--- a/research/syntaxnet/dragnn/python/composite_optimizer_test.py
+++ b/research/syntaxnet/dragnn/python/composite_optimizer_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for CompositeOptimizer."""


@@ -99,8 +98,8 @@ class CompositeOptimizerTest(test_util.TensorFlowTestCase):
      optimizer1 = MockAdamOptimizer(0.05)
      optimizer2 = MockMomentumOptimizer(0.05, 0.5)
      switch = tf.less(step, 100)
-      optimizer = composite_optimizer.CompositeOptimizer(optimizer1, optimizer2,
-                                                         switch)
+      optimizer = composite_optimizer.CompositeOptimizer(
+          optimizer1, optimizer2, switch)
      train_op = optimizer.minimize(loss)

      sess.run(tf.global_variables_initializer())
@@ -111,16 +110,19 @@ class CompositeOptimizerTest(test_util.TensorFlowTestCase):
        sess.run(train_op)
        sess.run(tf.assign_add(step, 1))
        slot_names = optimizer.get_slot_names()
-        self.assertItemsEqual(
-            slot_names,
-            ["m", "v", "momentum", "adam_counter", "momentum_counter"])
-        adam_counter = sess.run(optimizer.get_slot(w, "adam_counter"))
-        momentum_counter = sess.run(optimizer.get_slot(w, "momentum_counter"))
+        adam_slots = ["c1-m", "c1-v", "c1-adam_counter"]
+        momentum_slots = ["c2-momentum", "c2-momentum_counter"]
+        self.assertItemsEqual(slot_names, adam_slots + momentum_slots)
+        adam_counter = sess.run(optimizer.get_slot(w, "c1-adam_counter"))
+        momentum_counter = sess.run(
+            optimizer.get_slot(w, "c2-momentum_counter"))
        self.assertEqual(adam_counter, min(iteration + 1, 100))
        self.assertEqual(momentum_counter, max(iteration - 99, 0))
        if iteration % 20 == 0:
-          logging.info("%d %s %d %d", iteration, sess.run([switch, step, w, b]),
-                       adam_counter, momentum_counter)
+          logging.info("%d %s %d %d", iteration,
+                       sess.run([switch, step, w, b]), adam_counter,
+                       momentum_counter)
+

 if __name__ == "__main__":
  googletest.main()
--- a/research/syntaxnet/dragnn/python/dragnn_model_saver.py
+++ b/research/syntaxnet/dragnn/python/dragnn_model_saver.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Converter for DRAGNN checkpoint+master-spec files to TF SavedModels.
+
+This script loads a DRAGNN model from a checkpoint and master-spec and saves it
+to a TF SavedModel checkpoint. The checkpoint and master-spec together must
+form a complete model - see the conll_checkpoint_converter.py for an example
+of how to convert CONLL checkpoints, since they are not complete.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+from dragnn.protos import spec_pb2
+from dragnn.python import dragnn_model_saver_lib as saver_lib
+
+flags = tf.app.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('master_spec', None, 'Path to task context with '
+                    'inputs and parameters for feature extractors.')
+flags.DEFINE_string('params_path', None, 'Path to trained model parameters.')
+flags.DEFINE_string('export_path', '', 'Output path for exported servo model.')
+flags.DEFINE_bool('export_moving_averages', False,
+                  'Whether to export the moving average parameters.')
+
+
+def export(master_spec_path, params_path, export_path,
+           export_moving_averages):
+  """Restores a model and exports it in SavedModel form.
+
+  This method loads a graph specified by the spec at master_spec_path and the
+  params in params_path. It then saves the model in SavedModel format to the
+  location specified in export_path.
+
+  Args:
+    master_spec_path: Path to a proto-text master spec.
+    params_path: Path to the parameters file to export.
+    export_path: Path to export the SavedModel to.
+    export_moving_averages: Whether to export the moving average parameters.
+  """
+
+  graph = tf.Graph()
+  master_spec = spec_pb2.MasterSpec()
+  with tf.gfile.FastGFile(master_spec_path) as fin:
+    text_format.Parse(fin.read(), master_spec)
+
+  # Remove '/' if it exists at the end of the export path, ensuring that
+  # path utils work correctly.
+  stripped_path = export_path.rstrip('/')
+  saver_lib.clean_output_paths(stripped_path)
+
+  short_to_original = saver_lib.shorten_resource_paths(master_spec)
+  saver_lib.export_master_spec(master_spec, graph)
+  saver_lib.export_to_graph(master_spec, params_path, stripped_path, graph,
+                            export_moving_averages)
+  saver_lib.export_assets(master_spec, short_to_original, stripped_path)
+
+
+def main(unused_argv):
+  # Run the exporter.
+  export(FLAGS.master_spec, FLAGS.params_path,
+         FLAGS.export_path, FLAGS.export_moving_averages)
+  tf.logging.info('Export complete.')
+
+
+if __name__ == '__main__':
+  tf.app.run()
--- a/research/syntaxnet/dragnn/python/dragnn_model_saver_lib.py
+++ b/research/syntaxnet/dragnn/python/dragnn_model_saver_lib.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A program to export a DRAGNN model via SavedModel."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+from dragnn.protos import spec_pb2
+from dragnn.python import graph_builder
+
+# The saved model tags to export.  The same set of tags must be specified when
+# loading the saved model.
+_SAVED_MODEL_TAGS = [tf.saved_model.tag_constants.SERVING]
+
+
+def clean_output_paths(stripped_path):
+  """Ensures that the output path is cleaned and ready to receive a model."""
+  # If the export path's directory doesn't exist, create it.
+  export_directory = os.path.dirname(stripped_path)
+  if not tf.gfile.Exists(export_directory):
+    tf.logging.info('%s does not exist; creating it.' % export_directory)
+    tf.gfile.MakeDirs(export_directory)
+
+  # Remove any existing model on this export path, since exporting will fail
+  # if the model directory already exists.
+  if tf.gfile.Exists(stripped_path):
+    tf.logging.info('%s already exists; deleting it.' % stripped_path)
+    tf.gfile.DeleteRecursively(stripped_path)
+
+
+def shorten_resource_paths(master_spec):
+  """Shortens the resource file paths in a MasterSpec.
+
+  Replaces resource paths in the MasterSpec with shortened paths and builds a
+  mapping from the shortened path to the original path. Note that shortened
+  paths are relative to the 'assets.extra' directory of the SavedModel. Also
+  removes resources from FixedFeatureChannel, since they are not exported.
+
+  NB: The format of the shortened resource paths should be considered an
+  implementation detail and may change.
+
+  Args:
+    master_spec: MasterSpec proto to sanitize.
+
+  Returns:
+    Dict mapping from shortened resource path to original resource path.
+  """
+  for component_spec in master_spec.component:
+    for feature_spec in component_spec.fixed_feature:
+      feature_spec.ClearField('pretrained_embedding_matrix')
+      feature_spec.ClearField('vocab')
+
+  shortened_to_original = {}
+  original_to_shortened = {}
+  for component_index, component_spec in enumerate(master_spec.component):
+    component_name = 'component_{}_{}'.format(component_index,
+                                              component_spec.name)
+    for resource_index, resource_spec in enumerate(component_spec.resource):
+      resource_name = 'resource_{}_{}'.format(resource_index,
+                                              resource_spec.name)
+      for part_index, part in enumerate(resource_spec.part):
+        part_name = 'part_{}'.format(part_index)
+        shortened_path = os.path.join('resources', component_name,
+                                      resource_name, part_name)
+        if part.file_pattern not in original_to_shortened:
+          shortened_to_original[shortened_path] = part.file_pattern
+          original_to_shortened[part.file_pattern] = shortened_path
+
+        part.file_pattern = original_to_shortened[part.file_pattern]
+
+  return shortened_to_original
+
+
+def export_master_spec(master_spec, external_graph):
+  """Exports a MasterSpec.
+
+  Args:
+    master_spec: MasterSpec proto.
+    external_graph: tf.Graph that will be used to export the SavedModel.
+  """
+  # Implementation note: We can't export the original MasterSpec file directly
+  # because it uses short paths.  We also can't replace the original MasterSpec
+  # file with the new version, because the file may have other users.
+
+  # Write the new spec to a temp file and export it.  The basename will be
+  # exported in the SavedModel, so use mkdtemp() with a fixed basename.
+  master_spec_path = os.path.join(tempfile.mkdtemp(), 'master_spec')
+  with tf.gfile.FastGFile(master_spec_path, 'w') as fout:
+    fout.write(text_format.MessageToString(master_spec))
+  with external_graph.as_default():
+    asset_file_tensor = tf.constant(
+        master_spec_path, name='master_spec_filepath')
+    tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, asset_file_tensor)
+
+
+def export_assets(master_spec, shortened_to_original, saved_model_path):
+  """Exports the assets in a master_spec into a SavedModel directory.
+
+  This method exports a master_spec and associated files into the SavedModel's
+  'assets.extra' directory (which is unmanaged). All resources are added to the
+  'assets.extra' directory using sanitized paths. The master spec itself is
+  located at the base of the assets.extra directory.
+
+  NB: Only exports resource files in MasterSpec.component.resource, not the
+  embedding init resources in FixedFeatureChannel.
+
+  Args:
+    master_spec: Proto master spec.
+    shortened_to_original: Mapping returned by shorten_resource_paths().
+    saved_model_path: Path to an already-created SavedModel directory.
+  """
+  if not tf.gfile.Exists(saved_model_path):
+    tf.logging.fatal('Unable to export assets - directory %s does not exist!' %
+                     saved_model_path)
+  asset_dir = os.path.join(saved_model_path, 'assets.extra')
+  tf.logging.info('Exporting assets to model at %s' % asset_dir)
+
+  # First, write the MasterSpec that will be used to export the data.
+  tf.gfile.MakeDirs(asset_dir)
+  with tf.gfile.FastGFile(os.path.join(asset_dir, 'master_spec'),
+                          'w') as out_file:
+    out_file.write(text_format.MessageToString(master_spec))
+
+  # Then, copy all the asset files.
+  for component_spec in master_spec.component:
+    for resource_spec in component_spec.resource:
+      tf.logging.info('Copying assets for resource %s/%s.' %
+                      (component_spec.name, resource_spec.name))
+      for part in resource_spec.part:
+        original_file = shortened_to_original[part.file_pattern]
+        new_file = os.path.join(asset_dir, part.file_pattern)
+        tf.logging.info('Asset %s was renamed to %s.' % (original_file,
+                                                         new_file))
+        if tf.gfile.Exists(new_file):
+          tf.logging.info('%s already exists, skipping copy.' % (new_file))
+        else:
+          new_dir = os.path.dirname(new_file)
+          tf.gfile.MakeDirs(new_dir)
+          tf.logging.info('Copying %s to %s' % (original_file, new_dir))
+          tf.gfile.Copy(original_file, new_file, overwrite=True)
+  tf.logging.info('Asset export complete.')
+
+
+def export_to_graph(master_spec,
+                    params_path,
+                    export_path,
+                    external_graph,
+                    export_moving_averages,
+                    signature_name='model'):
+  """Restores a model and exports it in SavedModel form.
+
+  This method loads a graph specified by the master_spec and the params in
+  params_path into the graph given in external_graph. It then saves the model
+  in SavedModel format to the location specified in export_path.
+
+  Args:
+    master_spec: Proto master spec.
+    params_path: Path to the parameters file to export.
+    export_path: Path to export the SavedModel to.
+    external_graph: A tf.Graph() object to build the graph inside.
+    export_moving_averages: Whether to export the moving average parameters.
+    signature_name: Name of the signature to insert.
+  """
+  tf.logging.info(
+      'Exporting graph with signature_name "%s" and use_moving_averages = %s' %
+      (signature_name, export_moving_averages))
+
+  tf.logging.info('Building the graph')
+  with external_graph.as_default(), tf.device('/device:CPU:0'):
+    hyperparam_config = spec_pb2.GridPoint()
+    hyperparam_config.use_moving_average = export_moving_averages
+    builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
+    post_restore_hook = builder.build_post_restore_hook()
+    annotation = builder.add_annotation()
+    builder.add_saver()
+
+  # Resets session.
+  session_config = tf.ConfigProto(
+      log_device_placement=False,
+      intra_op_parallelism_threads=10,
+      inter_op_parallelism_threads=10)
+
+  with tf.Session(graph=external_graph, config=session_config) as session:
+    tf.logging.info('Initializing variables...')
+    session.run(tf.global_variables_initializer())
+
+    tf.logging.info('Loading params...')
+    session.run('save/restore_all', {'save/Const:0': params_path})
+
+    tf.logging.info('Saving.')
+
+    with tf.device('/device:CPU:0'):
+      saved_model_builder = tf.saved_model.builder.SavedModelBuilder(
+          export_path)
+
+      signature_map = {
+          signature_name:
+              tf.saved_model.signature_def_utils.build_signature_def(
+                  inputs={
+                      'inputs':
+                          tf.saved_model.utils.build_tensor_info(
+                              annotation['input_batch'])
+                  },
+                  outputs={
+                      'annotations':
+                          tf.saved_model.utils.build_tensor_info(
+                              annotation['annotations'])
+                  },
+                  method_name=tf.saved_model.signature_constants.
+                  PREDICT_METHOD_NAME),
+      }
+
+      tf.logging.info('Input is: %s', annotation['input_batch'].name)
+      tf.logging.info('Output is: %s', annotation['annotations'].name)
+
+      saved_model_builder.add_meta_graph_and_variables(
+          session,
+          tags=_SAVED_MODEL_TAGS,
+          legacy_init_op=tf.group(
+              post_restore_hook,
+              builder.build_warmup_graph(
+                  tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)[0])),
+          signature_def_map=signature_map,
+          assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS))
+
+      saved_model_builder.save()
--- a/research/syntaxnet/dragnn/python/dragnn_model_saver_lib_test.py
+++ b/research/syntaxnet/dragnn/python/dragnn_model_saver_lib_test.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Test for dragnn.python.dragnn_model_saver_lib."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from dragnn.protos import spec_pb2
+from dragnn.python import dragnn_model_saver_lib
+
+FLAGS = tf.app.flags.FLAGS
+
+
+def setUpModule():
+  if not hasattr(FLAGS, 'test_srcdir'):
+    FLAGS.test_srcdir = ''
+  if not hasattr(FLAGS, 'test_tmpdir'):
+    FLAGS.test_tmpdir = tf.test.get_temp_dir()
+
+
+class DragnnModelSaverLibTest(test_util.TensorFlowTestCase):
+
+  def LoadSpec(self, spec_path):
+    master_spec = spec_pb2.MasterSpec()
+    root_dir = os.path.join(FLAGS.test_srcdir,
+                            'dragnn/python')
+    with file(os.path.join(root_dir, 'testdata', spec_path), 'r') as fin:
+      text_format.Parse(fin.read().replace('TOPDIR', root_dir), master_spec)
+      return master_spec
+
+  def CreateLocalSpec(self, spec_path):
+    master_spec = self.LoadSpec(spec_path)
+    master_spec_name = os.path.basename(spec_path)
+    outfile = os.path.join(FLAGS.test_tmpdir, master_spec_name)
+    fout = open(outfile, 'w')
+    fout.write(text_format.MessageToString(master_spec))
+    return outfile
+
+  def ValidateAssetExistence(self, master_spec, export_path):
+    asset_path = os.path.join(export_path, 'assets.extra')
+
+    # The master spec should exist.
+    expected_path = os.path.join(asset_path, 'master_spec')
+    tf.logging.info('Validating existence of %s' % expected_path)
+    self.assertTrue(os.path.isfile(expected_path))
+
+    # For every part in every resource in every component, the resource should
+    # exist at [export_path]/assets.extra/[component file path]
+    path_list = []
+    for component_spec in master_spec.component:
+      for resource_spec in component_spec.resource:
+        for part in resource_spec.part:
+          expected_path = os.path.join(asset_path,
+                                       part.file_pattern.strip(os.path.sep))
+          tf.logging.info('Validating existence of %s' % expected_path)
+          self.assertTrue(os.path.isfile(expected_path))
+          path_list.append(expected_path)
+
+    # Return a set of all unique paths.
+    return set(path_list)
+
+  def testModelExport(self):
+    # Get the master spec and params for this graph.
+    master_spec = self.LoadSpec('ud-hungarian.master-spec')
+    params_path = os.path.join(
+        FLAGS.test_srcdir, 'dragnn/python/testdata'
+        '/ud-hungarian.params')
+
+    # Export the graph via SavedModel. (Here, we maintain a handle to the graph
+    # for comparison, but that's usually not necessary.)
+    export_path = os.path.join(FLAGS.test_tmpdir, 'export')
+    saver_graph = tf.Graph()
+
+    shortened_to_original = dragnn_model_saver_lib.shorten_resource_paths(
+        master_spec)
+
+    dragnn_model_saver_lib.export_master_spec(master_spec, saver_graph)
+
+    dragnn_model_saver_lib.export_to_graph(
+        master_spec,
+        params_path,
+        export_path,
+        saver_graph,
+        export_moving_averages=False)
+
+    # Export the assets as well.
+    dragnn_model_saver_lib.export_assets(master_spec, shortened_to_original,
+                                         export_path)
+
+    # Validate that the assets are all in the exported directory.
+    path_set = self.ValidateAssetExistence(master_spec, export_path)
+
+    # This master-spec has 4 unique assets. If there are more, we have not
+    # uniquified the assets properly.
+    self.assertEqual(len(path_set), 4)
+
+    # Restore the graph from the checkpoint into a new Graph object.
+    restored_graph = tf.Graph()
+    restoration_config = tf.ConfigProto(
+        log_device_placement=False,
+        intra_op_parallelism_threads=10,
+        inter_op_parallelism_threads=10)
+
+    with tf.Session(graph=restored_graph, config=restoration_config) as sess:
+      tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING],
+                                 export_path)
+
+
+if __name__ == '__main__':
+  googletest.main()
--- a/research/syntaxnet/dragnn/python/dragnn_ops.py
+++ b/research/syntaxnet/dragnn/python/dragnn_ops.py
@@ -16,9 +16,9 @@
 """Groups the DRAGNN TensorFlow ops in one module."""


-try:
-  from dragnn.core.ops.gen_dragnn_bulk_ops import *
-  from dragnn.core.ops.gen_dragnn_ops import *
-except ImportError as e:
-    raise e
+from dragnn.core.ops.gen_dragnn_bulk_ops import *
+from dragnn.core.ops.gen_dragnn_ops import *

+
+import dragnn.python.load_dragnn_cc_impl
+import syntaxnet.load_parser_ops
--- a/research/syntaxnet/dragnn/python/graph_builder.py
+++ b/research/syntaxnet/dragnn/python/graph_builder.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Builds a DRAGNN graph for local training."""

-
+import collections
 import tensorflow as tf
+
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.platform import tf_logging as logging

@@ -32,6 +32,37 @@ except KeyError, e:
  logging.info(str(e))


+def _validate_grid_point(hyperparams, is_sub_optimizer=False):
+  """Validates that a grid point's configuration is reasonable.
+
+  Args:
+    hyperparams (spec_pb2.GridPoint): Grid point to validate.
+    is_sub_optimizer (bool): Whether this optimizer is a sub-optimizer of
+      a composite optimizer.
+
+  Raises:
+    ValueError: If the grid point is not valid.
+  """
+  valid_methods = ('gradient_descent', 'adam', 'lazyadam', 'momentum',
+                   'composite')
+  if hyperparams.learning_method not in valid_methods:
+    raise ValueError('Unknown learning method (optimizer)')
+
+  if is_sub_optimizer:
+    for base_only_field in ('decay_steps', 'decay_base', 'decay_staircase'):
+      if hyperparams.HasField(base_only_field):
+        raise ValueError('Field {} is not valid for sub-optimizers of a '
+                         'composite optimizer.'.format(base_only_field))
+
+  if hyperparams.learning_method == 'composite':
+    spec = hyperparams.composite_optimizer_spec
+    if spec.switch_after_steps < 1:
+      raise ValueError('switch_after_steps {} not valid for composite '
+                       'optimizer!'.format(spec.switch_after_steps))
+    for sub_optimizer in (spec.method1, spec.method2):
+      _validate_grid_point(sub_optimizer, is_sub_optimizer=True)
+
+
 def _create_learning_rate(hyperparams, step_var):
  """Creates learning rate var, with decay and switching for CompositeOptimizer.

@@ -40,21 +71,31 @@ def _create_learning_rate(hyperparams, step_var):
      learning_method to determine optimizer class to use.
    step_var: tf.Variable, global training step.

+  Raises:
+    ValueError: If the composite optimizer is set, but not correctly configured.
+
  Returns:
    a scalar `Tensor`, the learning rate based on current step and hyperparams.
  """
  if hyperparams.learning_method != 'composite':
    base_rate = hyperparams.learning_rate
+    adjusted_steps = step_var
  else:
    spec = hyperparams.composite_optimizer_spec
    switch = tf.less(step_var, spec.switch_after_steps)
    base_rate = tf.cond(switch, lambda: tf.constant(spec.method1.learning_rate),
                        lambda: tf.constant(spec.method2.learning_rate))
+    if spec.reset_learning_rate:
+      adjusted_steps = tf.cond(switch, lambda: step_var,
+                               lambda: step_var - spec.switch_after_steps)
+    else:
+      adjusted_steps = step_var
+
  return tf.train.exponential_decay(
-      base_rate,
-      step_var,
-      hyperparams.decay_steps,
-      hyperparams.decay_base,
+      learning_rate=base_rate,
+      global_step=adjusted_steps,
+      decay_steps=hyperparams.decay_steps,
+      decay_rate=hyperparams.decay_base,
      staircase=hyperparams.decay_staircase)


@@ -158,6 +199,7 @@ class MasterBuilder(object):
    self.spec = master_spec
    self.hyperparams = (spec_pb2.GridPoint()
                        if hyperparam_config is None else hyperparam_config)
+    _validate_grid_point(self.hyperparams)
    self.pool_scope = pool_scope

    # Set the graph-level random seed before creating the Components so the ops
@@ -260,6 +302,25 @@ class MasterBuilder(object):
    all_nodes['run'] = run_op
    return all_nodes

+  def build_warmup_graph(self, asset_dir):
+    """Builds a warmup graph.
+
+    This graph performs a MasterSpec asset location rewrite via
+    SetAssetDirectory, then grabs a ComputeSession and immediately returns it.
+    By grabbing a session, we cause the underlying transition systems to cache
+    their static data reads.
+
+    Args:
+      asset_dir: The base directory to append to all resources.
+
+    Returns:
+      A single op suitable for passing to the legacy_init_op of the ModelSaver.
+    """
+    with tf.control_dependencies([dragnn_ops.set_asset_directory(asset_dir)]):
+      session = self._get_compute_session()
+      release_op = dragnn_ops.release_session(session)
+    return tf.group(release_op, name='run')
+
  def build_training(self,
                     handle,
                     compute_gradients=True,
@@ -408,6 +469,8 @@ class MasterBuilder(object):
    # Restore that subsequent builds don't use average by default.
    self.read_from_avg = False

+    cost = tf.check_numerics(cost, message='Cost is not finite.')
+
    # Returns named access to common outputs.
    outputs = {
        'cost': cost,
@@ -447,8 +510,14 @@ class MasterBuilder(object):
    Returns:
      setup_op - An op that, when run, guarantees all setup ops will run.
    """
-    with tf.control_dependencies(
-        [comp.build_post_restore_hook() for comp in self.components]):
+    control_ops = []
+    for comp in self.components:
+      hook = comp.build_post_restore_hook()
+      if isinstance(hook, collections.Iterable):
+        control_ops.extend(hook)
+      else:
+        control_ops.append(hook)
+    with tf.control_dependencies(control_ops):
      return tf.no_op(name='post_restore_hook_master')

  def build_inference(self, handle, use_moving_average=False):
@@ -597,10 +666,8 @@ class MasterBuilder(object):

  def add_saver(self):
    """Adds a Saver for all variables in the graph."""
-    logging.info('Saving non-quantized variables:\n\t%s', '\n\t'.join(
-        [x.name for x in tf.global_variables() if 'quantized' not in x.name]))
+    logging.info('Saving variables:\n\t%s',
+                 '\n\t'.join([x.name for x in tf.global_variables()]))
    self.saver = tf.train.Saver(
-        var_list=[
-            x for x in tf.global_variables() if 'quantized' not in x.name
-        ],
+        var_list=[x for x in tf.global_variables()],
        write_version=saver_pb2.SaverDef.V1)
--- a/research/syntaxnet/dragnn/python/graph_builder_test.py
+++ b/research/syntaxnet/dragnn/python/graph_builder_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for graph_builder."""


@@ -35,14 +34,8 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging

-import dragnn.python.load_dragnn_cc_impl
-import syntaxnet.load_parser_ops
-
 FLAGS = tf.app.flags.FLAGS
-if not hasattr(FLAGS, 'test_srcdir'):
-  FLAGS.test_srcdir = ''
-if not hasattr(FLAGS, 'test_tmpdir'):
-  FLAGS.test_tmpdir = tf.test.get_temp_dir()
+

 _DUMMY_GOLD_SENTENCE = """
 token {
@@ -157,6 +150,13 @@ token {
 ]


+def setUpModule():
+  if not hasattr(FLAGS, 'test_srcdir'):
+    FLAGS.test_srcdir = ''
+  if not hasattr(FLAGS, 'test_tmpdir'):
+    FLAGS.test_tmpdir = tf.test.get_temp_dir()
+
+
 def _as_op(x):
  """Always returns the tf.Operation associated with a node."""
  return x.op if isinstance(x, tf.Tensor) else x
@@ -264,7 +264,8 @@ class GraphBuilderTest(test_util.TensorFlowTestCase):
    gold_doc_2 = sentence_pb2.Sentence()
    text_format.Parse(_DUMMY_GOLD_SENTENCE_2, gold_doc_2)
    reader_strings = [
-        gold_doc.SerializeToString(), gold_doc_2.SerializeToString()
+        gold_doc.SerializeToString(),
+        gold_doc_2.SerializeToString()
    ]
    tf.logging.info('Generating graph with config: %s', hyperparam_config)
    with tf.Graph().as_default():
@@ -294,18 +295,35 @@ class GraphBuilderTest(test_util.TensorFlowTestCase):
    self.RunTraining(
        self.MakeHyperparams(learning_method='adam', use_moving_average=True))

+  def testTrainingWithLazyAdamAndNoAveraging(self):
+    """Adds code coverage for lazy ADAM without the use of moving averaging."""
+    self.RunTraining(
+        self.MakeHyperparams(
+            learning_method='lazyadam', use_moving_average=False))
+
  def testTrainingWithCompositeOptimizer(self):
    """Adds code coverage for CompositeOptimizer."""
+    self.RunCompositeOptimizerTraining(False)
+
+  def testTrainingWithCompositeOptimizerResetLearningRate(self):
+    """Adds code coverage for CompositeOptimizer."""
+    self.RunCompositeOptimizerTraining(True)
+
+  def RunCompositeOptimizerTraining(self, reset_learning_rate):
    grid_point = self.MakeHyperparams(learning_method='composite')
-    grid_point.composite_optimizer_spec.method1.learning_method = 'adam'
-    grid_point.composite_optimizer_spec.method2.learning_method = 'momentum'
-    grid_point.composite_optimizer_spec.method2.momentum = 0.9
+    spec = grid_point.composite_optimizer_spec
+    spec.reset_learning_rate = reset_learning_rate
+    spec.switch_after_steps = 1
+    spec.method1.learning_method = 'adam'
+    spec.method2.learning_method = 'momentum'
+    spec.method2.momentum = 0.9
    self.RunTraining(grid_point)

  def RunFullTrainingAndInference(self,
                                  test_name,
                                  master_spec_path=None,
                                  master_spec=None,
+                                  hyperparam_config=None,
                                  component_weights=None,
                                  unroll_using_oracle=None,
                                  num_evaluated_components=1,
@@ -320,7 +338,8 @@ class GraphBuilderTest(test_util.TensorFlowTestCase):
    gold_doc_2 = sentence_pb2.Sentence()
    text_format.Parse(_DUMMY_GOLD_SENTENCE_2, gold_doc_2)
    gold_reader_strings = [
-        gold_doc.SerializeToString(), gold_doc_2.SerializeToString()
+        gold_doc.SerializeToString(),
+        gold_doc_2.SerializeToString()
    ]

    test_doc = sentence_pb2.Sentence()
@@ -328,8 +347,10 @@ class GraphBuilderTest(test_util.TensorFlowTestCase):
    test_doc_2 = sentence_pb2.Sentence()
    text_format.Parse(_DUMMY_TEST_SENTENCE_2, test_doc_2)
    test_reader_strings = [
-        test_doc.SerializeToString(), test_doc.SerializeToString(),
-        test_doc_2.SerializeToString(), test_doc.SerializeToString()
+        test_doc.SerializeToString(),
+        test_doc.SerializeToString(),
+        test_doc_2.SerializeToString(),
+        test_doc.SerializeToString()
    ]

    if batch_size_limit is not None:
@@ -338,7 +359,8 @@ class GraphBuilderTest(test_util.TensorFlowTestCase):

    with tf.Graph().as_default():
      tf.set_random_seed(1)
-      hyperparam_config = spec_pb2.GridPoint()
+      if not hyperparam_config:
+        hyperparam_config = spec_pb2.GridPoint()
      builder = graph_builder.MasterBuilder(
          master_spec, hyperparam_config, pool_scope=test_name)
      target = spec_pb2.TrainTarget()
@@ -493,6 +515,22 @@ class GraphBuilderTest(test_util.TensorFlowTestCase):
        expected_num_actions=12,
        expected=_TAGGER_PARSER_EXPECTED_SENTENCES)

+  def testTaggerParserNanDeath(self):
+    hyperparam_config = spec_pb2.GridPoint()
+    hyperparam_config.learning_rate = 1.0
+
+    # The large learning rate should trigger check_numerics.
+    with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
+                                 'Cost is not finite'):
+      self.RunFullTrainingAndInference(
+          'tagger-parser',
+          'tagger_parser_master_spec.textproto',
+          hyperparam_config=hyperparam_config,
+          component_weights=[0., 1., 1.],
+          unroll_using_oracle=[False, True, True],
+          expected_num_actions=12,
+          expected=_TAGGER_PARSER_EXPECTED_SENTENCES)
+
  def testTaggerParserWithAttention(self):
    spec = self.LoadSpec('tagger_parser_master_spec.textproto')

@@ -621,6 +659,18 @@ class GraphBuilderTest(test_util.TensorFlowTestCase):
      self.checkOpOrder('annotations', anno['annotations'],
                        ['GetSession', 'ReleaseSession'])

+  def testWarmupGetsAndReleasesSession(self):
+    """Checks that create_warmup_graph creates Get and ReleaseSession."""
+    test_name = 'warmup-graph-structure'
+
+    with tf.Graph().as_default():
+      # Build the actual graphs. The choice of spec is arbitrary, as long as
+      # training and annotation nodes can be constructed.
+      builder, _ = self.getBuilderAndTarget(test_name)
+      warmup = builder.build_warmup_graph('foo')
+      self.checkOpOrder('annotations', warmup,
+                        ['SetAssetDirectory', 'GetSession', 'ReleaseSession'])
+
  def testAttachDataReader(self):
    """Checks that train['run'] and 'annotations' call AttachDataReader."""
    test_name = 'attach-data-reader'

--- a/research/syntaxnet/dragnn/python/lexicon.py
+++ b/research/syntaxnet/dragnn/python/lexicon.py
@@ -28,7 +28,8 @@ def create_lexicon_context(path):
  context = task_spec_pb2.TaskSpec()
  for name in [
      'word-map', 'tag-map', 'tag-to-category', 'lcword-map', 'category-map',
-      'char-map', 'char-ngram-map', 'label-map', 'prefix-table', 'suffix-table'
+      'char-map', 'char-ngram-map', 'label-map', 'prefix-table', 'suffix-table',
+      'known-word-map'
  ]:
    context.input.add(name=name).part.add(file_pattern=os.path.join(path, name))
  return context

--- a/research/syntaxnet/dragnn/python/lexicon_test.py
+++ b/research/syntaxnet/dragnn/python/lexicon_test.py
@@ -28,13 +28,7 @@ from dragnn.python import lexicon
 from syntaxnet import parser_trainer
 from syntaxnet import task_spec_pb2

-import syntaxnet.load_parser_ops
-
 FLAGS = tf.app.flags.FLAGS
-if not hasattr(FLAGS, 'test_srcdir'):
-  FLAGS.test_srcdir = ''
-if not hasattr(FLAGS, 'test_tmpdir'):
-  FLAGS.test_tmpdir = tf.test.get_temp_dir()


 _EXPECTED_CONTEXT = r"""
@@ -48,9 +42,17 @@ input { name: "char-ngram-map" Part { file_pattern: "/tmp/char-ngram-map" } }
 input { name: "label-map" Part { file_pattern: "/tmp/label-map" } }
 input { name: "prefix-table" Part { file_pattern: "/tmp/prefix-table" } }
 input { name: "suffix-table" Part { file_pattern: "/tmp/suffix-table" } }
+input { name: "known-word-map" Part { file_pattern: "/tmp/known-word-map" } }
 """


+def setUpModule():
+  if not hasattr(FLAGS, 'test_srcdir'):
+    FLAGS.test_srcdir = ''
+  if not hasattr(FLAGS, 'test_tmpdir'):
+    FLAGS.test_tmpdir = tf.test.get_temp_dir()
+
+
 class LexiconTest(tf.test.TestCase):

  def testCreateLexiconContext(self):

--- a/research/syntaxnet/dragnn/python/network_units.py
+++ b/research/syntaxnet/dragnn/python/network_units.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Basic network units used in assembling DRAGNN graphs."""

 from __future__ import absolute_import
@@ -21,6 +20,8 @@ from __future__ import print_function

 import abc

+
+import numpy as np
 import tensorflow as tf
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import tensor_array_ops as ta
@@ -141,17 +142,22 @@ def add_embeddings(channel_id, feature_spec, seed=None):
    embeddings = syntaxnet_ops.word_embedding_initializer(
        vectors=feature_spec.pretrained_embedding_matrix.part[0].file_pattern,
        vocabulary=feature_spec.vocab.part[0].file_pattern,
+
        num_special_embeddings=1,
        embedding_init=1.0,
        seed=seed1,
        seed2=seed2)
-    return tf.get_variable(name, initializer=tf.reshape(embeddings, shape))
+    return tf.get_variable(
+        name,
+        initializer=tf.reshape(embeddings, shape),
+        trainable=not feature_spec.is_constant)
  else:
    return tf.get_variable(
        name,
        shape,
        initializer=tf.random_normal_initializer(
-            stddev=1.0 / feature_spec.embedding_dim**.5, seed=seed))
+            stddev=1.0 / feature_spec.embedding_dim**.5, seed=seed),
+        trainable=not feature_spec.is_constant)


 def embedding_lookup(embedding_matrix, indices, ids, weights, size):
@@ -183,7 +189,7 @@ def fixed_feature_lookup(component, state, channel_id, stride):

  Args:
    component: Component object in which to look up the fixed features.
-    state: MasterState object for the live nlp_saft::dragnn::MasterState.
+    state: MasterState object for the live ComputeSession.
    channel_id: int id of the fixed feature to look up.
    stride: int Tensor of current batch * beam size.

@@ -228,6 +234,100 @@ def get_input_tensor(fixed_embeddings, linked_embeddings):
  return tf.concat([e.tensor for e in embeddings], 1)


+def add_var_initialized(name, shape, init_type, divisor=1.0, stddev=1e-4):
+  """Creates a tf.Variable with the given shape and initialization.
+
+  Args:
+    name: variable name
+    shape: variable shape
+    init_type: type of initialization (random, xavier, identity, varscale)
+    divisor: numerator for identity initialization where in_dim != out_dim,
+      should divide both in_dim and out_dim
+    stddev: standard deviation for random normal initialization
+
+  Returns:
+    tf.Variable object with the given shape and initialization
+
+  Raises:
+    ValueError: if identity initialization is specified for a tensor of rank < 4
+    NotImplementedError: if an unimplemented type of initialization is specified
+  """
+  if init_type == 'random':
+    # Random normal initialization
+    return tf.get_variable(
+        name,
+        shape=shape,
+        initializer=tf.random_normal_initializer(stddev=stddev),
+        dtype=tf.float32)
+  if init_type == 'xavier':
+    # Xavier normal initialization (Glorot and Bengio, 2010):
+    # http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf
+    return tf.get_variable(
+        name,
+        shape=shape,
+        initializer=tf.contrib.layers.xavier_initializer(),
+        dtype=tf.float32)
+  if init_type == 'varscale':
+    # Variance scaling initialization (He at al. 2015):
+    # https://arxiv.org/abs/1502.01852
+    return tf.get_variable(
+        name,
+        shape=shape,
+        initializer=tf.contrib.layers.variance_scaling_initializer(),
+        dtype=tf.float32)
+  if init_type == 'identity':
+    # "Identity initialization" described in Yu and Koltun (2015):
+    # https://arxiv.org/abs/1511.07122v3 eqns. (4) and (5)
+    rank = len(shape)
+    square = shape[-1] == shape[-2]
+    if rank < 2:
+      raise ValueError(
+          'Identity initialization requires a tensor with rank >= 2. The given '
+          'shape has rank ' + str(rank))
+
+    if shape[-1] % divisor != 0 or shape[-2] % divisor != 0:
+      raise ValueError('Divisor must divide both shape[-1]=' + str(shape[-1]) +
+                       ' and shape[-2]=' + str(shape[-2]) + '. Divisor is: ' +
+                       str(divisor))
+
+    # If the desired shape is > 2 dimensions, we only want to set the values
+    # in the middle along the last two dims.
+    middle_indices = [int(s / 2) for s in shape]
+    middle_indices = middle_indices[:-2]
+
+    base_array = NotImplemented
+    if square:
+      if rank == 2:
+        base_array = np.eye(shape[-1])
+      else:
+        base_array = np.zeros(shape, dtype=np.float32)
+        base_array[[[i] for i in middle_indices]] = np.eye(shape[-1])
+    else:
+      # NOTE(strubell): We use NumPy's RNG here and not TensorFlow's because
+      # constructing this matrix with tf ops is tedious and harder to read.
+      base_array = np.random.normal(
+          size=shape, loc=0, scale=stddev).astype(np.float32)
+      m = divisor / shape[-1]
+
+      identity = np.eye(int(divisor))
+      x_stretch = int(shape[-1] / divisor)
+      y_stretch = int(shape[-2] / divisor)
+      x_stretched_ident = np.repeat(identity, x_stretch, 1)
+      xy_stretched_ident = np.repeat(x_stretched_ident, y_stretch, 0)
+      indices = np.where(xy_stretched_ident == 1.0)
+
+      if rank == 2:
+        base_array[indices[0], indices[1]] = m
+      else:
+        arr = base_array[[[i] for i in middle_indices]][0]
+        arr[indices[0], indices[1]] = m
+        base_array[[[i] for i in middle_indices]] = arr
+    return tf.get_variable(name, initializer=base_array)
+
+  raise NotImplementedError('Initialization type ' + init_type +
+                            ' is not implemented.')
+
+
 def get_input_tensor_with_stride(fixed_embeddings, linked_embeddings, stride):
  """Constructs an input tensor with a separate dimension for steps.

@@ -304,8 +404,8 @@ def lookup_named_tensor(name, named_tensors):
  for named_tensor in named_tensors:
    if named_tensor.name == name:
      return named_tensor
-  raise KeyError('Name "%s" not found in named tensors: %s' %
-                 (name, named_tensors))
+  raise KeyError('Name "%s" not found in named tensors: %s' % (name,
+                                                               named_tensors))


 def activation_lookup_recurrent(component, state, channel_id, source_array,
@@ -317,7 +417,7 @@ def activation_lookup_recurrent(component, state, channel_id, source_array,

  Args:
    component: Component object in which to look up the fixed features.
-    state: MasterState object for the live nlp_saft::dragnn::MasterState.
+    state: MasterState object for the live ComputeSession.
    channel_id: int id of the fixed feature to look up.
    source_array: TensorArray from which to fetch feature vectors, expected to
        have size [steps + 1] elements of shape [stride, D] each.
@@ -381,7 +481,7 @@ def activation_lookup_other(component, state, channel_id, source_tensor,

  Args:
    component: Component object in which to look up the fixed features.
-    state: MasterState object for the live nlp_saft::dragnn::MasterState.
+    state: MasterState object for the live ComputeSession.
    channel_id: int id of the fixed feature to look up.
    source_tensor: Tensor from which to fetch feature vectors. Expected to have
        have shape [steps + 1, stride, D].
@@ -494,8 +594,8 @@ class LayerNorm(object):

      # Compute layer normalization using the batch_normalization function.
      variance_epsilon = 1E-12
-      outputs = nn.batch_normalization(
-          inputs, mean, variance, beta, gamma, variance_epsilon)
+      outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma,
+                                       variance_epsilon)
      outputs.set_shape(inputs_shape)
      return outputs

@@ -529,12 +629,13 @@ class Layer(object):
      TensorArray object
    """
    check.Gt(self.dim, 0, 'Cannot create array when dimension is dynamic')
-    tensor_array = ta.TensorArray(dtype=tf.float32,
-                                  size=0,
-                                  dynamic_size=True,
-                                  clear_after_read=False,
-                                  infer_shape=False,
-                                  name='%s_array' % self.name)
+    tensor_array = ta.TensorArray(
+        dtype=tf.float32,
+        size=0,
+        dynamic_size=True,
+        clear_after_read=False,
+        infer_shape=False,
+        name='%s_array' % self.name)

    # Start each array with all zeros. Special values will still be learned via
    # the extra embedding dimension stored for each linked feature channel.
@@ -588,9 +689,6 @@ def maybe_apply_dropout(inputs, keep_prob, per_sequence, stride=None):
    shape of |inputs|, containing the masked or original inputs, depending on
    whether dropout was actually performed.
  """
-  check.Ge(inputs.get_shape().ndims, 2, 'inputs must be rank 2 or 3')
-  check.Le(inputs.get_shape().ndims, 3, 'inputs must be rank 2 or 3')
-  flat = (inputs.get_shape().ndims == 2)

  if keep_prob >= 1.0:
    return inputs
@@ -598,6 +696,11 @@ def maybe_apply_dropout(inputs, keep_prob, per_sequence, stride=None):
  if not per_sequence:
    return tf.nn.dropout(inputs, keep_prob)

+  # We only check the dims if we are applying per-sequence dropout
+  check.Ge(inputs.get_shape().ndims, 2, 'inputs must be rank 2 or 3')
+  check.Le(inputs.get_shape().ndims, 3, 'inputs must be rank 2 or 3')
+  flat = (inputs.get_shape().ndims == 2)
+
  check.NotNone(stride, 'per-sequence dropout requires stride')
  dim = inputs.get_shape().as_list()[-1]
  check.NotNone(dim, 'inputs must have static activation dimension, but have '
@@ -629,7 +732,7 @@ class NetworkUnitInterface(object):
    layers (list): List of Layer objects to track network layers that should
      be written to Tensors during training and inference.
  """
-  __metaclass__ = abc.ABCMeta  # required for @abstractmethod
+  __metaclass__ = abc.ABCMeta  # required for @abc.abstractmethod

  def __init__(self, component, init_layers=None, init_context_layers=None):
    """Initializes parameters for embedding matrices.
@@ -692,8 +795,8 @@ class NetworkUnitInterface(object):

    # Compute the cumulative dimension of all inputs.  If any input has dynamic
    # dimension, then the result is -1.
-    input_dims = (self._fixed_feature_dims.values() +
-                  self._linked_feature_dims.values())
+    input_dims = (
+        self._fixed_feature_dims.values() + self._linked_feature_dims.values())
    if any(x < 0 for x in input_dims):
      self._concatenated_input_dim = -1
    else:
@@ -844,8 +947,7 @@ class NetworkUnitInterface(object):
        tf.reduce_sum(
            tf.multiply(
                h_tensor, tf.reshape(p_vec, [-1, 1]), name='time_together2'),
-            0),
-        0)
+            0), 0)
    return tf.matmul(
        r_vec,
        self._component.get_variable('attention_weights_pu'),
@@ -908,6 +1010,7 @@ class FeedForwardNetwork(NetworkUnitInterface):
    Parameters used to construct the network:
      hidden_layer_sizes: comma-separated list of ints, indicating the
        number of hidden units in each hidden layer.
+      omit_logits (False): Whether to elide the logits layer.
      layer_norm_input (False): Whether or not to apply layer normalization
        on the concatenated input to the network.
      layer_norm_hidden (False): Whether or not to apply layer normalization
@@ -928,21 +1031,24 @@ class FeedForwardNetwork(NetworkUnitInterface):
          when the |dropout_keep_prob| parameter is negative.
    """
    self._attrs = get_attrs_with_defaults(
-        component.spec.network_unit.parameters, defaults={
+        component.spec.network_unit.parameters,
+        defaults={
            'hidden_layer_sizes': '',
+            'omit_logits': False,
            'layer_norm_input': False,
            'layer_norm_hidden': False,
            'nonlinearity': 'relu',
            'dropout_keep_prob': -1.0,
            'dropout_per_sequence': False,
-            'dropout_all_layers': False})
+            'dropout_all_layers': False
+        })

    # Initialize the hidden layer sizes before running the base initializer, as
-    # the base initializer may need to know the size of of the hidden layer for
+    # the base initializer may need to know the size of the hidden layer for
    # recurrent connections.
-    self._hidden_layer_sizes = (
-        map(int, self._attrs['hidden_layer_sizes'].split(','))
-        if self._attrs['hidden_layer_sizes'] else [])
+    self._hidden_layer_sizes = (map(
+        int, self._attrs['hidden_layer_sizes'].split(','))
+                                if self._attrs['hidden_layer_sizes'] else [])
    super(FeedForwardNetwork, self).__init__(component)

    # Infer dropout rate from network parameters and grid hyperparameters.
@@ -960,9 +1066,8 @@ class FeedForwardNetwork(NetworkUnitInterface):
      self._params.extend(self._layer_norm_input.params)

    if self._attrs['layer_norm_hidden']:
-      self._layer_norm_hidden = LayerNorm(self._component, 'layer_0',
-                                          self._hidden_layer_sizes[0],
-                                          tf.float32)
+      self._layer_norm_hidden = LayerNorm(
+          self._component, 'layer_0', self._hidden_layer_sizes[0], tf.float32)
      self._params.extend(self._layer_norm_hidden.params)

    # Extract nonlinearity from |tf.nn|.
@@ -984,13 +1089,11 @@ class FeedForwardNetwork(NetworkUnitInterface):
        self._params.append(
            tf.get_variable(
                'bias_%d' % index, [hidden_layer_size],
-                initializer=tf.constant_initializer(
-                    0.2, dtype=tf.float32)))
+                initializer=tf.constant_initializer(0.2, dtype=tf.float32)))

      self._weights.append(weights)
      self._layers.append(
-          Layer(
-              component, name='layer_%d' % index, dim=hidden_layer_size))
+          Layer(component, name='layer_%d' % index, dim=hidden_layer_size))
      last_layer_dim = hidden_layer_size

    # Add a convenience alias for the last hidden layer, if any.
@@ -1000,7 +1103,7 @@ class FeedForwardNetwork(NetworkUnitInterface):
    # By default, regularize only the weights.
    self._regularized_weights.extend(self._weights)

-    if component.num_actions:
+    if component.num_actions and not self._attrs['omit_logits']:
      self._params.append(
          tf.get_variable(
              'weights_softmax', [last_layer_dim, component.num_actions],
@@ -1010,8 +1113,7 @@ class FeedForwardNetwork(NetworkUnitInterface):
              'bias_softmax', [component.num_actions],
              initializer=tf.zeros_initializer()))
      self._layers.append(
-          Layer(
-              component, name='logits', dim=component.num_actions))
+          Layer(component, name='logits', dim=component.num_actions))

  def create(self,
             fixed_embeddings,
@@ -1078,10 +1180,8 @@ class FeedForwardNetwork(NetworkUnitInterface):
      return self._hidden_layer_sizes[-1]

    if not layer_name.startswith('layer_'):
-      logging.fatal(
-          'Invalid layer name: "%s" Can only retrieve from "logits", '
-          '"last_layer", and "layer_*".',
-          layer_name)
+      logging.fatal('Invalid layer name: "%s" Can only retrieve from "logits", '
+                    '"last_layer", and "layer_*".', layer_name)

    # NOTE(danielandor): Since get_layer_size is called before the
    # model has been built, we compute the layer size directly from
@@ -1157,7 +1257,8 @@ class LSTMNetwork(NetworkUnitInterface):

    self._params.extend([
        self._x2i, self._h2i, self._c2i, self._bi, self._x2o, self._h2o,
-        self._c2o, self._bo, self._x2c, self._h2c, self._bc])
+        self._c2o, self._bo, self._x2c, self._h2c, self._bc
+    ])

    lstm_h_layer = Layer(component, name='lstm_h', dim=self._hidden_layer_sizes)
    lstm_c_layer = Layer(component, name='lstm_c', dim=self._hidden_layer_sizes)
@@ -1168,20 +1269,20 @@ class LSTMNetwork(NetworkUnitInterface):
    self._layers.extend(self._context_layers)

    self._layers.append(
-        Layer(
-            component, name='layer_0', dim=self._hidden_layer_sizes))
+        Layer(component, name='layer_0', dim=self._hidden_layer_sizes))

-    self.params.append(tf.get_variable(
-        'weights_softmax', [self._hidden_layer_sizes, component.num_actions],
-        initializer=tf.random_normal_initializer(stddev=1e-4)))
+    self.params.append(
+        tf.get_variable(
+            'weights_softmax',
+            [self._hidden_layer_sizes, component.num_actions],
+            initializer=tf.random_normal_initializer(stddev=1e-4)))
    self.params.append(
        tf.get_variable(
            'bias_softmax', [component.num_actions],
            initializer=tf.zeros_initializer()))

    self._layers.append(
-        Layer(
-            component, name='logits', dim=component.num_actions))
+        Layer(component, name='logits', dim=component.num_actions))

  def create(self,
             fixed_embeddings,
@@ -1215,6 +1316,13 @@ class LSTMNetwork(NetworkUnitInterface):
    i_h_tm1 = context_tensor_arrays[0].read(length - 1)
    i_c_tm1 = context_tensor_arrays[1].read(length - 1)

+    # label c and h inputs
+    i_c_tm1 = tf.identity(i_c_tm1, name='lstm_c_in')
+    i_h_tm1 = tf.identity(i_h_tm1, name='lstm_h_in')
+
+    # label the feature input (for debugging purposes)
+    input_tensor = tf.identity(input_tensor, name='input_tensor')
+
    # apply dropout according to http://arxiv.org/pdf/1409.2329v5.pdf
    if during_training and self._input_dropout_rate < 1:
      input_tensor = tf.nn.dropout(input_tensor, self._input_dropout_rate)
@@ -1251,7 +1359,8 @@ class LSTMNetwork(NetworkUnitInterface):

    h = tf.identity(ht, name='layer_0')

-    logits = tf.nn.xw_plus_b(ht, tf.get_variable('weights_softmax'),
+    logits = tf.nn.xw_plus_b(ht,
+                             tf.get_variable('weights_softmax'),
                             tf.get_variable('bias_softmax'))

    if self._component.spec.attention_component:
@@ -1284,7 +1393,7 @@ class ConvNetwork(NetworkUnitInterface):
      widths: comma separated list of ints, number of steps input to the
              convolutional kernel at every layer.
      depths: comma separated list of ints, number of channels input to the
-              convolutional kernel at every layer.
+              convolutional kernel at every layer except the first.
      output_embedding_dim: int, number of output channels for the convolutional
              kernel of the last layer, which receives no ReLU activation and
              therefore can be used in a softmax output. If zero, this final
@@ -1298,6 +1407,13 @@ class ConvNetwork(NetworkUnitInterface):
        sequence, instead of once per step.  See Gal and Ghahramani
        (https://arxiv.org/abs/1512.05287).

+    Raises:
+      RuntimeError: if the number of widths is not equal to the number of
+          depths - 1.
+
+    The input depth of the first layer is inferred from the total concatenated
+    size of the input features.
+
    Hyperparameters used:
      dropout_rate: The probability that an input is not dropped.  Only used
          when the |dropout_keep_prob| parameter is negative.
@@ -1305,21 +1421,34 @@ class ConvNetwork(NetworkUnitInterface):

    super(ConvNetwork, self).__init__(component)
    self._attrs = get_attrs_with_defaults(
-        component.spec.network_unit.parameters, defaults={
+        component.spec.network_unit.parameters,
+        defaults={
            'widths': '',
            'depths': '',
            'output_embedding_dim': 0,
            'nonlinearity': 'relu',
            'dropout_keep_prob': -1.0,
-            'dropout_per_sequence': False})
+            'dropout_per_sequence': False
+        })

    self._weights = []
    self._biases = []
    self._widths = map(int, self._attrs['widths'].split(','))
-    self._depths = map(int, self._attrs['depths'].split(','))
+    self._depths = [self._concatenated_input_dim]
+
+    # Since we infer the input dimension, depths could be empty
+    if self._attrs['depths']:
+      self._depths.extend(map(int, self._attrs['depths'].split(',')))
+
    self._output_dim = self._attrs['output_embedding_dim']
    if self._output_dim:
      self._depths.append(self._output_dim)
+
+    if len(self._widths) != len(self._depths) - 1:
+      raise RuntimeError(
+          'Unmatched widths/depths: %d/%d (depths should equal widths + 1)' %
+          (len(self._widths), len(self._depths)))
+
    self.kernel_shapes = []
    for i in range(len(self._depths) - 1):
      self.kernel_shapes.append(
@@ -1350,10 +1479,9 @@ class ConvNetwork(NetworkUnitInterface):

    self._params.extend(self._weights + self._biases)
    self._layers.append(
-        Layer(
-            component, name='conv_output', dim=self._depths[-1]))
-    self._regularized_weights.extend(self._weights[:-1] if self._output_dim else
-                                     self._weights)
+        Layer(component, name='conv_output', dim=self._depths[-1]))
+    self._regularized_weights.extend(self._weights[:-1]
+                                     if self._output_dim else self._weights)

  def create(self,
             fixed_embeddings,
@@ -1365,7 +1493,7 @@ class ConvNetwork(NetworkUnitInterface):
    """Requires |stride|; otherwise see base class."""
    if stride is None:
      raise RuntimeError("ConvNetwork needs 'stride' and must be called in the "
-                         "bulk feature extractor component.")
+                         'bulk feature extractor component.')
    input_tensor = get_input_tensor_with_stride(fixed_embeddings,
                                                linked_embeddings, stride)

@@ -1388,8 +1516,253 @@ class ConvNetwork(NetworkUnitInterface):
        if i < (len(self._weights) - 1) or not self._output_dim:
          conv = self._nonlinearity(conv, name=scope.name)
    return [
+        tf.reshape(conv, [-1, self._depths[-1]], name='reshape_activations')
+    ]
+
+  def _maybe_apply_dropout(self, inputs, stride):
+    # The |inputs| are rank 4 (one 1xN "image" per sequence).  Squeeze out and
+    # restore the singleton image height, so dropout is applied to the normal
+    # rank 3 batched input tensor.
+    inputs = tf.squeeze(inputs, [1])
+    inputs = maybe_apply_dropout(inputs, self._dropout_rate,
+                                 self._attrs['dropout_per_sequence'], stride)
+    inputs = tf.expand_dims(inputs, 1)
+    return inputs
+
+
+class ConvMultiNetwork(NetworkUnitInterface):
+  """Implementation of a convolutional feed forward net with a side tower."""
+
+  def __init__(self, component):
+    """Initializes kernels and biases for this convolutional net.
+
+    Args:
+      component: parent ComponentBuilderBase object.
+
+    Parameters used to construct the network:
+      widths: comma separated list of ints, number of steps input to the
+              convolutional kernel at every layer.
+      depths: comma separated list of ints, number of channels input to the
+              convolutional kernel at every layer except the first.
+      output_embedding_dim: int, number of output channels for the convolutional
+              kernel of the last layer, which receives no ReLU activation and
+              therefore can be used in a softmax output. If zero, this final
+              layer is disabled entirely.
+      side_tower_index: An int representing the layer of the tower that the
+              side tower will start from. 0 is the input data and 'num_layers'
+              is the output.
+      side_tower_widths: comma separated list of ints, number of steps input to
+              the convolutional kernel at every layer of the side tower.
+      side_tower_depths: comma separated list of ints, number of channels input
+              to the convolutional kernel at every layer of the side tower save
+              the first.
+      side_tower_output_embedding_dim: int, number of output channels for the
+              kernel of the last layer, which receives no ReLU activation and
+              therefore can be used in a softmax output. If zero, this final
+              layer is disabled entirely.
+      nonlinearity ('relu'): Name of function from module "tf.nn" to apply to
+        each hidden layer; e.g., "relu" or "elu".
+      dropout_keep_prob (-1.0): The probability that an input is not dropped.
+        If >= 1.0, disables dropout.  If < 0.0, uses the global |dropout_rate|
+        hyperparameter.
+      dropout_per_sequence (False): If true, sample the dropout mask once per
+        sequence, instead of once per step.  See Gal and Ghahramani
+        (https://arxiv.org/abs/1512.05287).
+
+    Raises:
+      RuntimeError: if the number of widths is not equal to the number of
+          depths - 1.
+
+    The input depth of the first layer is inferred from the total concatenated
+    size of the input features.
+
+    Hyperparameters used:
+      dropout_rate: The probability that an input is not dropped.  Only used
+          when the |dropout_keep_prob| parameter is negative.
+    """
+
+    super(ConvMultiNetwork, self).__init__(component)
+    self._attrs = get_attrs_with_defaults(
+        component.spec.network_unit.parameters,
+        defaults={
+            'widths': '',
+            'depths': '',
+            'output_embedding_dim': 0,
+            'side_tower_index': 0,
+            'side_tower_widths': '',
+            'side_tower_depths': '',
+            'side_tower_output_embedding_dim': 0,
+            'nonlinearity': 'relu',
+            'dropout_keep_prob': -1.0,
+            'dropout_per_sequence': False
+        })
+
+    # Examine the widths and depths for the primary tower.
+    self._weights = []
+    self._biases = []
+    self._widths = map(int, self._attrs['widths'].split(','))
+    self._depths = [self._concatenated_input_dim]
+
+    # Since we infer the input dimension, depths could be empty.
+    if self._attrs['depths']:
+      self._depths.extend(map(int, self._attrs['depths'].split(',')))
+
+    self._output_dim = self._attrs['output_embedding_dim']
+    if self._output_dim:
+      self._depths.append(self._output_dim)
+
+    if len(self._widths) != len(self._depths) - 1:
+      raise RuntimeError(
+          'Unmatched widths/depths: %d/%d (depths should equal widths + 1)' %
+          (len(self._widths), len(self._depths)))
+
+    # Create the kernels for the primary tower.
+    self.kernel_shapes = []
+    for i in range(len(self._depths) - 1):
+      self.kernel_shapes.append(
+          [1, self._widths[i], self._depths[i], self._depths[i + 1]])
+    for i in range(len(self._depths) - 1):
+      with tf.variable_scope('conv%d' % i):
+        self._weights.append(
+            tf.get_variable(
+                'weights',
+                self.kernel_shapes[i],
+                initializer=tf.random_normal_initializer(stddev=1e-4),
+                dtype=tf.float32))
+        bias_init = 0.0 if (i == len(self._widths) - 1) else 0.2
+        self._biases.append(
+            tf.get_variable(
+                'biases',
+                self.kernel_shapes[i][-1],
+                initializer=tf.constant_initializer(bias_init),
+                dtype=tf.float32))
+
+    # Examine the widths and depths for the side tower.
+    self._side_index = self._attrs['side_tower_index']
+    self._side_weights = []
+    self._side_biases = []
+    self._side_widths = map(int, self._attrs['side_tower_widths'].split(','))
+    self._side_depths = [self._depths[self._side_index]]
+
+    # Since we infer the input dimension, depths could be empty.
+    if self._attrs['side_tower_depths']:
+      self._side_depths.extend(
+          map(int, self._attrs['side_tower_depths'].split(',')))
+
+    self._side_output_dim = self._attrs['side_tower_output_embedding_dim']
+    if self._side_output_dim:
+      self._depths.append(self._side_output_dim)
+
+    if len(self._side_widths) != len(self._side_depths) - 1:
+      raise RuntimeError(
+          'Unmatched widths/depths: %d/%d (depths should equal widths + 1)' %
+          (len(self._side_widths), len(self._side_depths)))
+
+    # Create the kernels for the side tower, if there is more than one layer.
+    self.side_kernel_shapes = []
+    for i in range(len(self._side_depths) - 1):
+      self.side_kernel_shapes.append([
+          1, self._side_widths[i], self._side_depths[i], self._side_depths[i
+                                                                           + 1]
+      ])
+    for i in range(len(self._side_depths) - 1):
+      with tf.variable_scope('side_conv%d' % i):
+        self._side_weights.append(
+            tf.get_variable(
+                'weights',
+                self.side_kernel_shapes[i],
+                initializer=tf.random_normal_initializer(stddev=1e-4),
+                dtype=tf.float32))
+        bias_init = 0.0 if (i == len(self._side_widths) - 1) else 0.2
+        self._side_biases.append(
+            tf.get_variable(
+                'biases',
+                self.side_kernel_shapes[i][-1],
+                initializer=tf.constant_initializer(bias_init),
+                dtype=tf.float32))
+
+    # Extract nonlinearity from |tf.nn|.
+    self._nonlinearity = getattr(tf.nn, self._attrs['nonlinearity'])
+
+    # Infer dropout rate from network parameters and grid hyperparameters.
+    self._dropout_rate = self._attrs['dropout_keep_prob']
+    if self._dropout_rate < 0.0:
+      self._dropout_rate = component.master.hyperparams.dropout_rate
+
+    self._params.extend(self._weights + self._biases + self._side_weights +
+                        self._side_biases)
+
+    # Append primary tower layers to the data structure.
+    self._layers.append(
+        Layer(component, name='conv_output', dim=self._depths[-1]))
+    if self._output_dim:
+      self._regularized_weights.extend(self._weights[:-1])
+    else:
+      self._regularized_weights.extend(self._weights)
+
+    # Append side tower layers to the data structure.
+    self._layers.append(
+        Layer(component, name='conv_side_output', dim=self._side_depths[-1]))
+    if self._side_output_dim:
+      self._regularized_weights.extend(self._side_weights[:-1])
+    else:
+      self._regularized_weights.extend(self._side_weights)
+
+  def create(self,
+             fixed_embeddings,
+             linked_embeddings,
+             context_tensor_arrays,
+             attention_tensor,
+             during_training,
+             stride=None):
+    """Requires |stride|; otherwise see base class."""
+    if stride is None:
+      raise RuntimeError("ConvNetwork needs 'stride' and must be called in the "
+                         'bulk feature extractor component.')
+    input_tensor = get_input_tensor_with_stride(fixed_embeddings,
+                                                linked_embeddings, stride)
+
+    # TODO(googleuser): Add context and attention.
+    del context_tensor_arrays, attention_tensor
+
+    # On CPU, add a dimension so that the 'image' has shape
+    # [stride, 1, num_steps, D].
+    conv = tf.expand_dims(input_tensor, 1)
+    for i in range(len(self._depths) - 1):
+      if i == self._side_index:
+        logging.info('Creating side tower at index %d', i)
+        side_conv = conv
+        for j in range(len(self._side_depths) - 1):
+          with tf.variable_scope('side_conv%d' % j, reuse=True) as scope:
+            if during_training:
+              side_conv.set_shape([None, 1, None, self._side_depths[j]])
+              side_conv = self._maybe_apply_dropout(side_conv, stride)
+            side_conv = tf.nn.conv2d(
+                side_conv,
+                self._component.get_variable('weights'), [1, 1, 1, 1],
+                padding='SAME')
+            side_conv = tf.nn.bias_add(side_conv,
+                                       self._component.get_variable('biases'))
+            if j < (len(self._side_weights) - 1) or not self._side_output_dim:
+              side_conv = self._nonlinearity(side_conv, name=scope.name)
+
+      with tf.variable_scope('conv%d' % i, reuse=True) as scope:
+        if during_training:
+          conv.set_shape([None, 1, None, self._depths[i]])
+          conv = self._maybe_apply_dropout(conv, stride)
+        conv = tf.nn.conv2d(
+            conv,
+            self._component.get_variable('weights'), [1, 1, 1, 1],
+            padding='SAME')
+        conv = tf.nn.bias_add(conv, self._component.get_variable('biases'))
+        if i < (len(self._weights) - 1) or not self._output_dim:
+          conv = self._nonlinearity(conv, name=scope.name)
+
+    return [
+        tf.reshape(conv, [-1, self._depths[-1]], name='reshape_activations'),
        tf.reshape(
-            conv, [-1, self._depths[-1]], name='reshape_activations')
+            side_conv, [-1, self._side_depths[-1]],
+            name='reshape_side_activations'),
    ]

  def _maybe_apply_dropout(self, inputs, stride):
@@ -1406,20 +1779,17 @@ class ConvNetwork(NetworkUnitInterface):
 class PairwiseConvNetwork(NetworkUnitInterface):
  """Implementation of a pairwise 2D convolutional feed forward network.

-  For a sequence of N tokens, all N^2 pairs of concatenated input features are
-  constructed. If each input vector is of length D, then the sequence is
-  represented by an image of dimensions [N, N] with 2*D channels per pixel.
-  I.e. pixel [i, j] has a representation that is the concatenation of the
-  representations of the tokens at i and at j.
-
-  To use this network for graph edge scoring, for instance by using the "heads"
-  transition system, the output layer needs to have dimensions [N, N] and only
-  a single channel. The network takes care of outputting an [N, N] sized layer,
-  but the user needs to ensure that the output depth equals 1.
-
-  TODO(googleuser): Like Dozat and Manning, we will need an
-  additional network to label the edges, and the ability to read head
-  and modifier representations from different inputs.
+  For two sequences of representations of N tokens, all N^2 pairs of
+  concatenated input features are constructed. If each input vector is of
+  length D, then the sequence is represented by an image of dimensions [N, N]
+  with 2*D channels per pixel. I.e. pixel [i, j] has a representation that is
+  the concatenation of the representations of the tokens at i and at j.
+
+  To use this network for graph edge scoring, for instance by using the
+  "heads_labels" transition system, the output layer needs to have dimensions
+  [N, N*num_labels]. The network takes care of outputting an [N, N*last_dim]
+  sized layer, but the user needs to ensure that the output depth equals the
+  desired number of output labels.
  """

  def __init__(self, component):
@@ -1430,62 +1800,98 @@ class PairwiseConvNetwork(NetworkUnitInterface):
          convolutional kernel at every layer.
      widths: comma separated list of ints, number of steps input to the
          convolutional kernel at every layer.
-      relu_layers: comma separate list of ints, the id of layers after which
-          to apply a relu activation. *By default, all but the final layer will
-          have a relu activation applied.*
-
-    To generate a network with M layers, both 'depths' and 'widths' must be of
-    length M. The input depth of the first layer is inferred from the total
-    concatenated size of the input features.
+      dropout: comma separated list of floats, dropout keep probability for each
+          layer.
+      bias_init: comma separated list of floats, constant bias initializer for
+          each layer.
+      initialization: comma separated list of strings, initialization for each
+          layer. See add_var_initialized() for available initialization schemes.
+      activation_layers: comma separated list of ints, the id of layers after
+          which to apply an activation. *By default, all but the final layer
+          will have an activation applied.*
+      activation: anything defined in tf.nn.
+
+    To generate a network with M layers, 'depths', 'widths', 'dropout',
+    'bias_init' and 'initialization' must be of length M. The input depth of the
+    first layer is inferred from the total concatenated size of the input
+    features.

    Args:
      component: parent ComponentBuilderBase object.

    Raises:
-      RuntimeError: if the number of depths and weights are not equal.
-      ValueError: if the final depth is not equal to 1.
+      RuntimeError: if the lists of dropout, bias_init, initialization, and
+          widths do not have equal length, or the number of widths is not
+          equal to the number of depths - 1.
    """
    parameters = component.spec.network_unit.parameters
    super(PairwiseConvNetwork, self).__init__(component)

+    self._source_dim = self._linked_feature_dims['sources']
+    self._target_dim = self._linked_feature_dims['targets']
+
    # Each input pixel will comprise the concatenation of two tokens, so the
    # input depth is double that for a single token.
-    self._depths = [self._concatenated_input_dim * 2]
-    self._depths.extend(map(int, parameters['depths'].split(',')))
+    self._depths = [self._source_dim + self._target_dim]
    self._widths = map(int, parameters['widths'].split(','))
    self._num_layers = len(self._widths)
-    if len(self._depths) != self._num_layers + 1:
-      raise RuntimeError('Unmatched depths/weights %s/%s' %
-                         (parameters['depths'], parameters['weights']))
-    if self._depths[-1] != 1:
-      raise ValueError('Final depth is not equal to 1 in %s' %
-                       parameters['depths'])
+    self._dropout = map(float, parameters['dropout'].split(',')) if parameters[
+        'dropout'] else [1.0] * self._num_layers
+    self._bias_init = map(float, parameters['bias_init'].split(
+        ',')) if parameters['bias_init'] else [0.01] * self._num_layers
+    self._initialization = parameters['initialization'].split(
+        ',') if parameters['initialization'] else ['xavier'] * self._num_layers
+    param_lengths = map(len, [
+        self._widths, self._dropout, self._bias_init, self._initialization
+    ])
+    if not all(param_lengths[0] == param_len for param_len in param_lengths):
+      raise RuntimeError(
+          'Unmatched widths/dropout/bias_init/initialization: ' +
+          '%d/%d/%d/%d' % (param_lengths[0], param_lengths[1],
+                           param_lengths[2], param_lengths[3]))
+
+    self._depths.extend(map(int, parameters['depths'].split(',')))
+    if len(self._depths) != len(self._widths) + 1:
+      raise RuntimeError(
+          'Unmatched widths/depths: %d/%d (depths should equal widths + 1)' %
+          (len(self._widths), len(self._depths)))
+
+    if parameters['activation']:
+      self._activation = parameters['activation']
+    else:
+      self._activation = 'relu'
+    self._activation_fn = getattr(tf.nn, self._activation)
+
+    self._num_labels = self._depths[-1]
+
+    if parameters['activation_layers']:
+      self._activation_layers = set(map(int,
+                                        parameters['activation_layers'].split(
+                                            ',')))
+    else:
+      self._activation_layers = set(range(self._num_layers - 1))

    self._kernel_shapes = []
    for i, width in enumerate(self._widths):
-      self._kernel_shapes.append(
-          [width, width, self._depths[i], self._depths[i + 1]])
-    if parameters['relu_layers']:
-      self._relu_layers = set(map(int, parameters['relu_layers'].split(',')))
-    else:
-      self._relu_layers = set(range(self._num_layers - 1))
+      if self._activation == 'glu' and i in self._activation_layers:
+        self._kernel_shapes.append(
+            [width, width, self._depths[i], 2*self._depths[i + 1]])
+      else:
+        self._kernel_shapes.append(
+            [width, width, self._depths[i], self._depths[i + 1]])

    self._weights = []
    self._biases = []
    for i, kernel_shape in enumerate(self._kernel_shapes):
      with tf.variable_scope('conv%d' % i):
        self._weights.append(
-            tf.get_variable(
-                'weights',
-                kernel_shape,
-                initializer=tf.random_normal_initializer(stddev=1e-4),
-                dtype=tf.float32))
-        bias_init = 0.0 if i in self._relu_layers else 0.2
+            add_var_initialized('weights', kernel_shape, self._initialization[
+                i]))
        self._biases.append(
            tf.get_variable(
                'biases',
                kernel_shape[-1],
-                initializer=tf.constant_initializer(bias_init),
+                initializer=tf.constant_initializer(self._bias_init[i]),
                dtype=tf.float32))

    self._params.extend(self._weights + self._biases)
@@ -1500,34 +1906,46 @@ class PairwiseConvNetwork(NetworkUnitInterface):
             during_training,
             stride=None):
    """Requires |stride|; otherwise see base class."""
+    del context_tensor_arrays, attention_tensor  # Unused.
    # TODO(googleuser): Normalize the arguments to create(). 'stride'
    # is unused by the recurrent network units, while 'context_tensor_arrays'
    # and 'attenion_tensor_array' is unused by bulk network units. b/33587044
    if stride is None:
      raise ValueError("PairwiseConvNetwork needs 'stride'")

-    input_tensor = get_input_tensor_with_stride(fixed_embeddings,
-                                                linked_embeddings, stride)
-
-    # TODO(googleuser): Add dropout.
-    del context_tensor_arrays, attention_tensor, during_training  # Unused.
-
-    num_steps = tf.shape(input_tensor)[1]
-    arg1 = tf.expand_dims(input_tensor, 1)
-    arg1 = tf.tile(arg1, tf.stack([1, num_steps, 1, 1]))
-    arg2 = tf.expand_dims(input_tensor, 2)
-    arg2 = tf.tile(arg2, tf.stack([1, 1, num_steps, 1]))
+    sources = lookup_named_tensor('sources', linked_embeddings).tensor
+    targets = lookup_named_tensor('targets', linked_embeddings).tensor
+
+    source_tokens = tf.reshape(sources, [stride, -1, 1, self._source_dim])
+    target_tokens = tf.reshape(targets, [stride, 1, -1, self._target_dim])
+
+    # sources and targets should have shapes [b, n, 1, s] and [b, 1, n, t],
+    # respectively. Since we just reshaped them, we can check that all dims are
+    # as expected by checking the one unknown dim, i.e. their num_steps (n) dim.
+    sources_shape = tf.shape(source_tokens)
+    targets_shape = tf.shape(target_tokens)
+    num_steps = sources_shape[1]
+    with tf.control_dependencies([tf.assert_equal(num_steps, targets_shape[2],
+                                                  name='num_steps_mismatch')]):
+      arg1 = tf.tile(source_tokens, tf.stack([1, 1, num_steps, 1]))
+      arg2 = tf.tile(target_tokens, tf.stack([1, num_steps, 1, 1]))
    conv = tf.concat([arg1, arg2], 3)
    for i in xrange(self._num_layers):
      with tf.variable_scope('conv%d' % i, reuse=True) as scope:
-        conv = tf.nn.conv2d(
-            conv,
-            self._component.get_variable('weights'), [1, 1, 1, 1],
-            padding='SAME')
+        if during_training:
+          conv = maybe_apply_dropout(conv, self._dropout[i], False)
+        conv = tf.nn.conv2d(conv,
+                            self._component.get_variable('weights'),
+                            [1, 1, 1, 1],
+                            padding='SAME')
        conv = tf.nn.bias_add(conv, self._component.get_variable('biases'))
-        if i in self._relu_layers:
-          conv = tf.nn.relu(conv, name=scope.name)
-    return [tf.reshape(conv, [-1, num_steps], name='reshape_activations')]
+        if i in self._activation_layers:
+          conv = self._activation_fn(conv, name=scope.name)
+    return [
+        tf.reshape(
+            conv, [-1, num_steps * self._num_labels],
+            name='reshape_activations')
+    ]


 class ExportFixedFeaturesNetwork(NetworkUnitInterface):
@@ -1593,7 +2011,7 @@ class SplitNetwork(NetworkUnitInterface):

    for slice_index in xrange(self._num_slices):
      self._layers.append(
-          Layer(self, 'slice_%s' % slice_index, self._slice_dim))
+          Layer(component, 'slice_%s' % slice_index, self._slice_dim))

  def create(self,
             fixed_embeddings,
@@ -1602,5 +2020,103 @@ class SplitNetwork(NetworkUnitInterface):
             attention_tensor,
             during_training,
             stride=None):
+    """See base class."""
    input_bnxd = get_input_tensor(fixed_embeddings, linked_embeddings)
    return tf.split(input_bnxd, self._num_slices, axis=1)
+
+
+class GatherNetwork(NetworkUnitInterface):
+  """Network unit that gathers input according to specified step indices.
+
+  This can be used to implement a non-trivial linked feature (i.e., where the
+  link mapping is more complex than 'input.focus').  Extract the step indices
+  using a BulkFeatureIdExtractorComponentBuilder, and then gather activations
+  using this network.
+
+  Note that the step index -1 is special: gathering it will retrieve a padding
+  vector, which can be constant (zeros) or trainable.
+
+  Parameters:
+    trainable_padding (False): Whether the padding vector is trainable.
+
+  Features:
+    indices: [B * N, 1] The step indices to gather, local to each batch item.
+      These are local in the sense that, for each batch item, the step indices
+      are in the range [-1,N).
+    All other features are concatenated into a [B * N, D] matrix.
+
+  Layers:
+    outputs: [B * N, D] The first slice of the input.
+  """
+
+  def __init__(self, component):
+    """Initializes weights and layers.
+
+    Args:
+      component: Parent ComponentBuilderBase object.
+    """
+    super(GatherNetwork, self).__init__(component)
+    self._attrs = get_attrs_with_defaults(
+        component.spec.network_unit.parameters, {'trainable_padding': False})
+
+    check.In('indices', self._linked_feature_dims,
+             'Missing required linked feature')
+    check.Eq(self._linked_feature_dims['indices'], 1,
+             'Wrong dimension for "indices" feature')
+    self._dim = self._concatenated_input_dim - 1  # exclude 'indices'
+    self._layers.append(Layer(component, 'outputs', self._dim))
+
+    if self._attrs['trainable_padding']:
+      self._params.append(
+          tf.get_variable(
+              'pre_padding', [1, 1, self._dim],
+              initializer=tf.random_normal_initializer(stddev=1e-4),
+              dtype=tf.float32))
+
+  def create(self,
+             fixed_embeddings,
+             linked_embeddings,
+             context_tensor_arrays,
+             attention_tensor,
+             during_training,
+             stride=None):
+    """Requires |stride|; otherwise see base class."""
+    check.NotNone(stride,
+                  'BulkBiLSTMNetwork requires "stride" and must be called '
+                  'in the bulk feature extractor component.')
+
+    # Extract the batched local step indices.
+    local_indices = lookup_named_tensor('indices', linked_embeddings)
+    local_indices_bxn = tf.reshape(local_indices.tensor, [stride, -1])
+    local_indices_bxn = tf.to_int32(local_indices_bxn)
+    num_steps = tf.shape(local_indices_bxn)[1]
+
+    # Collect all other inputs as a batched tensor.
+    linked_embeddings = [
+        named_tensor for named_tensor in linked_embeddings
+        if named_tensor.name != 'indices'
+    ]
+    inputs_bnxd = get_input_tensor(fixed_embeddings, linked_embeddings)
+
+    # Prepend the padding vector, which may be trainable or constant.
+    inputs_bxnxd = tf.reshape(inputs_bnxd, [stride, -1, self._dim])
+    if self._attrs['trainable_padding']:
+      padding_1x1xd = self._component.get_variable('pre_padding')
+      padding_bx1xd = tf.tile(padding_1x1xd, [stride, 1, 1])
+    else:
+      padding_bx1xd = tf.zeros([stride, 1, self._dim], tf.float32)
+    inputs_bxnxd = tf.concat([padding_bx1xd, inputs_bxnxd], 1)
+    inputs_bnxd = tf.reshape(inputs_bxnxd, [-1, self._dim])
+
+    # As mentioned above, for each batch item the local step indices are in the
+    # range [-1,N).  To compensate for batching and padding, the local indices
+    # must be progressively offset into "global" indices such that batch item b
+    # is in the range [b*(N+1),(b+1)*(N+1)).
+    batch_indices_b = tf.range(stride)
+    batch_indices_bx1 = tf.expand_dims(batch_indices_b, 1)
+    local_to_global_offsets_bx1 = batch_indices_bx1 * (num_steps + 1) + 1
+    global_indices_bxn = local_indices_bxn + local_to_global_offsets_bx1
+    global_indices_bn = tf.reshape(global_indices_bxn, [-1])
+
+    outputs_bnxd = tf.gather(inputs_bnxd, global_indices_bn)
+    return [outputs_bnxd]
--- a/research/syntaxnet/dragnn/python/network_units_test.py
+++ b/research/syntaxnet/dragnn/python/network_units_test.py
@@ -16,16 +16,16 @@
 """Tests for network_units."""


+import numpy as np
 import tensorflow as tf
+
+from google.protobuf import text_format
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest

 from dragnn.protos import spec_pb2
 from dragnn.python import network_units

-import dragnn.python.load_dragnn_cc_impl
-import syntaxnet.load_parser_ops
-
 FLAGS = tf.app.flags.FLAGS


@@ -66,6 +66,9 @@ class MockComponent(object):
  def attr(self, name):
    return self._attrs[name]

+  def get_variable(self, name):
+    return tf.get_variable(name)
+

 class MockMaster(object):

@@ -77,6 +80,15 @@ class MockMaster(object):
    }


+class MockNetwork(object):
+
+  def __init__(self, **dims):
+    self._dims = dims
+
+  def get_layer_size(self, name):
+    return self._dims[name]
+
+
 class NetworkUnitsLookupTest(test_util.TensorFlowTestCase):

  def setUp(self):
@@ -155,5 +167,256 @@ class GetAttrsWithDefaultsTest(test_util.TensorFlowTestCase):
    _assert_attr_is_true('TRUE')


+class GatherNetworkTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    # Clear the graph and all existing variables.  Otherwise, variables created
+    # in different tests may collide with each other.
+    tf.reset_default_graph()
+
+    self._master = MockMaster()
+    self._master.spec = spec_pb2.MasterSpec()
+    text_format.Parse("""
+      component {
+        name: 'test'
+        backend { registered_name: 'TestComponent' }
+        linked_feature {
+          name: 'indices'
+          fml: 'input.focus'
+          size: 1
+          embedding_dim: -1
+          source_component: 'previous'
+          source_translator: 'identity'
+          source_layer: 'index_layer'
+        }
+        linked_feature {
+          name: 'features'
+          fml: 'input.focus'
+          size: 1
+          embedding_dim: -1
+          source_component: 'previous'
+          source_translator: 'identity'
+          source_layer: 'feature_layer'
+        }
+        network_unit {
+          registered_name: 'GatherNetwork'
+        }
+      }
+    """, self._master.spec)
+    self._component = MockComponent(self._master,
+                                    self._master.spec.component[0])
+    self._master.lookup_component['previous'].network = MockNetwork(
+        index_layer=1, feature_layer=2)
+
+  def testConstantPadding(self):
+    with tf.Graph().as_default(), self.test_session():
+      with tf.variable_scope('test_scope'):
+        network = network_units.GatherNetwork(self._component)
+
+      # Construct a batch of two items with 3 and 2 steps, respectively.
+      indices = tf.constant([[1], [2], [0],  # item 1
+                             [-1], [0], [-1]],  # item 2
+                            dtype=tf.int64)
+      features = tf.constant([[1.0, 1.5], [2.0, 2.5], [3.0, 3.5],  # item 1
+                              [4.0, 4.5], [5.0, 5.5], [6.0, 6.5]],  # item 2
+                             dtype=tf.float32)
+
+      fixed_embeddings = []
+      linked_embeddings = [
+          network_units.NamedTensor(indices, 'indices', 1),
+          network_units.NamedTensor(features, 'features', 2)
+      ]
+
+      with tf.variable_scope('test_scope', reuse=True):
+        outputs = network.create(fixed_embeddings, linked_embeddings, None,
+                                 None, True, 2)
+      gathered = outputs[0]
+
+      # Zeros will be substituted for index -1.
+      self.assertAllEqual(gathered.eval(),
+                          [[2.0, 2.5],  # gathered from 1
+                           [3.0, 3.5],  # gathered from 2
+                           [1.0, 1.5],  # gathered from 0
+                           [0.0, 0.0],  # gathered from -1
+                           [4.0, 4.5],  # gathered from 0
+                           [0.0, 0.0]])  # gathered from -1
+
+  def testTrainablePadding(self):
+    self._component.spec.network_unit.parameters['trainable_padding'] = 'true'
+    with tf.Graph().as_default(), self.test_session():
+      with tf.variable_scope('test_scope'):
+        network = network_units.GatherNetwork(self._component)
+
+      # Construct a batch of two items with 3 and 2 steps, respectively.
+      indices = tf.constant([[1], [2], [0],  # item 1
+                             [-1], [0], [-1]],  # item 2
+                            dtype=tf.int64)
+      features = tf.constant([[1.0, 1.5], [2.0, 2.5], [3.0, 3.5],  # item 1
+                              [4.0, 4.5], [5.0, 5.5], [6.0, 6.5]],  # item 2
+                             dtype=tf.float32)
+
+      fixed_embeddings = []
+      linked_embeddings = [
+          network_units.NamedTensor(indices, 'indices', 1),
+          network_units.NamedTensor(features, 'features', 2)
+      ]
+
+      with tf.variable_scope('test_scope', reuse=True):
+        outputs = network.create(fixed_embeddings, linked_embeddings, None,
+                                 None, True, 2)
+      gathered = outputs[0]
+
+      # Ensure that the padding variable is initialized.
+      tf.global_variables_initializer().run()
+
+      # Randomly-initialized padding will be substituted for index -1.
+      self.assertAllEqual(gathered[0].eval(), [2.0, 2.5])  # gathered from 1
+      self.assertAllEqual(gathered[1].eval(), [3.0, 3.5])  # gathered from 2
+      self.assertAllEqual(gathered[2].eval(), [1.0, 1.5])  # gathered from 0
+      tf.logging.info('padding = %s', gathered[3].eval())  # gathered from -1
+      self.assertAllEqual(gathered[4].eval(), [4.0, 4.5])  # gathered from 0
+      tf.logging.info('padding = %s', gathered[5].eval())  # gathered from -1
+
+      # Though random, the padding must identical.
+      self.assertAllEqual(gathered[3].eval(), gathered[5].eval())
+
+
+class IdentityInitializerTest(test_util.TensorFlowTestCase):
+
+  def IdentityInitializerHelper(self, shape, expected, divisor=1.0, std=1e-4):
+    """Tests identity initialization by comparing expected to actual array.
+
+    Tests the given expected array against the result of calling
+    network_units.add_var_initialized() with the given params and
+    init_type='identity'.
+
+    Args:
+      shape: shape of the array
+      expected: expected contents of the array to initialize
+      divisor: numerator for identity initialization where the last two dims
+        of the array are not equal; should divide both of the last two dims
+      std: standard deviation for random normal samples
+    """
+    with tf.Graph().as_default(), self.test_session() as session:
+      np.random.seed(4)
+      tensor = network_units.add_var_initialized('tensor', shape, 'identity',
+                                                 divisor=divisor, stddev=std)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(tensor)
+      self.assertAllClose(actual, expected, 1e-8, 1e-8)
+
+  def IdentityInitializerSquareHelper(self, shape, middles):
+    """Tests identity initialization when last two dims are equal.
+
+    When the last two dims of the array are equal, identity initialization
+    should simply set the center matrix in the last two dimensions to the
+    identity, with all other entries set to zero.
+
+    Args:
+      shape: shape of the array to initialize
+      middles: indices into the middle of all axes except the last two. It
+          must be the case that len(middles) == len(shape) - 2.
+    """
+    expected = np.zeros(shape, dtype='float32')
+    expected[[[m] for m in middles]] = np.eye(shape[-1])
+    self.IdentityInitializerHelper(shape, expected)
+
+  def testIdentityInitializerSquareRank2(self):
+    shape = (3, 3)
+    expected = np.eye(shape[-1]).astype('float32')
+    self.IdentityInitializerHelper(shape, expected)
+
+  def testIdentityInitializerSquareRank3(self):
+    shape = (2, 4, 4)
+    middles = [1]
+    self.IdentityInitializerSquareHelper(shape, middles)
+
+  def testIdentityInitializerSquareRank4(self):
+    shape = (2, 3, 4, 4)
+    middles = [1, 1]
+    self.IdentityInitializerSquareHelper(shape, middles)
+
+  def testIdentityInitializerSquareRank5(self):
+    shape = (2, 3, 4, 5, 5)
+    middles = [1, 1, 2]
+    self.IdentityInitializerSquareHelper(shape, middles)
+
+  def testIdentityInitializerNonSquareRank2FirstDimLarger(self):
+    divisor = 3.
+    std = 1e-3
+    shape = (6, 3)
+    m = divisor/shape[-1]
+    expected = [[m, 4.99951362e-04, -9.95908980e-04],
+                [m, -4.18301526e-04, -1.58457726e-03],
+                [-6.47706795e-04, m, 3.32250027e-04],
+                [-1.14747661e-03, m, -8.79869258e-05],
+                [4.25072387e-04, 3.32253141e-04, m],
+                [3.50997143e-04, -6.06887275e-04, m]]
+    self.IdentityInitializerHelper(shape, expected, divisor, std)
+
+  def testIdentityInitializerNonSquareRank2FirstDimSmaller(self):
+    divisor = 2.
+    std = 1e-3
+    shape = (2, 4)
+    m = divisor / shape[-1]
+    expected = [[m, m, -9.95908980e-04, 6.93598529e-04],
+                [-4.18301526e-04, -1.58457726e-03, m, m]]
+    self.IdentityInitializerHelper(shape, expected, divisor, std)
+
+  def testIdentityInitializerNonSquareRank3(self):
+    divisor = 2.
+    std = 1e-3
+    shape = (2, 2, 6)
+    m = divisor / shape[-1]
+    expected = [[[5.05617063e-05, 4.99951362e-04, -9.95908980e-04,
+                  6.93598529e-04, -4.18301526e-04, -1.58457726e-03],
+                 [-6.47706795e-04, 5.98575163e-04, 3.32250027e-04,
+                  -1.14747661e-03, 6.18669670e-04, -8.79869258e-05]],
+                [[m, m, m,
+                  3.50997143e-04, -6.06887275e-04, 1.54697930e-03],
+                 [7.23341596e-04, 4.61355667e-05, -9.82991653e-04,
+                  m, m, m]]]
+    self.IdentityInitializerHelper(shape, expected, divisor, std)
+
+  def testIdentityInitializerNonSquareRank4(self):
+    divisor = 2.
+    std = 1e-3
+    shape = (2, 3, 2, 8)
+    m = divisor / float(shape[-1])
+    expected = [
+        [[[5.05617063e-05, 4.99951362e-04, -9.95908980e-04, 6.93598529e-04,
+           -4.18301526e-04, -1.58457726e-03, -6.47706795e-04, 5.98575163e-04],
+          [3.32250027e-04, -1.14747661e-03, 6.18669670e-04, -8.79869258e-05,
+           4.25072387e-04, 3.32253141e-04, -1.15681626e-03, 3.50997143e-04]],
+
+         [[-6.06887275e-04, 1.54697930e-03, 7.23341596e-04, 4.61355667e-05,
+           -9.82991653e-04, 5.44327377e-05, 1.59892938e-04, -1.20894820e-03],
+          [2.22336012e-03, 3.94295203e-04, 1.69235771e-03, -1.11281220e-03,
+           1.63574750e-03, -1.36096554e-03, -6.51225855e-04, 5.42451337e-04]],
+
+         [[4.80062481e-05, -2.35807360e-03, -1.10558409e-03, 8.37836356e-04,
+           2.08787085e-03, 9.14840959e-04, -2.76203355e-04, 7.96511886e-04],
+          [-1.14379858e-03, 5.09919773e-04, -1.34746032e-03, -9.36010019e-06,
+           -1.30704633e-04, 8.02086608e-04, -3.02963977e-04, 1.20200263e-03]]],
+
+        [[[-1.96745284e-04, 8.36528721e-04, 7.86602264e-04, -1.84087583e-03,
+           3.75474883e-05, 3.59280530e-05, -7.78739923e-04, 1.79410708e-04],
+          [-1.45553437e-03, 5.56185201e-04, 5.09778853e-04, 3.00445536e-04,
+           2.47658417e-03, 3.52343399e-04, 6.74710027e-05, -7.32264714e-04]],
+
+         [[m, m, m, m,
+           1.58469542e-04, 1.99008291e-03, 1.16418756e-03, 2.42660157e-04],
+          [1.37992005e-03, -5.45587063e-05, 7.95233937e-04, 1.90899627e-05,
+           m, m, m, m]],
+
+         [[-1.09712186e-03, -5.28196048e-04, -2.37977528e-03, -6.07683673e-04,
+           -1.07529014e-03, 2.02240516e-03, -5.64875314e-04, -1.54292909e-03],
+          [8.70841788e-04, -1.75210531e-04, 4.86030076e-05, 1.88646198e-04,
+           2.09313483e-04, -3.74444906e-04, 9.54698597e-04, 5.23247640e-04]]]
+    ]
+
+    self.IdentityInitializerHelper(shape, expected, divisor, std)
+
+
 if __name__ == '__main__':
  googletest.main()
--- a/research/syntaxnet/dragnn/python/perf_test_data/master-spec
+++ b/research/syntaxnet/dragnn/python/perf_test_data/master-spec
+component {
+  name: "convnet"
+  transition_system {
+    registered_name: "shift-only"
+    parameters {
+      key: "parser_skip_deterministic"
+      value: "false"
+    }
+  }
+  resource {
+    name: "lexifuse-repository"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/lexifuse.lexifuse-repository/repository"
+      file_format: "repository"
+      record_format: "entity"
+    }
+  }
+  resource {
+    name: "brain-parser-model"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.convnet.model-init/brain-parser-model"
+      file_format: "model"
+      record_format: ""
+    }
+  }
+  resource {
+    name: "transition-system-data"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.convnet.model-init/transition-system-data"
+      file_format: "model"
+      record_format: ""
+    }
+  }
+  resource {
+    name: "words-embedding-input"
+    part {
+      file_pattern: "/readahead/512M/cns/lg-d/home/saft/corpora/word-embeddings/en/word2vec/1billion/word2vec-embedding-bi-true-32.sst"
+      file_format: "sstable"
+      record_format: "dist_belief.TokenEmbedding"
+    }
+  }
+  resource {
+    name: "words-vocab-input"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.convnet.model-init/vocab"
+      file_format: "text"
+      record_format: ""
+    }
+  }
+  resource {
+    name: "component-builder-module"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.convnet.component-builder-module/module-spec"
+      file_format: "pbtxt"
+      record_format: ""
+    }
+  }
+  fixed_feature {
+    name: "char_ngram"
+    fml: "input.token.lexifuse-char-ngram"
+    embedding_dim: 16
+    vocabulary_size: 16500
+    size: 1
+    predicate_map: "hashed"
+  }
+  fixed_feature {
+    name: "words"
+    fml: "input.word"
+    embedding_dim: 32
+    vocabulary_size: 39395
+    size: 1
+    predicate_map: "hashed"
+  }
+  network_unit {
+    registered_name: "IdentityNetwork"
+  }
+  backend {
+    registered_name: "ParserComponent"
+  }
+  num_actions: 1
+  attention_component: ""
+  component_builder {
+    registered_name: "components.common.dragnn.python.conv_component.ConvComponentBuilder"
+    parameters {
+      key: "depths"
+      value: "48,128"
+    }
+    parameters {
+      key: "output_dims"
+      value: "45"
+    }
+    parameters {
+      key: "widths"
+      value: "7"
+    }
+  }
+  training_beam_size: 1
+  inference_beam_size: 1
+}
+component {
+  name: "tagger"
+  transition_system {
+    registered_name: "tagger"
+    parameters {
+      key: "parser_skip_deterministic"
+      value: "false"
+    }
+  }
+  resource {
+    name: "tag-map"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/lexifuse.lexicon/tag-map"
+      file_format: "text"
+      record_format: ""
+    }
+  }
+  resource {
+    name: "lexifuse-repository"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/lexifuse.lexifuse-repository/repository"
+      file_format: "repository"
+      record_format: "entity"
+    }
+  }
+  resource {
+    name: "brain-parser-model"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.tagger.model-init/brain-parser-model"
+      file_format: "model"
+      record_format: ""
+    }
+  }
+  resource {
+    name: "transition-system-data"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.tagger.model-init/transition-system-data"
+      file_format: "model"
+      record_format: ""
+    }
+  }
+  resource {
+    name: "component-builder-module"
+    part {
+      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.tagger.component-builder-module/module-spec"
+      file_format: "pbtxt"
+      record_format: ""
+    }
+  }
+  linked_feature {
+    name: "convnet"
+    fml: "input.focus"
+    embedding_dim: -1
+    size: 1
+    source_component: "convnet"
+    source_translator: "identity"
+    source_layer: "conv0_logits"
+  }
+  network_unit {
+    registered_name: "IdentityNetwork"
+  }
+  backend {
+    registered_name: "ParserComponent"
+  }
+  num_actions: 45
+  attention_component: ""
+  component_builder {
+    registered_name: "bulk_component.BulkAnnotatorComponentBuilder"
+  }
+  training_beam_size: 1
+  inference_beam_size: 1
+}
--- a/research/syntaxnet/dragnn/python/perf_test_data/params
+++ b/research/syntaxnet/dragnn/python/perf_test_data/params
--- a/research/syntaxnet/dragnn/python/perf_test_data/sample_docs.pickle
+++ b/research/syntaxnet/dragnn/python/perf_test_data/sample_docs.pickle
--- a/research/syntaxnet/dragnn/python/render_spec_with_graphviz_test.py
+++ b/research/syntaxnet/dragnn/python/render_spec_with_graphviz_test.py
@@ -28,7 +28,7 @@ from dragnn.python import spec_builder
 def _make_basic_master_spec():
  """Constructs a simple spec.

-  Modified version of nlp/saft/opensource/dragnn/tools/parser_trainer.py
+  Modified version of dragnn/tools/parser_trainer.py

  Returns:
    spec_pb2.MasterSpec instance.

--- a/research/syntaxnet/dragnn/python/sentence_io.py
+++ b/research/syntaxnet/dragnn/python/sentence_io.py
@@ -18,21 +18,26 @@ import tensorflow as tf
 from syntaxnet.ops import gen_parser_ops


-class ConllSentenceReader(object):
-  """A reader for conll files, with optional projectivizing."""
+class FormatSentenceReader(object):
+  """A reader for formatted files, with optional projectivizing."""

-  def __init__(self, filepath, batch_size=32,
-               projectivize=False, morph_to_pos=False):
+  def __init__(self,
+               filepath,
+               record_format,
+               batch_size=32,
+               check_well_formed=False,
+               projectivize=False,
+               morph_to_pos=False):
    self._graph = tf.Graph()
    self._session = tf.Session(graph=self._graph)
    task_context_str = """
          input {
            name: 'documents'
-            record_format: 'conll-sentence'
+            record_format: '%s'
            Part {
             file_pattern: '%s'
            }
-          }""" % filepath
+          }""" % (record_format, filepath)
    if morph_to_pos:
      task_context_str += """
          Parameter {
@@ -51,7 +56,8 @@ class ConllSentenceReader(object):
    with self._graph.as_default():
      self._source, self._is_last = gen_parser_ops.document_source(
          task_context_str=task_context_str, batch_size=batch_size)
-      self._source = gen_parser_ops.well_formed_filter(self._source)
+      if check_well_formed:
+        self._source = gen_parser_ops.well_formed_filter(self._source)
      if projectivize:
        self._source = gen_parser_ops.projectivize_filter(self._source)

@@ -77,3 +83,20 @@ class ConllSentenceReader(object):
        break
    tf.logging.info('Read %d sentences.' % len(corpus))
    return corpus
+
+
+class ConllSentenceReader(FormatSentenceReader):
+  """A sentence reader that uses an underlying 'conll-sentence' reader."""
+
+  def __init__(self,
+               filepath,
+               batch_size=32,
+               projectivize=False,
+               morph_to_pos=False):
+    super(ConllSentenceReader, self).__init__(
+        filepath,
+        'conll-sentence',
+        check_well_formed=True,
+        batch_size=batch_size,
+        projectivize=projectivize,
+        morph_to_pos=morph_to_pos)
--- a/research/syntaxnet/dragnn/python/sentence_io_test.py
+++ b/research/syntaxnet/dragnn/python/sentence_io_test.py
@@ -19,16 +19,19 @@ import tensorflow as tf
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest

+from dragnn.python import dragnn_ops
+
 from dragnn.python import sentence_io
 from syntaxnet import sentence_pb2

-import syntaxnet.load_parser_ops
-
 FLAGS = tf.app.flags.FLAGS
-if not hasattr(FLAGS, 'test_srcdir'):
-  FLAGS.test_srcdir = ''
-if not hasattr(FLAGS, 'test_tmpdir'):
-  FLAGS.test_tmpdir = tf.test.get_temp_dir()
+
+
+def setUpModule():
+  if not hasattr(FLAGS, 'test_srcdir'):
+    FLAGS.test_srcdir = ''
+  if not hasattr(FLAGS, 'test_tmpdir'):
+    FLAGS.test_tmpdir = tf.test.get_temp_dir()


 class ConllSentenceReaderTest(test_util.TensorFlowTestCase):