Merge pull request #4153 from terryykoo/master

Export @195097388.

Merge pull request #4153 from terryykoo/master
Export @195097388.
80178fc6 · Mark Omernick · GitHub · a84e1ef9 · edea2b67 · 80178fc6
Unverified Commit 80178fc6 authored May 11, 2018 by Mark Omernick Committed by GitHub May 11, 2018
20 changed files
--- a/research/syntaxnet/dragnn/python/mst_ops.py
+++ b/research/syntaxnet/dragnn/python/mst_ops.py
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""TensorFlow ops for maximum spanning tree problems."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+import dragnn.python.load_mst_cc_impl
+from dragnn.mst.ops import gen_mst_ops
+from dragnn.python import digraph_ops
+from syntaxnet.util import check
+
+# Re-export the generated MST op.
+maximum_spanning_tree = gen_mst_ops.maximum_spanning_tree
+
+
+@tf.RegisterGradient("MaximumSpanningTree")
+def maximum_spanning_tree_gradient(mst_op, d_loss_d_max_scores, *_):
+  """Returns a subgradient of the MaximumSpanningTree op.
+
+  Note that MaximumSpanningTree is only differentiable w.r.t. its |scores| input
+  and its |max_scores| output.
+
+  Args:
+    mst_op: The MaximumSpanningTree op being differentiated.
+    d_loss_d_max_scores: [B] vector where entry b is the gradient of the network
+                         loss w.r.t. entry b of the |max_scores| output of the
+                         |mst_op|.
+    *_: The gradients w.r.t. the other outputs; ignored.
+
+  Returns:
+    1. None, since the op is not differentiable w.r.t. its |num_nodes| input.
+    2. [B,M,M] tensor where entry b,t,s is a subgradient of the network loss
+       w.r.t. entry b,t,s of the |scores| input, with the same dtype as
+       |d_loss_d_max_scores|.
+  """
+  dtype = d_loss_d_max_scores.dtype.base_dtype
+  check.NotNone(dtype)
+
+  argmax_sources_bxm = mst_op.outputs[1]
+  input_dim = tf.shape(argmax_sources_bxm)[1]  # M in the docstring
+
+  # The one-hot argmax is a subgradient of max.  Convert the batch of maximal
+  # spanning trees into 0/1 indicators, then scale them by the relevant output
+  # gradients from |d_loss_d_max_scores|.  Note that |d_loss_d_max_scores| must
+  # be reshaped in order for it to broadcast across the batch dimension.
+  indicators_bxmxm = tf.one_hot(argmax_sources_bxm, input_dim, dtype=dtype)
+  d_loss_d_max_scores_bx1 = tf.expand_dims(d_loss_d_max_scores, -1)
+  d_loss_d_max_scores_bx1x1 = tf.expand_dims(d_loss_d_max_scores_bx1, -1)
+  d_loss_d_scores_bxmxm = indicators_bxmxm * d_loss_d_max_scores_bx1x1
+  return None, d_loss_d_scores_bxmxm
+
+
+def log_partition_function(num_nodes,
+                           scores,
+                           forest=False,
+                           max_dynamic_range=None):
+  r"""Returns the log of the sum-of-product of spanning trees or forests.
+
+  Computing the sum-of-product in the log domain reduces the chance of overflow
+  or underflow, and ML techniques (e.g., CRF loss functions) typically require
+  the log partition function anyways.  For similar reasons, the scores input is
+  assumed to be specified in the log domain.
+
+  The partition function is caluclated via application of the Matrix-Tree
+  theorem; see the following for details:
+    https://en.wikipedia.org/wiki/Kirchhoff%27s_theorem
+    http://www.aclweb.org/anthology/D/D07/D07-1015.pdf
+
+  Computing the gradient of the log partition function requires inverting the
+  Laplacian matrix.  Numerical issues may occur if the Laplacian is singular or
+  nearly-so.  (Intuitively, the Laplacian will be close to singular when the
+  input scores strongly favor invalid structures such as cycles).  In the EMNLP
+  paper, we alleviated the numerical issues by clipping the difference between
+  the minimum and maximum score for each node to 20 (in the log domain).  The
+  |max_dynamic_range| argument can be used for this purpose.
+
+  TODO(googleuser): Try improving the condition number of the Laplacian matrix
+  directly, instead of using the indirect approach above.  For example, one
+  could add c*I to the Laplacian (i.e., Tikhonov regularization).
+
+  Args:
+    num_nodes: [B] vector of graph sizes per batch item.
+    scores: [B,M,M] tensor of padded batched arc and root scores, in the format
+      used by the maximum_spanning_tree() op.  Padding values must be finite.
+    forest: If true, sum over spanning forests instead of trees.
+    max_dynamic_range: If specified, incoming scores for each node are clipped
+      to at most this far from the maximum such score (in the log domain).
+
+  Returns:
+    [B] vector Z of log partition function values, where
+      Z[b] = log(
+          \sum_{tree spanning batch item b}
+              score(root_of(tree)) \prod_{arc in tree} score(arc))
+  """
+  orig_dtype = scores.dtype.base_dtype
+  scores_bxmxm = tf.to_double(scores)  # use doubles to reduce under/overflow
+  shape_bxmxm = tf.shape(scores_bxmxm)
+  batch_size = shape_bxmxm[0]
+  max_nodes = shape_bxmxm[1]
+  total_nodes = batch_size * max_nodes
+
+  # To eliminate overflow, we locally normalize the scores.  Specifically, for
+  # each node we divide its incoming arc scores and root selection score by the
+  # maximum such score.  Since each node in a tree must select exactly one of
+  # these scores (i.e., it is either a root or has exactly one incoming arc),
+  # the local normalization factors are identical for all trees and can thus be
+  # factored out of the sum over trees.
+  #
+  # More concretely, we find the maximum per node, divide all scores for that
+  # node by the maximum, and then find the partition function of the normalized
+  # scores.  Then we recover the un-normalized partition function by multiplying
+  # the per-node maxima back in.  This final step is performed in the log domain
+  # to avoid overflow.
+  #
+  # Note that underflow is still possible, but unlikely as long as the scores
+  # are close to feasible (i.e., there is not too much mass on non-trees).  The
+  # |max_dynamic_range| argument can be used to mitigate this.
+
+  # Finding the maximum incoming score is difficult, because the batch padding
+  # may contain arbitrary values.  We restrict the maximization to valid arcs
+  # using tf.unsorted_segment_max() with a specially-constructed set of IDs.
+  _, valid_tokens_bxm = digraph_ops.ValidArcAndTokenMasks(
+      num_nodes, max_nodes, dtype=tf.int32)
+
+  # Create a tensor of "target IDs".  In each row of each sub-matrix, the
+  # positions of valid source tokens are filled with the 1-origin index of that
+  # row in the entire batch, and zero elsewhere.  For example, given a batch
+  # with num_nodes=[2, 3] we might have
+  #   [[[1, 1, 0],
+  #     [2, 2, 0],
+  #     [3, 3, 0]],
+  #    [[4, 4, 4],
+  #     [5, 5, 5],
+  #     [6, 6, 6]]]
+  #
+  # TODO(googleuser): The dynamic masking is pretty awkward.  Find an op that does
+  # this (I looked, but maybe not hard enough), or write a custom op for this.
+  valid_tokens_bx1xm = tf.expand_dims(valid_tokens_bxm, 1)
+  valid_sources_bxmxm = tf.tile(valid_tokens_bx1xm, [1, max_nodes, 1])
+  sequence_bm = 1 + tf.range(total_nodes, dtype=tf.int32)
+  sequence_bxmx1 = tf.reshape(sequence_bm, [batch_size, max_nodes, 1])
+  target_ids_bxmxm = valid_sources_bxmxm * sequence_bxmx1
+
+  max_scores_bm1 = tf.unsorted_segment_max(scores_bxmxm, target_ids_bxmxm,
+                                           total_nodes + 1)
+  max_scores_bm = max_scores_bm1[1:]  # ID 0 corresponds to padding
+
+  # Similar to above, we need to sum over the valid tokens.  We analogously use
+  # tf.unsorted_segment_sum() with a specially-constructed set of "batch IDs".
+  sequence_b = 1 + tf.range(batch_size, dtype=tf.int32)
+  sequence_bx1 = tf.expand_dims(sequence_b, 1)
+  batch_ids_bxm = valid_tokens_bxm * sequence_bx1
+  batch_ids_bm = tf.reshape(batch_ids_bxm, [-1])
+
+  log_normalization_factor_b1 = tf.unsorted_segment_sum(
+      max_scores_bm, batch_ids_bm, batch_size + 1)
+  log_normalization_factor_b = log_normalization_factor_b1[1:]
+
+  # Locally-normalize and optionally clip the scores.
+  max_scores_bxmx1 = tf.reshape(max_scores_bm, [batch_size, max_nodes, 1])
+  scores_bxmxm -= max_scores_bxmx1
+  if max_dynamic_range is not None:
+    # After normalization, the scores are non-positive with max=0, so the
+    # |max_dynamic_range| can be applied directly.
+    #
+    # PyLint thinks "-max_dynamic_range" is invalid because it defaults to None.
+
+    scores_bxmxm = tf.maximum(scores_bxmxm, -max_dynamic_range)
+  scores_bxmxm = tf.exp(scores_bxmxm)
+
+  # Apply the Matrix-Tree theorem.
+  exp_normalized_laplacian_bxmxm = digraph_ops.LaplacianMatrix(
+      num_nodes, scores_bxmxm, forest=forest)
+  log_normalized_partition_function_b = tf.log(
+      tf.matrix_determinant(exp_normalized_laplacian_bxmxm))
+
+  # Reapply the normalization factor that was divided out.
+  log_partition_function_b = (
+      log_normalized_partition_function_b + log_normalization_factor_b)
+  return tf.cast(log_partition_function_b, orig_dtype)
--- a/research/syntaxnet/dragnn/python/mst_ops_test.py
+++ b/research/syntaxnet/dragnn/python/mst_ops_test.py
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for maximum spanning tree ops."""
+
+import math
+
+import numpy as np
+import tensorflow as tf
+
+from dragnn.python import mst_ops
+
+
+class MstOpsTest(tf.test.TestCase):
+  """Testing rig."""
+
+  def testMaximumSpanningTree(self):
+    """Tests that the MST op can recover a simple tree."""
+    with self.test_session() as session:
+      # The first batch element prefers 3 as root, then 3->0->1->2, for a total
+      # score of 4+2+1=7.  The second batch element is smaller and has reversed
+      # scores, so 0 is root and 0->2->1.
+      num_nodes = tf.constant([4, 3], tf.int32)
+      scores = tf.constant([[[0, 0, 0, 0],
+                             [1, 0, 0, 0],
+                             [1, 2, 0, 0],
+                             [1, 2, 3, 4]],
+                            [[4, 3, 2, 9],
+                             [0, 0, 2, 9],
+                             [0, 0, 0, 9],
+                             [9, 9, 9, 9]]], tf.int32)  # pyformat: disable
+
+      mst_outputs = mst_ops.maximum_spanning_tree(
+          num_nodes, scores, forest=False)
+      max_scores, argmax_sources = session.run(mst_outputs)
+      tf.logging.info('\nmax_scores=%s\nargmax_sources=\n%s', max_scores,
+                      argmax_sources)
+
+      self.assertAllEqual(max_scores, [7, 6])
+      self.assertAllEqual(argmax_sources, [[3, 0, 1, 3],
+                                           [0, 2, 0, -1]])  # pyformat: disable
+
+  def testMaximumSpanningTreeGradient(self):
+    """Tests the MST max score gradient."""
+    with self.test_session() as session:
+      num_nodes = tf.constant([4, 3], tf.int32)
+      scores = tf.constant([[[0, 0, 0, 0],
+                             [1, 0, 0, 0],
+                             [1, 2, 0, 0],
+                             [1, 2, 3, 4]],
+                            [[4, 3, 2, 9],
+                             [0, 0, 2, 9],
+                             [0, 0, 0, 9],
+                             [9, 9, 9, 9]]], tf.int32)  # pyformat: disable
+
+      mst_ops.maximum_spanning_tree(num_nodes, scores, forest=False, name='MST')
+      mst_op = session.graph.get_operation_by_name('MST')
+
+      d_loss_d_max_scores = tf.constant([3, 7], tf.float32)
+      d_loss_d_num_nodes, d_loss_d_scores = (
+          mst_ops.maximum_spanning_tree_gradient(mst_op, d_loss_d_max_scores))
+
+      # The num_nodes input is non-differentiable.
+      self.assertTrue(d_loss_d_num_nodes is None)
+      tf.logging.info('\nd_loss_d_scores=\n%s', d_loss_d_scores.eval())
+
+      self.assertAllEqual(d_loss_d_scores.eval(),
+                          [[[0, 0, 0, 3],
+                            [3, 0, 0, 0],
+                            [0, 3, 0, 0],
+                            [0, 0, 0, 3]],
+                           [[7, 0, 0, 0],
+                            [0, 0, 7, 0],
+                            [7, 0, 0, 0],
+                            [0, 0, 0, 0]]])  # pyformat: disable
+
+  def testMaximumSpanningTreeGradientError(self):
+    """Numerically validates the max score gradient."""
+    with self.test_session():
+      # The maximum-spanning-tree-score function, as a max of linear functions,
+      # is piecewise-linear (i.e., faceted).  The numerical gradient estimate
+      # may be inaccurate if the epsilon ball used for the estimate crosses an
+      # edge from one facet to another.  To avoid spurious errors, we manually
+      # set the sample point so the epsilon ball fits in a facet.  Or in other
+      # words, we set the scores so there is a non-trivial margin between the
+      # best and second-best trees.
+      scores_raw = [[[0, 0, 0, 0],
+                     [1, 0, 0, 0],
+                     [1, 2, 0, 0],
+                     [1, 2, 3, 4]],
+                    [[4, 3, 2, 9],
+                     [0, 0, 2, 9],
+                     [0, 0, 0, 9],
+                     [9, 9, 9, 9]]]  # pyformat: disable
+
+      # Use 64-bit floats to reduce numerical error.
+      scores = tf.constant(scores_raw, tf.float64)
+      init_scores = np.array(scores_raw)
+
+      num_nodes = tf.constant([4, 3], tf.int32)
+      max_scores = mst_ops.maximum_spanning_tree(
+          num_nodes, scores, forest=False)[0]
+
+      gradient_error = tf.test.compute_gradient_error(
+          scores, [2, 4, 4], max_scores, [2], init_scores)
+      tf.logging.info('gradient_error=%s', gradient_error)
+
+  def testLogPartitionFunctionOneTree(self):
+    """Tests the log partition function with one feasible tree with score 1."""
+    with self.test_session():
+      for forest in [False, True]:
+
+        # Each score matrix supports exactly one tree with score=1*1*1, and
+        # the rest with score=0.  Thus the log partition function will be 1.0
+        # in each case.
+        pad = 12345.6
+        scores = tf.constant([[[  1, pad, pad],
+                               [pad, pad, pad],
+                               [pad, pad, pad]],
+                              [[  1,   0, pad],
+                               [  1,   0, pad],
+                               [pad, pad, pad]],
+                              [[  1,   0,   0],
+                               [  1,   0,   0],
+                               [  0,   1,   0]]],
+                             tf.float64)  # pyformat: disable
+        scores = tf.log(scores)
+        num_nodes = tf.constant([1, 2, 3], tf.int32)
+
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest)
+
+        self.assertAlmostEqual(tf.exp(log_partition_functions[0]).eval(), 1.0)
+        self.assertAlmostEqual(tf.exp(log_partition_functions[1]).eval(), 1.0)
+        self.assertAlmostEqual(tf.exp(log_partition_functions[2]).eval(), 1.0)
+
+  def testLogPartitionFunctionOneTreeScaled(self):
+    """Tests the log partition function with one feasible tree."""
+    with self.test_session():
+      for forest in [False, True]:
+
+        # Each score matrix supports exactly one tree with varying score, and
+        # the rest with score=0.  Thus the log partition function will equal
+        # the score of that single tree in each case.
+        pad = 12345.6
+        scores = tf.constant([[[  2, pad, pad],
+                               [pad, pad, pad],
+                               [pad, pad, pad]],
+                              [[  3,   0, pad],
+                               [  5,   0, pad],
+                               [pad, pad, pad]],
+                              [[  7,   0,   0],
+                               [ 11,   0,   0],
+                               [  0,  13,   0]]],
+                             tf.float64)  # pyformat: disable
+        scores = tf.log(scores)
+        num_nodes = tf.constant([1, 2, 3], tf.int32)
+
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest)
+
+        self.assertAlmostEqual(tf.exp(log_partition_functions[0]).eval(), 2.0)
+        self.assertAlmostEqual(
+            tf.exp(log_partition_functions[1]).eval(), 3.0 * 5.0)
+        self.assertAlmostEqual(
+            tf.exp(log_partition_functions[2]).eval(), 7.0 * 11.0 * 13.0)
+
+  def testLogPartitionFunctionTwoTreesScaled(self):
+    """Tests the log partition function with two feasible trees."""
+    with self.test_session():
+      for forest in [False, True]:
+
+        # Each score matrix supports exactly two trees with varying score, and
+        # the rest with score=0.  Thus the log partition function will equal
+        # the sum of scores of those two trees in each case.
+        pad = 12345.6
+        scores = tf.constant([[[  2,   0,   0, pad],
+                               [  3,   0,   0, pad],
+                               [  5,   7,   0, pad],
+                               [pad, pad, pad, pad]],
+                              [[  0,  11,   0,  13],
+                               [  0,  17,   0,   0],
+                               [  0,  19,   0,   0],
+                               [  0,  23,   0,   0]]],
+                             tf.float64)  # pyformat: disable
+        scores = tf.log(scores)
+        num_nodes = tf.constant([3, 4], tf.int32)
+
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest)
+
+        self.assertAlmostEqual(
+            tf.exp(log_partition_functions[0]).eval(),
+            2.0 * 3.0 * 5.0 + 2.0 * 3.0 * 7.0)
+        self.assertAlmostEqual(
+            tf.exp(log_partition_functions[1]).eval(),
+            11.0 * 17.0 * 19.0 * 23.0 + 13.0 * 17.0 * 19.0 * 23.0)
+
+  def testLogPartitionFunctionInfeasible(self):
+    """Tests the log partition function on infeasible scores."""
+    with self.test_session():
+      for forest in [False, True]:
+
+        # The scores form cycles of various sizes.  Note that one can compute
+        # the partition function for infeasible scores---it's the gradient that
+        # may be impacted by numerical error.
+        pad = 12345.6
+        scores = tf.constant([[[  0,   1, pad, pad],
+                               [  1,   0, pad, pad],
+                               [pad, pad, pad, pad],
+                               [pad, pad, pad, pad]],
+                              [[  0,   1,   0, pad],
+                               [  0,   0,   1, pad],
+                               [  1,   0,   0, pad],
+                               [pad, pad, pad, pad]],
+                              [[  0,   1,   0,   0],
+                               [  0,   0,   1,   0],
+                               [  0,   0,   0,   1],
+                               [  1,   0,   0,   0]]],
+                             tf.float64)  # pyformat: disable
+        scores = tf.log(scores)
+        num_nodes = tf.constant([2, 3, 4], tf.int32)
+
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest)
+
+        self.assertAlmostEqual(tf.exp(log_partition_functions[0]).eval(), 0.0)
+        self.assertAlmostEqual(tf.exp(log_partition_functions[1]).eval(), 0.0)
+        self.assertAlmostEqual(tf.exp(log_partition_functions[2]).eval(), 0.0)
+
+  def testLogPartitionFunctionAllTrees(self):
+    """Tests the log partition function with all trees feasible."""
+    with self.test_session():
+      for forest in [False, True]:
+        # The scores allow all trees.  Using Cayley's formula, the
+        # number of directed spanning trees and forests in a complete
+        # digraph of n nodes is n^{n-1} and (n+1)^{n-1}, respectively.
+        # https://en.wikipedia.org/wiki/Cayley%27s_formula
+        scores = tf.zeros([10, 10, 10], tf.float64)  # = 1 in log domain
+        num_nodes = tf.range(1, 11, dtype=tf.int32)
+
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest)
+
+        base_offset = 1 if forest else 0  # n+1 for forest, n for tree
+        for size in range(1, 11):
+          self.assertAlmostEqual(log_partition_functions[size - 1].eval(),
+                                 (size - 1) * math.log(size + base_offset))
+
+  def testLogPartitionFunctionWithVeryHighValues(self):
+    """Tests the overflow protection in the log partition function."""
+    with self.test_session():
+      for forest in [False, True]:
+        # Set the scores to very high values to test overflow protection.
+        scores = 1000 * tf.ones([10, 10, 10], tf.float64)
+        num_nodes = tf.range(1, 11, dtype=tf.int32)
+
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest)
+
+        base_offset = 1 if forest else 0  # n+1 for forest, n for tree
+        for size in range(1, 11):
+          self.assertAlmostEqual(
+              log_partition_functions[size - 1].eval(),
+              (size - 1) * math.log(size + base_offset) + size * 1000)
+
+  def testLogPartitionFunctionWithVeryLowValues(self):
+    """Tests the underflow protection in the log partition function."""
+    with self.test_session():
+      for forest in [False, True]:
+        # Set the scores to very low values to test underflow protection.
+        scores = -1000 * tf.ones([10, 10, 10], tf.float64)
+        num_nodes = tf.range(1, 11, dtype=tf.int32)
+
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest)
+
+        base_offset = 1 if forest else 0  # n+1 for forest, n for tree
+        for size in range(1, 11):
+          self.assertAlmostEqual(
+              log_partition_functions[size - 1].eval(),
+              (size - 1) * math.log(size + base_offset) - size * 1000)
+
+  def testLogPartitionFunctionGradientError(self):
+    """Validates the log partition function gradient."""
+    with self.test_session():
+      for forest in [False, True]:
+        # To avoid numerical issues, provide score matrices that are weighted
+        # towards feasible trees or forests.
+        scores_raw = [[[0, 0, 0, 0],
+                       [1, 0, 0, 0],
+                       [1, 2, 0, 0],
+                       [1, 2, 3, 4]],
+                      [[4, 3, 2, 9],
+                       [0, 0, 2, 9],
+                       [0, 0, 0, 9],
+                       [9, 9, 9, 9]]]  # pyformat: disable
+
+        scores = tf.constant(scores_raw, tf.float64)
+        init_scores = np.array(scores_raw)
+
+        num_nodes = tf.constant([4, 3], tf.int32)
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest)
+
+        gradient_error = tf.test.compute_gradient_error(
+            scores, [2, 4, 4], log_partition_functions, [2], init_scores)
+        tf.logging.info('forest=%s gradient_error=%s', forest, gradient_error)
+
+        self.assertLessEqual(gradient_error, 1e-7)
+
+  def testLogPartitionFunctionGradientErrorFailsIfInfeasible(self):
+    """Tests that the partition function gradient fails on infeasible scores."""
+    with self.test_session():
+      for forest in [False, True]:
+
+        # The scores form cycles of various sizes.
+        pad = 12345.6
+        scores_raw = [[[  0,   1, pad, pad],
+                       [  1,   0, pad, pad],
+                       [pad, pad, pad, pad],
+                       [pad, pad, pad, pad]],
+                      [[  0,   1,   0, pad],
+                       [  0,   0,   1, pad],
+                       [  1,   0,   0, pad],
+                       [pad, pad, pad, pad]],
+                      [[  0,   1,   0,   0],
+                       [  0,   0,   1,   0],
+                       [  0,   0,   0,   1],
+                       [  1,   0,   0,   0]]]  # pyformat: disable
+
+        scores = tf.log(scores_raw)
+        init_scores = np.log(np.array(scores_raw))
+        num_nodes = tf.constant([2, 3, 4], tf.int32)
+
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest)
+
+        with self.assertRaises(Exception):
+          tf.test.compute_gradient_error(
+              scores, [3, 4, 4], log_partition_functions, [3], init_scores)
+
+  def testLogPartitionFunctionGradientErrorOkIfInfeasibleWithClipping(self):
+    """Tests that the log partition function gradient is OK after clipping."""
+    with self.test_session():
+      for forest in [False, True]:
+
+        # The scores form cycles of various sizes.
+        pad = 12345.6
+        scores_raw = [[[  0,   1, pad, pad],
+                       [  1,   0, pad, pad],
+                       [pad, pad, pad, pad],
+                       [pad, pad, pad, pad]],
+                      [[  0,   1,   0, pad],
+                       [  0,   0,   1, pad],
+                       [  1,   0,   0, pad],
+                       [pad, pad, pad, pad]],
+                      [[  0,   1,   0,   0],
+                       [  0,   0,   1,   0],
+                       [  0,   0,   0,   1],
+                       [  1,   0,   0,   0]]]  # pyformat: disable
+
+        scores = tf.log(scores_raw)
+        init_scores = np.log(np.array(scores_raw))
+        num_nodes = tf.constant([2, 3, 4], tf.int32)
+
+        log_partition_functions = mst_ops.log_partition_function(
+            num_nodes, scores, forest=forest, max_dynamic_range=10)
+
+        gradient_error = tf.test.compute_gradient_error(
+            scores, [3, 4, 4], log_partition_functions, [3], init_scores)
+        tf.logging.info('forest=%s gradient_error=%s', forest, gradient_error)
+
+        # There's still a lot of error.
+        self.assertLessEqual(gradient_error, 1e-3)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/syntaxnet/dragnn/python/mst_units.py
+++ b/research/syntaxnet/dragnn/python/mst_units.py
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""DRAGNN wrappers for the MST solver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from dragnn.python import mst_ops
+from dragnn.python import network_units
+from syntaxnet.util import check
+
+
+class MstSolverNetwork(network_units.NetworkUnitInterface):
+  """Network unit that performs MST prediction with structured loss.
+
+  Parameters:
+    forest: If true, solve for a spanning forest instead of a spanning tree.
+    loss: The loss function for training.  Select from
+      softmax: Default unstructured softmax (prediction is still structured).
+      m3n: Max-Margin Markov Networks loss.
+    crf_max_dynamic_range: Max dynamic range for the log partition function.
+
+  Links:
+    lengths: [B, 1] sequence lengths per batch item.
+    scores: [B * N, N] matrix of padded batched arc scores.
+
+  Layers:
+    lengths: [B] sequence lengths per batch item.
+    scores: [B, N, N] tensor of padded batched arc scores.
+    logits: [B * N, N] matrix of padded batched arc scores.
+    arcs: [B * N, N] matrix of padded batched 0/1 indicators for MST arcs.
+  """
+
+  def __init__(self, component):
+    """Initializes layers.
+
+    Args:
+      component: Parent ComponentBuilderBase object.
+    """
+    layers = [
+        network_units.Layer(self, 'lengths', -1),
+        network_units.Layer(self, 'scores', -1),
+        network_units.Layer(self, 'logits', -1),
+        network_units.Layer(self, 'arcs', -1),
+    ]
+    super(MstSolverNetwork, self).__init__(component, init_layers=layers)
+
+    self._attrs = network_units.get_attrs_with_defaults(
+        component.spec.network_unit.parameters,
+        defaults={
+            'forest': False,
+            'loss': 'softmax',
+            'crf_max_dynamic_range': 20,
+        })
+
+    check.Eq(
+        len(self._fixed_feature_dims.items()), 0, 'Expected no fixed features')
+    check.Eq(
+        len(self._linked_feature_dims.items()), 2,
+        'Expected two linked features')
+
+    check.In('lengths', self._linked_feature_dims,
+             'Missing required linked feature')
+    check.In('scores', self._linked_feature_dims,
+             'Missing required linked feature')
+
+  def create(self,
+             fixed_embeddings,
+             linked_embeddings,
+             context_tensor_arrays,
+             attention_tensor,
+             during_training,
+             stride=None):
+    """Forwards the lengths and scores."""
+    check.NotNone(stride, 'MstSolverNetwork requires stride')
+
+    lengths = network_units.lookup_named_tensor('lengths', linked_embeddings)
+    lengths_b = tf.to_int32(tf.squeeze(lengths.tensor, [1]))
+
+    scores = network_units.lookup_named_tensor('scores', linked_embeddings)
+    scores_bnxn = scores.tensor
+    max_length = tf.shape(scores_bnxn)[1]
+    scores_bxnxn = tf.reshape(scores_bnxn, [stride, max_length, max_length])
+
+    _, argmax_sources_bxn = mst_ops.maximum_spanning_tree(
+        forest=self._attrs['forest'], num_nodes=lengths_b, scores=scores_bxnxn)
+    argmax_sources_bn = tf.reshape(argmax_sources_bxn, [-1])
+    arcs_bnxn = tf.one_hot(argmax_sources_bn, max_length, dtype=tf.float32)
+
+    return [lengths_b, scores_bxnxn, scores_bnxn, arcs_bnxn]
+
+  def get_logits(self, network_tensors):
+    return network_tensors[self.get_layer_index('logits')]
+
+  def get_bulk_predictions(self, stride, network_tensors):
+    return network_tensors[self.get_layer_index('arcs')]
+
+  def compute_bulk_loss(self, stride, network_tensors, gold):
+    """See base class."""
+    if self._attrs['loss'] == 'softmax':
+      return (None, None, None)  # fall back to default bulk softmax
+
+    lengths_b, scores_bxnxn, _, arcs_bnxn = network_tensors
+    max_length = tf.shape(scores_bxnxn)[2]
+    arcs_bxnxn = tf.reshape(arcs_bnxn, [stride, max_length, max_length])
+    gold_bxn = tf.reshape(gold, [stride, max_length])
+    gold_bxnxn = tf.one_hot(gold_bxn, max_length, dtype=tf.float32)
+
+    loss = self._compute_loss(lengths_b, scores_bxnxn, gold_bxnxn)
+    correct = tf.reduce_sum(tf.to_int32(arcs_bxnxn * gold_bxnxn))
+    total = tf.reduce_sum(lengths_b)
+    return loss, correct, total
+
+  def _compute_loss(self, lengths, scores, gold):
+    """Computes the configured structured loss for a batch.
+
+    Args:
+      lengths: [B] sequence lengths per batch item.
+      scores: [B, N, N] tensor of padded batched arc scores.
+      gold: [B, N, N] tensor of 0/1 indicators for gold arcs.
+
+    Returns:
+      Scalar sum of losses across the batch.
+    """
+    # Dispatch to one of the _compute_*_loss() methods.
+    method_name = '_compute_%s_loss' % self._attrs['loss']
+    loss_b = getattr(self, method_name)(lengths, scores, gold)
+    return tf.reduce_sum(loss_b)
+
+  def _compute_m3n_loss(self, lengths, scores, gold):
+    """Computes the M3N-style structured hinge loss for a batch."""
+    # Perform hamming-loss-augmented inference.
+    gold_scores_b = tf.reduce_sum(scores * gold, axis=[1, 2])
+    hamming_loss_bxnxn = 1 - gold
+    scores_bxnxn = scores + hamming_loss_bxnxn
+    max_scores_b, _ = mst_ops.maximum_spanning_tree(
+        num_nodes=lengths, scores=scores_bxnxn, forest=self._attrs['forest'])
+    return max_scores_b - gold_scores_b
+
+  def _compute_crf_loss(self, lengths, scores, gold):
+    """Computes the negative CRF log-probability for a batch."""
+    # The |scores| are assumed to be in the log domain.
+    log_gold_scores_b = tf.reduce_sum(scores * gold, axis=[1, 2])
+    log_partition_functions_b = mst_ops.log_partition_function(
+        num_nodes=lengths,
+        scores=scores,
+        forest=self._attrs['forest'],
+        max_dynamic_range=self._attrs['crf_max_dynamic_range'])
+    return log_partition_functions_b - log_gold_scores_b  # negative log-prob
--- a/research/syntaxnet/dragnn/python/mst_units_test.py
+++ b/research/syntaxnet/dragnn/python/mst_units_test.py
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DRAGNN wrappers for the MST solver."""
+
+import math
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+
+from dragnn.protos import spec_pb2
+from dragnn.python import mst_units
+from dragnn.python import network_units
+
+_MASTER_SPEC = r"""
+  component {
+    name: 'test'
+    linked_feature {
+      name: 'lengths'
+      size: 1
+      embedding_dim: -1
+      fml: 'input.focus'
+      source_translator: 'identity'
+      source_component: 'previous'
+      source_layer: 'lengths'
+    }
+    linked_feature {
+      name: 'scores'
+      size: 1
+      embedding_dim: -1
+      fml: 'input.focus'
+      source_translator: 'identity'
+      source_component: 'previous'
+      source_layer: 'scores'
+    }
+  }
+"""
+
+
+class MockNetwork(object):
+
+  def get_layer_size(self, unused_name):
+    return -1
+
+
+class MockComponent(object):
+
+  def __init__(self, master, component_spec):
+    self.master = master
+    self.spec = component_spec
+    self.name = component_spec.name
+    self.beam_size = 1
+    self.num_actions = -1
+    self.network = MockNetwork()
+
+
+class MockMaster(object):
+
+  def __init__(self, build_runtime_graph=False):
+    self.spec = spec_pb2.MasterSpec()
+    text_format.Parse(_MASTER_SPEC, self.spec)
+    self.hyperparams = spec_pb2.GridPoint()
+    self.lookup_component = {
+        'previous': MockComponent(self, spec_pb2.ComponentSpec())
+    }
+    self.build_runtime_graph = build_runtime_graph
+
+
+class MstSolverNetworkTest(tf.test.TestCase):
+
+  def setUp(self):
+    # Clear the graph and all existing variables.  Otherwise, variables created
+    # in different tests may collide with each other.
+    tf.reset_default_graph()
+
+  def testCreate(self):
+    with self.test_session():
+      master = MockMaster()
+      component = MockComponent(master, master.spec.component[0])
+      component.network = mst_units.MstSolverNetwork(component)
+
+      stride = 1
+      lengths = tf.constant([[3]], dtype=tf.int64)
+      scores = tf.constant([[1.0, 0.5, 0.5],
+                            [2.0, 0.5, 0.5],
+                            [0.5, 3.0, 0.5]],
+                           dtype=tf.float32)  # pyformat: disable
+
+      linked_embeddings = [
+          network_units.NamedTensor(lengths, 'lengths'),
+          network_units.NamedTensor(scores, 'scores')
+      ]
+      network_tensors = component.network.create([], linked_embeddings, [],
+                                                 None, False, stride)
+
+      self.assertAllEqual(network_tensors[0].eval(), [3])
+      self.assertAllEqual(network_tensors[1].eval(),
+                          [[[1.0, 0.5, 0.5],
+                            [2.0, 0.5, 0.5],
+                            [0.5, 3.0, 0.5]]])  # pyformat: disable
+      self.assertAllEqual(network_tensors[2].eval(),
+                          [[1.0, 0.5, 0.5],
+                           [2.0, 0.5, 0.5],
+                           [0.5, 3.0, 0.5]])  # pyformat: disable
+      self.assertAllEqual(network_tensors[3].eval(),
+                          [[1.0, 0.0, 0.0],
+                           [1.0, 0.0, 0.0],
+                           [0.0, 1.0, 0.0]])  # pyformat: disable
+
+  def testGetBulkPredictions(self):
+    with self.test_session():
+      master = MockMaster()
+      component = MockComponent(master, master.spec.component[0])
+      component.network = mst_units.MstSolverNetwork(component)
+
+      stride = 2
+      lengths = tf.constant([[2], [3]], dtype=tf.int64)
+
+      pad = -12345.6
+      scores = tf.constant([[1.0, 2.0, pad],
+                            [1.8, 2.0, pad],
+                            [pad, pad, pad],
+                            [3.8, 4.0, 3.9],
+                            [3.9, 3.8, 4.0],
+                            [3.8, 0.9, 4.0]],
+                           dtype=tf.float32)  # pyformat: disable
+
+      linked_embeddings = [
+          network_units.NamedTensor(lengths, 'lengths'),
+          network_units.NamedTensor(scores, 'scores')
+      ]
+      network_tensors = component.network.create([], linked_embeddings, [],
+                                                 None, False, stride)
+      predictions = component.network.get_bulk_predictions(
+          stride, network_tensors)
+
+      self.assertAllEqual(predictions.eval(),
+                          [[0.0, 1.0, 0.0],
+                           [0.0, 1.0, 0.0],
+                           [0.0, 0.0, 0.0],
+                           [0.0, 1.0, 0.0],
+                           [0.0, 0.0, 1.0],
+                           [0.0, 0.0, 1.0]])  # pyformat: disable
+
+  def testComputeBulkLossM3n(self):
+    with self.test_session():
+      master = MockMaster()
+      component = MockComponent(master, master.spec.component[0])
+      component.spec.network_unit.parameters['loss'] = 'm3n'
+      component.network = mst_units.MstSolverNetwork(component)
+
+      stride = 2
+      lengths = tf.constant([[2], [3]], dtype=tf.int64)
+
+      # Note that these scores are large enough to overcome the +1 hamming loss
+      # terms in the M3N loss.  Therefore, the score matrix determines the tree
+      # that is used to compute the M3N loss.
+      pad = -12345.6
+      scores = tf.constant([[0.5, 2.0, pad],
+                            [0.5, 2.0, pad],
+                            [pad, pad, pad],
+                            [2.5, 4.0, 2.5],
+                            [2.5, 2.5, 4.0],
+                            [2.5, 2.5, 4.0]],
+                           dtype=tf.float32)  # pyformat: disable
+
+      # For the first tree, the gold and scores agree on one arc (that index 1
+      # is a root), and for the second tree, the gold and scores agree on none
+      # of the arcs.  Therefore, we expect +1 and +3 for the first and second
+      # trees in the M3N loss.
+      gold = tf.constant([0, 1, -1, 0, 0, 1], tf.int32)
+      first_gold_score = 0.5 + 2.0
+      second_gold_score = 2.5 + 2.5 + 2.5
+      first_tree_correct = 1
+      second_tree_correct = 0
+      first_tree_loss = 2 * 2.0 + 2 - first_tree_correct - first_gold_score
+      second_tree_loss = 3 * 4.0 + 3 - second_tree_correct - second_gold_score
+
+      linked_embeddings = [
+          network_units.NamedTensor(lengths, 'lengths'),
+          network_units.NamedTensor(scores, 'scores')
+      ]
+      network_tensors = component.network.create([], linked_embeddings, [],
+                                                 None, False, stride)
+      cost, correct, total = component.network.compute_bulk_loss(
+          stride, network_tensors, gold)
+
+      self.assertEqual(cost.eval(), first_tree_loss + second_tree_loss)
+      self.assertEqual(correct.eval(), first_tree_correct + second_tree_correct)
+      self.assertEqual(total.eval(), 2 + 3)
+
+  def testComputeBulkLossCrf(self):
+    with self.test_session():
+      master = MockMaster()
+      component = MockComponent(master, master.spec.component[0])
+      component.spec.network_unit.parameters['loss'] = 'crf'
+      component.network = mst_units.MstSolverNetwork(component)
+
+      stride = 2
+      lengths = tf.constant([[2], [3]], dtype=tf.int64)
+
+      # These scores have 2.0 (in the log domain) on the gold arcs and 1.0
+      # elsewhere.
+      pad = -12345.6
+      one = math.log(1.0)
+      two = math.log(2.0)
+      scores = tf.constant([[one, two, pad],
+                            [one, two, pad],
+                            [pad, pad, pad],
+                            [one, two, one],
+                            [one, one, two],
+                            [one, one, two]],
+                           dtype=tf.float32)  # pyformat: disable
+
+      gold = tf.constant([1, 1, -1, 1, 2, 2], tf.int32)
+
+      first_partition_function = (
+          2.0 * 2.0 +  # 0 -> 1  (gold)
+          1.0 * 1.0)  #  1 -> 0
+      first_loss = -math.log(2.0 * 2.0 / first_partition_function)
+
+      second_partition_function = (
+          2.0 * 2.0 * 2.0 +  # 0 -> 1 -> 2  (gold)
+          1.0 * 1.0 * 1.0 +  # 2 -> 1 -> 0
+          1.0 * 1.0 * 1.0 +  # 0 -> 2 -> 1
+          2.0 * 1.0 * 1.0 +  # 1 -> 2 -> 0
+          2.0 * 1.0 * 1.0 +  # 1 -> 0 -> 2
+          2.0 * 1.0 * 1.0 +  # 2 -> 0 -> 1
+          2.0 * 2.0 * 1.0 +  # {0, 1} -> 2
+          2.0 * 1.0 * 1.0 +  # {0, 2} -> 1
+          1.0 * 1.0 * 1.0)  #  {1, 2} -> 0
+      second_loss = -math.log(2.0 * 2.0 * 2.0 / second_partition_function)
+
+      linked_embeddings = [
+          network_units.NamedTensor(lengths, 'lengths'),
+          network_units.NamedTensor(scores, 'scores')
+      ]
+      network_tensors = component.network.create([], linked_embeddings, [],
+                                                 None, False, stride)
+      cost, correct, total = component.network.compute_bulk_loss(
+          stride, network_tensors, gold)
+
+      self.assertAlmostEqual(cost.eval(), first_loss + second_loss)
+      self.assertEqual(correct.eval(), 2 + 3)
+      self.assertEqual(total.eval(), 2 + 3)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/syntaxnet/dragnn/python/network_units.py
+++ b/research/syntaxnet/dragnn/python/network_units.py
@@ -22,7 +22,6 @@ import abc


 import numpy as np
-from six.moves import xrange
 import tensorflow as tf
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import tensor_array_ops as ta
@@ -76,11 +75,13 @@ class StoredActivations(object):
      check.NotNone(dim, 'Dim is required for bulk tensor')

      self._bulk_tensor = tensor
-      with tf.name_scope('convert_to_dyn'):
-        tensor = tf.reshape(tensor, [stride, -1, dim])
-        tensor = tf.transpose(tensor, perm=[1, 0, 2])
-        pad = tf.zeros([1, stride, dim], dtype=tensor.dtype)
-        self._array_tensor = tf.concat([pad, tensor], 0)
+      if dim >= 0:
+        # These operations will fail if |dim| is negative.
+        with tf.name_scope('convert_to_dyn'):
+          tensor = tf.reshape(tensor, [stride, -1, dim])
+          tensor = tf.transpose(tensor, perm=[1, 0, 2])
+          pad = tf.zeros([1, stride, dim], dtype=tensor.dtype)
+          self._array_tensor = tf.concat([pad, tensor], 0)

    if array is not None:
      check.IsNone(tensor, 'Cannot initialize from both tensor and array')
@@ -130,7 +131,8 @@ def add_embeddings(channel_id, feature_spec, seed=None):
  check.Gt(feature_spec.embedding_dim, 0,
           'Embeddings requested for non-embedded feature: %s' % feature_spec)
  name = fixed_embeddings_name(channel_id)
-  shape = [feature_spec.vocabulary_size + 1, feature_spec.embedding_dim]
+  row_num = feature_spec.vocabulary_size + 1
+  shape = [row_num, feature_spec.embedding_dim]
  if feature_spec.HasField('pretrained_embedding_matrix'):
    if len(feature_spec.pretrained_embedding_matrix.part) > 1:
      raise RuntimeError('pretrained_embedding_matrix resource contains '
@@ -143,9 +145,9 @@ def add_embeddings(channel_id, feature_spec, seed=None):
    embeddings = syntaxnet_ops.word_embedding_initializer(
        vectors=feature_spec.pretrained_embedding_matrix.part[0].file_pattern,
        vocabulary=feature_spec.vocab.part[0].file_pattern,
+        override_num_embeddings=row_num,

-        num_special_embeddings=1,
-        embedding_init=1.0,
+        embedding_init=0.0,  # zero out rows with no pretrained values
        seed=seed1,
        seed2=seed2)
    return tf.get_variable(
@@ -183,7 +185,57 @@ def embedding_lookup(embedding_matrix, indices, ids, weights, size):
  return embeddings


-def fixed_feature_lookup(component, state, channel_id, stride):
+def apply_feature_id_dropout(ids, weights, channel):
+  """Randomly perturbs a vector of feature IDs.
+
+  Args:
+    ids: Vector of feature IDs.
+    weights: Vector of feature weights.
+    channel: FixedFeatureChannel that extracted the |ids|.
+
+  Returns:
+    Copy of |ids| and |weights| where each ID is randomly replaced with
+    |channel.dropout_id|, according to the probabilities in
+    |channel.dropout_keep_probabilities|. The weights of dropped features are
+    set to zero if |channel.dropped_id| equals |channel.vocabulary_size|.
+  """
+  check.Gt(
+      len(channel.dropout_keep_probability), 0,
+      'Channel {} dropout_keep_probability is empty'.format(channel.name))
+  check.Le(
+      len(channel.dropout_keep_probability), channel.vocabulary_size,
+      'Channel {} dropout_keep_probability is too long'.format(channel.name))
+
+  # Channel fields, converted from proto to constant tensor.
+  dropout_id = tf.constant(
+      channel.dropout_id, name='dropout_id', dtype=tf.int64)
+  dropout_keep_probabilities = tf.constant(
+      list(channel.dropout_keep_probability),
+      name='dropout_keep_probability',
+      dtype=tf.float32,
+      shape=[channel.vocabulary_size])
+
+  # The keep probabilities for the current batch of feature IDs.
+  keep_probabilities = tf.gather(dropout_keep_probabilities, ids)
+
+  # Draw random values and determine which IDs should be kept.
+  shape = tf.shape(ids)
+  noise = tf.random_uniform(shape)  # \in [0,1)^d
+  should_keep = noise < keep_probabilities
+
+  # Replace dropped IDs with the specified replacement ID.
+  dropout_ids = tf.fill(shape, dropout_id)
+  new_ids = tf.where(should_keep, ids, dropout_ids)
+  if channel.dropout_id == channel.vocabulary_size:
+    # Replace weights of dropped IDs with 0.
+    zeros = tf.zeros(shape, dtype=tf.float32)
+    new_weights = tf.where(should_keep, weights, zeros)
+  else:
+    new_weights = weights
+  return new_ids, new_weights
+
+
+def fixed_feature_lookup(component, state, channel_id, stride, during_training):
  """Looks up fixed features and passes them through embeddings.

  Embedding vectors may be scaled by weights if the features specify it.
@@ -193,6 +245,8 @@ def fixed_feature_lookup(component, state, channel_id, stride):
    state: MasterState object for the live ComputeSession.
    channel_id: int id of the fixed feature to look up.
    stride: int Tensor of current batch * beam size.
+    during_training: True if this is being called from a training code path.
+      This controls, e.g., the use of feature ID dropout.

  Returns:
    NamedTensor object containing the embedding vectors.
@@ -200,13 +254,35 @@ def fixed_feature_lookup(component, state, channel_id, stride):
  feature_spec = component.spec.fixed_feature[channel_id]
  check.Gt(feature_spec.embedding_dim, 0,
           'Embeddings requested for non-embedded feature: %s' % feature_spec)
-  embedding_matrix = component.get_variable(fixed_embeddings_name(channel_id))
+  if feature_spec.is_constant:
+    embedding_matrix = tf.get_variable(fixed_embeddings_name(channel_id))
+  else:
+    embedding_matrix = component.get_variable(fixed_embeddings_name(channel_id))

  with tf.op_scope([embedding_matrix], 'fixed_embedding_' + feature_spec.name):
    indices, ids, weights = dragnn_ops.extract_fixed_features(
        state.handle, component=component.name, channel_id=channel_id)
-    size = stride * feature_spec.size
-    embeddings = embedding_lookup(embedding_matrix, indices, ids, weights, size)
+
+    if during_training and feature_spec.dropout_id >= 0:
+      ids, weights = apply_feature_id_dropout(ids, weights, feature_spec)
+
+    if component.master.build_runtime_graph:
+      # To simplify integration with NN compilers, assume that each feature in
+      # the channel extracts exactly one ID and no weights.
+      # TODO(googleuser): Relax this restriction?
+      embeddings = []
+      for index in range(feature_spec.size):
+
+        feature_id = component.add_cell_input(
+            tf.int32, [1], 'fixed_channel_{}_index_{}_ids'.format(
+                channel_id, index))
+        embeddings.append(tf.gather(embedding_matrix, feature_id))
+      embeddings = tf.concat(embeddings, 1)
+    else:
+      size = stride * feature_spec.size
+      embeddings = embedding_lookup(embedding_matrix, indices, ids, weights,
+                                    size)
+
    dim = feature_spec.size * feature_spec.embedding_dim
    return NamedTensor(
        tf.reshape(embeddings, [-1, dim]), feature_spec.name, dim=dim)
@@ -368,12 +444,16 @@ def convert_network_state_tensorarray(tensorarray):
  return tf.reshape(tensor, [-1, tf.shape(tensor)[2]])


-def pass_through_embedding_matrix(act_block, embedding_matrix, step_idx):
+def pass_through_embedding_matrix(component, channel_id, size, act_block,
+                                  embedding_matrix, step_idx):
  """Passes the activations through the embedding_matrix.

  Takes care to handle out of bounds lookups.

  Args:
+    component: Component that produced the linked features.
+    channel_id: Channel that produced the linked features.
+    size: Number of linked embeddings in the channel.
    act_block: matrix of activations.
    embedding_matrix: matrix of weights.
    step_idx: vector containing step indices, with -1 indicating out of bounds.
@@ -383,14 +463,36 @@ def pass_through_embedding_matrix(act_block, embedding_matrix, step_idx):
  """
  # Indicator vector for out of bounds lookups.
  step_idx_mask = tf.expand_dims(tf.equal(step_idx, -1), -1)
+  step_idx_mask = tf.to_float(step_idx_mask)
+
+  if component.master.build_runtime_graph:
+    step_idx_mask = component.add_cell_input(
+        step_idx_mask.dtype, [size, 1],
+        'linked_channel_{}_out_of_bounds'.format(channel_id))

  # Pad the last column of the activation vectors with the indicator.
-  act_block = tf.concat([act_block, tf.to_float(step_idx_mask)], 1)
+  act_block = tf.concat([act_block, step_idx_mask], 1)
  return tf.matmul(act_block, embedding_matrix)


+def lookup_named_tensor_or_none(name, named_tensors):
+  """Retrieves a NamedTensor by name, or None if it doesn't exist.
+
+  Args:
+    name: Name of the tensor to retrieve.
+    named_tensors: List of NamedTensor objects to search.
+
+  Returns:
+    The NamedTensor in |named_tensors| with the |name| or None.
+  """
+  for named_tensor in named_tensors:
+    if named_tensor.name == name:
+      return named_tensor
+  return None
+
+
 def lookup_named_tensor(name, named_tensors):
-  """Retrieves a NamedTensor by name.
+  """Retrieves a NamedTensor by name, raising KeyError if it doesn't exist.

  Args:
    name: Name of the tensor to retrieve.
@@ -402,11 +504,11 @@ def lookup_named_tensor(name, named_tensors):
  Raises:
    KeyError: If the |name| is not found among the |named_tensors|.
  """
-  for named_tensor in named_tensors:
-    if named_tensor.name == name:
-      return named_tensor
-  raise KeyError('Name "%s" not found in named tensors: %s' % (name,
-                                                               named_tensors))
+  result = lookup_named_tensor_or_none(name, named_tensors)
+  if result is None:
+    raise KeyError('Name "%s" not found in named tensors: %s' % (name,
+                                                                 named_tensors))
+  return result


 def activation_lookup_recurrent(component, state, channel_id, source_array,
@@ -417,9 +519,9 @@ def activation_lookup_recurrent(component, state, channel_id, source_array,
  not passed through (i.e. multiplied by) an embedding matrix.

  Args:
-    component: Component object in which to look up the fixed features.
+    component: Component object in which to look up the linked features.
    state: MasterState object for the live ComputeSession.
-    channel_id: int id of the fixed feature to look up.
+    channel_id: int id of the linked feature to look up.
    source_array: TensorArray from which to fetch feature vectors, expected to
        have size [steps + 1] elements of shape [stride, D] each.
    source_layer_size: int length of feature vectors before embedding.
@@ -459,11 +561,17 @@ def activation_lookup_recurrent(component, state, channel_id, source_array,
    act_block = tf.gather(act_block, flat_idx)
    act_block = tf.reshape(act_block, [-1, source_layer_size])

+    if component.master.build_runtime_graph:
+      act_block = component.add_cell_input(act_block.dtype, [
+          feature_spec.size, source_layer_size
+      ], 'linked_channel_{}_activations'.format(channel_id))
+
    if feature_spec.embedding_dim != -1:
      embedding_matrix = component.get_variable(
          linked_embeddings_name(channel_id))
-      act_block = pass_through_embedding_matrix(act_block, embedding_matrix,
-                                                step_idx)
+      act_block = pass_through_embedding_matrix(component, channel_id,
+                                                feature_spec.size, act_block,
+                                                embedding_matrix, step_idx)
      dim = feature_spec.size * feature_spec.embedding_dim
    else:
      # If embedding_dim is -1, just output concatenation of activations.
@@ -481,9 +589,9 @@ def activation_lookup_other(component, state, channel_id, source_tensor,
  not passed through (i.e. multiplied by) an embedding matrix.

  Args:
-    component: Component object in which to look up the fixed features.
+    component: Component object in which to look up the linked features.
    state: MasterState object for the live ComputeSession.
-    channel_id: int id of the fixed feature to look up.
+    channel_id: int id of the linked feature to look up.
    source_tensor: Tensor from which to fetch feature vectors. Expected to have
        have shape [steps + 1, stride, D].
    source_layer_size: int length of feature vectors before embedding (D). It
@@ -508,11 +616,17 @@ def activation_lookup_other(component, state, channel_id, source_tensor,
    act_block = tf.gather_nd(source_tensor, indices)
    act_block = tf.reshape(act_block, [-1, source_layer_size])

+    if component.master.build_runtime_graph:
+      act_block = component.add_cell_input(act_block.dtype, [
+          feature_spec.size, source_layer_size
+      ], 'linked_channel_{}_activations'.format(channel_id))
+
    if feature_spec.embedding_dim != -1:
      embedding_matrix = component.get_variable(
          linked_embeddings_name(channel_id))
-      act_block = pass_through_embedding_matrix(act_block, embedding_matrix,
-                                                step_idx)
+      act_block = pass_through_embedding_matrix(component, channel_id,
+                                                feature_spec.size, act_block,
+                                                embedding_matrix, step_idx)
      dim = feature_spec.size * feature_spec.embedding_dim
    else:
      # If embedding_dim is -1, just output concatenation of activations.
@@ -629,7 +743,7 @@ class Layer(object):
    Returns:
      TensorArray object
    """
-    check.Gt(self.dim, 0, 'Cannot create array when dimension is dynamic')
+    check.Ge(self.dim, 0, 'Cannot create array when dimension is dynamic')
    tensor_array = ta.TensorArray(
        dtype=tf.float32,
        size=0,
@@ -671,7 +785,19 @@ def get_attrs_with_defaults(parameters, defaults):
  return attrs


-def maybe_apply_dropout(inputs, keep_prob, per_sequence, stride=None):
+def maybe_make_dropout_mask(shape, keep_prob):
+  """Returns a reusable dropout mask, or None if dropout would not occur."""
+  if keep_prob >= 1.0:
+    return None
+  return tf.nn.dropout(tf.ones(shape, dtype=tf.float32), keep_prob)
+
+
+def maybe_apply_dropout(inputs,
+                        keep_prob,
+                        per_sequence,
+                        stride=None,
+                        dropout_mask=None,
+                        name=None):
  """Applies dropout, if so configured, to an input tensor.

  The input may be rank 2 or 3 depending on whether the stride (i.e., batch
@@ -682,20 +808,27 @@ def maybe_apply_dropout(inputs, keep_prob, per_sequence, stride=None):
    keep_prob: Scalar probability of keeping each input element.  If >= 1.0, no
        dropout is performed.
    per_sequence: If true, sample the dropout mask once per sequence, instead of
-        once per step.  Requires |stride| when true.
-    stride: Scalar batch size.  Optional if |per_sequence| is false.
+        once per step.  Either |stride| or |dropout_mask| must be set when true.
+    stride: Scalar batch size.  Optional if |per_sequence| is false, or if
+        |dropout_mask| is provided.
+    dropout_mask: Precomputed dropout mask to apply to the |inputs|; must be
+        broadcastable to |inputs|.  Optional if |per_sequence| is false, or if
+        |stride| is provided.
+    name: Optional name for the dropout operation, if dropout is applied.

  Returns:
    [stride * num_steps, dim] or [stride, num_steps, dim] tensor, matching the
    shape of |inputs|, containing the masked or original inputs, depending on
    whether dropout was actually performed.
  """
-
  if keep_prob >= 1.0:
    return inputs

  if not per_sequence:
-    return tf.nn.dropout(inputs, keep_prob)
+    return tf.nn.dropout(inputs, keep_prob, name=name)
+
+  if dropout_mask is not None:
+    return tf.multiply(inputs, dropout_mask, name=name)

  # We only check the dims if we are applying per-sequence dropout
  check.Ge(inputs.get_shape().ndims, 2, 'inputs must be rank 2 or 3')
@@ -713,7 +846,7 @@ def maybe_apply_dropout(inputs, keep_prob, per_sequence, stride=None):
  # Replace |num_steps| with 1 in |noise_shape|, so the dropout mask broadcasts
  # to all steps for a particular sequence.
  noise_shape = [stride, 1, dim]
-  masked_sxnxd = tf.nn.dropout(inputs_sxnxd, keep_prob, noise_shape)
+  masked_sxnxd = tf.nn.dropout(inputs_sxnxd, keep_prob, noise_shape, name=name)

  # If needed, flatten out the batch dimension in the return value.
  return tf.reshape(masked_sxnxd, [-1, dim]) if flat else masked_sxnxd
@@ -749,6 +882,7 @@ class NetworkUnitInterface(object):
    """
    self._component = component
    self._params = []
+    self._derived_params = []
    self._layers = init_layers if init_layers else []
    self._regularized_weights = []
    self._context_layers = init_context_layers if init_context_layers else []
@@ -764,7 +898,10 @@ class NetworkUnitInterface(object):
      check.Gt(spec.size, 0, 'Invalid fixed feature size')
      if spec.embedding_dim > 0:
        fixed_dim = spec.embedding_dim
-        self._params.append(add_embeddings(channel_id, spec))
+        if spec.is_constant:
+          add_embeddings(channel_id, spec)
+        else:
+          self._params.append(add_embeddings(channel_id, spec))
      else:
        fixed_dim = 1  # assume feature ID extraction; only one ID per step
      self._fixed_feature_dims[spec.name] = spec.size * fixed_dim
@@ -802,8 +939,8 @@ class NetworkUnitInterface(object):
      self._concatenated_input_dim = -1
    else:
      self._concatenated_input_dim = sum(input_dims)
-    tf.logging.info('component %s concat_input_dim %s', component.name,
-                    self._concatenated_input_dim)
+    tf.logging.debug('component %s concat_input_dim %s', component.name,
+                     self._concatenated_input_dim)

    # Allocate attention parameters.
    if self._component.spec.attention_component:
@@ -845,6 +982,19 @@ class NetworkUnitInterface(object):
              [attention_hidden_layer_size, component.num_actions],
              initializer=tf.random_normal_initializer(stddev=1e-4)))

+  def pre_create(self, stride):
+    """Prepares this network for inputs of the given stride.
+
+    This will be called before entering the main transition loop and calling
+    create().  Networks can use this to pre-compute values that are reused in
+    the main transition loop.  Note that this may be called multiple times;
+    e.g., once for the training graph, and again for the inference graph.
+
+    Args:
+      stride: Scalar batch_size * beam_size.
+    """
+    pass
+
  @abc.abstractmethod
  def create(self,
             fixed_embeddings,
@@ -878,6 +1028,18 @@ class NetworkUnitInterface(object):
  def params(self):
    return self._params

+  @property
+  def derived_params(self):
+    """Gets the list of derived parameters.
+
+    Derived parameters are similar to `params`, but reformatted slightly
+    (because doing so is easier in Python).
+
+    Returns:
+      List of zero-argument getters, each of which return a tensor when called.
+    """
+    return self._derived_params
+
  @property
  def regularized_weights(self):
    return self._regularized_weights
@@ -919,6 +1081,38 @@ class NetworkUnitInterface(object):
    """
    raise NotImplementedError()

+  def get_bulk_predictions(self, stride, network_tensors):
+    """Returns custom bulk predictions, if supported.
+
+    The returned predictions will be used to advance the batch of states, like
+    logits.  For example, a network may perform structured prediction, and then
+    return 0/1 indicators of the jointly-predicted annotations.  The difference
+    between this and get_logits() is that this is only used at inference time.
+
+    Args:
+      stride: Scalar stride for segmenting bulk tensors.
+      network_tensors: List of tensors as returned by create().
+
+    Returns:
+      [stride * steps, dim] matrix of predictions, or None if not supported.
+    """
+    del stride, network_tensors
+    return None
+
+  def compute_bulk_loss(self, stride, network_tensors, gold):
+    """Returns a custom bulk training loss, if supported.
+
+    Args:
+      stride: Scalar stride for segmenting bulk tensors.
+      network_tensors: List of tensors as returned by create().
+      gold: [stride * steps] vector of gold actions.
+
+    Returns:
+      Tuple of (loss, correct, total), or (None, None, None) if not supported.
+    """
+    del stride, network_tensors, gold
+    return (None, None, None)
+
  def get_l2_regularized_weights(self):
    """Gets the weights that need to be regularized."""
    return self.regularized_weights
@@ -1026,6 +1220,12 @@ class FeedForwardNetwork(NetworkUnitInterface):
        (https://arxiv.org/abs/1512.05287).
      dropout_all_layers (False): If true, apply dropout to the input of all
        hidden layers, instead of just applying it to the network input.
+      initialize_bias_zero (False): If true, initialize bias vectors to 0.
+        Otherwise, they are initialized to a small constant value.
+      initialize_softmax_zero (False): If true, initialize softmax weights to 0.
+        Otherwise, they are initialized to small random values.
+      initialize_hidden_orthogonal (False): If true, initialize hidden weights
+        orthogonally.  Otherwise, they are initialized to small random values.

    Hyperparameters used:
      dropout_rate: The probability that an input is not dropped.  Only used
@@ -1041,9 +1241,25 @@ class FeedForwardNetwork(NetworkUnitInterface):
            'nonlinearity': 'relu',
            'dropout_keep_prob': -1.0,
            'dropout_per_sequence': False,
-            'dropout_all_layers': False
+            'dropout_all_layers': False,
+            'initialize_bias_zero': False,
+            'initialize_softmax_zero': False,
+            'initialize_hidden_orthogonal': False,
        })

+    def _make_bias_initializer():
+      return (tf.zeros_initializer() if self._attrs['initialize_bias_zero'] else
+              tf.constant_initializer(0.2, dtype=tf.float32))
+
+    def _make_softmax_initializer():
+      return (tf.zeros_initializer() if self._attrs['initialize_softmax_zero']
+              else tf.random_normal_initializer(stddev=1e-4))
+
+    def _make_hidden_initializer():
+      return (tf.orthogonal_initializer()
+              if self._attrs['initialize_hidden_orthogonal'] else
+              tf.random_normal_initializer(stddev=1e-4))
+
    # Initialize the hidden layer sizes before running the base initializer, as
    # the base initializer may need to know the size of the hidden layer for
    # recurrent connections.
@@ -1084,13 +1300,13 @@ class FeedForwardNetwork(NetworkUnitInterface):
    for index, hidden_layer_size in enumerate(self._hidden_layer_sizes):
      weights = tf.get_variable(
          'weights_%d' % index, [last_layer_dim, hidden_layer_size],
-          initializer=tf.random_normal_initializer(stddev=1e-4))
+          initializer=_make_hidden_initializer())
      self._params.append(weights)
      if index > 0 or self._layer_norm_hidden is None:
        self._params.append(
            tf.get_variable(
                'bias_%d' % index, [hidden_layer_size],
-                initializer=tf.constant_initializer(0.2, dtype=tf.float32)))
+                initializer=_make_bias_initializer()))

      self._weights.append(weights)
      self._layers.append(
@@ -1108,7 +1324,7 @@ class FeedForwardNetwork(NetworkUnitInterface):
      self._params.append(
          tf.get_variable(
              'weights_softmax', [last_layer_dim, component.num_actions],
-              initializer=tf.random_normal_initializer(stddev=1e-4)))
+              initializer=_make_softmax_initializer()))
      self._params.append(
          tf.get_variable(
              'bias_softmax', [component.num_actions],
@@ -1199,67 +1415,133 @@ class FeedForwardNetwork(NetworkUnitInterface):


 class LSTMNetwork(NetworkUnitInterface):
-  """Implementation of action LSTM style network."""
+  """Implementation of action LSTM style network.
+
+  Note that this is not a vanilla LSTM: it adds peephole connections and couples
+  the input and forget gates.
+
+  This implementation treats linked features called lstm_h and lstm_c specially.
+  Instead of treating them as normal linked features, it uses them as the
+  previous LSTM states.  This allows having a single LSTM component actually
+  consist of several LSTMs, or to have a tree-shaped LSTM.
+  """

  def __init__(self, component):
+    """Initializes LSTM parameters.
+
+    Args:
+      component: parent ComponentBuilderBase object.
+
+    Parameters used to construct the network:
+      hidden_layer_sizes: In spite of its name, a single int indicating the
+        number of hidden units in each hidden layer.
+      factored_hidden_dim: If positive, the weight matrix is factored into a
+        product of two matrices with this inner dimension.
+      omit_logits (False): Whether to elide the logits layer.
+      initialize_bias_zero (False): If true, initialize bias vectors to 0.
+        Otherwise, they are initialized to small random values.
+      initialize_softmax_zero (False): If true, initialize softmax weights to 0.
+        Otherwise, they are initialized to small random values.
+      initialize_hidden_orthogonal (False): If true, initialize hidden weights
+        orthogonally.  Otherwise, they are initialized to small random values.
+      input_dropout_rate (-1.0): Keep probability for inputs.  If negative, fall
+        back to the |dropout_rate| hyperparameter.
+      recurrent_dropout_rate (-1.0): Keep probability for recurrences.  If
+        negative, fall back to the |recurrent_dropout_rate| hyperparameter.
+      dropout_per_sequence (False): If true, sample the dropout mask once per
+        sequence, instead of once per step.  See Gal and Ghahramani
+        (https://arxiv.org/abs/1512.05287).
+    """
    assert component.num_actions > 0, 'Component num actions must be positive.'
-    network_unit_spec = component.spec.network_unit
-    self._hidden_layer_sizes = (
-        int)(network_unit_spec.parameters['hidden_layer_sizes'])
+    self._attrs = get_attrs_with_defaults(
+        component.spec.network_unit.parameters,
+        defaults={
+            'hidden_layer_sizes': -1,  # NB: a single dim, not a list
+            'factored_hidden_dim': -1,
+            'omit_logits': False,
+            'initialize_bias_zero': False,
+            'initialize_softmax_zero': False,
+            'initialize_hidden_orthogonal': False,
+            'input_dropout_rate': -1.0,
+            'recurrent_dropout_rate': -1.0,
+            'dropout_per_sequence': False,
+        })
+
+    def _make_bias_initializer():
+      return (tf.zeros_initializer() if self._attrs['initialize_bias_zero'] else
+              tf.random_normal_initializer(stddev=1e-4))

-    self._input_dropout_rate = component.master.hyperparams.dropout_rate
-    self._recurrent_dropout_rate = (
-        component.master.hyperparams.recurrent_dropout_rate)
+    def _make_softmax_initializer():
+      return (tf.zeros_initializer() if self._attrs['initialize_softmax_zero']
+              else tf.random_normal_initializer(stddev=1e-4))
+
+    self._hidden_layer_sizes = self._attrs['hidden_layer_sizes']
+    self._factored_hidden_dim = self._attrs['factored_hidden_dim']
+    self._compute_logits = not self._attrs['omit_logits']
+    self._dropout_per_sequence = self._attrs['dropout_per_sequence']
+
+    self._input_dropout_rate = self._attrs['input_dropout_rate']
+    if self._input_dropout_rate < 0.0:
+      self._input_dropout_rate = component.master.hyperparams.dropout_rate
+
+    self._recurrent_dropout_rate = self._attrs['recurrent_dropout_rate']
+    if self._recurrent_dropout_rate < 0.0:
+      self._recurrent_dropout_rate = (
+          component.master.hyperparams.recurrent_dropout_rate)
    if self._recurrent_dropout_rate < 0.0:
      self._recurrent_dropout_rate = component.master.hyperparams.dropout_rate

+    tf.logging.info('[%s] dropout: input=%s recurrent=%s per_sequence=%s',
+                    component.name, self._input_dropout_rate,
+                    self._recurrent_dropout_rate, self._dropout_per_sequence)
+
    super(LSTMNetwork, self).__init__(component)
-    layer_input_dim = self._concatenated_input_dim
+    self._layer_input_dim = self._concatenated_input_dim
+    if self._layer_input_dim > 1:
+      for skipped_link in ['lstm_h', 'lstm_c']:
+        if skipped_link in self._linked_feature_dims:
+          self._layer_input_dim -= self._linked_feature_dims[skipped_link]
+
+    self._input_dropout_mask = None
+    self._recurrent_dropout_mask = None

    self._context_layers = []

-    # TODO(googleuser): should we choose different initilizer,
-    # e.g. truncated_normal_initializer?
-    self._x2i = tf.get_variable(
-        'x2i', [layer_input_dim, self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-    self._h2i = tf.get_variable(
-        'h2i', [self._hidden_layer_sizes, self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-    self._c2i = tf.get_variable(
-        'c2i', [self._hidden_layer_sizes, self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-    self._bi = tf.get_variable(
-        'bi', [self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-
-    self._x2o = tf.get_variable(
-        'x2o', [layer_input_dim, self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-    self._h2o = tf.get_variable(
-        'h2o', [self._hidden_layer_sizes, self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-    self._c2o = tf.get_variable(
-        'c2o', [self._hidden_layer_sizes, self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-    self._bo = tf.get_variable(
-        'bo', [self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-
-    self._x2c = tf.get_variable(
-        'x2c', [layer_input_dim, self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-    self._h2c = tf.get_variable(
-        'h2c', [self._hidden_layer_sizes, self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-    self._bc = tf.get_variable(
-        'bc', [self._hidden_layer_sizes],
-        initializer=tf.random_normal_initializer(stddev=1e-4))
-
-    self._params.extend([
-        self._x2i, self._h2i, self._c2i, self._bi, self._x2o, self._h2o,
-        self._c2o, self._bo, self._x2c, self._h2c, self._bc
-    ])
+    self._create_hidden_weights(
+        'x2i', [self._layer_input_dim, self._hidden_layer_sizes])
+    self._create_hidden_weights(
+        'h2i', [self._hidden_layer_sizes, self._hidden_layer_sizes])
+    self._create_hidden_weights(
+        'c2i', [self._hidden_layer_sizes, self._hidden_layer_sizes])
+    self._params.append(
+        tf.get_variable(
+            'bi', [self._hidden_layer_sizes],
+            initializer=_make_bias_initializer()))
+
+    self._create_hidden_weights(
+        'x2o', [self._layer_input_dim, self._hidden_layer_sizes])
+    self._create_hidden_weights(
+        'h2o', [self._hidden_layer_sizes, self._hidden_layer_sizes])
+    self._create_hidden_weights(
+        'c2o', [self._hidden_layer_sizes, self._hidden_layer_sizes])
+    self._params.append(
+        tf.get_variable(
+            'bo', [self._hidden_layer_sizes],
+            initializer=_make_bias_initializer()))
+
+    self._create_hidden_weights(
+        'x2c', [self._layer_input_dim, self._hidden_layer_sizes])
+    self._create_hidden_weights(
+        'h2c', [self._hidden_layer_sizes, self._hidden_layer_sizes])
+    self._params.append(
+        tf.get_variable(
+            'bc', [self._hidden_layer_sizes],
+            initializer=_make_bias_initializer()))
+
+    # Add runtime hooks for combined matrices.
+    self._derived_params.append(self._get_x_to_ico)
+    self._derived_params.append(self._get_h_to_ico)
+    self._derived_params.append(self._get_ico_bias)

    lstm_h_layer = Layer(component, name='lstm_h', dim=self._hidden_layer_sizes)
    lstm_c_layer = Layer(component, name='lstm_c', dim=self._hidden_layer_sizes)
@@ -1272,18 +1554,92 @@ class LSTMNetwork(NetworkUnitInterface):
    self._layers.append(
        Layer(component, name='layer_0', dim=self._hidden_layer_sizes))

-    self.params.append(
-        tf.get_variable(
-            'weights_softmax',
-            [self._hidden_layer_sizes, component.num_actions],
-            initializer=tf.random_normal_initializer(stddev=1e-4)))
-    self.params.append(
-        tf.get_variable(
-            'bias_softmax', [component.num_actions],
-            initializer=tf.zeros_initializer()))
+    if self._compute_logits:
+      self.params.append(
+          tf.get_variable(
+              'weights_softmax',
+              [self._hidden_layer_sizes, component.num_actions],
+              initializer=_make_softmax_initializer()))
+      self.params.append(
+          tf.get_variable(
+              'bias_softmax', [component.num_actions],
+              initializer=tf.zeros_initializer()))

-    self._layers.append(
-        Layer(component, name='logits', dim=component.num_actions))
+      self._layers.append(
+          Layer(component, name='logits', dim=component.num_actions))
+
+  def _get_variable_name_prefix(self):
+    """Returns the prefix for variable names."""
+    # The bias variables are always present; infer the prefix from one of them.
+    bi = self._component.get_variable('bi')
+    tokens = bi.op.name.split('/')
+    while tokens.pop() != 'bi':
+      pass  # remove the last 'bi' and everything after it
+    return '/'.join(tokens) + '/'
+
+  def _get_x_to_ico(self):
+    # TODO(googleuser): Export the factored representation, if available.
+    x2i = self._multiply_hidden_weights(tf.eye(self._layer_input_dim), 'x2i')
+    x2c = self._multiply_hidden_weights(tf.eye(self._layer_input_dim), 'x2c')
+    x2o = self._multiply_hidden_weights(tf.eye(self._layer_input_dim), 'x2o')
+    prefix = self._get_variable_name_prefix()
+    with tf.name_scope(None):
+      return tf.concat([x2i, x2c, x2o], axis=1, name=prefix + 'x_to_ico')
+
+  def _get_h_to_ico(self):
+    # TODO(googleuser): Export the factored representation, if available.
+    h2i = self._multiply_hidden_weights(tf.eye(self._hidden_layer_sizes), 'h2i')
+    h2c = self._multiply_hidden_weights(tf.eye(self._hidden_layer_sizes), 'h2c')
+    h2o = self._multiply_hidden_weights(tf.eye(self._hidden_layer_sizes), 'h2o')
+    prefix = self._get_variable_name_prefix()
+    with tf.name_scope(None):
+      return tf.concat([h2i, h2c, h2o], axis=1, name=prefix + 'h_to_ico')
+
+  def _get_ico_bias(self):
+    bi = self._component.get_variable('bi')
+    bc = self._component.get_variable('bc')
+    bo = self._component.get_variable('bo')
+    prefix = self._get_variable_name_prefix()
+    with tf.name_scope(None):
+      return tf.concat([bi, bc, bo], axis=0, name=prefix + 'ico_bias')
+
+  def _create_hidden_weights(self, name, shape):
+    """Creates params for hidden weight matrix of the given shape."""
+    check.Eq(len(shape), 2, 'Hidden weights %s must be a matrix' % name)
+
+    def _initializer():
+      return (tf.orthogonal_initializer()
+              if self._attrs['initialize_hidden_orthogonal'] else
+              tf.random_normal_initializer(stddev=1e-4))
+
+    if self._factored_hidden_dim > 0:
+      self._params.append(
+          tf.get_variable(
+              '%s_in' % name, [shape[0], self._factored_hidden_dim],
+              initializer=_initializer()))
+      self._params.append(
+          tf.get_variable(
+              '%s_out' % name, [self._factored_hidden_dim, shape[1]],
+              initializer=_initializer()))
+    else:
+      self._params.append(
+          tf.get_variable(name, shape, initializer=_initializer()))
+
+  def _multiply_hidden_weights(self, inputs, name):
+    """Multiplies the inputs with the named hidden weight matrix."""
+    if self._factored_hidden_dim > 0:
+      inputs = tf.matmul(inputs, self._component.get_variable('%s_in' % name))
+      return tf.matmul(inputs, self._component.get_variable('%s_out' % name))
+    else:
+      return tf.matmul(inputs, self._component.get_variable(name))
+
+  def pre_create(self, stride):
+    """Refreshes the dropout masks, if applicable."""
+    if self._dropout_per_sequence:
+      self._input_dropout_mask = maybe_make_dropout_mask(
+          [stride, self._layer_input_dim], self._input_dropout_rate)
+      self._recurrent_dropout_mask = maybe_make_dropout_mask(
+          [stride, self._hidden_layer_sizes], self._recurrent_dropout_rate)

  def create(self,
             fixed_embeddings,
@@ -1293,51 +1649,84 @@ class LSTMNetwork(NetworkUnitInterface):
             during_training,
             stride=None):
    """See base class."""
-    input_tensor = get_input_tensor(fixed_embeddings, linked_embeddings)

    # context_tensor_arrays[0] is lstm_h
    # context_tensor_arrays[1] is lstm_c
    assert len(context_tensor_arrays) == 2
    length = context_tensor_arrays[0].size()

-    # Get the (possibly averaged) parameters to execute the network.
-    x2i = self._component.get_variable('x2i')
-    h2i = self._component.get_variable('h2i')
-    c2i = self._component.get_variable('c2i')
+    # Get the (possibly averaged) biases to execute the network.
    bi = self._component.get_variable('bi')
-    x2o = self._component.get_variable('x2o')
-    h2o = self._component.get_variable('h2o')
-    c2o = self._component.get_variable('c2o')
    bo = self._component.get_variable('bo')
-    x2c = self._component.get_variable('x2c')
-    h2c = self._component.get_variable('h2c')
    bc = self._component.get_variable('bc')
+    if self._compute_logits:
+      weights_softmax = self._component.get_variable('weights_softmax')
+      bias_softmax = self._component.get_variable('bias_softmax')
+
+    i_h_tm1 = lookup_named_tensor_or_none('lstm_h', linked_embeddings)
+    h_from_linked = False
+    if i_h_tm1 is not None:
+      h_from_linked = True
+      i_h_tm1 = i_h_tm1.tensor
+    i_c_tm1 = lookup_named_tensor_or_none('lstm_c', linked_embeddings)
+    c_from_linked = False
+    if i_c_tm1 is not None:
+      c_from_linked = True
+      i_c_tm1 = i_c_tm1.tensor
+
+    # i_h_tm1, i_c_tm1 = h_{t-1}, c_{t-1} and label c and h inputs
+    if i_h_tm1 is None:
+      i_h_tm1 = context_tensor_arrays[0].read(length - 1)
+    if i_c_tm1 is None:
+      i_c_tm1 = context_tensor_arrays[1].read(length - 1)
+    i_h_tm1 = tf.identity(i_h_tm1, name='lstm_h_in')
+    i_c_tm1 = tf.identity(i_c_tm1, name='lstm_c_in')

-    # i_h_tm1, i_c_tm1 = h_{t-1}, c_{t-1}
-    i_h_tm1 = context_tensor_arrays[0].read(length - 1)
-    i_c_tm1 = context_tensor_arrays[1].read(length - 1)
+    # Add hard-coded recurrent inputs to the exported cell.
+    if self._component.master.build_runtime_graph:
+      shape = [1, self._hidden_layer_sizes]
+      if not c_from_linked:
+        i_c_tm1 = self._component.add_cell_input(i_c_tm1.dtype, shape, 'lstm_c',
+                                                 'TYPE_RECURRENT')
+      if not h_from_linked:
+        i_h_tm1 = self._component.add_cell_input(i_h_tm1.dtype, shape, 'lstm_h',
+                                                 'TYPE_RECURRENT')
+
+    # Remove 'lstm_h' and 'lstm_c' from linked_embeddings, since they are used
+    # in a special way.
+    linked_embeddings = [
+        x for x in linked_embeddings if x.name not in ['lstm_h', 'lstm_c']
+    ]

-    # label c and h inputs
-    i_c_tm1 = tf.identity(i_c_tm1, name='lstm_c_in')
-    i_h_tm1 = tf.identity(i_h_tm1, name='lstm_h_in')
+    input_tensor = get_input_tensor(fixed_embeddings, linked_embeddings)

    # label the feature input (for debugging purposes)
    input_tensor = tf.identity(input_tensor, name='input_tensor')

    # apply dropout according to http://arxiv.org/pdf/1409.2329v5.pdf
-    if during_training and self._input_dropout_rate < 1:
-      input_tensor = tf.nn.dropout(input_tensor, self._input_dropout_rate)
+    if during_training:
+      input_tensor = maybe_apply_dropout(
+          input_tensor,
+          self._input_dropout_rate,
+          self._dropout_per_sequence,
+          dropout_mask=self._input_dropout_mask)

    # input --  i_t = sigmoid(affine(x_t, h_{t-1}, c_{t-1}))
-    i_ait = tf.matmul(input_tensor, x2i) + tf.matmul(i_h_tm1, h2i) + tf.matmul(
-        i_c_tm1, c2i) + bi
+    # Note peephole connection to previous cell state.
+    i_ait = (
+        self._multiply_hidden_weights(input_tensor, 'x2i') +
+        self._multiply_hidden_weights(i_h_tm1, 'h2i') +
+        self._multiply_hidden_weights(i_c_tm1, 'c2i') + bi)
    i_it = tf.sigmoid(i_ait)

    # forget -- f_t = 1 - i_t
+    # Note coupling with input gate.
    i_ft = tf.ones([1, 1]) - i_it

    # write memory cell -- tanh(affine(x_t, h_{t-1}))
-    i_awt = tf.matmul(input_tensor, x2c) + tf.matmul(i_h_tm1, h2c) + bc
+    i_awt = (
+        self._multiply_hidden_weights(input_tensor, 'x2c') +
+        self._multiply_hidden_weights(i_h_tm1, 'h2c') + bc)
    i_wt = tf.tanh(i_awt)

    # c_t = f_t \odot c_{t-1} + i_t \odot tanh(affine(x_t, h_{t-1}))
@@ -1345,8 +1734,11 @@ class LSTMNetwork(NetworkUnitInterface):
        tf.multiply(i_it, i_wt), tf.multiply(i_ft, i_c_tm1), name='lstm_c')

    # output -- o_t = sigmoid(affine(x_t, h_{t-1}, c_t))
-    i_aot = tf.matmul(input_tensor, x2o) + tf.matmul(ct, c2o) + tf.matmul(
-        i_h_tm1, h2o) + bo
+    # Note peephole connection to current cell state.
+    i_aot = (
+        self._multiply_hidden_weights(input_tensor, 'x2o') +
+        self._multiply_hidden_weights(ct, 'c2o') +
+        self._multiply_hidden_weights(i_h_tm1, 'h2o') + bo)

    i_ot = tf.sigmoid(i_aot)

@@ -1354,27 +1746,35 @@ class LSTMNetwork(NetworkUnitInterface):
    ph_t = tf.tanh(ct)
    ht = tf.multiply(i_ot, ph_t, name='lstm_h')

-    if during_training and self._recurrent_dropout_rate < 1:
-      ht = tf.nn.dropout(
-          ht, self._recurrent_dropout_rate, name='lstm_h_dropout')
+    if during_training:
+      ht = maybe_apply_dropout(
+          ht,
+          self._recurrent_dropout_rate,
+          self._dropout_per_sequence,
+          dropout_mask=self._recurrent_dropout_mask,
+          name='lstm_h_dropout')

    h = tf.identity(ht, name='layer_0')

-    logits = tf.nn.xw_plus_b(ht,
-                             tf.get_variable('weights_softmax'),
-                             tf.get_variable('bias_softmax'))
+    # tensors will be consistent with the layers:
+    # [lstm_h, lstm_c, layer_0, (optional) logits]
+    tensors = [ht, ct, h]

-    if self._component.spec.attention_component:
-      logits += self.attention(ht, attention_tensor)
+    if self._compute_logits:
+      logits = tf.nn.xw_plus_b(ht, weights_softmax, bias_softmax)
+
+      if self._component.spec.attention_component:
+        logits += self.attention(ht, attention_tensor)
+
+      logits = tf.identity(logits, name='logits')
+      tensors.append(logits)

-    logits = tf.identity(logits, name='logits')
-    # tensors will be consistent with the layers:
-    # [lstm_h, lstm_c, layer_0, logits]
-    tensors = [ht, ct, h, logits]
    return tensors

  def get_layer_size(self, layer_name):
-    assert layer_name == 'layer_0', 'Can only retrieve from first hidden layer.'
+    assert layer_name in {
+        'layer_0', 'lstm_h', 'lstm_c'
+    }, 'Can only retrieve from first hidden layer, lstm_h or lstm_c.'
    return self._hidden_layer_sizes

  def get_logits(self, network_tensors):
@@ -1846,10 +2246,9 @@ class PairwiseConvNetwork(NetworkUnitInterface):
        self._widths, self._dropout, self._bias_init, self._initialization
    ])
    if not all(param_lengths[0] == param_len for param_len in param_lengths):
-      raise RuntimeError(
-          'Unmatched widths/dropout/bias_init/initialization: ' +
-          '%d/%d/%d/%d' % (param_lengths[0], param_lengths[1],
-                           param_lengths[2], param_lengths[3]))
+      raise RuntimeError('Unmatched widths/dropout/bias_init/initialization: ' +
+                         '%d/%d/%d/%d' % (param_lengths[0], param_lengths[1],
+                                          param_lengths[2], param_lengths[3]))

    self._depths.extend(map(int, parameters['depths'].split(',')))
    if len(self._depths) != len(self._widths) + 1:
@@ -1866,9 +2265,8 @@ class PairwiseConvNetwork(NetworkUnitInterface):
    self._num_labels = self._depths[-1]

    if parameters['activation_layers']:
-      self._activation_layers = set(map(int,
-                                        parameters['activation_layers'].split(
-                                            ',')))
+      self._activation_layers = set(
+          map(int, parameters['activation_layers'].split(',')))
    else:
      self._activation_layers = set(range(self._num_layers - 1))

@@ -1876,7 +2274,7 @@ class PairwiseConvNetwork(NetworkUnitInterface):
    for i, width in enumerate(self._widths):
      if self._activation == 'glu' and i in self._activation_layers:
        self._kernel_shapes.append(
-            [width, width, self._depths[i], 2*self._depths[i + 1]])
+            [width, width, self._depths[i], 2 * self._depths[i + 1]])
      else:
        self._kernel_shapes.append(
            [width, width, self._depths[i], self._depths[i + 1]])
@@ -1910,7 +2308,8 @@ class PairwiseConvNetwork(NetworkUnitInterface):
    del context_tensor_arrays, attention_tensor  # Unused.
    # TODO(googleuser): Normalize the arguments to create(). 'stride'
    # is unused by the recurrent network units, while 'context_tensor_arrays'
-    # and 'attenion_tensor_array' is unused by bulk network units. b/33587044
+    # and 'attenion_tensor_array' is unused by bulk network units.
+
    if stride is None:
      raise ValueError("PairwiseConvNetwork needs 'stride'")

@@ -1926,8 +2325,9 @@ class PairwiseConvNetwork(NetworkUnitInterface):
    sources_shape = tf.shape(source_tokens)
    targets_shape = tf.shape(target_tokens)
    num_steps = sources_shape[1]
-    with tf.control_dependencies([tf.assert_equal(num_steps, targets_shape[2],
-                                                  name='num_steps_mismatch')]):
+    with tf.control_dependencies([
+        tf.assert_equal(num_steps, targets_shape[2], name='num_steps_mismatch')
+    ]):
      arg1 = tf.tile(source_tokens, tf.stack([1, 1, num_steps, 1]))
      arg2 = tf.tile(target_tokens, tf.stack([1, num_steps, 1, 1]))
    conv = tf.concat([arg1, arg2], 3)
@@ -1935,10 +2335,10 @@ class PairwiseConvNetwork(NetworkUnitInterface):
      with tf.variable_scope('conv%d' % i, reuse=True) as scope:
        if during_training:
          conv = maybe_apply_dropout(conv, self._dropout[i], False)
-        conv = tf.nn.conv2d(conv,
-                            self._component.get_variable('weights'),
-                            [1, 1, 1, 1],
-                            padding='SAME')
+        conv = tf.nn.conv2d(
+            conv,
+            self._component.get_variable('weights'), [1, 1, 1, 1],
+            padding='SAME')
        conv = tf.nn.bias_add(conv, self._component.get_variable('biases'))
        if i in self._activation_layers:
          conv = self._activation_fn(conv, name=scope.name)

--- a/research/syntaxnet/dragnn/python/network_units_test.py
+++ b/research/syntaxnet/dragnn/python/network_units_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for network_units."""


@@ -26,8 +25,6 @@ from tensorflow.python.platform import googletest
 from dragnn.protos import spec_pb2
 from dragnn.python import network_units

-FLAGS = tf.app.flags.FLAGS
-

 class NetworkUnitsConverterTest(test_util.TensorFlowTestCase):

@@ -61,6 +58,7 @@ class MockComponent(object):
    self.spec = component_spec
    self.name = component_spec.name
    self.beam_size = 1
+    self.num_actions = 45
    self._attrs = {}

  def attr(self, name):
@@ -72,12 +70,13 @@ class MockComponent(object):

 class MockMaster(object):

-  def __init__(self):
+  def __init__(self, build_runtime_graph=False):
    self.spec = spec_pb2.MasterSpec()
    self.hyperparams = spec_pb2.GridPoint()
    self.lookup_component = {
        'previous': MockComponent(self, spec_pb2.ComponentSpec())
    }
+    self.build_runtime_graph = build_runtime_graph


 class MockNetwork(object):
@@ -167,6 +166,164 @@ class GetAttrsWithDefaultsTest(test_util.TensorFlowTestCase):
    _assert_attr_is_true('TRUE')


+class LstmNetworkTest(test_util.TensorFlowTestCase):
+  test_spec_1 = """
+      component {
+        name: 'bi_lstm'
+        backend { registered_name: 'TestComponent' }
+        fixed_feature {
+          name: 'words'
+          fml: 'words'
+          size: 1
+          embedding_dim: 32
+          vocabulary_size: 1079813,
+        }
+        network_unit {
+          registered_name: 'LSTMNetwork'
+          parameters {
+            key: "hidden_layer_sizes"
+            value: "128"
+          }
+        }
+      }
+    """
+
+  test_spec_linked = """
+      component {
+        name: 'bi_lstm'
+        backend { registered_name: 'TestComponent' }
+        fixed_feature {
+          name: 'words'
+          fml: 'words'
+          size: 1
+          embedding_dim: 32
+          vocabulary_size: 1079813,
+        }
+        linked_feature {
+          name: 'lstm_h'
+          fml: 'bias(0)'
+          embedding_dim: -1
+          size: 1
+          source_component: 'bi_lstm'
+          source_translator: 'history'
+          source_layer: 'lstm_h'
+        }
+        linked_feature {
+          name: 'lstm_c'
+          fml: 'bias(0)'
+          embedding_dim: -1
+          size: 1
+          source_component: 'bi_lstm'
+          source_translator: 'history'
+          source_layer: 'lstm_c'
+        }
+        network_unit {
+          registered_name: 'LSTMNetwork'
+          parameters {
+            key: "hidden_layer_sizes"
+            value: "128"
+          }
+        }
+      }
+    """
+
+  def setUp(self):
+    # Clear the graph and all existing variables.  Otherwise, variables created
+    # in different tests may collide with each other.
+    tf.reset_default_graph()
+
+  def construct_lstm_network_unit(self, master):
+    """Helper to construct a LSTMNetwork. Doesn't call create() yet."""
+    component = MockComponent(master, master.spec.component[0])
+    with tf.variable_scope('bi_lstm'):
+      lstm_network_unit = network_units.LSTMNetwork(component)
+    return lstm_network_unit
+
+  def get_context_tensor_arrays(self, lstm_network_unit):
+    context_tensor_arrays = []
+    for context_layer in lstm_network_unit.context_layers:
+      context_tensor_arrays.append(context_layer.create_array(1))
+    return context_tensor_arrays
+
+  def fixed_word_embeddings(self):
+    """Helper for returning fixed embeddings, for 1 word feature."""
+    words_tensor = tf.constant([[1.0] * 32], dtype=tf.float32)
+    return [network_units.NamedTensor(words_tensor, 'words')]
+
+  def testCanCreate(self):
+    """Smoke test that the create() function doesn't raise errors."""
+    master = MockMaster()
+    master.spec = spec_pb2.MasterSpec()
+    text_format.Parse(self.test_spec_1, master.spec)
+    lstm_network_unit = self.construct_lstm_network_unit(master)
+    with tf.variable_scope('bi_lstm', reuse=True):
+      lstm_network_unit.create(
+          self.fixed_word_embeddings(), [],
+          self.get_context_tensor_arrays(lstm_network_unit), None, True)
+
+  def testCanCreateLinked(self):
+    """Smoke test that the create() function doesn't raise errors."""
+    master = MockMaster()
+    master.spec = spec_pb2.MasterSpec()
+    text_format.Parse(self.test_spec_linked, master.spec)
+    lstm_network_unit = self.construct_lstm_network_unit(master)
+    with tf.variable_scope('bi_lstm', reuse=True):
+      lstm_network_unit.create(
+          self.fixed_word_embeddings(), [],
+          self.get_context_tensor_arrays(lstm_network_unit), None, True)
+
+  def testRuntimeConcatentatedMatrices(self):
+    """Test generation of concatenated matrices."""
+    # TODO(googleuser): Make MockComponent support runtime graph generation.
+    master = MockMaster(build_runtime_graph=False)
+    master.spec = spec_pb2.MasterSpec()
+    text_format.Parse(self.test_spec_1, master.spec)
+    lstm_network_unit = self.construct_lstm_network_unit(master)
+    with tf.variable_scope('bi_lstm', reuse=True):
+      lstm_network_unit.create(
+          self.fixed_word_embeddings(), [],
+          self.get_context_tensor_arrays(lstm_network_unit), None, False)
+      x_to_ico = lstm_network_unit.derived_params[0]()
+      h_to_ico = lstm_network_unit.derived_params[1]()
+      ico_bias = lstm_network_unit.derived_params[2]()
+
+      # Should be the word dimension (32) to 3x the hidden dimension (128).
+      self.assertEqual(x_to_ico.shape, (32, 384))
+      self.assertEqual(x_to_ico.op.name, 'bi_lstm/x_to_ico')
+
+      # Should be the hidden dimension (128) to 3x the hidden dimension (128).
+      self.assertEqual(h_to_ico.shape, (128, 384))
+      self.assertEqual(h_to_ico.op.name, 'bi_lstm/h_to_ico')
+
+      # Should be equal to the hidden dimension (128) times 3.
+      self.assertEqual(ico_bias.shape, (384,))
+      self.assertEqual(ico_bias.op.name, 'bi_lstm/ico_bias')
+
+  def testRuntimeConcatentatedMatricesLinked(self):
+    """Test generation of concatenated matrices."""
+    # TODO(googleuser): Make MockComponent support runtime graph generation.
+    master = MockMaster(build_runtime_graph=False)
+    master.spec = spec_pb2.MasterSpec()
+    text_format.Parse(self.test_spec_linked, master.spec)
+    lstm_network_unit = self.construct_lstm_network_unit(master)
+    with tf.variable_scope('bi_lstm', reuse=True):
+      lstm_network_unit.create(
+          self.fixed_word_embeddings(), [],
+          self.get_context_tensor_arrays(lstm_network_unit), None, False)
+      x_to_ico = lstm_network_unit.derived_params[0]()
+      h_to_ico = lstm_network_unit.derived_params[1]()
+      ico_bias = lstm_network_unit.derived_params[2]()
+
+      # Should be the word dimension (32) to 3x the hidden dimension (128).
+      self.assertEqual(x_to_ico.shape, (32, 384))
+
+      # Should be the hidden dimension (128) to 3x the hidden dimension (128).
+      self.assertEqual(h_to_ico.shape, (128, 384))
+
+      # Should be equal to the hidden dimension (128) times 3.
+      self.assertEqual(ico_bias.shape, (384,))
+
+
 class GatherNetworkTest(test_util.TensorFlowTestCase):

  def setUp(self):
@@ -214,12 +371,30 @@ class GatherNetworkTest(test_util.TensorFlowTestCase):
        network = network_units.GatherNetwork(self._component)

      # Construct a batch of two items with 3 and 2 steps, respectively.
-      indices = tf.constant([[1], [2], [0],  # item 1
-                             [-1], [0], [-1]],  # item 2
-                            dtype=tf.int64)
-      features = tf.constant([[1.0, 1.5], [2.0, 2.5], [3.0, 3.5],  # item 1
-                              [4.0, 4.5], [5.0, 5.5], [6.0, 6.5]],  # item 2
-                             dtype=tf.float32)
+      indices = tf.constant(
+          [
+              # item 1
+              [1],
+              [2],
+              [0],
+              # item 2
+              [-1],
+              [0],
+              [-1]
+          ],
+          dtype=tf.int64)
+      features = tf.constant(
+          [
+              # item 1
+              [1.0, 1.5],
+              [2.0, 2.5],
+              [3.0, 3.5],
+              # item 2
+              [4.0, 4.5],
+              [5.0, 5.5],
+              [6.0, 6.5]
+          ],
+          dtype=tf.float32)

      fixed_embeddings = []
      linked_embeddings = [
@@ -233,13 +408,16 @@ class GatherNetworkTest(test_util.TensorFlowTestCase):
      gathered = outputs[0]

      # Zeros will be substituted for index -1.
-      self.assertAllEqual(gathered.eval(),
-                          [[2.0, 2.5],  # gathered from 1
-                           [3.0, 3.5],  # gathered from 2
-                           [1.0, 1.5],  # gathered from 0
-                           [0.0, 0.0],  # gathered from -1
-                           [4.0, 4.5],  # gathered from 0
-                           [0.0, 0.0]])  # gathered from -1
+      self.assertAllEqual(
+          gathered.eval(),
+          [
+              [2.0, 2.5],  # gathered from 1
+              [3.0, 3.5],  # gathered from 2
+              [1.0, 1.5],  # gathered from 0
+              [0.0, 0.0],  # gathered from -1
+              [4.0, 4.5],  # gathered from 0
+              [0.0, 0.0]  # gathered from -1
+          ])

  def testTrainablePadding(self):
    self._component.spec.network_unit.parameters['trainable_padding'] = 'true'
@@ -248,12 +426,30 @@ class GatherNetworkTest(test_util.TensorFlowTestCase):
        network = network_units.GatherNetwork(self._component)

      # Construct a batch of two items with 3 and 2 steps, respectively.
-      indices = tf.constant([[1], [2], [0],  # item 1
-                             [-1], [0], [-1]],  # item 2
-                            dtype=tf.int64)
-      features = tf.constant([[1.0, 1.5], [2.0, 2.5], [3.0, 3.5],  # item 1
-                              [4.0, 4.5], [5.0, 5.5], [6.0, 6.5]],  # item 2
-                             dtype=tf.float32)
+      indices = tf.constant(
+          [
+              # item 1
+              [1],
+              [2],
+              [0],
+              # item 2
+              [-1],
+              [0],
+              [-1]
+          ],
+          dtype=tf.int64)
+      features = tf.constant(
+          [
+              # item 1
+              [1.0, 1.5],
+              [2.0, 2.5],
+              [3.0, 3.5],
+              # item 2
+              [4.0, 4.5],
+              [5.0, 5.5],
+              [6.0, 6.5]
+          ],
+          dtype=tf.float32)

      fixed_embeddings = []
      linked_embeddings = [
@@ -299,8 +495,8 @@ class IdentityInitializerTest(test_util.TensorFlowTestCase):
    """
    with tf.Graph().as_default(), self.test_session() as session:
      np.random.seed(4)
-      tensor = network_units.add_var_initialized('tensor', shape, 'identity',
-                                                 divisor=divisor, stddev=std)
+      tensor = network_units.add_var_initialized(
+          'tensor', shape, 'identity', divisor=divisor, stddev=std)
      session.run(tf.global_variables_initializer())
      actual = session.run(tensor)
      self.assertAllClose(actual, expected, 1e-8, 1e-8)
@@ -345,13 +541,13 @@ class IdentityInitializerTest(test_util.TensorFlowTestCase):
    divisor = 3.
    std = 1e-3
    shape = (6, 3)
-    m = divisor/shape[-1]
-    expected = [[m, 4.99951362e-04, -9.95908980e-04],
-                [m, -4.18301526e-04, -1.58457726e-03],
-                [-6.47706795e-04, m, 3.32250027e-04],
-                [-1.14747661e-03, m, -8.79869258e-05],
-                [4.25072387e-04, 3.32253141e-04, m],
-                [3.50997143e-04, -6.06887275e-04, m]]
+    m = divisor / shape[-1]
+    expected = [[m, 4.99951362e-04,
+                 -9.95908980e-04], [m, -4.18301526e-04, -1.58457726e-03],
+                [-6.47706795e-04, m,
+                 3.32250027e-04], [-1.14747661e-03, m, -8.79869258e-05],
+                [4.25072387e-04, 3.32253141e-04,
+                 m], [3.50997143e-04, -6.06887275e-04, m]]
    self.IdentityInitializerHelper(shape, expected, divisor, std)

  def testIdentityInitializerNonSquareRank2FirstDimSmaller(self):
@@ -368,14 +564,14 @@ class IdentityInitializerTest(test_util.TensorFlowTestCase):
    std = 1e-3
    shape = (2, 2, 6)
    m = divisor / shape[-1]
-    expected = [[[5.05617063e-05, 4.99951362e-04, -9.95908980e-04,
-                  6.93598529e-04, -4.18301526e-04, -1.58457726e-03],
-                 [-6.47706795e-04, 5.98575163e-04, 3.32250027e-04,
-                  -1.14747661e-03, 6.18669670e-04, -8.79869258e-05]],
-                [[m, m, m,
-                  3.50997143e-04, -6.06887275e-04, 1.54697930e-03],
-                 [7.23341596e-04, 4.61355667e-05, -9.82991653e-04,
-                  m, m, m]]]
+    expected = [[[
+        5.05617063e-05, 4.99951362e-04, -9.95908980e-04, 6.93598529e-04,
+        -4.18301526e-04, -1.58457726e-03
+    ], [
+        -6.47706795e-04, 5.98575163e-04, 3.32250027e-04, -1.14747661e-03,
+        6.18669670e-04, -8.79869258e-05
+    ]], [[m, m, m, 3.50997143e-04, -6.06887275e-04, 1.54697930e-03],
+         [7.23341596e-04, 4.61355667e-05, -9.82991653e-04, m, m, m]]]
    self.IdentityInitializerHelper(shape, expected, divisor, std)

  def testIdentityInitializerNonSquareRank4(self):
@@ -383,40 +579,110 @@ class IdentityInitializerTest(test_util.TensorFlowTestCase):
    std = 1e-3
    shape = (2, 3, 2, 8)
    m = divisor / float(shape[-1])
-    expected = [
-        [[[5.05617063e-05, 4.99951362e-04, -9.95908980e-04, 6.93598529e-04,
-           -4.18301526e-04, -1.58457726e-03, -6.47706795e-04, 5.98575163e-04],
-          [3.32250027e-04, -1.14747661e-03, 6.18669670e-04, -8.79869258e-05,
-           4.25072387e-04, 3.32253141e-04, -1.15681626e-03, 3.50997143e-04]],
-
-         [[-6.06887275e-04, 1.54697930e-03, 7.23341596e-04, 4.61355667e-05,
-           -9.82991653e-04, 5.44327377e-05, 1.59892938e-04, -1.20894820e-03],
-          [2.22336012e-03, 3.94295203e-04, 1.69235771e-03, -1.11281220e-03,
-           1.63574750e-03, -1.36096554e-03, -6.51225855e-04, 5.42451337e-04]],
-
-         [[4.80062481e-05, -2.35807360e-03, -1.10558409e-03, 8.37836356e-04,
-           2.08787085e-03, 9.14840959e-04, -2.76203355e-04, 7.96511886e-04],
-          [-1.14379858e-03, 5.09919773e-04, -1.34746032e-03, -9.36010019e-06,
-           -1.30704633e-04, 8.02086608e-04, -3.02963977e-04, 1.20200263e-03]]],
-
-        [[[-1.96745284e-04, 8.36528721e-04, 7.86602264e-04, -1.84087583e-03,
-           3.75474883e-05, 3.59280530e-05, -7.78739923e-04, 1.79410708e-04],
-          [-1.45553437e-03, 5.56185201e-04, 5.09778853e-04, 3.00445536e-04,
-           2.47658417e-03, 3.52343399e-04, 6.74710027e-05, -7.32264714e-04]],
-
-         [[m, m, m, m,
-           1.58469542e-04, 1.99008291e-03, 1.16418756e-03, 2.42660157e-04],
-          [1.37992005e-03, -5.45587063e-05, 7.95233937e-04, 1.90899627e-05,
-           m, m, m, m]],
-
-         [[-1.09712186e-03, -5.28196048e-04, -2.37977528e-03, -6.07683673e-04,
-           -1.07529014e-03, 2.02240516e-03, -5.64875314e-04, -1.54292909e-03],
-          [8.70841788e-04, -1.75210531e-04, 4.86030076e-05, 1.88646198e-04,
-           2.09313483e-04, -3.74444906e-04, 9.54698597e-04, 5.23247640e-04]]]
-    ]
+    expected = [[[[
+        5.05617063e-05, 4.99951362e-04, -9.95908980e-04, 6.93598529e-04,
+        -4.18301526e-04, -1.58457726e-03, -6.47706795e-04, 5.98575163e-04
+    ], [
+        3.32250027e-04, -1.14747661e-03, 6.18669670e-04, -8.79869258e-05,
+        4.25072387e-04, 3.32253141e-04, -1.15681626e-03, 3.50997143e-04
+    ]], [[
+        -6.06887275e-04, 1.54697930e-03, 7.23341596e-04, 4.61355667e-05,
+        -9.82991653e-04, 5.44327377e-05, 1.59892938e-04, -1.20894820e-03
+    ], [
+        2.22336012e-03, 3.94295203e-04, 1.69235771e-03, -1.11281220e-03,
+        1.63574750e-03, -1.36096554e-03, -6.51225855e-04, 5.42451337e-04
+    ]], [[
+        4.80062481e-05, -2.35807360e-03, -1.10558409e-03, 8.37836356e-04,
+        2.08787085e-03, 9.14840959e-04, -2.76203355e-04, 7.96511886e-04
+    ], [
+        -1.14379858e-03, 5.09919773e-04, -1.34746032e-03, -9.36010019e-06,
+        -1.30704633e-04, 8.02086608e-04, -3.02963977e-04, 1.20200263e-03
+    ]]], [[[
+        -1.96745284e-04, 8.36528721e-04, 7.86602264e-04, -1.84087583e-03,
+        3.75474883e-05, 3.59280530e-05, -7.78739923e-04, 1.79410708e-04
+    ], [
+        -1.45553437e-03, 5.56185201e-04, 5.09778853e-04, 3.00445536e-04,
+        2.47658417e-03, 3.52343399e-04, 6.74710027e-05, -7.32264714e-04
+    ]], [[
+        m, m, m, m, 1.58469542e-04, 1.99008291e-03, 1.16418756e-03,
+        2.42660157e-04
+    ], [
+        1.37992005e-03, -5.45587063e-05, 7.95233937e-04, 1.90899627e-05, m, m,
+        m, m
+    ]], [[
+        -1.09712186e-03, -5.28196048e-04, -2.37977528e-03, -6.07683673e-04,
+        -1.07529014e-03, 2.02240516e-03, -5.64875314e-04, -1.54292909e-03
+    ], [
+        8.70841788e-04, -1.75210531e-04, 4.86030076e-05, 1.88646198e-04,
+        2.09313483e-04, -3.74444906e-04, 9.54698597e-04, 5.23247640e-04
+    ]]]]

    self.IdentityInitializerHelper(shape, expected, divisor, std)


+class FeatureIdDropoutTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    # Clear the graph and all existing variables.  Otherwise, variables created
+    # in different tests may collide with each other.
+    tf.reset_default_graph()
+
+  def testApplyFeatureIdDropout(self):
+    channel = spec_pb2.FixedFeatureChannel()
+    text_format.Parse("""
+      vocabulary_size: 10
+      dropout_id: 8
+      dropout_keep_probability: [0.0, 0.25, 0.5, 0.75, 1.0]
+    """, channel)
+
+    with tf.Graph().as_default(), self.test_session():
+      with tf.variable_scope('test_scope'):
+        ids = tf.constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=tf.int64)
+        weights = tf.constant([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=tf.float32)
+        tensors = network_units.apply_feature_id_dropout(ids, weights, channel)
+        perturbed_ids = tensors[0].eval()
+        tf.logging.info('perturbed_ids = %s', perturbed_ids)
+
+        # Given the dropout_keep_probability values specified above:
+        #   * ID 0 is never kept.
+        #   * IDs 1-3 are randomly kept with varying probability.
+        #   * IDs 4-9 are always kept.
+        # To avoid non-determinism, we only check for specific feature IDs at
+        # the extremes (never/always kept).  Behavior in between the extremes
+        # should interpolate between the two extremes.
+        self.assertEqual(perturbed_ids[0], channel.dropout_id)
+        self.assertTrue(perturbed_ids[1] in (1, channel.dropout_id))
+        self.assertTrue(perturbed_ids[2] in (2, channel.dropout_id))
+        self.assertTrue(perturbed_ids[3] in (3, channel.dropout_id))
+        self.assertAllEqual(perturbed_ids[4:], [4, 5, 6, 7, 8, 9])
+
+  def testApplyFeatureIdDropoutSkip(self):
+    channel = spec_pb2.FixedFeatureChannel()
+    text_format.Parse("""
+      vocabulary_size: 2
+      dropout_id: 2
+      dropout_keep_probability: [0.0, 1.0]
+    """, channel)
+
+    with tf.Graph().as_default(), self.test_session():
+      with tf.variable_scope('test_scope'):
+        ids = tf.constant([0, 1], dtype=tf.int64)
+        weights = tf.constant([1, 1], dtype=tf.float32)
+        tensors = network_units.apply_feature_id_dropout(ids, weights, channel)
+        perturbed_ids, perturbed_weights = tensors[0].eval(), tensors[1].eval()
+        tf.logging.info('perturbed_ids = %s', perturbed_ids)
+        tf.logging.info('perturbed_weights = %s', perturbed_weights)
+
+        # Given the dropout_keep_probability values specified above:
+        #   * ID 0 is never kept, its weight is set to 0.
+        #   * IDs 1 are always kept.
+        # To avoid non-determinism, we only check for specific feature IDs at
+        # the extremes (never/always kept).
+        self.assertEqual(perturbed_ids[0], channel.dropout_id)
+        self.assertEqual(perturbed_weights[0], 0)
+        self.assertEqual(perturbed_ids[1], 1)
+        self.assertEqual(perturbed_weights[1], 1)
+
+
 if __name__ == '__main__':
  googletest.main()
--- a/research/syntaxnet/dragnn/python/perf_test_data/master-spec
+++ b/research/syntaxnet/dragnn/python/perf_test_data/master-spec
-component {
-  name: "convnet"
-  transition_system {
-    registered_name: "shift-only"
-    parameters {
-      key: "parser_skip_deterministic"
-      value: "false"
-    }
-  }
-  resource {
-    name: "lexifuse-repository"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/lexifuse.lexifuse-repository/repository"
-      file_format: "repository"
-      record_format: "entity"
-    }
-  }
-  resource {
-    name: "brain-parser-model"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.convnet.model-init/brain-parser-model"
-      file_format: "model"
-      record_format: ""
-    }
-  }
-  resource {
-    name: "transition-system-data"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.convnet.model-init/transition-system-data"
-      file_format: "model"
-      record_format: ""
-    }
-  }
-  resource {
-    name: "words-embedding-input"
-    part {
-      file_pattern: "/readahead/512M/cns/lg-d/home/saft/corpora/word-embeddings/en/word2vec/1billion/word2vec-embedding-bi-true-32.sst"
-      file_format: "sstable"
-      record_format: "dist_belief.TokenEmbedding"
-    }
-  }
-  resource {
-    name: "words-vocab-input"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.convnet.model-init/vocab"
-      file_format: "text"
-      record_format: ""
-    }
-  }
-  resource {
-    name: "component-builder-module"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.convnet.component-builder-module/module-spec"
-      file_format: "pbtxt"
-      record_format: ""
-    }
-  }
-  fixed_feature {
-    name: "char_ngram"
-    fml: "input.token.lexifuse-char-ngram"
-    embedding_dim: 16
-    vocabulary_size: 16500
-    size: 1
-    predicate_map: "hashed"
-  }
-  fixed_feature {
-    name: "words"
-    fml: "input.word"
-    embedding_dim: 32
-    vocabulary_size: 39395
-    size: 1
-    predicate_map: "hashed"
-  }
-  network_unit {
-    registered_name: "IdentityNetwork"
-  }
-  backend {
-    registered_name: "ParserComponent"
-  }
-  num_actions: 1
-  attention_component: ""
-  component_builder {
-    registered_name: "components.common.dragnn.python.conv_component.ConvComponentBuilder"
-    parameters {
-      key: "depths"
-      value: "48,128"
-    }
-    parameters {
-      key: "output_dims"
-      value: "45"
-    }
-    parameters {
-      key: "widths"
-      value: "7"
-    }
-  }
-  training_beam_size: 1
-  inference_beam_size: 1
-}
-component {
-  name: "tagger"
-  transition_system {
-    registered_name: "tagger"
-    parameters {
-      key: "parser_skip_deterministic"
-      value: "false"
-    }
-  }
-  resource {
-    name: "tag-map"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/lexifuse.lexicon/tag-map"
-      file_format: "text"
-      record_format: ""
-    }
-  }
-  resource {
-    name: "lexifuse-repository"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/lexifuse.lexifuse-repository/repository"
-      file_format: "repository"
-      record_format: "entity"
-    }
-  }
-  resource {
-    name: "brain-parser-model"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.tagger.model-init/brain-parser-model"
-      file_format: "model"
-      record_format: ""
-    }
-  }
-  resource {
-    name: "transition-system-data"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.tagger.model-init/transition-system-data"
-      file_format: "model"
-      record_format: ""
-    }
-  }
-  resource {
-    name: "component-builder-module"
-    part {
-      file_pattern: "/cns/lg-d/home/chrisalberti/e/conv/dragnn-parser.tagger.component-builder-module/module-spec"
-      file_format: "pbtxt"
-      record_format: ""
-    }
-  }
-  linked_feature {
-    name: "convnet"
-    fml: "input.focus"
-    embedding_dim: -1
-    size: 1
-    source_component: "convnet"
-    source_translator: "identity"
-    source_layer: "conv0_logits"
-  }
-  network_unit {
-    registered_name: "IdentityNetwork"
-  }
-  backend {
-    registered_name: "ParserComponent"
-  }
-  num_actions: 45
-  attention_component: ""
-  component_builder {
-    registered_name: "bulk_component.BulkAnnotatorComponentBuilder"
-  }
-  training_beam_size: 1
-  inference_beam_size: 1
-}
--- a/research/syntaxnet/dragnn/python/perf_test_data/params
+++ b/research/syntaxnet/dragnn/python/perf_test_data/params
--- a/research/syntaxnet/dragnn/python/perf_test_data/sample_docs.pickle
+++ b/research/syntaxnet/dragnn/python/perf_test_data/sample_docs.pickle
--- a/research/syntaxnet/dragnn/python/runtime_support.py
+++ b/research/syntaxnet/dragnn/python/runtime_support.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for supporting the DRAGNN runtime from the TF side."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import re
+
+import tensorflow as tf
+
+from dragnn.python import network_units
+from syntaxnet.util import check
+
+
+def add_hooks(component, cell_subgraph_spec):
+  """Adds "hook" nodes to the graph, for use by the runtime.
+
+  The runtime hook nodes are not on the path to any required output, and will
+  not be called when running TF-based DRAGNN.  As long as the TF graph is not
+  pruned, however, the DRAGNN runtime can call them.
+
+  Runtime hook nodes can perform any TF computation.  Possible uses include:
+    * Applying stable names to existing tensors (e.g., via tf.identity()).
+    * Converting variable data from a TF-friendly or training-friendly format
+      into a runtime-friendly format.
+
+  NB: There are several restrictions on the context in which this function is
+  called.  In brief, call ComponentBuilderBase._add_runtime_hooks() at the top
+  of each ComponentBuilderSubclass.build_*() method.  In detail, this:
+    * Must be called in the variable scope of the |component|, so variable
+      references in component.get_variable() work.
+    * Must be called, possibly transitively, from one of the |component|'s
+      build_*() methods, so MasterBuilder.read_from_avg is set properly for
+      component.get_variable().
+    * Must not be called from within a tf.while_loop(), or the hook nodes will
+      not work.  In particular, NetworkUnitInterface.create() is called from a
+      tf.while_loop() in DynamicComponentBuilder.
+
+  Args:
+    component: Component for which to add hooks.
+    cell_subgraph_spec: CellSubgraphSpec for which to add hooks.
+  """
+  for channel_id, feature_spec in enumerate(component.spec.linked_feature):
+    if feature_spec.embedding_dim != -1:
+      _add_hooks_for_linked_embedding_matrix(component, channel_id)
+
+  for channel_id, feature_spec in enumerate(component.spec.fixed_feature):
+    if feature_spec.embedding_dim != -1:
+      _add_hooks_for_fixed_embedding_matrix(component, channel_id)
+
+  for params in component.network.params:
+    _add_hooks_for_trainable_params(component, params)
+
+  for parameter_getter in component.network.derived_params:
+    _add_hooks_for_derived_parameter(parameter_getter)
+
+  _add_hook_node(
+      tf.constant(cell_subgraph_spec.SerializeToString(), tf.string),
+      '{}/EXPORT/CellSubgraphSpec'.format(component.name))
+
+
+def _blocked_and_dtype_transformations(tensor):
+  """Yields variants of a tensor, for standard blocking/dtype variants.
+
+  Args:
+    tensor (tf.Tensor): Input tensor.
+
+  Yields:
+    (modified_tensor, suffix) pairs, where `modified_tensor` is a transformed
+    version of the input, and `suffix` is a string like "/blocked32".
+  """
+  for blocking_level in (32, 48):
+    blocked = make_padded_blocked_matrix(tensor, blocking_level)
+    bfloat16_blocked = tf.to_bfloat16(bfloat16_permutation(blocked))
+    yield blocked, '/blocked{}'.format(blocking_level)
+    yield bfloat16_blocked, '/blocked{}/bfloat16'.format(blocking_level)
+
+
+def _add_hooks_for_linked_embedding_matrix(component, channel_id):
+  """Adds runtime hooks for a linked embedding matrix.
+
+  The computation performed by network_units.pass_through_embedding_matrix() is
+  equivalent to the following:
+
+    for i in range(stride):
+      if step_idx[i] == -1:
+        outputs[i,:] = out_of_bounds_vector
+      else:
+        outputs[i,:] = tf.matmul(act_block[i,:], weight_matrix)
+
+  The implementation uses clever arithmetic to do this in one matmul per batch.
+  Specifically, the weight_matrix is extended with the out_of_bounds_vector and
+  each activation vector is extended with a 0/1 out-of-bounds indicator.  Then,
+  multiplying the two suffices, assuming that act_block[i,:] is set to zero for
+  out-of-bounds links.
+
+  While this works well for training and high-throughput batched computation, it
+  isn't the best for the runtime:
+    * Appending a 0/1 indicator to the input activation vector requires a copy.
+      Ideally, we could use the input activation vector by reference alone.
+    * In order to access to the |out_of_bounds_vector| as a contiguous array,
+      the runtime must load the linked embedding matrix in row-major format,
+      which may not be the fastest format for arithmetic.
+    * The dimensions of the extended-by-1 matrix and vector are likely to be
+      pessimal.  Most dimensions are specified as 2^n, and adding one element
+      produces maximal padding on the trailing elements, which in turn wastes
+      memory, reduces cache utilization, etc.
+
+  Therefore, in the runtime we split the linked embedding matrix into a separate
+  weight matrix and out-of-bounds vector.
+
+  Args:
+    component: Component for which to add hooks.
+    channel_id: Linked embedding channel for which to add hooks.
+  """
+  var_name = network_units.linked_embeddings_name(channel_id)
+  extended_matrix = component.get_variable(var_name)
+  extended_num_rows = tf.shape(extended_matrix)[0]
+  matrix, vector = tf.split(extended_matrix, [extended_num_rows - 1, 1], 0)
+  transposed = tf.transpose(matrix)
+
+  hook_name = functools.partial(_get_hook_name, component, var_name)
+
+  _add_hook_node(matrix, hook_name('/weights'))
+  _add_hook_node(transposed, hook_name('/weights/transposed'))
+
+  # Add blocked versions of the matrix and its transpose.
+  for blocked, blocked_suffix in _blocked_and_dtype_transformations(matrix):
+    blocked_name = hook_name('/weights/matrix' + blocked_suffix)
+    _add_hook_node(blocked, blocked_name)
+  for blocked, blocked_suffix in _blocked_and_dtype_transformations(transposed):
+    blocked_name = hook_name('/weights/transposed' + blocked_suffix)
+    _add_hook_node(blocked, blocked_name)
+
+  # Add shape and out-of-bounds information.
+  _add_hook_node(tf.shape(transposed), hook_name('/weights/transposed/shape'))
+  _add_hook_node(vector, _get_hook_name(component, var_name, '/out_of_bounds'))
+
+
+def _add_hooks_for_fixed_embedding_matrix(component, channel_id):
+  """Adds runtime hooks for a fixed embedding matrix.
+
+  The hooks remove the last row from the embedding matrix.  The extra row was
+  probably intended for out-of-vocabulary items, but those are handled in the
+  feature system and the extra row is never used.
+
+  Args:
+    component: Component for which to add hooks.
+    channel_id: Fixed embedding channel for which to add hooks.
+  """
+  var_name = network_units.fixed_embeddings_name(channel_id)
+  extended_matrix = component.get_variable(var_name)
+  extended_num_rows = tf.shape(extended_matrix)[0]
+  matrix = tf.slice(extended_matrix, [0, 0], [extended_num_rows - 1, -1])
+
+  # TODO(googleuser): If the extra row is removed from the variable itself, remove
+  # the tf.slice() and point the hook directly at the variable.
+  _add_hook_node(matrix, _get_hook_name(component, var_name, '/trimmed'))
+
+
+def _add_hooks_for_derived_parameter(getter):
+  """Adds hooks for derived parameters.
+
+  Derived parameters are typically slight format modifications of regular
+  parameters, exposed because doing the computation in Python is more convenient
+  than as VariableStore wrappers.
+
+  Args:
+    getter: Function which, when called, will return the derived tensor.
+  """
+  parameter = getter()
+  full_name = parameter.op.name
+
+  def _hook_name(base_name):
+    """Returns a hook node name constructed from a base name."""
+    return full_name + base_name
+
+  if parameter.shape.ndims != 2:
+    tf.logging.info('Not adding matrix hooks for derived parameter %s',
+                    full_name)
+    return
+
+  _add_hook_node(tf.transpose(parameter), _hook_name('/transposed'))
+  for blocked, blocked_suffix in _blocked_and_dtype_transformations(parameter):
+    _add_hook_node(blocked, _hook_name('/matrix' + blocked_suffix))
+
+
+def _add_hooks_for_trainable_params(component, params):
+  """Adds runtime hooks for a variable of trainable parameters.
+
+  Ignores parameters that are not statically-deducible as matrices.
+
+  Args:
+    component: Component for which to add hooks.
+    params: Variable for which to add hooks.
+  """
+  full_name = params.op.name
+  matrix = component.get_variable(var_params=params)
+
+  # Only add hooks for tensors that are statically-deducible as matrices.
+  if params.shape.ndims != 2:
+    tf.logging.info('Not adding hooks for trainable params %s', full_name)
+    return
+
+  # Infer the suffix to append to variable names, if any, based on whether the
+  # possibly-averaged |matrix| is named differently than the |params|.
+  suffix = re.sub('^' + re.escape(full_name), '', matrix.op.name)
+  check.Ne(suffix, matrix.op.name,
+           'Failed to find suffix for params %s' % full_name)
+
+  def _hook_name(base_name):
+    """Returns a hook node name constructed from a base name."""
+    return full_name + base_name + suffix
+
+  # Add the matrix and its transpose.
+  transposed = tf.transpose(matrix)
+  _add_hook_node(matrix, _hook_name('/matrix'))
+  _add_hook_node(transposed, _hook_name('/transposed'))
+
+  # Add blocked versions of the matrix and its transpose.
+  for blocked, blocked_suffix in _blocked_and_dtype_transformations(matrix):
+    _add_hook_node(blocked, _hook_name('/matrix' + blocked_suffix))
+  for blocked, blocked_suffix in _blocked_and_dtype_transformations(transposed):
+    _add_hook_node(blocked, _hook_name('/transposed' + blocked_suffix))
+
+  # Also add hooks for the original shapes, which are obscured by padding.
+  _add_hook_node(tf.shape(matrix), _hook_name('/matrix/shape'))
+  _add_hook_node(tf.shape(transposed), _hook_name('/transposed/shape'))
+
+
+def make_padded_blocked_matrix(matrix, block_size):
+  """Converts a matrix to padded column-blocked format.
+
+  For example, given a [64,127] matrix and block_size=16, this function returns
+  an [8,64,16] tensor where the 8 inner sub-matrices, when concatenated left to
+  right, re-constitute the original matrix.  Note that the 8th sub-matrix has a
+  final column of padding.
+
+  Args:
+    matrix: The matrix to convert.
+    block_size: The number of columns per block.
+
+  Returns:
+    Padded column-blocked matrix.
+  """
+  shape = tf.shape(matrix)
+  num_rows = shape[0]
+  num_columns = shape[1]
+
+  # Compute the amount of padding and resulting number of blocks.
+  last_block_size = num_columns % block_size
+  padding_size = (block_size - last_block_size) % block_size
+  num_blocks = (num_columns + padding_size) // block_size
+
+  # Somehow the obvious approach based on tf.split() and tf.stack() doesn't work
+  # (seems that the number of splits needs to be statically-known), but this
+  # alternative based on tf.transpose() and tf.reshape() does.  Continuing the
+  # example from the docstring...
+  padded = tf.pad(matrix, [[0, 0], [0, padding_size]])  # [64,127] => [64,128]
+  transposed = tf.transpose(padded)  # => [128,64]
+  blocked = tf.reshape(transposed, [num_blocks, block_size,
+                                    num_rows])  # => [8,16,64]
+  return tf.transpose(blocked, [0, 2, 1])  # => [8,64,16]
+
+
+def bfloat16_permutation(tensor):
+  """Permutes values in the last dimension of a tensor.
+
+  This permutation is used so that we can directly use unpacklo/unpackhi AVX2
+  instructions on the matrix coefficients. These unpacking instructions
+  effectively permute the data. See FastUnpackPermutation() and
+  AvxFloatVecArray::Load(const TruncatedFloat16 *) in avx_vector_array.h for
+  more details.
+
+  Args:
+    tensor: Blocked matrix, the result of make_padded_blocked_matrix(). Must
+      have its last dimension a multiple of 16.
+
+  Returns:
+    Permuted matrix, suitable for calling tf.to_bfloat16() on. For testing
+    convenience we don't do so in this method.
+
+  Raises:
+    ValueError: If the matrix's block dimension is not a multiple of 16.
+  """
+  orig_shape = tensor.shape
+  if tensor.shape[-1] % 16 != 0:
+    raise ValueError('Bad block dimension, must be divisible by 16')
+  permutation = [0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15]
+  indices = tf.constant(
+      [16 * (i // 16) + permutation[i % 16] for i in xrange(orig_shape[-1])])
+  return tf.gather(tensor, indices, axis=len(orig_shape) - 1)
+
+
+def _get_hook_name(component, variable_name, suffix):
+  """Builds the name of a hook node.
+
+  Specifically, the name of the hook node is:
+
+    <component.name>/<variable_name><suffix><remainder>
+
+  where <remainder> is whatever follows <variable_name> in the name of the op
+  that produces the named variable.  Recall that component.get_variable() may
+  return either the original variable or its moving average.  These might have
+  names like:
+
+    foo_component/bar_variable
+    foo_component/bar_variable/ExponentialMovingAverage
+
+  In the examples above, the <remainder> is "" for the original variable and
+  "/ExponentialMovingAverage" for its moving average.  Calling this function
+  with suffix="/baz_suffix" in either case would add hook nodes named:
+
+    foo_component/bar_variable/baz_suffix
+    foo_component/bar_variable/baz_suffix/ExponentialMovingAverage
+
+  Note that the suffix is inserted after the variable name, not necessarily at
+  the end of the entire op name.
+
+  Args:
+    component: Component that the hook node belongs to.
+    variable_name: Variable that the hook node name is based on.
+    suffix: Suffix to append to the variable name.
+
+  Returns:
+    Name of the hook node.
+  """
+  variable = component.get_variable(variable_name)
+  full_name = variable.op.name
+  prefix = component.name + '/' + variable_name
+  hook_name = re.sub('^' + re.escape(prefix), prefix + suffix, full_name)
+
+  # If re.sub() did not match anything, it returns the unmodified input (i.e.,
+  # |full_name|).  Enforce that some change was made.
+  check.Ne(
+      full_name, hook_name,
+      'Failed to match expected variable prefix "{}" in variable "{}"'.format(
+          prefix, full_name))
+
+  return hook_name
+
+
+def _add_hook_node(tensor, fully_qualified_name):
+  """Adds a hook node that outputs a tensor with a fully-qualified name."""
+  # Since the name is fully-qualified, insert the hook node into the top-level
+  # name scope.
+  with tf.name_scope(None):
+    tf.identity(tensor, name=fully_qualified_name)
--- a/research/syntaxnet/dragnn/python/runtime_support_test.py
+++ b/research/syntaxnet/dragnn/python/runtime_support_test.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the runtime support utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from dragnn.protos import export_pb2
+from dragnn.protos import spec_pb2
+from dragnn.python import network_units
+from dragnn.python import runtime_support
+
+
+class MockNetwork(object):
+  """Mock for tests."""
+
+  def __init__(self):
+    self.params = [
+        tf.get_variable('rank2', [64, 127], tf.float32),
+        tf.get_variable('rank3', [64, 127, 250], tf.float32)
+    ]
+    self.derived_params = [
+        self._fake_derived_vector, self._fake_derived_parameter
+    ]
+
+  def _fake_derived_vector(self):
+    value = tf.constant([1, 2, 3], dtype=tf.float32)
+    with tf.name_scope(None):
+      return tf.identity(value, name='derived/vector')
+
+  def _fake_derived_parameter(self):
+    # Use absolute scoping to put the derived parameter in the same namespace.
+    base_name = self.params[0].op.name.rsplit('/', 1)[0]
+    with tf.name_scope(None):
+      return tf.concat(
+          [self.params[0], self.params[0]],
+          axis=0,
+          name='{}/derived'.format(base_name))
+
+
+class MockComponent(object):
+  """Mock for tests."""
+
+  def __init__(self):
+    self.name = 'test_component'
+    self.spec = spec_pb2.ComponentSpec()
+    with tf.variable_scope(self.name):
+      self.network = MockNetwork()
+
+  def get_variable(self, var_name=None, var_params=None):
+    if var_name:
+      return tf.get_variable(var_name)
+    else:
+      return var_params
+
+
+class RuntimeSupportTest(tf.test.TestCase):
+  """Testing rig."""
+
+  def testAddLinkedHooks(self):
+    component = MockComponent()
+    link0 = component.spec.linked_feature.add()
+    link1 = component.spec.linked_feature.add()
+    link0.embedding_dim = -1  # direct link
+    link1.embedding_dim = 32  # transformed link
+    link0_matrix_name = network_units.linked_embeddings_name(0)
+    link1_matrix_name = network_units.linked_embeddings_name(1)
+
+    with self.test_session() as session:
+      graph = session.graph
+
+      # Create linked embedding matrices.  Only channel 1 uses one.
+      with tf.variable_scope(component.name):
+        tf.get_variable(link1_matrix_name, shape=[64 + 1, 32], dtype=tf.float32)
+
+      # Add hooks.  This should ignore channel 0 and add hooks for channel 1.
+      with tf.variable_scope(component.name, reuse=True):
+        runtime_support.add_hooks(component, export_pb2.CellSubgraphSpec())
+
+      # Check that no hooks were added for channel 0.
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/weights:0'.format(component.name, link0_matrix_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name('{}/{}/weights/transposed:0'.format(
+            component.name, link0_matrix_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name('{}/{}/weights/transposed/shape:0'.format(
+            component.name, link0_matrix_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name('{}/{}/weights/transposed/blocked32:0'.format(
+            component.name, link0_matrix_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name('{}/{}/weights/transposed/blocked48:0'.format(
+            component.name, link0_matrix_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/out_of_bounds:0'.format(component.name, link0_matrix_name))
+
+      # Get the hooks added for channel 1.
+      weights = graph.get_tensor_by_name(
+          '{}/{}/weights:0'.format(component.name, link1_matrix_name))
+      transposed = graph.get_tensor_by_name('{}/{}/weights/transposed:0'.format(
+          component.name, link1_matrix_name))
+      transposed_shape = graph.get_tensor_by_name(
+          '{}/{}/weights/transposed/shape:0'.format(component.name,
+                                                    link1_matrix_name))
+      transposed32 = graph.get_tensor_by_name(
+          '{}/{}/weights/transposed/blocked32:0'.format(component.name,
+                                                        link1_matrix_name))
+      transposed48 = graph.get_tensor_by_name(
+          '{}/{}/weights/transposed/blocked48:0'.format(component.name,
+                                                        link1_matrix_name))
+      out_of_bounds = graph.get_tensor_by_name(
+          '{}/{}/out_of_bounds:0'.format(component.name, link1_matrix_name))
+
+      # Check dimensions of the hooks.
+      tf.global_variables_initializer().run()
+      self.assertAllEqual(tf.shape(weights).eval(), [64, 32])
+      self.assertAllEqual(tf.shape(transposed).eval(), [32, 64])
+      self.assertAllEqual(transposed_shape.eval(), [32, 64])
+      self.assertAllEqual(tf.shape(transposed32).eval(), [2, 32, 32])
+      self.assertAllEqual(tf.shape(transposed48).eval(), [2, 32, 48])
+      self.assertAllEqual(tf.shape(out_of_bounds).eval(), [1, 32])
+
+  def testAddFixedHooks(self):
+    component = MockComponent()
+    fixed0 = component.spec.fixed_feature.add()
+    fixed1 = component.spec.fixed_feature.add()
+    fixed0.embedding_dim = -1
+    fixed1.embedding_dim = 32
+    fixed0.vocabulary_size = 100
+    fixed1.vocabulary_size = 1000
+    fixed0_matrix_name = network_units.fixed_embeddings_name(0)
+    fixed1_matrix_name = network_units.fixed_embeddings_name(1)
+
+    with self.test_session() as session:
+      graph = session.graph
+
+      # Create fixed embedding matrices.  Only channel 1 uses one.
+      with tf.variable_scope(component.name):
+        tf.get_variable(
+            fixed1_matrix_name, shape=[1000 + 1, 32], dtype=tf.float32)
+
+      # Add hooks.  This should ignore channel 0 and add hooks for channel 1.
+      with tf.variable_scope(component.name, reuse=True):
+        runtime_support.add_hooks(component, export_pb2.CellSubgraphSpec())
+
+      # Check that no hooks were added for channel 0.
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/trimmed:0'.format(component.name, fixed0_matrix_name))
+
+      # Get the hooks added for channel 1.
+      trimmed = graph.get_tensor_by_name(
+          '{}/{}/trimmed:0'.format(component.name, fixed1_matrix_name))
+
+      # Check dimensions of the hooks.
+      tf.global_variables_initializer().run()
+      self.assertAllEqual(tf.shape(trimmed).eval(), [1000, 32])
+
+  def testAddParamsHooks(self):
+    component = MockComponent()
+    rank2_name = 'rank2'
+    rank3_name = 'rank3'
+
+    with self.test_session() as session:
+      graph = session.graph
+
+      # Add hooks.  This should add hooks for all rank-2 params.
+      with tf.variable_scope(component.name, reuse=True):
+        runtime_support.add_hooks(component, export_pb2.CellSubgraphSpec())
+
+      # Check that no hooks were added for the rank-3 params.
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/matrix:0'.format(component.name, rank3_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/transposed:0'.format(component.name, rank3_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/matrix/blocked32:0'.format(component.name, rank3_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/matrix/blocked48:0'.format(component.name, rank3_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/transposed/blocked32:0'.format(component.name, rank3_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/transposed/blocked48:0'.format(component.name, rank3_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/matrix/shape:0'.format(component.name, rank3_name))
+      with self.assertRaises(KeyError):
+        graph.get_tensor_by_name(
+            '{}/{}/transposed/shape:0'.format(component.name, rank3_name))
+
+      # Get the hooks added for each variable.
+      matrix = graph.get_tensor_by_name(
+          '{}/{}/matrix:0'.format(component.name, rank2_name))
+      transposed = graph.get_tensor_by_name(
+          '{}/{}/transposed:0'.format(component.name, rank2_name))
+      matrix32 = graph.get_tensor_by_name(
+          '{}/{}/matrix/blocked32:0'.format(component.name, rank2_name))
+      matrix48 = graph.get_tensor_by_name(
+          '{}/{}/matrix/blocked48:0'.format(component.name, rank2_name))
+      transposed32 = graph.get_tensor_by_name(
+          '{}/{}/transposed/blocked32:0'.format(component.name, rank2_name))
+      transposed48 = graph.get_tensor_by_name(
+          '{}/{}/transposed/blocked48:0'.format(component.name, rank2_name))
+      matrix_shape = graph.get_tensor_by_name(
+          '{}/{}/matrix/shape:0'.format(component.name, rank2_name))
+      transposed_shape = graph.get_tensor_by_name(
+          '{}/{}/transposed/shape:0'.format(component.name, rank2_name))
+
+      # Check dimensions of the hooks.
+      tf.global_variables_initializer().run()
+      self.assertAllEqual(tf.shape(matrix).eval(), [64, 127])
+      self.assertAllEqual(tf.shape(transposed).eval(), [127, 64])
+      self.assertAllEqual(matrix_shape.eval(), [64, 127])
+      self.assertAllEqual(transposed_shape.eval(), [127, 64])
+      self.assertAllEqual(tf.shape(matrix32).eval(), [4, 64, 32])
+      self.assertAllEqual(tf.shape(matrix48).eval(), [3, 64, 48])
+      self.assertAllEqual(tf.shape(transposed32).eval(), [2, 127, 32])
+      self.assertAllEqual(tf.shape(transposed48).eval(), [2, 127, 48])
+
+  def testAddDerivedParamHooks(self):
+    component = MockComponent()
+    derived_name = 'derived'
+
+    with self.test_session() as session:
+      graph = session.graph
+
+      # Add hooks.
+      with tf.variable_scope(component.name, reuse=True):
+        runtime_support.add_hooks(component, export_pb2.CellSubgraphSpec())
+
+      session.run(tf.global_variables_initializer())
+
+      # Get hooks for the derived vector.
+      vector = graph.get_tensor_by_name('derived/vector:0')
+      self.assertEqual(vector.shape, (3,))
+
+      # Get the hooks for the derived variable.
+      matrix = graph.get_tensor_by_name(
+          '{}/{}/matrix/blocked32:0'.format(component.name, derived_name))
+      self.assertAllEqual(tf.shape(matrix).eval(), [4, 128, 32])
+
+      # Check the bfloat16 version. It should have the same shape.
+      bfloat16_matrix = graph.get_tensor_by_name(
+          '{}/{}/matrix/blocked32/bfloat16:0'.format(component.name,
+                                                     derived_name))
+      self.assertAllEqual(tf.shape(bfloat16_matrix).eval(), [4, 128, 32])
+
+  def testMakePaddedBlockedMatrix(self):
+    with self.test_session():
+      matrix = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15],
+                [16, 17, 18, 19, 20]]
+      expected_blocked = [[[1, 2], [6, 7], [11, 12],
+                           [16, 17]], [[3, 4], [8, 9], [13, 14], [18, 19]],
+                          [[5, 0], [10, 0], [15, 0], [20, 0]]]
+
+      matrix = tf.constant(matrix, tf.float32)
+      actual_blocked = runtime_support.make_padded_blocked_matrix(matrix, 2)
+      self.assertAllEqual(actual_blocked.eval(), expected_blocked)
+
+  def testBfloat16Permutation(self):
+    with self.test_session():
+      matrix = [list(range(16))]
+      expected_permuted = [[
+          0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
+      ]]
+      matrix = tf.constant(matrix, tf.float32)
+      actual_permuted = runtime_support.bfloat16_permutation(matrix)
+      self.assertAllEqual(actual_permuted.eval(), expected_permuted)
+
+  def testLargerBfloat16Permutation(self):
+    with self.test_session() as session:
+      matrix = tf.random_uniform((3, 4, 32))
+      permuted = runtime_support.bfloat16_permutation(matrix)
+      matrix, actual_permuted = session.run([matrix, permuted])
+
+      # Just check a few items for now, hopefully that's sufficient to ensure
+      # the permutation is okay.
+      self.assertEqual(matrix[0, 0, 0], actual_permuted[0, 0, 0])
+      self.assertEqual(matrix[0, 0, 1], actual_permuted[0, 0, 1])
+      self.assertEqual(matrix[1, 1, 16], actual_permuted[1, 1, 16])
+      self.assertEqual(matrix[2, 0, 4], actual_permuted[2, 0, 8])
+      self.assertEqual(matrix[2, 0, 5], actual_permuted[2, 0, 9])
+      self.assertEqual(matrix[2, 1, 8], actual_permuted[2, 1, 4])
+      self.assertEqual(matrix[2, 1, 8 + 16], actual_permuted[2, 1, 4 + 16])
+
+  def testAddCellSubgraphSpecHook(self):
+    component = MockComponent()
+    cell = export_pb2.CellSubgraphSpec()
+    cell.input.add(
+        name='feature',
+        tensor='feature_tensor',
+        type=export_pb2.CellSubgraphSpec.Input.TYPE_FEATURE)
+    cell.input.add(
+        name='recurrent',
+        tensor='recurrent_tensor',
+        type=export_pb2.CellSubgraphSpec.Input.TYPE_RECURRENT)
+    cell.output.add(name='layer_0', tensor='layer_0_tensor')
+    cell.output.add(name='logits', tensor='logits_tensor')
+
+    with self.test_session() as session:
+      graph = session.graph
+
+      # Add hooks for the cell constructed above.
+      with tf.variable_scope(component.name, reuse=True):
+        runtime_support.add_hooks(component, cell)
+
+      # Get the hook containing the wire-format proto.
+      cell_wire_format = graph.get_tensor_by_name(
+          '{}/EXPORT/CellSubgraphSpec:0'.format(component.name))
+
+      # Check that the hook matches the cell.
+      tf.global_variables_initializer().run()
+      self.assertEqual(cell_wire_format.eval(), cell.SerializeToString())
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/syntaxnet/dragnn/python/sentence_io_test.py
+++ b/research/syntaxnet/dragnn/python/sentence_io_test.py
@@ -16,30 +16,19 @@
 import os
 import tensorflow as tf

-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-
 from dragnn.python import dragnn_ops

 from dragnn.python import sentence_io
 from syntaxnet import sentence_pb2
-
-FLAGS = tf.app.flags.FLAGS
-
-
-def setUpModule():
-  if not hasattr(FLAGS, 'test_srcdir'):
-    FLAGS.test_srcdir = ''
-  if not hasattr(FLAGS, 'test_tmpdir'):
-    FLAGS.test_tmpdir = tf.test.get_temp_dir()
+from syntaxnet import test_flags


-class ConllSentenceReaderTest(test_util.TensorFlowTestCase):
+class ConllSentenceReaderTest(tf.test.TestCase):

  def setUp(self):
    # This dataset contains 54 sentences.
    self.filepath = os.path.join(
-        FLAGS.test_srcdir,
+        test_flags.source_root(),
        'syntaxnet/testdata/mini-training-set')
    self.batch_size = 20

@@ -82,4 +71,4 @@ class ConllSentenceReaderTest(test_util.TensorFlowTestCase):


 if __name__ == '__main__':
-  googletest.main()
+  tf.test.main()
--- a/research/syntaxnet/dragnn/python/spec_builder.py
+++ b/research/syntaxnet/dragnn/python/spec_builder.py
@@ -15,7 +15,6 @@
 """Utils for building DRAGNN specs."""


-from six.moves import xrange
 import tensorflow as tf

 from dragnn.protos import spec_pb2
@@ -110,7 +109,9 @@ class ComponentSpecBuilder(object):
    if transition_spec.registered_name == 'arc-standard':
      return 'shift-reduce-step'

-    if transition_spec.registered_name in ('shift-only', 'tagger'):
+    if transition_spec.registered_name in ('shift-only', 'tagger', 'morpher',
+                                           'lm-transitions', 'dependency-label',
+                                           'category'):
      if 'left_to_right' in transition_spec.parameters:
        if transition_spec.parameters['left_to_right'] == 'false':
          return 'reverse-token'

--- a/research/syntaxnet/dragnn/python/spec_builder_test.py
+++ b/research/syntaxnet/dragnn/python/spec_builder_test.py
@@ -27,15 +27,6 @@ from dragnn.python import spec_builder

 from syntaxnet import parser_trainer

-FLAGS = tf.app.flags.FLAGS
-
-
-def setUpModule():
-  if not hasattr(FLAGS, 'test_srcdir'):
-    FLAGS.test_srcdir = ''
-  if not hasattr(FLAGS, 'test_tmpdir'):
-    FLAGS.test_tmpdir = tf.test.get_temp_dir()
-

 class SpecBuilderTest(tf.test.TestCase):


--- a/research/syntaxnet/dragnn/python/trainer_lib.py
+++ b/research/syntaxnet/dragnn/python/trainer_lib.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Utility functions to build DRAGNN MasterSpecs and schedule model training.

 Provides functions to finish a MasterSpec, building required lexicons for it and
@@ -23,13 +22,12 @@ import random


 import tensorflow as tf
-from six.moves import xrange
+
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import gfile

-flags = tf.app.flags
-FLAGS = flags.FLAGS
+from syntaxnet.util import check


 def calculate_component_accuracies(eval_res_values):
@@ -59,7 +57,9 @@ def annotate_dataset(sess, annotator, eval_corpus):
    end = min(start + batch_size, len(eval_corpus))
    serialized_annotations = sess.run(
        annotator['annotations'],
-        feed_dict={annotator['input_batch']: eval_corpus[start:end]})
+        feed_dict={
+            annotator['input_batch']: eval_corpus[start:end]
+        })
    assert len(serialized_annotations) == end - start
    processed.extend(serialized_annotations)
  tf.logging.info('Done. Produced %d annotations', len(processed))
@@ -81,16 +81,60 @@ def get_summary_writer(tensorboard_dir):
  return summary_writer


+def generate_target_per_step_schedule(pretrain_steps, train_steps):
+  """Generates a sampled training schedule.
+
+  Arguments:
+    pretrain_steps: List, number of pre-training steps per each target.
+    train_steps: List, number of sampled training steps per each target.
+
+  Returns:
+    Python list of length sum(pretrain_steps + train_steps), containing
+    target numbers per step.
+  """
+  check.Eq(len(pretrain_steps), len(train_steps))
+  # Arbitrary seed to make sure the return is deterministic.
+  random.seed(0x31337)
+  tf.logging.info('Determining the training schedule...')
+  target_per_step = []
+  for target_idx in xrange(len(pretrain_steps)):
+    target_per_step += [target_idx] * pretrain_steps[target_idx]
+  train_steps = list(train_steps)
+  while sum(train_steps) > 0:
+    step = random.randint(0, sum(train_steps) - 1)
+    cumulative_steps = 0
+    for target_idx in xrange(len(train_steps)):
+      cumulative_steps += train_steps[target_idx]
+      if step < cumulative_steps:
+        break
+    assert train_steps[target_idx] > 0
+    train_steps[target_idx] -= 1
+    target_per_step.append(target_idx)
+  tf.logging.info('Training schedule defined!')
+  return target_per_step
+
+
 def run_training_step(sess, trainer, train_corpus, batch_size):
  """Runs a single iteration of train_op on a randomly sampled batch."""
  batch = random.sample(train_corpus, batch_size)
  sess.run(trainer['run'], feed_dict={trainer['input_batch']: batch})


-def run_training(sess, trainers, annotator, evaluator, pretrain_steps,
-                 train_steps, train_corpus, eval_corpus, eval_gold,
-                 batch_size, summary_writer, report_every, saver,
-                 checkpoint_filename, checkpoint_stats=None):
+def run_training(sess,
+                 trainers,
+                 annotator,
+                 evaluator,
+                 pretrain_steps,
+                 train_steps,
+                 train_corpus,
+                 eval_corpus,
+                 eval_gold,
+                 batch_size,
+                 summary_writer,
+                 report_every,
+                 saver,
+                 checkpoint_filename,
+                 checkpoint_stats=None):
  """Runs multi-task DRAGNN training on a single corpus.

  Arguments:
@@ -117,30 +161,15 @@ def run_training(sess, trainers, annotator, evaluator, pretrain_steps,
    checkpoint_filename: File to save checkpoints to.
    checkpoint_stats: Stats of checkpoint.
  """
-  random.seed(0x31337)
-
  if not checkpoint_stats:
    checkpoint_stats = [0] * (len(train_steps) + 1)
-  tf.logging.info('Determining the training schedule...')
-  target_for_step = []
-  for target_idx in xrange(len(pretrain_steps)):
-    target_for_step += [target_idx] * pretrain_steps[target_idx]
-  while sum(train_steps) > 0:
-    step = random.randint(0, sum(train_steps) - 1)
-    cumulative_steps = 0
-    for target_idx in xrange(len(train_steps)):
-      cumulative_steps += train_steps[target_idx]
-      if step < cumulative_steps:
-        break
-    assert train_steps[target_idx] > 0
-    train_steps[target_idx] -= 1
-    target_for_step.append(target_idx)
-  tf.logging.info('Training schedule defined!')

+  target_per_step = generate_target_per_step_schedule(pretrain_steps,
+                                                      train_steps)
  best_eval_metric = -1.0
  tf.logging.info('Starting training...')
  actual_step = sum(checkpoint_stats[1:])
-  for step, target_idx in enumerate(target_for_step):
+  for step, target_idx in enumerate(target_per_step):
    run_training_step(sess, trainers[target_idx], train_corpus, batch_size)
    checkpoint_stats[target_idx + 1] += 1
    if step % 100 == 0:

--- a/research/syntaxnet/dragnn/python/trainer_lib_test.py
+++ b/research/syntaxnet/dragnn/python/trainer_lib_test.py
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dragnn.python.trainer_lib."""
+
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+from dragnn.python import trainer_lib
+
+
+class TrainerLibTest(test_util.TensorFlowTestCase):
+
+  def testImmutabilityOfArguments(self):
+    """Tests that training schedule generation does not change its arguments."""
+    pretrain_steps = [1, 2, 3]
+    train_steps = [5, 5, 5]
+    trainer_lib.generate_target_per_step_schedule(pretrain_steps, train_steps)
+    self.assertEqual(pretrain_steps, [1, 2, 3])
+    self.assertEqual(train_steps, [5, 5, 5])
+
+  def testTrainingScheduleGenerationAndDeterminism(self):
+    """Non-trivial schedule, check generation and determinism."""
+    pretrain_steps = [1, 2, 3]
+    train_steps = [5, 5, 5]
+    generated_schedule = trainer_lib.generate_target_per_step_schedule(
+        pretrain_steps, train_steps)
+    expected_schedule = [
+        0, 1, 1, 2, 2, 2, 1, 0, 2, 1, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2
+    ]
+    self.assertEqual(generated_schedule, expected_schedule)
+
+  def testNoPretrainSteps(self):
+    """Edge case, 1 target, no pretrain."""
+    generated_schedule = trainer_lib.generate_target_per_step_schedule([0],
+                                                                       [10])
+    expected_schedule = [0] * 10
+    self.assertEqual(generated_schedule, expected_schedule)
+
+  def testNoTrainSteps(self):
+    """Edge case, 1 target, only pretrain."""
+    generated_schedule = trainer_lib.generate_target_per_step_schedule([10],
+                                                                       [0])
+    expected_schedule = [0] * 10
+    self.assertEqual(generated_schedule, expected_schedule)
+
+
+if __name__ == '__main__':
+  googletest.main()
--- a/research/syntaxnet/dragnn/python/wrapped_units.py
+++ b/research/syntaxnet/dragnn/python/wrapped_units.py
@@ -330,7 +330,7 @@ class LayerNormBasicLSTMNetwork(BaseLSTMNetwork):

    def _cell_closure(scope):
      """Applies the LSTM cell to the current inputs and state."""
-      return cell(input_tensor, state, scope)
+      return cell(input_tensor, state, scope=scope)

    unused_h, state = self._apply_with_captured_variables(_cell_closure)


--- a/research/syntaxnet/dragnn/tensorflow_ops.bzl
+++ b/research/syntaxnet/dragnn/tensorflow_ops.bzl
-# -*- Python -*-
-
-# Given a source file, generate a test name.
-# i.e. "common_runtime/direct_session_test.cc" becomes
-#      "common_runtime_direct_session_test"
-def src_to_test_name(src):
-  return src.replace("/", "_").split(".")[0]
-
-# Return the options to use for a C++ library or binary build.
-# Uses the ":optmode" config_setting to pick the options.
-load(
-    "@org_tensorflow//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_cuda_tests_tags",
-    "tf_sycl_tests_tags",
-)
-load(
-    "@local_config_cuda//cuda:build_defs.bzl",
-    "if_cuda",
-    "cuda_default_copts"
-)
-
-# List of proto files for android builds
-def tf_android_core_proto_sources(core_proto_sources_relative):
-  return ["@org_tensorflow//tensorflow/core:" + p
-          for p in core_proto_sources_relative]
-
-# Returns the list of pb.h and proto.h headers that are generated for
-# tf_android_core_proto_sources().
-def tf_android_core_proto_headers(core_proto_sources_relative):
-  return (["@org_tensorflow//tensorflow/core/" + p.replace(".proto", ".pb.h")
-           for p in core_proto_sources_relative] +
-          ["@org_tensorflow//tensorflow/core/" + p.replace(".proto", ".proto.h")
-           for p in core_proto_sources_relative])
-
-def if_android_arm(a):
-  return select({
-      "@org_tensorflow//tensorflow:android_arm": a,
-      "//conditions:default": [],
-  })
-
-def if_android_arm64(a):
-  return select({
-      "@org_tensorflow//tensorflow:android_arm64": a,
-      "//conditions:default": [],
-  })
-
-def if_not_android(a):
-  return select({
-      "@org_tensorflow//tensorflow:android": [],
-      "//conditions:default": a,
-  })
-
-def if_android(a):
-  return select({
-      "@org_tensorflow//tensorflow:android": a,
-      "//conditions:default": [],
-  })
-
-def if_ios(a):
-  return select({
-      "@org_tensorflow//tensorflow:ios": a,
-      "//conditions:default": [],
-  })
-
-def if_mobile(a):
-  return select({
-      "@org_tensorflow//tensorflow:android": a,
-      "@org_tensorflow//tensorflow:ios": a,
-      "//conditions:default": [],
-  })
-
-def if_not_mobile(a):
-  return select({
-      "@org_tensorflow//tensorflow:android": [],
-      "@org_tensorflow//tensorflow:ios": [],
-      "//conditions:default": a,
-  })
-
-def if_not_windows(a):
-  return select({
-      "@org_tensorflow//tensorflow:windows": [],
-      "//conditions:default": a,
-  })
-
-def if_x86(a):
-  return select({
-      "@org_tensorflow//tensorflow:linux_x86_64": a,
-      "@org_tensorflow//tensorflow:windows": a,
-      "//conditions:default": [],
-  })
-
-
-def tf_copts():
-  return (["-DEIGEN_AVOID_STL_ARRAY",
-           "-Iexternal/gemmlowp",
-           "-Wno-sign-compare",
-           "-fno-exceptions",] +
-          if_cuda(["-DGOOGLE_CUDA=1"]) +
-          if_android_arm(["-mfpu=neon"]) +
-          select({
-              "@org_tensorflow//tensorflow:android": [
-                  "-std=c++11",
-                  "-DTF_LEAN_BINARY",
-                  "-O2",
-              ],
-              "@org_tensorflow//tensorflow:darwin": [],
-              "@org_tensorflow//tensorflow:windows": [
-                "/DLANG_CXX11",
-                "/D__VERSION__=\\\"MSVC\\\"",
-                "/DPLATFORM_WINDOWS",
-                "/DEIGEN_HAS_C99_MATH",
-                "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
-              ],
-              "@org_tensorflow//tensorflow:ios": ["-std=c++11"],
-              "//conditions:default": ["-pthread"]}))
-
-def tf_opts_nortti_if_android():
-  return if_android([
-      "-fno-rtti",
-      "-DGOOGLE_PROTOBUF_NO_RTTI",
-      "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
-  ])
-
-
-# Given a list of "op_lib_names" (a list of files in the ops directory
-# without their .cc extensions), generate a library for that file.
-def tf_gen_op_libs(op_lib_names, deps=None):
-  # Make library out of each op so it can also be used to generate wrappers
-  # for various languages.
-  if not deps:
-    deps = []
-  for n in op_lib_names:
-    native.cc_library(name=n + "_op_lib",
-                      copts=tf_copts(),
-                      srcs=["ops/" + n + ".cc"],
-                      deps=deps + ["@org_tensorflow//tensorflow/core:framework"],
-                      visibility=["//visibility:public"],
-                      alwayslink=1,
-                      linkstatic=1,)
-
-def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
-                         op_gen="@org_tensorflow//tensorflow/cc:cc_op_gen_main",
-                         deps=None,
-                         override_file=None,
-                         include_internal_ops=0):
-  # Construct an op generator binary for these ops.
-  tool = out_ops_file + "_gen_cc"
-  if deps == None:
-    deps = [pkg + ":" + name + "_op_lib"]
-  native.cc_binary(
-      name = tool,
-      copts = tf_copts(),
-      linkopts = ["-lm"],
-      linkstatic = 1,   # Faster to link this one-time-use binary dynamically
-      deps = [op_gen] + deps
-  )
-
-  if override_file == None:
-    srcs = []
-    override_arg = ","
-  else:
-    srcs = [override_file]
-    override_arg = "$(location " + override_file + ")"
-  native.genrule(
-      name=name + "_genrule",
-      outs=[out_ops_file + ".h", out_ops_file + ".cc",
-            out_ops_file + "_internal.h", out_ops_file + "_internal.cc"],
-      srcs=srcs,
-      tools=[":" + tool],
-      cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
-           "$(location :" + out_ops_file + ".cc) " + override_arg + " " +
-           str(include_internal_ops)))
-
-# Given a list of "op_lib_names" (a list of files in the ops directory
-# without their .cc extensions), generate individual C++ .cc and .h
-# files for each of the ops files mentioned, and then generate a
-# single cc_library called "name" that combines all the
-# generated C++ code.
-#
-# For example, for:
-#  tf_gen_op_wrappers_cc("tf_ops_lib", [ "array_ops", "math_ops" ])
-#
-#
-# This will ultimately generate ops/* files and a library like:
-#
-# cc_library(name = "tf_ops_lib",
-#            srcs = [ "ops/array_ops.cc",
-#                     "ops/math_ops.cc" ],
-#            hdrs = [ "ops/array_ops.h",
-#                     "ops/math_ops.h" ],
-#            deps = [ ... ])
-#
-# Plus a private library for the "hidden" ops.
-# cc_library(name = "tf_ops_lib_internal",
-#            srcs = [ "ops/array_ops_internal.cc",
-#                     "ops/math_ops_internal.cc" ],
-#            hdrs = [ "ops/array_ops_internal.h",
-#                     "ops/math_ops_internal.h" ],
-#            deps = [ ... ])
-# TODO(googleuser): Cleaner approach for hidden ops.
-def tf_gen_op_wrappers_cc(name,
-                          op_lib_names=[],
-                          other_srcs=[],
-                          other_hdrs=[],
-                          pkg="",
-                          deps=[
-                              "@org_tensorflow//tensorflow/cc:ops",
-                              "@org_tensorflow//tensorflow/cc:scope",
-                              "@org_tensorflow//tensorflow/cc:const_op",
-                          ],
-                          op_gen="@org_tensorflow//tensorflow/cc:cc_op_gen_main",
-                          override_file=None,
-                          include_internal_ops=0,
-                          visibility=None):
-  subsrcs = other_srcs
-  subhdrs = other_hdrs
-  internalsrcs = []
-  internalhdrs = []
-  for n in op_lib_names:
-    tf_gen_op_wrapper_cc(
-        n, "ops/" + n, pkg=pkg, op_gen=op_gen, override_file=override_file,
-        include_internal_ops=include_internal_ops)
-    subsrcs += ["ops/" + n + ".cc"]
-    subhdrs += ["ops/" + n + ".h"]
-    internalsrcs += ["ops/" + n + "_internal.cc"]
-    internalhdrs += ["ops/" + n + "_internal.h"]
-
-  native.cc_library(name=name,
-                    srcs=subsrcs,
-                    hdrs=subhdrs,
-                    deps=deps + if_not_android([
-                        "@org_tensorflow//tensorflow/core:core_cpu",
-                        "@org_tensorflow//tensorflow/core:framework",
-                        "@org_tensorflow//tensorflow/core:lib",
-                        "@org_tensorflow//tensorflow/core:protos_all_cc",
-                    ]) + if_android([
-                        "@org_tensorflow//tensorflow/core:android_tensorflow_lib",
-                    ]),
-                    copts=tf_copts(),
-                    alwayslink=1,
-                    visibility=visibility)
-  native.cc_library(name=name + "_internal",
-                    srcs=internalsrcs,
-                    hdrs=internalhdrs,
-                    deps=deps + if_not_android([
-                        "@org_tensorflow//tensorflow/core:core_cpu",
-                        "@org_tensorflow//tensorflow/core:framework",
-                        "@org_tensorflow//tensorflow/core:lib",
-                        "@org_tensorflow//tensorflow/core:protos_all_cc",
-                    ]) + if_android([
-                        "@org_tensorflow//tensorflow/core:android_tensorflow_lib",
-                    ]),
-                    copts=tf_copts(),
-                    alwayslink=1,
-                    visibility=["@org_tensorflow//tensorflow:internal"])
-
-# Invoke this rule in .../tensorflow/python to build the wrapper library.
-def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
-                         require_shape_functions=False, hidden_file=None,
-                         generated_target_name=None):
-  # Construct a cc_binary containing the specified ops.
-  tool_name = "gen_" + name + "_py_wrappers_cc"
-  if not deps:
-    deps = ["@org_tensorflow//tensorflow/core:" + name + "_op_lib"]
-  native.cc_binary(
-      name = tool_name,
-      linkopts = ["-lm"],
-      copts = tf_copts(),
-      linkstatic = 1,   # Faster to link this one-time-use binary dynamically
-      deps = (["@org_tensorflow//tensorflow/core:framework",
-               "@org_tensorflow//tensorflow/python:python_op_gen_main"] + deps),
-      visibility = ["@org_tensorflow//tensorflow:internal"],
-  )
-
-  # Invoke the previous cc_binary to generate a python file.
-  if not out:
-    out = "ops/gen_" + name + ".py"
-
-  if hidden:
-    # `hidden` is a list of op names to be hidden in the generated module.
-    native.genrule(
-        name=name + "_pygenrule",
-        outs=[out],
-        tools=[tool_name],
-        cmd=("$(location " + tool_name + ") " + ",".join(hidden)
-             + " " + ("1" if require_shape_functions else "0") + " > $@"))
-  elif hidden_file:
-    # `hidden_file` is file containing a list of op names to be hidden in the
-    # generated module.
-    native.genrule(
-        name=name + "_pygenrule",
-        outs=[out],
-        srcs=[hidden_file],
-        tools=[tool_name],
-        cmd=("$(location " + tool_name + ") @$(location "
-             + hidden_file + ") " + ("1" if require_shape_functions else "0")
-             + " > $@"))
-  else:
-    # No ops should be hidden in the generated module.
-    native.genrule(
-        name=name + "_pygenrule",
-        outs=[out],
-        tools=[tool_name],
-        cmd=("$(location " + tool_name + ") "
-             + ("1" if require_shape_functions else "0") + " > $@"))
-
-  # Make a py_library out of the generated python file.
-  if not generated_target_name:
-    generated_target_name = name
-  native.py_library(name=generated_target_name,
-                    srcs=[out],
-                    srcs_version="PY2AND3",
-                    visibility=visibility,
-                    deps=[
-                        "@org_tensorflow//tensorflow/python:framework_for_generated_wrappers",
-                    ],)
-
-# Define a bazel macro that creates cc_test for tensorflow.
-# TODO(googleuser): we need to enable this to work around the hidden symbol
-# __cudaRegisterFatBinary error. Need more investigations.
-def tf_cc_test(name, srcs, deps, linkstatic=0, tags=[], data=[], size="medium",
-               suffix="", args=None, linkopts=[]):
-  native.cc_test(name="%s%s" % (name, suffix),
-                 srcs=srcs,
-                 size=size,
-                 args=args,
-                 copts=tf_copts(),
-                 data=data,
-                 deps=deps,
-                 linkopts=["-lpthread", "-lm"] + linkopts,
-                 linkstatic=linkstatic,
-                 tags=tags)
-
-# Part of the testing process requires a distinguishable name for the build
-# rules that involve a GPU, even if otherwise identical to the base rule.
-def tf_cc_test_gpu(name, srcs, deps, linkstatic=0, tags=[], data=[],
-                   size="medium", suffix="", args=None):
-  tf_cc_test(name, srcs, deps, linkstatic=linkstatic, tags=tags, data=data,
-             size=size, suffix=suffix, args=args)
-
-def tf_cuda_cc_test(name, srcs=[], deps=[], tags=[], data=[], size="medium",
-                    linkstatic=0, args=[], linkopts=[]):
-  tf_cc_test(name=name,
-             srcs=srcs,
-             deps=deps,
-             tags=tags + ["manual"],
-             data=data,
-             size=size,
-             linkstatic=linkstatic,
-             linkopts=linkopts,
-             args=args)
-  tf_cc_test(name=name,
-             srcs=srcs,
-             suffix="_gpu",
-             deps=deps + if_cuda(["@org_tensorflow//tensorflow/core:gpu_runtime"]),
-             linkstatic=if_cuda(1, 0),
-             tags=tags + tf_cuda_tests_tags(),
-             data=data,
-             size=size,
-             linkopts=linkopts,
-             args=args)
-
-# Create a cc_test for each of the tensorflow tests listed in "tests"
-def tf_cc_tests(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
-                args=None, linkopts=[]):
-  for src in srcs:
-    tf_cc_test(
-        name=src_to_test_name(src),
-        srcs=[src],
-        deps=deps,
-        linkstatic=linkstatic,
-        tags=tags,
-        size=size,
-        args=args,
-        linkopts=linkopts)
-
-def tf_cc_tests_gpu(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
-                    args=None):
-  tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
-
-
-def tf_cuda_cc_tests(srcs, deps, name='', tags=[], size="medium", linkstatic=0,
-                     args=None, linkopts=[]):
-  for src in srcs:
-    tf_cuda_cc_test(
-        name=src_to_test_name(src),
-        srcs=[src],
-        deps=deps,
-        tags=tags,
-        size=size,
-        linkstatic=linkstatic,
-        args=args,
-        linkopts=linkopts)
-
-def _cuda_copts():
-    """Gets the appropriate set of copts for (maybe) CUDA compilation.
-
-    If we're doing CUDA compilation, returns copts for our particular CUDA
-    compiler.  If we're not doing CUDA compilation, returns an empty list.
-
-    """
-    return cuda_default_copts() + select({
-        "//conditions:default": [],
-        "@local_config_cuda//cuda:using_nvcc": (
-            [
-                "-nvcc_options=relaxed-constexpr",
-                "-nvcc_options=ftz=true",
-            ]
-        ),
-        "@local_config_cuda//cuda:using_clang": (
-            [
-                "-fcuda-flush-denormals-to-zero",
-            ]
-        ),
-    })
-
-# Build defs for TensorFlow kernels
-
-# When this target is built using --config=cuda, a cc_library is built
-# that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
-# libraries needed by GPU kernels.
-def tf_gpu_kernel_library(srcs, copts=[], cuda_copts=[], deps=[], hdrs=[],
-                          **kwargs):
-  copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
-
-  native.cc_library(
-      srcs = srcs,
-      hdrs = hdrs,
-      copts = copts,
-      deps = deps + if_cuda([
-          "@org_tensorflow//tensorflow/core:cuda",
-          "@org_tensorflow//tensorflow/core:gpu_lib",
-      ]),
-      alwayslink=1,
-      **kwargs)
-
-def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
-  """Generate a cc_library with a conditional set of CUDA dependencies.
-
-  When the library is built with --config=cuda:
-
-  - both deps and cuda_deps are used as dependencies
-  - the cuda runtime is added as a dependency (if necessary)
-  - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts
-
-  Args:
-  - cuda_deps: BUILD dependencies which will be linked if and only if:
-      '--config=cuda' is passed to the bazel command line.
-  - deps: dependencies which will always be linked.
-  - copts: copts always passed to the cc_library.
-  - kwargs: Any other argument to cc_library.
-  """
-  if not deps:
-    deps = []
-  if not cuda_deps:
-    cuda_deps = []
-  if not copts:
-    copts = []
-
-  native.cc_library(
-      deps = deps + if_cuda(cuda_deps + [
-          "@org_tensorflow//tensorflow/core:cuda",
-          "@local_config_cuda//cuda:cuda_headers"
-      ]),
-      copts = copts + if_cuda(["-DGOOGLE_CUDA=1"]),
-      **kwargs)
-
-def tf_kernel_library(name, prefix=None, srcs=None, gpu_srcs=None, hdrs=None,
-                      deps=None, alwayslink=1, copts=tf_copts(), **kwargs):
-  """A rule to build a TensorFlow OpKernel.
-
-  May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
-  but with alwayslink=1 by default.  If prefix is specified:
-    * prefix*.cc (except *.cu.cc) is added to srcs
-    * prefix*.h (except *.cu.h) is added to hdrs
-    * prefix*.cu.cc and prefix*.h (including *.cu.h) are added to gpu_srcs.
-  With the exception that test files are excluded.
-  For example, with prefix = "cast_op",
-    * srcs = ["cast_op.cc"]
-    * hdrs = ["cast_op.h"]
-    * gpu_srcs = ["cast_op_gpu.cu.cc", "cast_op.h"]
-    * "cast_op_test.cc" is excluded
-  With prefix = "cwise_op"
-    * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"],
-    * hdrs = ["cwise_ops.h", "cwise_ops_common.h"],
-    * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc",
-                  "cwise_ops.h", "cwise_ops_common.h",
-                  "cwise_ops_gpu_common.cu.h"]
-    * "cwise_ops_test.cc" is excluded
-  """
-  if not srcs:
-    srcs = []
-  if not hdrs:
-    hdrs = []
-  if not deps:
-    deps = []
-
-  if prefix:
-    if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
-      if not gpu_srcs:
-        gpu_srcs = []
-      gpu_srcs = gpu_srcs + native.glob([prefix + "*.cu.cc", prefix + "*.h"],
-                                        exclude = ["*test*"])
-    srcs = srcs + native.glob([prefix + "*.cc"],
-                              exclude = ["*test*", "*.cu.cc"])
-    hdrs = hdrs + native.glob([prefix + "*.h"], exclude = ["*test*", "*.cu.h"])
-
-  cuda_deps = ["@org_tensorflow//tensorflow/core:gpu_lib"]
-  if gpu_srcs:
-    for gpu_src in gpu_srcs:
-      if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"):
-        fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc".format(gpu_src))
-    tf_gpu_kernel_library(
-        name = name + "_gpu",
-        srcs = gpu_srcs,
-        deps = deps,
-        **kwargs)
-    cuda_deps.extend([":" + name + "_gpu"])
-  tf_cuda_library(
-      name = name,
-      srcs = srcs,
-      hdrs = hdrs,
-      copts = copts,
-      cuda_deps = cuda_deps,
-      linkstatic = 1,   # Needed since alwayslink is broken in bazel b/27630669
-      alwayslink = alwayslink,
-      deps = deps,
-      **kwargs)
-
-# Bazel rules for building swig files.
-def _py_wrap_cc_impl(ctx):
-  srcs = ctx.files.srcs
-  if len(srcs) != 1:
-    fail("Exactly one SWIG source file label must be specified.", "srcs")
-  module_name = ctx.attr.module_name
-  src = ctx.files.srcs[0]
-  inputs = depset([src])
-  inputs += ctx.files.swig_includes
-  for dep in ctx.attr.deps:
-    inputs += dep.cc.transitive_headers
-  inputs += ctx.files._swiglib
-  inputs += ctx.files.toolchain_deps
-  swig_include_dirs = depset(_get_repository_roots(ctx, inputs))
-  swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
-  args = ["-c++",
-          "-python",
-          "-module", module_name,
-          "-o", ctx.outputs.cc_out.path,
-          "-outdir", ctx.outputs.py_out.dirname]
-  args += ["-l" + f.path for f in ctx.files.swig_includes]
-  args += ["-I" + i for i in swig_include_dirs]
-  args += [src.path]
-  outputs = [ctx.outputs.cc_out,
-             ctx.outputs.py_out]
-  ctx.action(executable=ctx.executable._swig,
-             arguments=args,
-             inputs=list(inputs),
-             outputs=outputs,
-             mnemonic="PythonSwig",
-             progress_message="SWIGing " + src.path)
-  return struct(files=depset(outputs))
-
-_py_wrap_cc = rule(
-    attrs = {
-        "srcs": attr.label_list(
-            mandatory = True,
-            allow_files = True,
-        ),
-        "swig_includes": attr.label_list(
-            cfg = "data",
-            allow_files = True,
-        ),
-        "deps": attr.label_list(
-            allow_files = True,
-            providers = ["cc"],
-        ),
-        "toolchain_deps": attr.label_list(
-            allow_files = True,
-        ),
-        "module_name": attr.string(mandatory = True),
-        "py_module_name": attr.string(mandatory = True),
-        "_swig": attr.label(
-            default = Label("@swig//:swig"),
-            executable = True,
-            cfg = "host",
-        ),
-        "_swiglib": attr.label(
-            default = Label("@swig//:templates"),
-            allow_files = True,
-        ),
-    },
-    outputs = {
-        "cc_out": "%{module_name}.cc",
-        "py_out": "%{py_module_name}.py",
-    },
-    implementation = _py_wrap_cc_impl,
-)
-
-def _get_repository_roots(ctx, files):
-  """Returns abnormal root directories under which files reside.
-
-  When running a ctx.action, source files within the main repository are all
-  relative to the current directory; however, files that are generated or exist
-  in remote repositories will have their root directory be a subdirectory,
-  e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
-  returns the set of these devious directories, ranked and sorted by popularity
-  in order to hopefully minimize the number of I/O system calls within the
-  compiler, because includes have quadratic complexity.
-  """
-  result = {}
-  for f in files:
-    root = f.root.path
-    if root:
-      if root not in result:
-        result[root] = 0
-      result[root] -= 1
-    work = f.owner.workspace_root
-    if work:
-      if root:
-        root += "/"
-      root += work
-    if root:
-      if root not in result:
-        result[root] = 0
-      result[root] -= 1
-  return [k for v, k in sorted([(v, k) for k, v in result.items()])]
-
-# Bazel rule for collecting the header files that a target depends on.
-def _transitive_hdrs_impl(ctx):
-  outputs = depset()
-  for dep in ctx.attr.deps:
-    outputs += dep.cc.transitive_headers
-  return struct(files=outputs)
-
-_transitive_hdrs = rule(
-    attrs = {
-        "deps": attr.label_list(
-            allow_files = True,
-            providers = ["cc"],
-        ),
-    },
-    implementation = _transitive_hdrs_impl,
-)
-
-def transitive_hdrs(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather",
-                   deps=deps)
-  native.filegroup(name=name,
-                   srcs=[":" + name + "_gather"])
-
-# Create a header only library that includes all the headers exported by
-# the libraries in deps.
-def cc_header_only_library(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather",
-                   deps=deps)
-  native.cc_library(name=name,
-                    hdrs=[":" + name + "_gather"],
-                    **kwargs)
-
-def tf_custom_op_library_additional_deps():
-  return [
-      "@protobuf_archive//:protobuf",
-      "//third_party/eigen3",
-      "@org_tensorflow//tensorflow/core:framework_headers_lib",
-  ]
-
-# Traverse the dependency graph along the "deps" attribute of the
-# target and return a struct with one field called 'tf_collected_deps'.
-# tf_collected_deps will be the union of the deps of the current target
-# and the tf_collected_deps of the dependencies of this target.
-def _collect_deps_aspect_impl(target, ctx):
-  alldeps = depset()
-  if hasattr(ctx.rule.attr, "deps"):
-    for dep in ctx.rule.attr.deps:
-      alldeps = alldeps | depset([dep.label])
-      if hasattr(dep, "tf_collected_deps"):
-        alldeps = alldeps | dep.tf_collected_deps
-  return struct(tf_collected_deps=alldeps)
-
-collect_deps_aspect = aspect(
-    implementation=_collect_deps_aspect_impl,
-    attr_aspects=["deps"])
-
-def _dep_label(dep):
-  label = dep.label
-  return label.package + ":" + label.name
-
-# This rule checks that the transitive dependencies of targets listed
-# in the 'deps' attribute don't depend on the targets listed in
-# the 'disallowed_deps' attribute.
-def _check_deps_impl(ctx):
-  disallowed_deps = ctx.attr.disallowed_deps
-  for input_dep in ctx.attr.deps:
-    if not hasattr(input_dep, "tf_collected_deps"):
-      continue
-    for dep in input_dep.tf_collected_deps:
-      for disallowed_dep in disallowed_deps:
-        if dep == disallowed_dep.label:
-          fail(_dep_label(input_dep) + " cannot depend on " +
-               _dep_label(disallowed_dep))
-  return struct()
-
-check_deps = rule(
-    _check_deps_impl,
-    attrs = {
-        "deps": attr.label_list(
-            aspects=[collect_deps_aspect],
-            mandatory = True,
-            allow_files = True
-        ),
-        "disallowed_deps": attr.label_list(
-            mandatory = True,
-            allow_files = True
-        )},
-)
-
-# Helper to build a dynamic library (.so) from the sources containing
-# implementations of custom ops and kernels.
-def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
-  cuda_deps = [
-      "@org_tensorflow//tensorflow/core:stream_executor_headers_lib",
-      "@local_config_cuda//cuda:cudart_static",
-  ]
-  deps = deps + tf_custom_op_library_additional_deps()
-  if gpu_srcs:
-    basename = name.split(".")[0]
-    native.cc_library(
-        name = basename + "_gpu",
-        srcs = gpu_srcs,
-        copts = _cuda_copts(),
-        deps = deps + if_cuda(cuda_deps))
-    cuda_deps.extend([":" + basename + "_gpu"])
-
-  check_deps(name=name+"_check_deps",
-             deps=deps + if_cuda(cuda_deps),
-             disallowed_deps=["@org_tensorflow//tensorflow/core:framework",
-                              "@org_tensorflow//tensorflow/core:lib"])
-
-  native.cc_binary(name=name,
-                   srcs=srcs,
-                   deps=deps + if_cuda(cuda_deps),
-                   data=[name + "_check_deps"],
-                   copts=tf_copts(),
-                   linkshared=1,
-                   linkopts = select({
-                       "//conditions:default": [
-                           "-lm",
-                       ],
-                       "@org_tensorflow//tensorflow:darwin": [],
-                   }),
-  )
-
-def tf_extension_linkopts():
-  return []  # No extension link opts
-
-def tf_extension_copts():
-  return []  # No extension c opts
-
-def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
-  module_name = name.split("/")[-1]
-  # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
-  # and use that as the name for the rule producing the .so file.
-  cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
-  cc_library_pyd_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".pyd"])
-  extra_deps = []
-  _py_wrap_cc(name=name + "_py_wrap",
-              srcs=srcs,
-              swig_includes=swig_includes,
-              deps=deps + extra_deps,
-              toolchain_deps=["//tools/defaults:crosstool"],
-              module_name=module_name,
-              py_module_name=name)
-  extra_linkopts = select({
-      "@local_config_cuda//cuda:darwin": [
-          "-Wl,-exported_symbols_list",
-          "@org_tensorflow//tensorflow:tf_exported_symbols.lds"
-      ],
-      "@org_tensorflow//tensorflow:windows": [
-      ],
-      "//conditions:default": [
-          "-Wl,--version-script",
-          "@org_tensorflow//tensorflow:tf_version_script.lds"
-      ]})
-  extra_deps += select({
-      "@local_config_cuda//cuda:darwin": [
-          "@org_tensorflow//tensorflow:tf_exported_symbols.lds"
-      ],
-      "@org_tensorflow//tensorflow:windows": [
-      ],
-      "//conditions:default": [
-          "@org_tensorflow//tensorflow:tf_version_script.lds"
-      ]
-  })
-
-  native.cc_binary(
-      name=cc_library_name,
-      srcs=[module_name + ".cc"],
-      copts=(copts + ["-Wno-self-assign",
-                      "-Wno-sign-compare",
-                      "-Wno-write-strings"]
-             + tf_extension_copts()),
-      linkopts=tf_extension_linkopts() + extra_linkopts,
-      linkstatic=1,
-      linkshared=1,
-      deps=deps + extra_deps)
-  native.genrule(
-      name = "gen_" + cc_library_pyd_name,
-      srcs = [":" + cc_library_name],
-      outs = [cc_library_pyd_name],
-      cmd = "cp $< $@",
-  )
-  native.py_library(name=name,
-                    srcs=[":" + name + ".py"],
-                    srcs_version="PY2AND3",
-                    data=select({
-                      "@org_tensorflow//tensorflow:windows": [":" + cc_library_pyd_name],
-                      "//conditions:default": [":" + cc_library_name],
-                    }))
-
-def py_test(deps=[], **kwargs):
-  native.py_test(
-      deps=select({
-          "//conditions:default" : deps,
-          "@org_tensorflow//tensorflow:no_tensorflow_py_deps" : []
-      }),
-      **kwargs)
-
-def tf_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-               tags=[], shard_count=1, additional_deps=[], flaky=0):
-  native.py_test(
-      name=name,
-      size=size,
-      srcs=srcs,
-      main=main,
-      args=args,
-      tags=tags,
-      visibility=["@org_tensorflow//tensorflow:internal"],
-      shard_count=shard_count,
-      data=data,
-      deps=select({
-          "//conditions:default" : [
-            "@org_tensorflow//tensorflow/python:extra_py_tests_deps",
-            "@org_tensorflow//tensorflow/python:gradient_checker",
-          ] + additional_deps,
-          "@org_tensorflow//tensorflow:no_tensorflow_py_deps" : []
-      }),
-      flaky=flaky,
-      srcs_version="PY2AND3")
-
-def cuda_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-                 shard_count=1, additional_deps=[], tags=[], flaky=0):
-  test_tags = tags + tf_cuda_tests_tags()
-  tf_py_test(name=name,
-             size=size,
-             srcs=srcs,
-             data=data,
-             main=main,
-             args=args,
-             tags=test_tags,
-             shard_count=shard_count,
-             additional_deps=additional_deps,
-             flaky=flaky)
-
-def sycl_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-                shard_count=1, additional_deps=[], tags=[], flaky=0):
- test_tags = tags + tf_sycl_tests_tags()
- tf_py_test(name=name,
-            size=size,
-            srcs=srcs,
-            data=data,
-            main=main,
-            args=args,
-            tags=test_tags,
-            shard_count=shard_count,
-            additional_deps=additional_deps,
-            flaky=flaky)
-
-def py_tests(name,
-             srcs,
-             size="medium",
-             additional_deps=[],
-             data=[],
-             tags=[],
-             shard_count=1,
-             prefix=""):
-  for src in srcs:
-    test_name = src.split("/")[-1].split(".")[0]
-    if prefix:
-      test_name = "%s_%s" % (prefix, test_name)
-    tf_py_test(name=test_name,
-               size=size,
-               srcs=[src],
-               main=src,
-               tags=tags,
-               shard_count=shard_count,
-               data=data,
-               additional_deps=additional_deps)
-
-def cuda_py_tests(name, srcs, size="medium", additional_deps=[], data=[],
-                  shard_count=1, tags=[], prefix=""):
-  test_tags = tags + tf_cuda_tests_tags()
-  py_tests(name=name, size=size, srcs=srcs, additional_deps=additional_deps,
-           data=data, tags=test_tags, shard_count=shard_count,prefix=prefix)
-
-# Creates a genrule named <name> for running tools/proto_text's generator to
-# make the proto_text functions, for the protos passed in <srcs>.
-#
-# Return a struct with fields (hdrs, srcs) containing the names of the
-# generated files.
-def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs):
-  out_hdrs = ([p.replace(".proto", ".pb_text.h") for p in srcs] +
-              [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
-  out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
-  native.genrule(
-        name = name,
-        srcs = srcs + ["@org_tensorflow//tensorflow/tools/proto_text:placeholder.txt"],
-        outs = out_hdrs + out_srcs,
-        cmd = "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " +
-              "$(@D) " + srcs_relative_dir + " $(SRCS)",
-        tools = ["@org_tensorflow//tensorflow/tools/proto_text:gen_proto_text_functions"],
-    )
-  return struct(hdrs=out_hdrs, srcs=out_srcs)
-
-def tf_genrule_cmd_append_to_srcs(to_append):
-    return ("cat $(SRCS) > $(@) && " +
-            "echo >> $(@) && " +
-            "echo " + to_append + " >> $(@)")
-
-
-def tf_version_info_genrule():
-  native.genrule(
-      name = "version_info_gen",
-      srcs = [
-          "@org_tensorflow//tensorflow/tools/git:gen/spec.json",
-          "@org_tensorflow//tensorflow/tools/git:gen/head",
-          "@org_tensorflow//tensorflow/tools/git:gen/branch_ref",
-      ],
-      outs = ["util/version_info.cc"],
-      cmd = "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
-      local = 1,
-      tools = ["@org_tensorflow//tensorflow/tools/git:gen_git_source.py"],
-  )
-
-def cc_library_with_android_deps(deps, android_deps=[],
-                                common_deps=[], **kwargs):
-  deps = if_not_android(deps) + if_android(android_deps) + common_deps
-  native.cc_library(deps=deps, **kwargs)
--- a/research/syntaxnet/dragnn/tools/BUILD
+++ b/research/syntaxnet/dragnn/tools/BUILD
@@ -9,9 +9,10 @@ py_binary(
    name = "conll_checkpoint_converter",
    srcs = ["conll_checkpoint_converter.py"],
    deps = [
-        "//dragnn/protos:spec_py_pb2",
+        "//dragnn/protos:spec_pb2_py",
        "//dragnn/python:dragnn_model_saver_lib",
        "//dragnn/python:spec_builder",
+        "@absl_py//absl/flags",
        "@org_tensorflow//tensorflow:tensorflow_py",
        "@org_tensorflow//tensorflow/core:protos_all_py",
    ],
@@ -28,6 +29,7 @@ py_binary(
        ":components",
        "//dragnn/python:evaluation",
        "//dragnn/python:spec_builder",
+        "@absl_py//absl/flags",
    ],
 )

@@ -43,6 +45,7 @@ py_binary(
        "//dragnn/python:dragnn_ops",
        "//dragnn/python:evaluation",
        "//dragnn/python:spec_builder",
+        "@absl_py//absl/flags",
    ],
 )

@@ -58,6 +61,7 @@ py_binary(
        "//dragnn/python:dragnn_ops",
        "//dragnn/python:evaluation",
        "//dragnn/python:spec_builder",
+        "@absl_py//absl/flags",
    ],
 )

@@ -73,6 +77,7 @@ py_binary(
        "//dragnn/python:dragnn_ops",
        "//dragnn/python:evaluation",
        "//dragnn/python:spec_builder",
+        "@absl_py//absl/flags",
    ],
 )

@@ -86,7 +91,8 @@ py_binary(
        "//dragnn/python:lexicon",
        "//dragnn/python:spec_builder",
        "//dragnn/python:trainer_lib",
-        "//syntaxnet:task_spec_py_pb2",
+        "//syntaxnet:task_spec_pb2_py",
+        "@absl_py//absl/flags",
    ],
 )

@@ -100,7 +106,9 @@ py_binary(
        "//dragnn/python:lexicon",
        "//dragnn/python:spec_builder",
        "//dragnn/python:trainer_lib",
-        "//syntaxnet:task_spec_py_pb2",
+        "//syntaxnet:task_spec_pb2_py",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
    ],
 )

@@ -110,13 +118,14 @@ py_binary(
    deps = [
        "//dragnn/core:dragnn_bulk_ops",
        "//dragnn/core:dragnn_ops",
-        "//dragnn/protos:spec_py_pb2",
+        "//dragnn/protos:spec_pb2_py",
        "//dragnn/python:evaluation",
        "//dragnn/python:graph_builder",
        "//dragnn/python:sentence_io",
        "//dragnn/python:spec_builder",
        "//dragnn/python:trainer_lib",
        "//syntaxnet:parser_ops",
+        "@absl_py//absl/flags",
        "@org_tensorflow//tensorflow:tensorflow_py",
        "@org_tensorflow//tensorflow/core:protos_all_py",
    ],
@@ -128,7 +137,7 @@ py_binary(
    deps = [
        "//dragnn/core:dragnn_bulk_ops",
        "//dragnn/core:dragnn_ops",
-        "//dragnn/protos:spec_py_pb2",
+        "//dragnn/protos:spec_pb2_py",
        "//dragnn/python:dragnn_ops",
        "//dragnn/python:evaluation",
        "//dragnn/python:graph_builder",
@@ -136,9 +145,11 @@ py_binary(
        "//dragnn/python:spec_builder",
        "//dragnn/python:trainer_lib",
        "//syntaxnet:parser_ops",
-        "//syntaxnet:sentence_py_pb2",
-        "//syntaxnet:task_spec_py_pb2",
+        "//syntaxnet:sentence_pb2_py",
+        "//syntaxnet:task_spec_pb2_py",
        "//syntaxnet/util:check",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
        "@org_tensorflow//tensorflow:tensorflow_py",
        "@org_tensorflow//tensorflow/core:protos_all_py",
    ],
@@ -190,11 +201,11 @@ py_library(
    deps = [
        "//dragnn/core:dragnn_bulk_ops",
        "//dragnn/core:dragnn_ops",
-        "//dragnn/protos:spec_py_pb2",
+        "//dragnn/protos:spec_pb2_py",
        "//dragnn/python:graph_builder",
        "//dragnn/python:sentence_io",
        "//syntaxnet:parser_ops",
-        "//syntaxnet:sentence_py_pb2",
+        "//syntaxnet:sentence_pb2_py",
        "@org_tensorflow//tensorflow:tensorflow_py",
        "@org_tensorflow//tensorflow/core:protos_all_py",
    ],
@@ -215,6 +226,6 @@ py_library(
        "//dragnn/python:spec_builder",
        "//dragnn/python:trainer_lib",
        "//dragnn/python:visualization",
-        "//syntaxnet:task_spec_py_pb2",
+        "//syntaxnet:task_spec_pb2_py",
    ],
 )
--- a/research/syntaxnet/dragnn/tools/conll_checkpoint_converter.py
+++ b/research/syntaxnet/dragnn/tools/conll_checkpoint_converter.py
@@ -25,6 +25,7 @@ from __future__ import division
 from __future__ import print_function

 import os
+from absl import flags
 import tensorflow as tf

 from google.protobuf import text_format
@@ -32,7 +33,6 @@ from dragnn.protos import spec_pb2
 from dragnn.python import dragnn_model_saver_lib as saver_lib
 from dragnn.python import spec_builder

-flags = tf.app.flags
 FLAGS = flags.FLAGS

 flags.DEFINE_string('master_spec', None, 'Path to task context with '