Update DRAGNN, fix some macOS issues

ea3fa4a3 · Ivan Bogatyy · b7523ee5 · ea3fa4a3 · ea3fa4a3 · ea3fa4a3
Commit ea3fa4a3 authored Mar 22, 2017 by Ivan Bogatyy
15 changed files
--- a/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/prefix-table
+++ b/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/prefix-table
--- a/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/suffix-table
+++ b/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/suffix-table
--- a/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/tag-map
+++ b/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/tag-map
+7
+VBP 6
+. 5
+PRP 5
+NNS 4
+DT 2
+CC 1
+NN 1
--- a/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/tag-to-category
+++ b/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/tag-to-category
+.	PUNCT
+CC	CONJ
+DT	DET
+NN	NOUN
+NNS	NOUN
+PRP	PRON
+VBP	VERB
--- a/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/word-map
+++ b/syntaxnet/dragnn/tools/testdata/biaffine.model/resources/word-map
+10
+. 5
+books 4
+They 3
+I 2
+buy 2
+have 2
+no 2
+sell 2
+and 1
+clue 1
--- a/syntaxnet/dragnn/tools/testdata/biaffine.model/targets.pbtxt
+++ b/syntaxnet/dragnn/tools/testdata/biaffine.model/targets.pbtxt
+target {
+  name: "joint"
+  component_weights: [0,  # lengths
+                      0,  # bilstm
+                      1,  # tagger
+                      0,  # heads
+                      0,  # modifiers
+                      0,  # digraph
+                      1,  # parser
+                      0,  # parsed_head_tokens
+                      0,  # parsed_heads
+                      0,  # parsed_modifiers
+                      0,  # labels
+                      1]  # labeler
+  unroll_using_oracle: [true, true, true, true, true, true,
+                        true, true, true, true, true, true]
+}
--- a/syntaxnet/dragnn/tools/testdata/small.conll
+++ b/syntaxnet/dragnn/tools/testdata/small.conll
+1	They	they	PRON	PRP	Case=Nom|Number=Plur	2	nsubj	_	_
+2	buy	buy	VERB	VBP	Number=Plur|Person=3|Tense=Pres	0	ROOT	_	_
+3	books	book	NOUN	NNS	Number=Plur	2	obj	_	SpaceAfter=No
+4	.	.	PUNCT	.	_	2	punct	_	_
+
+1	They	they	PRON	PRP	Case=Nom|Number=Plur	2	nsubj	_	_
+2	sell	sell	VERB	VBP	Number=Plur|Person=3|Tense=Pres	0	ROOT	_	_
+3	books	book	NOUN	NNS	Number=Plur	2	obj	_	SpaceAfter=No
+4	.	.	PUNCT	.	_	2	punct	_	_
+
+1	They	they	PRON	PRP	Case=Nom|Number=Plur	2	nsubj	_	_
+2	buy	buy	VERB	VBP	Number=Plur|Person=3|Tense=Pres	0	ROOT	_	_
+3	and	and	CONJ	CC	_	4	cc	_	_
+4	sell	sell	VERB	VBP	Number=Plur|Person=3|Tense=Pres	2	conj	_	_
+5	books	book	NOUN	NNS	Number=Plur	2	obj	_	SpaceAfter=No
+6	.	.	PUNCT	.	_	2	punct	_	_
+
+1	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1	2	nsubj	_	_
+2	have	have	VERB	VBP	Number=Sing|Person=1|Tense=Pres	0	ROOT	_	_
+3	no	no	DET	DT	PronType=Neg	4	det	_	_
+4	clue	clue	NOUN	NN	Number=Sing	2	obj	_	SpaceAfter=No
+5	.	.	PUNCT	.	_	2	punct	_	_
+
+1	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1	2	nsubj	_	_
+2	have	have	VERB	VBP	Number=Sing|Person=1|Tense=Pres	0	ROOT	_	_
+3	no	no	DET	DT	PronType=Neg	4	det	_	_
+4	books	book	NOUN	NNS	Number=Plur	2	obj	_	SpaceAfter=No
+5	.	.	PUNCT	.	_	2	punct	_	_
+
--- a/syntaxnet/dragnn/viz/compile-minified.sh
+++ b/syntaxnet/dragnn/viz/compile-minified.sh
 #!/bin/bash
+# Copyright 2017 Google Inc. All Rights Reserved.
 #
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
 # This file builds all of the Javascript into a minified "hermetic" bundle.js
 # file, which is written out into the same directory as this script.
 #

--- a/syntaxnet/dragnn/viz/develop.sh
+++ b/syntaxnet/dragnn/viz/develop.sh
 #!/bin/bash
+# Copyright 2017 Google Inc. All Rights Reserved.
 #
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
 # This file starts up a development server, using webpack in development mode.
 # It takes no arguments. See README.md for more information.


--- a/syntaxnet/syntaxnet/BUILD
+++ b/syntaxnet/syntaxnet/BUILD
@@ -178,9 +178,9 @@ cc_library(
    srcs = ["char_ngram_string_extractor.cc"],
    hdrs = ["char_ngram_string_extractor.h"],
    deps = [
+        ":base",
        ":segmenter_utils",
        ":task_context",
-        "@org_tensorflow//tensorflow/core:lib",
    ],
 )

@@ -365,7 +365,6 @@ cc_library(
        ":utils",
        ":whole_sentence_features",
        ":workspace",
-        "@org_tensorflow//tensorflow/core:lib",
    ],
    alwayslink = 1,
 )
@@ -390,6 +389,7 @@ cc_library(
    srcs = ["embedding_feature_extractor.cc"],
    hdrs = ["embedding_feature_extractor.h"],
    deps = [
+        ":base",
        ":feature_extractor",
        ":parser_transitions",
        ":sentence_features",
@@ -397,7 +397,6 @@ cc_library(
        ":task_context",
        ":utils",
        ":workspace",
-        "@org_tensorflow//tensorflow/core:lib",
    ],
 )

@@ -455,6 +454,7 @@ cc_library(
    srcs = ["lexicon_builder.cc"],
    deps = [
        ":affix",
+        ":base",
        ":char_ngram_string_extractor",
        ":feature_extractor",
        ":parser_transitions",
@@ -464,8 +464,6 @@ cc_library(
        ":term_frequency_map",
        ":text_formats",
        ":utils",
-        "@org_tensorflow//tensorflow/core:framework",
-        "@org_tensorflow//tensorflow/core:lib",
    ],
    alwayslink = 1,
 )
@@ -484,11 +482,11 @@ cc_library(
    name = "parser_ops_cc",
    srcs = ["ops/parser_ops.cc"],
    deps = [
+        ":base",
        ":document_filters",
        ":lexicon_builder",
        ":reader_ops",
        ":unpack_sparse_features",
-        "@org_tensorflow//tensorflow/core:framework",
    ],
    alwayslink = 1,
 )

--- a/syntaxnet/syntaxnet/graph_builder.py
+++ b/syntaxnet/syntaxnet/graph_builder.py
@@ -478,11 +478,15 @@ class GreedyParser(object):
    """Embeddings at the given index will be set to pretrained values."""

    def _Initializer(shape, dtype=tf.float32, partition_info=None):
+      """Variable initializer that loads pretrained embeddings."""
      unused_dtype = dtype
+      seed1, seed2 = tf.get_seed(self._seed)
      t = gen_parser_ops.word_embedding_initializer(
          vectors=embeddings_path,
          task_context=task_context,
-          embedding_init=self._embedding_init)
+          embedding_init=self._embedding_init,
+          seed=seed1,
+          seed2=seed2)

      t.set_shape(shape)
      return t

--- a/syntaxnet/syntaxnet/ops/parser_ops.cc
+++ b/syntaxnet/syntaxnet/ops/parser_ops.cc
@@ -249,6 +249,8 @@ REGISTER_OP("WordEmbeddingInitializer")
    .Attr("vectors: string")
    .Attr("task_context: string")
    .Attr("embedding_init: float = 1.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
    .Doc(R"doc(
 Reads word embeddings from an sstable of dist_belief.TokenEmbedding protos for
 every word specified in a text vocabulary file.
@@ -256,6 +258,13 @@ every word specified in a text vocabulary file.
 word_embeddings: a tensor containing word embeddings from the specified sstable.
 vectors: path to recordio of word embedding vectors.
 task_context: file path at which to read the task context.
+embedding_init: embedding vectors that are not found in the input sstable are
+  initialized randomly from a normal distribution with zero mean and
+  std dev = embedding_init / sqrt(embedding_size).
+seed: If either `seed` or `seed2` are set to be non-zero, the random number
+  generator is seeded by the given seed.  Otherwise, it is seeded by a random
+  seed.
+seed2: A second seed to avoid seed collision.
 )doc");

 REGISTER_OP("DocumentSource")

--- a/syntaxnet/syntaxnet/reader_ops.cc
+++ b/syntaxnet/syntaxnet/reader_ops.cc
@@ -450,6 +450,13 @@ class WordEmbeddingInitializer : public OpKernel {
    OP_REQUIRES_OK(context,
                   context->GetAttr("embedding_init", &embedding_init_));

+    // Convert the seeds into a single 64-bit seed.  NB: seed=0,seed2=0 converts
+    // into seed_=0, which causes Eigen PRNGs to seed non-deterministically.
+    int seed, seed2;
+    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed));
+    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2));
+    seed_ = static_cast<uint64>(seed) | static_cast<uint64>(seed2) << 32;
+
    // Sets up number and type of inputs and outputs.
    OP_REQUIRES_OK(context, context->MatchSignature({}, {DT_FLOAT}));
  }
@@ -479,11 +486,10 @@ class WordEmbeddingInitializer : public OpKernel {
            context, context->allocate_output(
                         0, TensorShape({word_map->Size() + 3, embedding_size}),
                         &embedding_matrix));
-        embedding_matrix->matrix<float>()
-            .setRandom<Eigen::internal::NormalRandomGenerator<float>>();
-        embedding_matrix->matrix<float>() =
-            embedding_matrix->matrix<float>() * static_cast<float>(
-                embedding_init_ / sqrt(embedding_size));
+        auto matrix = embedding_matrix->matrix<float>();
+        Eigen::internal::NormalRandomGenerator<float> prng(seed_);
+        matrix =
+            matrix.random(prng) * (embedding_init_ / sqrtf(embedding_size));
      }
      if (vocab.find(embedding.token()) != vocab.end()) {
        SetNormalizedRow(embedding.vector(), vocab[embedding.token()],
@@ -544,6 +550,9 @@ class WordEmbeddingInitializer : public OpKernel {
  // Task context used to configure this op.
  TaskContext task_context_;

+  // Seed for random initialization.
+  uint64 seed_ = 0;
+
  // Embedding vectors that are not found in the input sstable are initialized
  // randomly from a normal distribution with zero mean and
  //   std dev = embedding_init_ / sqrt(embedding_size).

--- a/syntaxnet/syntaxnet/reader_ops_test.py
+++ b/syntaxnet/syntaxnet/reader_ops_test.py
@@ -167,19 +167,19 @@ class ParsingReaderOpsTest(test_util.TensorFlowTestCase):
      logging.info('Result: %s', res)
      self.assertEqual(res[0], 2)

-  def testWordEmbeddingInitializer(self):
-    def _TokenEmbedding(token, embedding):
-      e = dictionary_pb2.TokenEmbedding()
-      e.token = token
-      e.vector.values.extend(embedding)
-      return e.SerializeToString()
+  def _token_embedding(self, token, embedding):
+    e = dictionary_pb2.TokenEmbedding()
+    e.token = token
+    e.vector.values.extend(embedding)
+    return e.SerializeToString()

+  def testWordEmbeddingInitializer(self):
    # Provide embeddings for the first three words in the word map.
-    records_path = os.path.join(FLAGS.test_tmpdir, 'sstable-00000-of-00001')
+    records_path = os.path.join(FLAGS.test_tmpdir, 'records1')
    writer = tf.python_io.TFRecordWriter(records_path)
-    writer.write(_TokenEmbedding('.', [1, 2]))
-    writer.write(_TokenEmbedding(',', [3, 4]))
-    writer.write(_TokenEmbedding('the', [5, 6]))
+    writer.write(self._token_embedding('.', [1, 2]))
+    writer.write(self._token_embedding(',', [3, 4]))
+    writer.write(self._token_embedding('the', [5, 6]))
    del writer

    with self.test_session():
@@ -192,6 +192,34 @@ class ParsingReaderOpsTest(test_util.TensorFlowTestCase):
                  [5. / (25 + 36) ** .5, 6. / (25 + 36) ** .5]]),
        embeddings[:3,])

+  def testWordEmbeddingInitializerRepeatability(self):
+    records_path = os.path.join(FLAGS.test_tmpdir, 'records2')
+    writer = tf.python_io.TFRecordWriter(records_path)
+    writer.write(self._token_embedding('.', [1, 2, 3]))  # 3 dims
+    del writer
+
+    # As long as there is one non-zero seed, the result should be repeatable.
+    for seed1, seed2 in [(0, 1), (1, 0), (123, 456)]:
+      with tf.Graph().as_default(), self.test_session():
+        embeddings1 = gen_parser_ops.word_embedding_initializer(
+            vectors=records_path,
+            task_context=self._task_context,
+            seed=seed1,
+            seed2=seed2)
+        embeddings2 = gen_parser_ops.word_embedding_initializer(
+            vectors=records_path,
+            task_context=self._task_context,
+            seed=seed1,
+            seed2=seed2)
+
+        # The number of terms is based on the word map, which may change if the
+        # test corpus is updated.  Just assert that there are some terms.
+        self.assertGreater(tf.shape(embeddings1)[0].eval(), 0)
+        self.assertGreater(tf.shape(embeddings2)[0].eval(), 0)
+        self.assertEqual(tf.shape(embeddings1)[1].eval(), 3)
+        self.assertEqual(tf.shape(embeddings2)[1].eval(), 3)
+        self.assertAllEqual(embeddings1.eval(), embeddings2.eval())
+

 if __name__ == '__main__':
  googletest.main()
--- a/tensorflow @ a7d6015d
+++ b/tensorflow @ a7d6015d
-Subproject commit f7ed0682f67a9a767ee30ad62233847e8a8cbb95
+Subproject commit a7d6015d3759bee447c8103979a5ebc831ce23d1