Updates to syntaxnet, including update tensorflow sub-module, bazel...

Updates to syntaxnet, including update tensorflow sub-module, bazel requirement and fix trainer crash (#479) * syntaxnet: Cosmetic fixes recommended by python lint. * syntaxnet: Fix crash in parser_trainer due to inconsistency between LexiconBuilder::Compute() and context.pbtxt definition ('char-map' input declaration was missing). * syntaxnet: reduce flakiness in GraphBuilderTest. * syntaxnet: Update tensorflow submodule to version > 0.10. * syntaxnet: Update to latest stable bazel (0.3.1). This update comes partially to allow Tensorflow submodule to build succesffuly. In this commit, I also update and simplify the WORKSPACE, to avoid declaring dependencies already present in tensorflow. * syntaxnet: Update bazel version check to require version 0.3.0 * syntaxnet: Document pip requirement, along with python mock module.

Updates to syntaxnet, including update tensorflow sub-module, bazel...
Updates to syntaxnet, including update tensorflow sub-module, bazel requirement and fix trainer crash (#479) * syntaxnet: Cosmetic fixes recommended by python lint. * syntaxnet: Fix crash in parser_trainer due to inconsistency between LexiconBuilder::Compute() and context.pbtxt definition ('char-map' input declaration was missing). * syntaxnet: reduce flakiness in GraphBuilderTest. * syntaxnet: Update tensorflow submodule to version > 0.10. * syntaxnet: Update to latest stable bazel (0.3.1). This update comes partially to allow Tensorflow submodule to build succesffuly. In this commit, I also update and simplify the WORKSPACE, to avoid declaring dependencies already present in tensorflow. * syntaxnet: Update bazel version check to require version 0.3.0 * syntaxnet: Document pip requirement, along with python mock module.
51238b1b · Livio Soares · calberti · 2390974a · 51238b1b · 51238b1b
Commit 51238b1b authored Oct 03, 2016 by Livio Soares Committed by calberti Oct 03, 2016
8 changed files
--- a/syntaxnet/syntaxnet/sentence_features.h
+++ b/syntaxnet/syntaxnet/sentence_features.h
@@ -300,7 +300,7 @@ class Word : public TermFrequencyMapFeature {
  Word() : TermFrequencyMapFeature("word-map") {}

  FeatureValue ComputeValue(const Token &token) const override {
-    string form = token.word();
+    const string &form = token.word();
    return term_map().LookupIndex(form, UnknownValue());
  }
 };

--- a/syntaxnet/syntaxnet/shared_store.h
+++ b/syntaxnet/syntaxnet/shared_store.h
@@ -71,7 +71,7 @@ class SharedStore {
    int refcount;

    SharedObject(void *o, std::function<void()> d)
-        : object(o), delete_callback(d), refcount(1) {}
+        : object(o), delete_callback(std::move(d)), refcount(1) {}
  };

  // A map from keys to shared objects.

--- a/syntaxnet/syntaxnet/structured_graph_builder.py
+++ b/syntaxnet/syntaxnet/structured_graph_builder.py
@@ -24,9 +24,9 @@ from tensorflow.python.ops import tensor_array_ops
 from syntaxnet import graph_builder
 from syntaxnet.ops import gen_parser_ops

-tf.NoGradient('BeamParseReader')
-tf.NoGradient('BeamParser')
-tf.NoGradient('BeamParserOutput')
+tf.NotDifferentiable('BeamParseReader')
+tf.NotDifferentiable('BeamParser')
+tf.NotDifferentiable('BeamParserOutput')


 def AddCrossEntropy(batch_size, n):
@@ -122,6 +122,7 @@ class StructuredGraphBuilder(graph_builder.GreedyParser):
        KeepGoing,
        Advance,
        [state, step, scores_array, alive, alive_steps] + list(features),
+        shape_invariants=[tf.TensorShape(None)] * (len(features) + 5),
        parallel_iterations=100)

    # Link to the final nodes/values of ops that have passed through While:

--- a/syntaxnet/syntaxnet/syntaxnet.bzl
+++ b/syntaxnet/syntaxnet/syntaxnet.bzl
@@ -19,8 +19,8 @@ load("@protobuf//:protobuf.bzl", "py_proto_library")
 def if_cuda(if_true, if_false = []):
    """Shorthand for select()'ing on whether we're building with CUDA."""
    return select({
-        "@org_tensorflow//third_party/gpus/cuda:using_nvcc": if_true,
-        "@org_tensorflow//third_party/gpus/cuda:using_gcudacc": if_true,
+        "@local_config_cuda//cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_clang": if_true,
        "//conditions:default": if_false
    })


--- a/syntaxnet/syntaxnet/term_frequency_map.cc
+++ b/syntaxnet/syntaxnet/term_frequency_map.cc
@@ -20,7 +20,8 @@ limitations under the License.
 #include <limits>

 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"

@@ -61,9 +62,10 @@ void TermFrequencyMap::Load(const string &filename, int min_frequency,
  std::unique_ptr<tensorflow::RandomAccessFile> file;
  TF_CHECK_OK(tensorflow::Env::Default()->NewRandomAccessFile(filename, &file));
  static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
-  tensorflow::io::InputBuffer input(file.get(), kInputBufferSize);
+  tensorflow::io::RandomAccessInputStream stream(file.get());
+  tensorflow::io::BufferedInputStream buffer(&stream, kInputBufferSize);
  string line;
-  TF_CHECK_OK(input.ReadLine(&line));
+  TF_CHECK_OK(buffer.ReadLine(&line));
  int32 total = -1;
  CHECK(utils::ParseInt32(line.c_str(), &total));
  CHECK_GE(total, 0);
@@ -71,7 +73,7 @@ void TermFrequencyMap::Load(const string &filename, int min_frequency,
  // Read the mapping.
  int64 last_frequency = -1;
  for (int i = 0; i < total && i < max_num_terms; ++i) {
-    TF_CHECK_OK(input.ReadLine(&line));
+    TF_CHECK_OK(buffer.ReadLine(&line));
    vector<string> elements = utils::Split(line, ' ');
    CHECK_EQ(2, elements.size());
    CHECK(!elements[0].empty());
@@ -143,9 +145,10 @@ TagToCategoryMap::TagToCategoryMap(const string &filename) {
  std::unique_ptr<tensorflow::RandomAccessFile> file;
  TF_CHECK_OK(tensorflow::Env::Default()->NewRandomAccessFile(filename, &file));
  static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
-  tensorflow::io::InputBuffer input(file.get(), kInputBufferSize);
+  tensorflow::io::RandomAccessInputStream stream(file.get());
+  tensorflow::io::BufferedInputStream buffer(&stream, kInputBufferSize);
  string line;
-  while (input.ReadLine(&line) == tensorflow::Status::OK()) {
+  while (buffer.ReadLine(&line) == tensorflow::Status::OK()) {
    vector<string> pair = utils::Split(line, '\t');
    CHECK(line.empty() || pair.size() == 2) << line;
    tag_to_category_[pair[0]] = pair[1];

--- a/syntaxnet/syntaxnet/text_formats.cc
+++ b/syntaxnet/syntaxnet/text_formats.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <vector>

 #include "syntaxnet/document_format.h"
-#include "syntaxnet/sentence.pb.h"
 #include "syntaxnet/segmenter_utils.h"
+#include "syntaxnet/sentence.pb.h"
 #include "syntaxnet/utils.h"
-#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/regexp.h"
@@ -70,7 +70,7 @@ class CoNLLSyntaxFormat : public DocumentFormat {
  }

  // Reads up to the first empty line and returns false end of file is reached.
-  bool ReadRecord(tensorflow::io::InputBuffer *buffer,
+  bool ReadRecord(tensorflow::io::BufferedInputStream *buffer,
                  string *record) override {
    string line;
    record->clear();
@@ -284,6 +284,122 @@ class CoNLLSyntaxFormat : public DocumentFormat {

 REGISTER_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);

+// Reader for segmentation training data format. This reader assumes the input
+// format is similar to CoNLL format but with only two fileds:
+//
+// Fields:
+// 1  FORM:        Word form or punctuation symbol.
+// 2  SPACE FLAG:  Can be either 'SPACE' or 'NO_SPACE' indicates that whether
+//                 there should be a space between this word and the next one in
+//                 the raw text.
+//
+// Examples:
+// To create a training example for sentence with raw text:
+//   That's a good point.
+// and the corresponding gold segmentation:
+//   That 's a good point .
+// Then the correct input is:
+// That	NO_SPACE
+// 's	SPACE
+// a	SPACE
+// good	SPACE
+// point	NO_SPACE
+// .	NO_SPACE
+//
+// Yet another example:
+// To create a training example for sentence with raw text:
+//   这是一个测试
+// and the corresponding gold segmentation:
+//   这 是 一 个 测试
+// Then the correct input is:
+// 这	NO_SPACE
+// 是	NO_SPACE
+// 一	NO_SPACE
+// 个	NO_SPACE
+// 测试	NO_SPACE
+class SegmentationTrainingDataFormat : public CoNLLSyntaxFormat {
+ public:
+  // Converts to segmentation training data by breaking those word in the input
+  // tokens to utf8 character based tokens. Moreover, if a character is the
+  // first char of the word in the original token, then its break level is set
+  // to SPACE_BREAK to indicate that the corresponding gold transition for that
+  // character token is START. Otherwise NO_BREAK to indicate MERGE.
+  void ConvertFromString(const string &key, const string &value,
+                         vector<Sentence *> *sentences) override {
+    // Create new sentence.
+    Sentence *sentence = new Sentence();
+
+    // Each line corresponds to one token.
+    string text;
+    vector<string> lines = utils::Split(value, '\n');
+
+    // Add each token to the sentence.
+    vector<string> fields;
+    for (size_t i = 0; i < lines.size(); ++i) {
+      // Split line into tab-separated fields.
+      fields.clear();
+      fields = utils::Split(lines[i], '\t');
+      if (fields.empty()) continue;
+
+      // Skip comment lines.
+      if (fields[0][0] == '#') continue;
+
+      // Check that the line is valid.
+      CHECK_GE(fields.size(), 2)
+          << "Every line has to have at least 8 tab separated fields.";
+
+      // Get relevant fields.
+      const string &word = fields[0];
+      CHECK(fields[1] == "SPACE" || fields[1] == "NO_SPACE")
+          << "The space field can only be either 'SPACE' or 'NO_SPACE'";
+      const bool space_after = fields[1] == "SPACE";
+
+      // Add token to sentence text.
+      int start = text.size();
+      text.append(word);
+      if (space_after && i != lines.size() - 1) {
+        text.append(" ");
+      }
+
+      // Add character-based token to sentence.
+      vector<tensorflow::StringPiece> chars;
+      SegmenterUtils::GetUTF8Chars(word, &chars);
+      bool is_first_char = true;
+      for (auto utf8char : chars) {
+        Token *char_token = sentence->add_token();
+        char_token->set_word(utf8char.ToString());
+        char_token->set_start(start);
+        start += char_token->word().size();
+        char_token->set_end(start - 1);
+        char_token->set_break_level(
+            is_first_char ? Token::SPACE_BREAK : Token::NO_BREAK);
+        is_first_char = false;
+      }
+
+      // Add another space token.
+      if (space_after) {
+        Token *char_token = sentence->add_token();
+        char_token->set_word(" ");
+        char_token->set_start(start);
+        char_token->set_end(start);
+        char_token->set_break_level(Token::SPACE_BREAK);
+      }
+    }
+
+    if (sentence->token_size() > 0) {
+      sentence->set_docid(key);
+      sentence->set_text(text);
+      sentences->push_back(sentence);
+    } else {
+      // If the sentence was empty (e.g., blank lines at the beginning of a
+      // file), then don't save it.
+      delete sentence;
+    }
+  }
+};
+
+REGISTER_DOCUMENT_FORMAT("segment-train-data", SegmentationTrainingDataFormat);
+
 // Reader for tokenized text. This reader expects every sentence to be on a
 // single line and tokens on that line to be separated by single spaces.
 //
@@ -292,7 +408,7 @@ class TokenizedTextFormat : public DocumentFormat {
  TokenizedTextFormat() {}

  // Reads a line and returns false if end of file is reached.
-  bool ReadRecord(tensorflow::io::InputBuffer *buffer,
+  bool ReadRecord(tensorflow::io::BufferedInputStream *buffer,
                  string *record) override {
    return buffer->ReadLine(record).ok();
  }

--- a/syntaxnet/syntaxnet/text_formats_test.py
+++ b/syntaxnet/syntaxnet/text_formats_test.py
@@ -19,7 +19,6 @@

 # disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
 import os.path
-
 import tensorflow as tf

 import syntaxnet.load_parser_ops
@@ -51,6 +50,11 @@ class TextFormatsTest(test_util.TensorFlowTestCase):
    inp.record_format.append(record_format)
    inp.part.add().file_pattern = file_pattern

+  def AddParameter(self, name, value, context):
+    param = context.parameter.add()
+    param.name = name
+    param.value = value
+
  def WriteContext(self, corpus_format):
    context = task_spec_pb2.TaskSpec()
    self.AddInput('documents', self.corpus_file, corpus_format, context)
@@ -106,6 +110,49 @@ class TextFormatsTest(test_util.TensorFlowTestCase):
    self.CheckUntokenizedDoc('Hello ', ['H', 'e', 'l', 'l', 'o', ' '],
                             [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5])

+  def testSegmentationTrainingData(self):
+    doc1_lines = ['测试	NO_SPACE\n',
+                  '的	NO_SPACE\n',
+                  '句子	NO_SPACE']
+    doc1_text = '测试的句子'
+    doc1_tokens = ['测', '试', '的', '句', '子']
+    doc1_break_levles = [1, 0, 1, 1, 0]
+    doc2_lines = ['That	NO_SPACE\n',
+                  '\'s	SPACE\n',
+                  'a	SPACE\n',
+                  'good	SPACE\n',
+                  'point	NO_SPACE\n',
+                  '.	NO_SPACE']
+    doc2_text = 'That\'s a good point.'
+    doc2_tokens = ['T', 'h', 'a', 't', '\'', 's', ' ', 'a', ' ', 'g', 'o', 'o',
+                   'd', ' ', 'p', 'o', 'i', 'n', 't', '.']
+    doc2_break_levles = [1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
+                         0, 1]
+    self.CheckSegmentationTrainingData(doc1_lines, doc1_text, doc1_tokens,
+                                       doc1_break_levles)
+    self.CheckSegmentationTrainingData(doc2_lines, doc2_text, doc2_tokens,
+                                       doc2_break_levles)
+
+  def CheckSegmentationTrainingData(self, doc_lines, doc_text, doc_words,
+                                    break_levels):
+    # Prepare context.
+    self.WriteContext('segment-train-data')
+
+    # Prepare test sentence.
+    with open(self.corpus_file, 'w') as f:
+      f.write(''.join(doc_lines))
+
+    # Test converted sentence.
+    sentence, _ = gen_parser_ops.document_source(
+        self.context_file, batch_size=1)
+    with self.test_session() as sess:
+      sentence_doc = self.ReadNextDocument(sess, sentence)
+      self.assertEqual(doc_text.decode('utf-8'), sentence_doc.text)
+      self.assertEqual([t.decode('utf-8') for t in doc_words],
+                       [t.word for t in sentence_doc.token])
+      self.assertEqual(break_levels,
+                       [t.break_level for t in sentence_doc.token])
+
  def testSimple(self):
    self.CheckTokenization('Hello, world!', 'Hello , world !')
    self.CheckTokenization('"Hello"', "`` Hello ''")

--- a/tensorflow @ 8ed00233
+++ b/tensorflow @ 8ed00233
-Subproject commit 861644c0bcae5d56f7b3f439696eefa6df8580ec
+Subproject commit 8ed00233c0cd530fec78cfad5b34f54b6f902e31