Commit 51238b1b authored by Livio Soares's avatar Livio Soares Committed by calberti
Browse files

Updates to syntaxnet, including update tensorflow sub-module, bazel...

Updates to syntaxnet, including update tensorflow sub-module, bazel requirement and fix trainer crash (#479)

* syntaxnet: Cosmetic fixes recommended by python lint.

* syntaxnet: Fix crash in parser_trainer due to inconsistency between LexiconBuilder::Compute()
	   and context.pbtxt definition ('char-map' input declaration was missing).

* syntaxnet: reduce flakiness in GraphBuilderTest.

* syntaxnet: Update tensorflow submodule to version > 0.10.

* syntaxnet: Update to latest stable bazel (0.3.1).

This update comes partially to allow Tensorflow submodule to build
succesffuly. In this commit, I also update and simplify the WORKSPACE,
to avoid declaring dependencies already present in tensorflow.

* syntaxnet: Update bazel version check to require version 0.3.0

* syntaxnet: Document pip requirement, along with python mock module.
parent 2390974a
......@@ -300,7 +300,7 @@ class Word : public TermFrequencyMapFeature {
Word() : TermFrequencyMapFeature("word-map") {}
FeatureValue ComputeValue(const Token &token) const override {
string form = token.word();
const string &form = token.word();
return term_map().LookupIndex(form, UnknownValue());
}
};
......
......@@ -71,7 +71,7 @@ class SharedStore {
int refcount;
SharedObject(void *o, std::function<void()> d)
: object(o), delete_callback(d), refcount(1) {}
: object(o), delete_callback(std::move(d)), refcount(1) {}
};
// A map from keys to shared objects.
......
......@@ -24,9 +24,9 @@ from tensorflow.python.ops import tensor_array_ops
from syntaxnet import graph_builder
from syntaxnet.ops import gen_parser_ops
tf.NoGradient('BeamParseReader')
tf.NoGradient('BeamParser')
tf.NoGradient('BeamParserOutput')
tf.NotDifferentiable('BeamParseReader')
tf.NotDifferentiable('BeamParser')
tf.NotDifferentiable('BeamParserOutput')
def AddCrossEntropy(batch_size, n):
......@@ -122,6 +122,7 @@ class StructuredGraphBuilder(graph_builder.GreedyParser):
KeepGoing,
Advance,
[state, step, scores_array, alive, alive_steps] + list(features),
shape_invariants=[tf.TensorShape(None)] * (len(features) + 5),
parallel_iterations=100)
# Link to the final nodes/values of ops that have passed through While:
......
......@@ -19,8 +19,8 @@ load("@protobuf//:protobuf.bzl", "py_proto_library")
def if_cuda(if_true, if_false = []):
"""Shorthand for select()'ing on whether we're building with CUDA."""
return select({
"@org_tensorflow//third_party/gpus/cuda:using_nvcc": if_true,
"@org_tensorflow//third_party/gpus/cuda:using_gcudacc": if_true,
"@local_config_cuda//cuda:using_nvcc": if_true,
"@local_config_cuda//cuda:using_clang": if_true,
"//conditions:default": if_false
})
......
......@@ -20,7 +20,8 @@ limitations under the License.
#include <limits>
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
#include "tensorflow/core/lib/io/buffered_inputstream.h"
#include "tensorflow/core/lib/io/random_inputstream.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/env.h"
......@@ -61,9 +62,10 @@ void TermFrequencyMap::Load(const string &filename, int min_frequency,
std::unique_ptr<tensorflow::RandomAccessFile> file;
TF_CHECK_OK(tensorflow::Env::Default()->NewRandomAccessFile(filename, &file));
static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
tensorflow::io::InputBuffer input(file.get(), kInputBufferSize);
tensorflow::io::RandomAccessInputStream stream(file.get());
tensorflow::io::BufferedInputStream buffer(&stream, kInputBufferSize);
string line;
TF_CHECK_OK(input.ReadLine(&line));
TF_CHECK_OK(buffer.ReadLine(&line));
int32 total = -1;
CHECK(utils::ParseInt32(line.c_str(), &total));
CHECK_GE(total, 0);
......@@ -71,7 +73,7 @@ void TermFrequencyMap::Load(const string &filename, int min_frequency,
// Read the mapping.
int64 last_frequency = -1;
for (int i = 0; i < total && i < max_num_terms; ++i) {
TF_CHECK_OK(input.ReadLine(&line));
TF_CHECK_OK(buffer.ReadLine(&line));
vector<string> elements = utils::Split(line, ' ');
CHECK_EQ(2, elements.size());
CHECK(!elements[0].empty());
......@@ -143,9 +145,10 @@ TagToCategoryMap::TagToCategoryMap(const string &filename) {
std::unique_ptr<tensorflow::RandomAccessFile> file;
TF_CHECK_OK(tensorflow::Env::Default()->NewRandomAccessFile(filename, &file));
static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
tensorflow::io::InputBuffer input(file.get(), kInputBufferSize);
tensorflow::io::RandomAccessInputStream stream(file.get());
tensorflow::io::BufferedInputStream buffer(&stream, kInputBufferSize);
string line;
while (input.ReadLine(&line) == tensorflow::Status::OK()) {
while (buffer.ReadLine(&line) == tensorflow::Status::OK()) {
vector<string> pair = utils::Split(line, '\t');
CHECK(line.empty() || pair.size() == 2) << line;
tag_to_category_[pair[0]] = pair[1];
......
......@@ -18,10 +18,10 @@ limitations under the License.
#include <vector>
#include "syntaxnet/document_format.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/segmenter_utils.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/utils.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
#include "tensorflow/core/lib/io/buffered_inputstream.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/regexp.h"
......@@ -70,7 +70,7 @@ class CoNLLSyntaxFormat : public DocumentFormat {
}
// Reads up to the first empty line and returns false end of file is reached.
bool ReadRecord(tensorflow::io::InputBuffer *buffer,
bool ReadRecord(tensorflow::io::BufferedInputStream *buffer,
string *record) override {
string line;
record->clear();
......@@ -284,6 +284,122 @@ class CoNLLSyntaxFormat : public DocumentFormat {
REGISTER_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);
// Reader for segmentation training data format. This reader assumes the input
// format is similar to CoNLL format but with only two fileds:
//
// Fields:
// 1 FORM: Word form or punctuation symbol.
// 2 SPACE FLAG: Can be either 'SPACE' or 'NO_SPACE' indicates that whether
// there should be a space between this word and the next one in
// the raw text.
//
// Examples:
// To create a training example for sentence with raw text:
// That's a good point.
// and the corresponding gold segmentation:
// That 's a good point .
// Then the correct input is:
// That NO_SPACE
// 's SPACE
// a SPACE
// good SPACE
// point NO_SPACE
// . NO_SPACE
//
// Yet another example:
// To create a training example for sentence with raw text:
// 这是一个测试
// and the corresponding gold segmentation:
// 这 是 一 个 测试
// Then the correct input is:
// 这 NO_SPACE
// 是 NO_SPACE
// 一 NO_SPACE
// 个 NO_SPACE
// 测试 NO_SPACE
class SegmentationTrainingDataFormat : public CoNLLSyntaxFormat {
public:
// Converts to segmentation training data by breaking those word in the input
// tokens to utf8 character based tokens. Moreover, if a character is the
// first char of the word in the original token, then its break level is set
// to SPACE_BREAK to indicate that the corresponding gold transition for that
// character token is START. Otherwise NO_BREAK to indicate MERGE.
void ConvertFromString(const string &key, const string &value,
vector<Sentence *> *sentences) override {
// Create new sentence.
Sentence *sentence = new Sentence();
// Each line corresponds to one token.
string text;
vector<string> lines = utils::Split(value, '\n');
// Add each token to the sentence.
vector<string> fields;
for (size_t i = 0; i < lines.size(); ++i) {
// Split line into tab-separated fields.
fields.clear();
fields = utils::Split(lines[i], '\t');
if (fields.empty()) continue;
// Skip comment lines.
if (fields[0][0] == '#') continue;
// Check that the line is valid.
CHECK_GE(fields.size(), 2)
<< "Every line has to have at least 8 tab separated fields.";
// Get relevant fields.
const string &word = fields[0];
CHECK(fields[1] == "SPACE" || fields[1] == "NO_SPACE")
<< "The space field can only be either 'SPACE' or 'NO_SPACE'";
const bool space_after = fields[1] == "SPACE";
// Add token to sentence text.
int start = text.size();
text.append(word);
if (space_after && i != lines.size() - 1) {
text.append(" ");
}
// Add character-based token to sentence.
vector<tensorflow::StringPiece> chars;
SegmenterUtils::GetUTF8Chars(word, &chars);
bool is_first_char = true;
for (auto utf8char : chars) {
Token *char_token = sentence->add_token();
char_token->set_word(utf8char.ToString());
char_token->set_start(start);
start += char_token->word().size();
char_token->set_end(start - 1);
char_token->set_break_level(
is_first_char ? Token::SPACE_BREAK : Token::NO_BREAK);
is_first_char = false;
}
// Add another space token.
if (space_after) {
Token *char_token = sentence->add_token();
char_token->set_word(" ");
char_token->set_start(start);
char_token->set_end(start);
char_token->set_break_level(Token::SPACE_BREAK);
}
}
if (sentence->token_size() > 0) {
sentence->set_docid(key);
sentence->set_text(text);
sentences->push_back(sentence);
} else {
// If the sentence was empty (e.g., blank lines at the beginning of a
// file), then don't save it.
delete sentence;
}
}
};
REGISTER_DOCUMENT_FORMAT("segment-train-data", SegmentationTrainingDataFormat);
// Reader for tokenized text. This reader expects every sentence to be on a
// single line and tokens on that line to be separated by single spaces.
//
......@@ -292,7 +408,7 @@ class TokenizedTextFormat : public DocumentFormat {
TokenizedTextFormat() {}
// Reads a line and returns false if end of file is reached.
bool ReadRecord(tensorflow::io::InputBuffer *buffer,
bool ReadRecord(tensorflow::io::BufferedInputStream *buffer,
string *record) override {
return buffer->ReadLine(record).ok();
}
......
......@@ -19,7 +19,6 @@
# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
import os.path
import tensorflow as tf
import syntaxnet.load_parser_ops
......@@ -51,6 +50,11 @@ class TextFormatsTest(test_util.TensorFlowTestCase):
inp.record_format.append(record_format)
inp.part.add().file_pattern = file_pattern
def AddParameter(self, name, value, context):
param = context.parameter.add()
param.name = name
param.value = value
def WriteContext(self, corpus_format):
context = task_spec_pb2.TaskSpec()
self.AddInput('documents', self.corpus_file, corpus_format, context)
......@@ -106,6 +110,49 @@ class TextFormatsTest(test_util.TensorFlowTestCase):
self.CheckUntokenizedDoc('Hello ', ['H', 'e', 'l', 'l', 'o', ' '],
[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5])
def testSegmentationTrainingData(self):
doc1_lines = ['测试 NO_SPACE\n',
'的 NO_SPACE\n',
'句子 NO_SPACE']
doc1_text = '测试的句子'
doc1_tokens = ['测', '试', '的', '句', '子']
doc1_break_levles = [1, 0, 1, 1, 0]
doc2_lines = ['That NO_SPACE\n',
'\'s SPACE\n',
'a SPACE\n',
'good SPACE\n',
'point NO_SPACE\n',
'. NO_SPACE']
doc2_text = 'That\'s a good point.'
doc2_tokens = ['T', 'h', 'a', 't', '\'', 's', ' ', 'a', ' ', 'g', 'o', 'o',
'd', ' ', 'p', 'o', 'i', 'n', 't', '.']
doc2_break_levles = [1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
0, 1]
self.CheckSegmentationTrainingData(doc1_lines, doc1_text, doc1_tokens,
doc1_break_levles)
self.CheckSegmentationTrainingData(doc2_lines, doc2_text, doc2_tokens,
doc2_break_levles)
def CheckSegmentationTrainingData(self, doc_lines, doc_text, doc_words,
break_levels):
# Prepare context.
self.WriteContext('segment-train-data')
# Prepare test sentence.
with open(self.corpus_file, 'w') as f:
f.write(''.join(doc_lines))
# Test converted sentence.
sentence, _ = gen_parser_ops.document_source(
self.context_file, batch_size=1)
with self.test_session() as sess:
sentence_doc = self.ReadNextDocument(sess, sentence)
self.assertEqual(doc_text.decode('utf-8'), sentence_doc.text)
self.assertEqual([t.decode('utf-8') for t in doc_words],
[t.word for t in sentence_doc.token])
self.assertEqual(break_levels,
[t.break_level for t in sentence_doc.token])
def testSimple(self):
self.CheckTokenization('Hello, world!', 'Hello , world !')
self.CheckTokenization('"Hello"', "`` Hello ''")
......
Subproject commit 861644c0bcae5d56f7b3f439696eefa6df8580ec
Subproject commit 8ed00233c0cd530fec78cfad5b34f54b6f902e31
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment