Adding SyntaxNet to tensorflow/models (#63)

32ab5a58 · calberti · Martin Wicke · 148a15fb · 32ab5a58 · 32ab5a58
Commit 32ab5a58 authored May 12, 2016 by calberti Committed by Martin Wicke May 12, 2016
20 changed files
--- a/syntaxnet/syntaxnet/task_context.h
+++ b/syntaxnet/syntaxnet/task_context.h
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef $TARGETDIR_TASK_CONTEXT_H_
+#define $TARGETDIR_TASK_CONTEXT_H_
+
+#include <string>
+#include <vector>
+
+#include "syntaxnet/task_spec.pb.h"
+#include "syntaxnet/utils.h"
+
+namespace syntaxnet {
+
+// A task context holds configuration information for a task. It is basically a
+// wrapper around a TaskSpec protocol buffer.
+class TaskContext {
+ public:
+  // Returns the underlying task specification protocol buffer for the context.
+  const TaskSpec &spec() const { return spec_; }
+  TaskSpec *mutable_spec() { return &spec_; }
+
+  // Returns a named input descriptor for the task. A new input  is created if
+  // the task context does not already have an input with that name.
+  TaskInput *GetInput(const string &name);
+  TaskInput *GetInput(const string &name, const string &file_format,
+                      const string &record_format);
+
+  // Sets task parameter.
+  void SetParameter(const string &name, const string &value);
+
+  // Returns task parameter. If the parameter is not in the task configuration
+  // the (default) value of the corresponding command line flag is returned.
+  string GetParameter(const string &name) const;
+  int GetIntParameter(const string &name) const;
+  int64 GetInt64Parameter(const string &name) const;
+  bool GetBoolParameter(const string &name) const;
+  double GetFloatParameter(const string &name) const;
+
+  // Returns task parameter. If the parameter is not in the task configuration
+  // the default value is returned. Parameters retrieved using these methods
+  // don't need to be defined with a DEFINE_*() macro.
+  string Get(const string &name, const string &defval) const;
+  string Get(const string &name, const char *defval) const;
+  int Get(const string &name, int defval) const;
+  int64 Get(const string &name, int64 defval) const;
+  double Get(const string &name, double defval) const;
+  bool Get(const string &name, bool defval) const;
+
+  // Returns input file name for a single-file task input.
+  static string InputFile(const TaskInput &input);
+
+  // Returns true if task input supports the file and record format.
+  static bool Supports(const TaskInput &input, const string &file_format,
+                       const string &record_format);
+
+ private:
+  // Underlying task specification protocol buffer.
+  TaskSpec spec_;
+
+  // Vector of parameters required by this task.  These must be specified in the
+  // task rather than relying on default values.
+  vector<string> required_parameters_;
+};
+
+}  // namespace syntaxnet
+
+#endif  // $TARGETDIR_TASK_CONTEXT_H_
--- a/syntaxnet/syntaxnet/task_spec.proto
+++ b/syntaxnet/syntaxnet/task_spec.proto
+// LINT: ALLOW_GROUPS
+// Protocol buffer specifications for task configuration.
+
+syntax = "proto2";
+
+package syntaxnet;
+
+// Task input descriptor.
+message TaskInput {
+  // Name of input resource.
+  required string name = 1;
+
+  // Name of stage responsible of creating this resource.
+  optional string creator = 2;
+
+  // File format for resource.
+  repeated string file_format = 3;
+
+  // Record format for resource.
+  repeated string record_format = 4;
+
+  // Is this resource multi-file?
+  optional bool multi_file = 5 [default = false];
+
+  // An input can consist of multiple file sets.
+  repeated group Part = 6 {
+    // File pattern for file set.
+    optional string file_pattern = 7;
+
+    // File format for file set.
+    optional string file_format = 8;
+
+    // Record format for file set.
+    optional string record_format = 9;
+  }
+}
+
+// Task output descriptor.
+message TaskOutput {
+  // Name of output resource.
+  required string name = 1;
+
+  // File format for output resource.
+  optional string file_format = 2;
+
+  // Record format for output resource.
+  optional string record_format = 3;
+
+  // Number of shards in output. If it is different from zero this output is
+  // sharded. If the number of shards is set to -1 this means that the output is
+  // sharded, but the number of shard is unknown. The files are then named
+  // 'base-*-of-*'.
+  optional int32 shards = 4 [default = 0];
+
+  // Base file name for output resource. If this is not set by the task
+  // component it is set to a default value by the workflow engine.
+  optional string file_base = 5;
+
+  // Optional extension added to the file name.
+  optional string file_extension = 6;
+}
+
+// A task specification is used for describing executing parameters.
+message TaskSpec {
+  // Name of task.
+  optional string task_name = 1;
+
+  // Workflow task type.
+  optional string task_type = 2;
+
+  // Task parameters.
+  repeated group Parameter = 3 {
+    required string name = 4;
+    optional string value = 5;
+  }
+
+  // Task inputs.
+  repeated TaskInput input = 6;
+
+  // Task outputs.
+  repeated TaskOutput output = 7;
+}
--- a/syntaxnet/syntaxnet/term_frequency_map.cc
+++ b/syntaxnet/syntaxnet/term_frequency_map.cc
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/term_frequency_map.h"
+
+#include <stddef.h>
+#include <algorithm>
+#include <limits>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace syntaxnet {
+
+int TermFrequencyMap::Increment(const string &term) {
+  CHECK_EQ(term_index_.size(), term_data_.size());
+  const TermIndex::const_iterator it = term_index_.find(term);
+  if (term_index_.find(term) != term_index_.end()) {
+    // Increment the existing term.
+    pair<string, int64> &data = term_data_[it->second];
+    CHECK_EQ(term, data.first);
+    ++(data.second);
+    return it->second;
+  } else {
+    // Add a new term.
+    const int index = term_index_.size();
+    CHECK_LT(index, std::numeric_limits<int32>::max());  // overflow
+    term_index_[term] = index;
+    term_data_.push_back(pair<string, int64>(term, 1));
+    return index;
+  }
+}
+
+void TermFrequencyMap::Clear() {
+  term_index_.clear();
+  term_data_.clear();
+}
+
+void TermFrequencyMap::Load(const string &filename, int min_frequency,
+                            int max_num_terms) {
+  Clear();
+
+  // If max_num_terms is non-positive, replace it with INT_MAX.
+  if (max_num_terms <= 0) max_num_terms = std::numeric_limits<int>::max();
+
+  // Read the first line (total # of terms in the mapping).
+  tensorflow::RandomAccessFile *file;
+  TF_CHECK_OK(tensorflow::Env::Default()->NewRandomAccessFile(filename, &file));
+  static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
+  tensorflow::io::InputBuffer input(file, kInputBufferSize);
+  string line;
+  TF_CHECK_OK(input.ReadLine(&line));
+  int32 total = -1;
+  CHECK(utils::ParseInt32(line.c_str(), &total));
+  CHECK_GE(total, 0);
+
+  // Read the mapping.
+  int64 last_frequency = -1;
+  for (int i = 0; i < total && i < max_num_terms; ++i) {
+    TF_CHECK_OK(input.ReadLine(&line));
+    vector<string> elements = utils::Split(line, ' ');
+    CHECK_EQ(2, elements.size());
+    CHECK(!elements[0].empty());
+    CHECK(!elements[1].empty());
+    int64 frequency = 0;
+    CHECK(utils::ParseInt64(elements[1].c_str(), &frequency));
+    CHECK_GT(frequency, 0);
+    const string &term = elements[0];
+
+    // Check frequency sorting (descending order).
+    if (i > 0) CHECK_GE(last_frequency, frequency);
+    last_frequency = frequency;
+
+    // Ignore low-frequency items.
+    if (frequency < min_frequency) continue;
+
+    // Check uniqueness of the mapped terms.
+    CHECK(term_index_.find(term) == term_index_.end())
+        << "File " << filename << " has duplicate term: " << term;
+
+    // Assign the next available index.
+    const int index = term_index_.size();
+    term_index_[term] = index;
+    term_data_.push_back(pair<string, int64>(term, frequency));
+  }
+  CHECK_EQ(term_index_.size(), term_data_.size());
+  LOG(INFO) << "Loaded " << term_index_.size() << " terms from " << filename
+            << ".";
+}
+
+struct TermFrequencyMap::SortByFrequencyThenTerm {
+  // Return a > b to sort in descending order of frequency; otherwise,
+  // lexicographic sort on term.
+  bool operator()(const pair<string, int64> &a,
+                  const pair<string, int64> &b) const {
+    return (a.second > b.second || (a.second == b.second && a.first < b.first));
+  }
+};
+
+void TermFrequencyMap::Save(const string &filename) const {
+  CHECK_EQ(term_index_.size(), term_data_.size());
+
+  // Copy and sort the term data.
+  vector<pair<string, int64>> sorted_data(term_data_);
+  std::sort(sorted_data.begin(), sorted_data.end(), SortByFrequencyThenTerm());
+
+  // Write the number of terms.
+  tensorflow::WritableFile *file;
+  TF_CHECK_OK(tensorflow::Env::Default()->NewWritableFile(filename, &file));
+  CHECK_LE(term_index_.size(), std::numeric_limits<int32>::max());  // overflow
+  const int32 num_terms = term_index_.size();
+  const string header = tensorflow::strings::StrCat(num_terms, "\n");
+  TF_CHECK_OK(file->Append(header));
+
+  // Write each term and frequency.
+  for (size_t i = 0; i < sorted_data.size(); ++i) {
+    if (i > 0) CHECK_GE(sorted_data[i - 1].second, sorted_data[i].second);
+    const string line = tensorflow::strings::StrCat(
+        sorted_data[i].first, " ", sorted_data[i].second, "\n");
+    TF_CHECK_OK(file->Append(line));
+  }
+  TF_CHECK_OK(file->Close()) << "for file " << filename;
+  LOG(INFO) << "Saved " << term_index_.size() << " terms to " << filename
+            << ".";
+  delete file;
+}
+
+TagToCategoryMap::TagToCategoryMap(const string &filename) {
+  // Load the mapping.
+  tensorflow::RandomAccessFile *file;
+  TF_CHECK_OK(tensorflow::Env::Default()->NewRandomAccessFile(filename, &file));
+  static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
+  tensorflow::io::InputBuffer input(file, kInputBufferSize);
+  string line;
+  while (input.ReadLine(&line) == tensorflow::Status::OK()) {
+    vector<string> pair = utils::Split(line, '\t');
+    CHECK(line.empty() || pair.size() == 2) << line;
+    tag_to_category_[pair[0]] = pair[1];
+  }
+}
+
+// Returns the category associated with the given tag.
+const string &TagToCategoryMap::GetCategory(const string &tag) const {
+  const auto it = tag_to_category_.find(tag);
+  CHECK(it != tag_to_category_.end()) << "No category found for tag " << tag;
+  return it->second;
+}
+
+void TagToCategoryMap::SetCategory(const string &tag, const string &category) {
+  const auto it = tag_to_category_.find(tag);
+  if (it != tag_to_category_.end()) {
+    CHECK_EQ(category, it->second)
+        << "POS tag cannot be mapped to multiple coarse POS tags. "
+        << "'" << tag << "' is mapped to: '" << category << "' and '"
+        << it->second << "'";
+  } else {
+    tag_to_category_[tag] = category;
+  }
+}
+
+void TagToCategoryMap::Save(const string &filename) const {
+  // Write tag and category on each line.
+  tensorflow::WritableFile *file;
+  TF_CHECK_OK(tensorflow::Env::Default()->NewWritableFile(filename, &file));
+  for (const auto &pair : tag_to_category_) {
+    const string line =
+        tensorflow::strings::StrCat(pair.first, "\t", pair.second, "\n");
+    TF_CHECK_OK(file->Append(line));
+  }
+  TF_CHECK_OK(file->Close()) << "for file " << filename;
+  delete file;
+}
+
+}  // namespace syntaxnet
--- a/syntaxnet/syntaxnet/term_frequency_map.h
+++ b/syntaxnet/syntaxnet/term_frequency_map.h
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef $TARGETDIR_TERM_FREQUENCY_MAP_H_
+#define $TARGETDIR_TERM_FREQUENCY_MAP_H_
+
+#include <stddef.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "syntaxnet/utils.h"
+
+namespace syntaxnet {
+
+// A mapping from strings to frequencies with save and load functionality.
+class TermFrequencyMap {
+ public:
+  // Creates an empty frequency map.
+  TermFrequencyMap() {}
+
+  // Creates a term frequency map by calling Load.
+  TermFrequencyMap(const string &file, int min_frequency, int max_num_terms) {
+    Load(file, min_frequency, max_num_terms);
+  }
+
+  // Returns the number of terms with positive frequency.
+  int Size() const { return term_index_.size(); }
+
+  // Returns the index associated with the given term.  If the term does not
+  // exist, the unknown index is returned instead.
+  int LookupIndex(const string &term, int unknown) const {
+    const TermIndex::const_iterator it = term_index_.find(term);
+    return (it != term_index_.end() ? it->second : unknown);
+  }
+
+  // Returns the term associated with the given index.
+  const string &GetTerm(int index) const { return term_data_[index].first; }
+
+  // Increases the frequency of the given term by 1, creating a new entry if
+  // necessary, and returns the index of the term.
+  int Increment(const string &term);
+
+  // Clears all frequencies.
+  void Clear();
+
+  // Loads a frequency mapping from the given file, which must have been created
+  // by an earlier call to Save().  After loading, the term indices are
+  // guaranteed to be ordered in descending order of frequency (breaking ties
+  // arbitrarily).  However, any new terms inserted after loading do not
+  // maintain this sorting invariant.
+  //
+  // Only loads terms with frequency >= min_frequency.  If max_num_terms <= 0,
+  // then all qualifying terms are loaded; otherwise, max_num_terms terms with
+  // maximal frequency are loaded (breaking ties arbitrarily).
+  void Load(const string &filename, int min_frequency, int max_num_terms);
+
+  // Saves a frequency mapping to the given file.
+  void Save(const string &filename) const;
+
+ private:
+  // Hashtable for term-to-index mapping.
+  typedef std::unordered_map<string, int> TermIndex;
+
+  // Sorting functor for term data.
+  struct SortByFrequencyThenTerm;
+
+  // Mapping from terms to indices.
+  TermIndex term_index_;
+
+  // Mapping from indices to term and frequency.
+  vector<pair<string, int64>> term_data_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TermFrequencyMap);
+};
+
+// A mapping from tags to categories.
+class TagToCategoryMap {
+ public:
+  TagToCategoryMap() {}
+  ~TagToCategoryMap() {}
+
+  // Loads a tag to category map from a text file.
+  explicit TagToCategoryMap(const string &filename);
+
+  // Sets the category for the given tag.
+  void SetCategory(const string &tag, const string &category);
+
+  // Returns the category associated with the given tag.
+  const string &GetCategory(const string &tag) const;
+
+  // Saves a tag to category map to the given file.
+  void Save(const string &filename) const;
+
+ private:
+  map<string, string> tag_to_category_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TagToCategoryMap);
+};
+
+}  // namespace syntaxnet
+
+#endif  // $TARGETDIR_TERM_FREQUENCY_MAP_H_
--- a/syntaxnet/syntaxnet/test_main.cc
+++ b/syntaxnet/syntaxnet/test_main.cc
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A program with a main that is suitable for unittests, including those
+// that also define microbenchmarks.  Based on whether the user specified
+// the --benchmark_filter flag which specifies which benchmarks to run,
+// we will either run benchmarks or run the gtest tests in the program.
+
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/types.h"
+
+#if defined(PLATFORM_GOOGLE) || defined(__ANDROID__)
+
+// main() is supplied by gunit_main
+#else
+#include "gtest/gtest.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+GTEST_API_ int main(int argc, char **argv) {
+  std::cout << "Running main() from test_main.cc\n";
+
+  testing::InitGoogleTest(&argc, argv);
+  for (int i = 1; i < argc; i++) {
+    if (tensorflow::StringPiece(argv[i]).starts_with("--benchmarks=")) {
+      const char *pattern = argv[i] + strlen("--benchmarks=");
+      tensorflow::testing::Benchmark::Run(pattern);
+      return 0;
+    }
+  }
+  return RUN_ALL_TESTS();
+}
+#endif
--- a/syntaxnet/syntaxnet/testdata/context.pbtxt
+++ b/syntaxnet/syntaxnet/testdata/context.pbtxt
+Parameter {
+  name: 'brain_parser_embedding_dims'
+  value: '8;8;8'
+}
+Parameter {
+  name: 'brain_parser_features'
+  value: 'input.token.word input(1).token.word input(2).token.word stack.token.word stack(1).token.word stack(2).token.word;input.tag input(1).tag input(2).tag stack.tag stack(1).tag stack(2).tag;stack.child(1).label stack.child(1).sibling(-1).label stack.child(-1).label stack.child(-1).sibling(1).label'
+}
+Parameter {
+  name: 'brain_parser_embedding_names'
+  value: 'words;tags;labels'
+}
+input {
+  name: 'training-corpus'
+  record_format: 'conll-sentence'
+  Part {
+    file_pattern: 'syntaxnet/testdata/mini-training-set'
+  }
+}
+input {
+  name: 'tuning-corpus'
+  record_format: 'conll-sentence'
+  Part {
+    file_pattern: 'syntaxnet/testdata/mini-training-set'
+  }
+}
+input {
+  name: 'parsed-tuning-corpus'
+  creator: 'brain_parser/greedy'
+  record_format: 'conll-sentence'
+}
+input {
+  name: 'label-map'
+  file_format: 'text'
+  Part {
+    file_pattern: 'OUTPATH/label-map'
+  }
+}
+input {
+  name: 'word-map'
+  Part {
+    file_pattern: 'OUTPATH/word-map'
+  }
+}
+input {
+  name: 'lcword-map'
+  Part {
+    file_pattern: 'OUTPATH/lcword-map'
+  }
+}
+input {
+  name: 'tag-map'
+  Part {
+    file_pattern: 'OUTPATH/tag-map'
+  }
+}
+input {
+  name: 'category-map'
+  Part {
+    file_pattern: 'OUTPATH/category-map'
+  }
+}
+input {
+  name: 'prefix-table'
+  Part {
+    file_pattern: 'OUTPATH/prefix-table'
+  }
+}
+input {
+  name: 'suffix-table'
+  Part {
+    file_pattern: 'OUTPATH/suffix-table'
+  }
+}
+input {
+  name: 'tag-to-category'
+  Part {
+    file_pattern: 'OUTPATH/tag-to-category'
+  }
+}
+input {
+  name: 'stdout'
+  record_format: 'conll-sentence'
+  Part {
+    file_pattern: '-'
+  }
+}
--- a/syntaxnet/syntaxnet/testdata/document
+++ b/syntaxnet/syntaxnet/testdata/document
+text       : "I can not recall any disorder in currency markets since the 1974 guidelines were adopted ."
+token: {
+  word    : "I"
+  start   : 0
+  end     : 0
+  head    : 3
+  tag     : "PRP"
+  category: "PRON"
+  label   : "nsubj"
+  break_level       : SENTENCE_BREAK
+}
+token: {
+  word    : "can"
+  start   : 2
+  end     : 4
+  head    : 3
+  tag     : "MD"
+  category: "VERB"
+  label   : "aux"
+}
+token: {
+  word    : "not"
+  start   : 6
+  end     : 8
+  head    : 3
+  tag     : "RB"
+  category: "ADV"
+  label   : "neg"
+}
+token: {
+  word    : "recall"
+  start   : 10
+  end     : 15
+  tag     : "VB"
+  category: "VERB"
+  label   : "ROOT"
+}
+token: {
+  word    : "any"
+  start   : 17
+  end     : 19
+  head    : 5
+  tag     : "DT"
+  category: "DET"
+  label   : "det"
+}
+token: {
+  word    : "disorder"
+  start   : 21
+  end     : 28
+  head    : 3
+  tag     : "NN"
+  category: "NOUN"
+  label   : "dobj"
+}
+token: {
+  word    : "in"
+  start   : 30
+  end     : 31
+  head    : 5
+  tag     : "IN"
+  category: "ADP"
+  label   : "prep"
+}
+token: {
+  word    : "currency"
+  start   : 33
+  end     : 40
+  head    : 8
+  tag     : "NN"
+  category: "NOUN"
+  label   : "nn"
+}
+token: {
+  word    : "markets"
+  start   : 42
+  end     : 48
+  head    : 6
+  tag     : "NNS"
+  category: "NOUN"
+  label   : "pobj"
+}
+token: {
+  word    : "since"
+  start   : 50
+  end     : 54
+  head    : 14
+  tag     : "IN"
+  category: "ADP"
+  label   : "mark"
+}
+token: {
+  word    : "the"
+  start   : 56
+  end     : 58
+  head    : 12
+  tag     : "DT"
+  category: "DET"
+  label   : "det"
+}
+token: {
+  word    : "1974"
+  start   : 60
+  end     : 63
+  head    : 12
+  tag     : "CD"
+  category: "NUM"
+  label   : "num"
+}
+token: {
+  word    : "guidelines"
+  start   : 65
+  end     : 74
+  head    : 14
+  tag     : "NNS"
+  category: "NOUN"
+  label   : "nsubjpass"
+}
+token: {
+  word    : "were"
+  start   : 76
+  end     : 79
+  head    : 14
+  tag     : "VBD"
+  category: "VERB"
+  label   : "auxpass"
+}
+token: {
+  word    : "adopted"
+  start   : 81
+  end     : 87
+  head    : 3
+  tag     : "VBN"
+  category: "VERB"
+  label   : "advcl"
+}
+token: {
+  word    : "."
+  start   : 89
+  end     : 89
+  head    : 3
+  tag     : "."
+  category: "."
+  label   : "p"
+}
--- a/syntaxnet/syntaxnet/testdata/mini-training-set
+++ b/syntaxnet/syntaxnet/testdata/mini-training-set
+1	I	_	PRP	PRP	_	2	nsubj	_	_
+2	knew	_	VBD	VBD	_	0	ROOT	_	_
+3	I	_	PRP	PRP	_	5	nsubj	_	_
+4	could	_	MD	MD	_	5	aux	_	_
+5	do	_	VB	VB	_	2	ccomp	_	_
+6	it	_	PRP	PRP	_	5	dobj	_	_
+7	properly	_	RB	RB	_	5	advmod	_	_
+8	if	_	IN	IN	_	9	mark	_	_
+9	given	_	VBN	VBN	_	5	advcl	_	_
+10	the	_	DT	DT	_	12	det	_	_
+11	right	_	JJ	JJ	_	12	amod	_	_
+12	kind	_	NN	NN	_	9	dobj	_	_
+13	of	_	IN	IN	_	12	prep	_	_
+14	support	_	NN	NN	_	13	pobj	_	_
+15	.	_	.	.	_	2	punct	_	_
+
+1	The	_	DT	DT	_	2	det	_	_
+2	journey	_	NN	NN	_	8	nsubj	_	_
+3	through	_	IN	IN	_	2	prep	_	_
+4	deserts	_	NNS	NNS	_	3	pobj	_	_
+5	and	_	CC	CC	_	4	cc	_	_
+6	mountains	_	NNS	NNS	_	4	conj	_	_
+7	can	_	MD	MD	_	8	aux	_	_
+8	take	_	VB	VB	_	0	ROOT	_	_
+9	a	_	DT	DT	_	10	det	_	_
+10	month	_	NN	NN	_	8	tmod	_	_
+11	.	_	.	.	_	8	punct	_	_
+
+1	You	_	PRP	PRP	_	2	nsubj	_	_
+2	say	_	VBP	VBP	_	0	ROOT	_	_
+3	they	_	PRP	PRP	_	4	nsubj	_	_
+4	're	_	VBP	VBP	_	2	ccomp	_	_
+5	in	_	IN	IN	_	4	prep	_	_
+6	the	_	DT	DT	_	7	det	_	_
+7	pipeline	_	NN	NN	_	5	pobj	_	_
+8	?	_	.	.	_	2	punct	_	_
+
+1	Border	_	NNP	NNP	_	5	nn	_	_
+2	police	_	NN	NN	_	5	nn	_	_
+3	commander	_	NN	NN	_	5	nn	_	_
+4	Abdul	_	NNP	NNP	_	5	nn	_	_
+5	Raziq	_	NNP	NNP	_	6	nsubj	_	_
+6	says	_	VBZ	VBZ	_	0	ROOT	_	_
+7	the	_	DT	DT	_	8	det	_	_
+8	drugs	_	NNS	NNS	_	10	nsubjpass	_	_
+9	were	_	VBD	VBD	_	10	auxpass	_	_
+10	found	_	VBN	VBN	_	6	ccomp	_	_
+11	in	_	IN	IN	_	10	prep	_	_
+12	the	_	DT	DT	_	13	det	_	_
+13	basement	_	NN	NN	_	11	pobj	_	_
+14	of	_	IN	IN	_	13	prep	_	_
+15	a	_	DT	DT	_	16	det	_	_
+16	compound	_	NN	NN	_	14	pobj	_	_
+17	in	_	IN	IN	_	16	prep	_	_
+18	Nawa	_	NNP	NNP	_	20	nn	_	_
+19	Kili	_	NNP	NNP	_	20	nn	_	_
+20	village	_	NN	NN	_	17	pobj	_	_
+21	.	_	.	.	_	6	punct	_	_
+
+1	Fourth	_	JJ	JJ	_	3	amod	_	_
+2	quarter	_	NN	NN	_	3	nn	_	_
+3	production	_	NN	NN	_	5	nsubjpass	_	_
+4	is	_	VBZ	VBZ	_	5	auxpass	_	_
+5	expected	_	VBN	VBN	_	0	ROOT	_	_
+6	to	_	TO	TO	_	7	aux	_	_
+7	increase	_	VB	VB	_	5	xcomp	_	_
+8	to	_	TO	TO	_	7	prep	_	_
+9	130,000	_	CD	CD	_	10	num	_	_
+10	ounces	_	NNS	NNS	_	8	pobj	_	_
+11	.	_	.	.	_	5	punct	_	_
+
+1	Minor	_	NNP	NNP	_	2	nn	_	_
+2	scuffling	_	NN	NN	_	3	nsubj	_	_
+3	broke	_	VBD	VBD	_	0	ROOT	_	_
+4	out	_	RP	RP	_	3	prt	_	_
+5	as	_	IN	IN	_	7	mark	_	_
+6	officials	_	NNS	NNS	_	7	nsubj	_	_
+7	sought	_	VBD	VBD	_	3	advcl	_	_
+8	to	_	TO	TO	_	9	aux	_	_
+9	separate	_	VB	VB	_	7	xcomp	_	_
+10	the	_	DT	DT	_	11	det	_	_
+11	groups	_	NNS	NNS	_	9	dobj	_	_
+12	.	_	.	.	_	3	punct	_	_
+
+1	According	_	VBG	VBG	_	18	prep	_	_
+2	to	_	TO	TO	_	1	pcomp	_	_
+3	Facebook	_	NNP	NNP	_	2	pobj	_	_
+4	,	_	,	,	_	3	punct	_	_
+5	which	_	WDT	WDT	_	7	nsubjpass	_	_
+6	is	_	VBZ	VBZ	_	7	auxpass	_	_
+7	based	_	VBN	VBN	_	3	rcmod	_	_
+8	in	_	IN	IN	_	7	prep	_	_
+9	Palo	_	NNP	NNP	_	10	nn	_	_
+10	Alto	_	NNP	NNP	_	8	pobj	_	_
+11	,	_	,	,	_	10	punct	_	_
+12	Calif	_	NNP	NNP	_	10	appos	_	_
+13	.	_	.	.	_	12	punct	_	_
+14	,	_	,	,	_	18	punct	_	_
+15	the	_	DT	DT	_	17	det	_	_
+16	Web	_	NNP	NNP	_	17	nn	_	_
+17	site	_	NN	NN	_	18	nsubj	_	_
+18	has	_	VBZ	VBZ	_	0	ROOT	_	_
+19	about	_	IN	IN	_	21	quantmod	_	_
+20	47	_	CD	CD	_	21	number	_	_
+21	million	_	CD	CD	_	23	num	_	_
+22	active	_	JJ	JJ	_	23	amod	_	_
+23	users	_	NNS	NNS	_	18	dobj	_	_
+24	.	_	.	.	_	18	punct	_	_
+
+1	Among	_	IN	IN	_	10	prep	_	_
+2	those	_	DT	DT	_	1	pobj	_	_
+3	leaning	_	VBG	VBG	_	2	partmod	_	_
+4	toward	_	IN	IN	_	3	prep	_	_
+5	McDonnell	_	NNP	NNP	_	4	pobj	_	_
+6	,	_	,	,	_	10	punct	_	_
+7	however	_	RB	RB	_	10	advmod	_	_
+8	,	_	,	,	_	10	punct	_	_
+9	some	_	DT	DT	_	10	nsubj	_	_
+10	took	_	VBD	VBD	_	0	ROOT	_	_
+11	a	_	DT	DT	_	14	det	_	_
+12	more	_	RBR	RBR	_	13	advmod	_	_
+13	nuanced	_	JJ	JJ	_	14	amod	_	_
+14	view	_	NN	NN	_	10	dobj	_	_
+15	,	_	,	,	_	10	punct	_	_
+16	allowing	_	VBG	VBG	_	10	partmod	_	_
+17	for	_	IN	IN	_	16	prep	_	_
+18	the	_	DT	DT	_	19	det	_	_
+19	possibility	_	NN	NN	_	17	pobj	_	_
+20	that	_	IN	IN	_	24	mark	_	_
+21	McDonnell	_	NNP	NNP	_	24	nsubj	_	_
+22	could	_	MD	MD	_	24	aux	_	_
+23	have	_	VB	VB	_	24	aux	_	_
+24	changed	_	VBN	VBN	_	19	ccomp	_	_
+25	his	_	PRP$	PRP$	_	26	poss	_	_
+26	mind	_	NN	NN	_	24	dobj	_	_
+27	in	_	IN	IN	_	24	prep	_	_
+28	the	_	DT	DT	_	31	det	_	_
+29	intervening	_	VBG	VBG	_	31	amod	_	_
+30	20	_	CD	CD	_	31	num	_	_
+31	years	_	NNS	NNS	_	27	pobj	_	_
+32	or	_	CC	CC	_	24	cc	_	_
+33	that	_	IN	IN	_	39	mark	_	_
+34	his	_	PRP$	PRP$	_	36	poss	_	_
+35	personal	_	JJ	JJ	_	36	amod	_	_
+36	convictions	_	NNS	NNS	_	39	nsubj	_	_
+37	would	_	MD	MD	_	39	aux	_	_
+38	not	_	RB	RB	_	39	neg	_	_
+39	interfere	_	VB	VB	_	24	conj	_	_
+40	with	_	IN	IN	_	39	prep	_	_
+41	his	_	PRP$	PRP$	_	42	poss	_	_
+42	governing	_	NN	NN	_	40	pobj	_	_
+43	.	_	.	.	_	10	punct	_	_
+
+1	Both	_	DT	DT	_	2	det	_	_
+2	teams	_	NNS	NNS	_	3	nsubj	_	_
+3	have	_	VBP	VBP	_	0	ROOT	_	_
+4	97	_	CD	CD	_	5	num	_	_
+5	points	_	NNS	NNS	_	3	dobj	_	_
+6	.	_	.	.	_	3	punct	_	_
+
+1	Star-Banner	_	NNP	NNP	_	2	nsubj	_	_
+2	reported	_	VBD	VBD	_	0	ROOT	_	_
+3	Tuesday	_	NNP	NNP	_	2	tmod	_	_
+4	.	_	.	.	_	2	punct	_	_
+
+1	Harry	_	NNP	NNP	_	2	nn	_	_
+2	Redknapp	_	NNP	NNP	_	9	nsubj	_	_
+3	,	_	,	,	_	2	punct	_	_
+4	the	_	DT	DT	_	6	det	_	_
+5	Tottenham	_	NNP	NNP	_	6	nn	_	_
+6	manager	_	NN	NN	_	2	appos	_	_
+7	,	_	,	,	_	2	punct	_	_
+8	was	_	VBD	VBD	_	9	aux	_	_
+9	disbelieving	_	VBG	VBG	_	0	ROOT	_	_
+10	that	_	IN	IN	_	18	mark	_	_
+11	Lennon	_	NNP	NNP	_	13	poss	_	_
+12	's	_	POS	POS	_	11	possessive	_	_
+13	delivery	_	NN	NN	_	18	nsubj	_	_
+14	could	_	MD	MD	_	18	aux	_	_
+15	be	_	VB	VB	_	18	cop	_	_
+16	so	_	RB	RB	_	18	advmod	_	_
+17	radically	_	RB	RB	_	18	advmod	_	_
+18	different	_	JJ	JJ	_	9	ccomp	_	_
+19	.	_	.	.	_	9	punct	_	_
+
+1	The	_	DT	DT	_	3	det	_	_
+2	US	_	NNP	NNP	_	3	nn	_	_
+3	uptick	_	NN	NN	_	4	nsubj	_	_
+4	mirrors	_	VBZ	VBZ	_	0	ROOT	_	_
+5	an	_	DT	DT	_	6	det	_	_
+6	improvement	_	NN	NN	_	4	dobj	_	_
+7	in	_	IN	IN	_	6	prep	_	_
+8	many	_	JJ	JJ	_	10	amod	_	_
+9	other	_	JJ	JJ	_	10	amod	_	_
+10	parts	_	NNS	NNS	_	7	pobj	_	_
+11	of	_	IN	IN	_	10	prep	_	_
+12	the	_	DT	DT	_	13	det	_	_
+13	world	_	NN	NN	_	11	pobj	_	_
+14	.	_	.	.	_	4	punct	_	_
+
+1	Although	_	IN	IN	_	4	mark	_	_
+2	satellite	_	NN	NN	_	3	nn	_	_
+3	television	_	NN	NN	_	4	nsubj	_	_
+4	has	_	VBZ	VBZ	_	17	advcl	_	_
+5	the	_	DT	DT	_	6	det	_	_
+6	capacity	_	NN	NN	_	4	dobj	_	_
+7	for	_	IN	IN	_	6	prep	_	_
+8	hundreds	_	NNS	NNS	_	7	pobj	_	_
+9	of	_	IN	IN	_	8	prep	_	_
+10	conventional	_	JJ	JJ	_	12	amod	_	_
+11	television	_	NN	NN	_	12	nn	_	_
+12	channels	_	NNS	NNS	_	9	pobj	_	_
+13	,	_	,	,	_	17	punct	_	_
+14	it	_	PRP	PRP	_	17	nsubj	_	_
+15	is	_	VBZ	VBZ	_	17	cop	_	_
+16	less	_	RBR	RBR	_	17	advmod	_	_
+17	able	_	JJ	JJ	_	0	ROOT	_	_
+18	to	_	TO	TO	_	19	aux	_	_
+19	provide	_	VB	VB	_	17	xcomp	_	_
+20	video-on-demand	_	NN	NN	_	19	dobj	_	_
+21	.	_	.	.	_	17	punct	_	_
+
+1	Our	_	PRP$	PRP$	_	3	poss	_	_
+2	comfortable	_	JJ	JJ	_	3	amod	_	_
+3	room	_	NN	NN	_	4	nsubj	_	_
+4	feels	_	VBZ	VBZ	_	0	ROOT	_	_
+5	on	_	IN	IN	_	4	prep	_	_
+6	the	_	DT	DT	_	8	det	_	_
+7	small	_	JJ	JJ	_	8	amod	_	_
+8	side	_	NN	NN	_	5	pobj	_	_
+9	,	_	,	,	_	4	punct	_	_
+10	mainly	_	RB	RB	_	17	advmod	_	_
+11	because	_	IN	IN	_	17	mark	_	_
+12	too	_	RB	RB	_	13	advmod	_	_
+13	much	_	JJ	JJ	_	14	amod	_	_
+14	furniture	_	NN	NN	_	17	nsubjpass	_	_
+15	has	_	VBZ	VBZ	_	17	aux	_	_
+16	been	_	VBN	VBN	_	17	auxpass	_	_
+17	shoehorned	_	VBN	VBN	_	4	advcl	_	_
+18	into	_	IN	IN	_	17	prep	_	_
+19	it	_	PRP	PRP	_	18	pobj	_	_
+20	.	_	.	.	_	4	punct	_	_
+
+1	They	_	PRP	PRP	_	3	nsubj	_	_
+2	also	_	RB	RB	_	3	advmod	_	_
+3	require	_	VBP	VBP	_	0	ROOT	_	_
+4	a	_	DT	DT	_	6	det	_	_
+5	slower	_	JJR	JJR	_	6	amod	_	_
+6	inhale	_	NN	NN	_	3	dobj	_	_
+7	.	_	.	.	_	3	punct	_	_
+
+1	Her	_	PRP$	PRP$	_	2	poss	_	_
+2	ring	_	NN	NN	_	4	nsubjpass	_	_
+3	was	_	VBD	VBD	_	4	auxpass	_	_
+4	found	_	VBN	VBN	_	0	ROOT	_	_
+5	in	_	IN	IN	_	4	prep	_	_
+6	the	_	DT	DT	_	7	det	_	_
+7	car	_	NN	NN	_	5	pobj	_	_
+8	.	_	.	.	_	4	punct	_	_
+
+1	In	_	IN	IN	_	12	prep	_	_
+2	the	_	DT	DT	_	4	det	_	_
+3	past	_	JJ	JJ	_	4	amod	_	_
+4	year	_	NN	NN	_	1	pobj	_	_
+5	,	_	,	,	_	7	punct	_	_
+6	Forsythe	_	NNP	NNP	_	7	nsubj	_	_
+7	said	_	VBD	VBD	_	12	parataxis	_	_
+8	,	_	,	,	_	7	punct	_	_
+9	the	_	DT	DT	_	11	det	_	_
+10	Salvation	_	NNP	NNP	_	11	nn	_	_
+11	Army	_	NNP	NNP	_	12	nsubj	_	_
+12	provided	_	VBD	VBD	_	0	ROOT	_	_
+13	rental	_	JJ	JJ	_	14	amod	_	_
+14	subsidies	_	NNS	NNS	_	12	dobj	_	_
+15	that	_	WDT	WDT	_	16	nsubj	_	_
+16	prevented	_	VBD	VBD	_	14	rcmod	_	_
+17	1,172	_	CD	CD	_	18	num	_	_
+18	evictions	_	NNS	NNS	_	16	dobj	_	_
+19	.	_	.	.	_	12	punct	_	_
+
+1	A	_	DT	DT	_	3	det	_	_
+2	23-year-old	_	JJ	JJ	_	3	amod	_	_
+3	man	_	NN	NN	_	6	nsubjpass	_	_
+4	has	_	VBZ	VBZ	_	6	aux	_	_
+5	been	_	VBN	VBN	_	6	auxpass	_	_
+6	jailed	_	VBN	VBN	_	0	ROOT	_	_
+7	for	_	IN	IN	_	6	prep	_	_
+8	two	_	CD	CD	_	9	num	_	_
+9	years	_	NNS	NNS	_	7	pobj	_	_
+10	after	_	IN	IN	_	6	prep	_	_
+11	pleading	_	VBG	VBG	_	10	pcomp	_	_
+12	guilty	_	JJ	JJ	_	11	acomp	_	_
+13	to	_	TO	TO	_	12	prep	_	_
+14	the	_	DT	DT	_	15	det	_	_
+15	manslaughter	_	NN	NN	_	13	pobj	_	_
+16	of	_	IN	IN	_	15	prep	_	_
+17	a	_	DT	DT	_	18	det	_	_
+18	man	_	NN	NN	_	16	pobj	_	_
+19	in	_	IN	IN	_	18	prep	_	_
+20	Hertfordshire	_	NNP	NNP	_	19	pobj	_	_
+21	.	_	.	.	_	6	punct	_	_
+
+1	But	_	CC	CC	_	10	cc	_	_
+2	the	_	DT	DT	_	3	det	_	_
+3	sustainability	_	NN	NN	_	10	nsubj	_	_
+4	of	_	IN	IN	_	3	prep	_	_
+5	any	_	DT	DT	_	7	det	_	_
+6	post-bubble	_	JJ	JJ	_	7	amod	_	_
+7	recovery	_	NN	NN	_	4	pobj	_	_
+8	is	_	VBZ	VBZ	_	10	cop	_	_
+9	always	_	RB	RB	_	10	advmod	_	_
+10	dubious	_	JJ	JJ	_	0	ROOT	_	_
+11	.	_	.	.	_	10	punct	_	_
+
+1	They	_	PRP	PRP	_	2	nsubj	_	_
+2	spoke	_	VBD	VBD	_	0	ROOT	_	_
+3	to	_	TO	TO	_	2	prep	_	_
+4	the	_	DT	DT	_	5	det	_	_
+5	BBC	_	NNP	NNP	_	8	poss	_	_
+6	's	_	POS	POS	_	5	possessive	_	_
+7	Artyom	_	NNP	NNP	_	8	nn	_	_
+8	Liss	_	NNP	NNP	_	3	pobj	_	_
+9	.	_	.	.	_	2	punct	_	_
+
+1	That	_	DT	DT	_	2	nsubj	_	_
+2	includes	_	VBZ	VBZ	_	0	ROOT	_	_
+3	me	_	PRP	PRP	_	2	dobj	_	_
+4	,	_	,	,	_	2	punct	_	_
+5	too	_	RB	RB	_	2	advmod	_	_
+6	.	_	.	.	_	2	punct	_	_
+
+1	The	_	DT	DT	_	2	det	_	_
+2	name	_	NN	NN	_	9	nsubj	_	_
+3	of	_	IN	IN	_	2	prep	_	_
+4	Rachel	_	NNP	NNP	_	5	nn	_	_
+5	Harris	_	NNP	NNP	_	8	poss	_	_
+6	'	_	POS	POS	_	5	possessive	_	_
+7	Web	_	NNP	NNP	_	8	nn	_	_
+8	site	_	NN	NN	_	3	pobj	_	_
+9	says	_	VBZ	VBZ	_	0	ROOT	_	_
+10	it	_	PRP	PRP	_	9	dobj	_	_
+11	all	_	DT	DT	_	10	det	_	_
+12	.	_	.	.	_	9	punct	_	_
+
+1	If	_	IN	IN	_	3	mark	_	_
+2	you	_	PRP	PRP	_	3	nsubj	_	_
+3	prefer	_	VBP	VBP	_	19	advcl	_	_
+4	to	_	TO	TO	_	5	aux	_	_
+5	maximize	_	VB	VB	_	3	xcomp	_	_
+6	your	_	PRP$	PRP$	_	7	poss	_	_
+7	travel	_	NN	NN	_	5	dobj	_	_
+8	with	_	IN	IN	_	5	prep	_	_
+9	shorter	_	JJR	JJR	_	10	amod	_	_
+10	stays	_	NNS	NNS	_	8	pobj	_	_
+11	in	_	IN	IN	_	10	prep	_	_
+12	more	_	JJR	JJR	_	13	mwe	_	_
+13	than	_	IN	IN	_	14	quantmod	_	_
+14	one	_	CD	CD	_	15	num	_	_
+15	destination	_	NN	NN	_	11	pobj	_	_
+16	,	_	,	,	_	19	punct	_	_
+17	you	_	PRP	PRP	_	19	nsubj	_	_
+18	may	_	MD	MD	_	19	aux	_	_
+19	like	_	VB	VB	_	0	ROOT	_	_
+20	this	_	DT	DT	_	22	det	_	_
+21	multi-country	_	JJ	JJ	_	22	amod	_	_
+22	jaunt	_	NN	NN	_	19	dobj	_	_
+23	from	_	IN	IN	_	22	prep	_	_
+24	Virgin	_	NNP	NNP	_	25	nn	_	_
+25	Vacations	_	NNPS	NNPS	_	23	pobj	_	_
+26	.	_	.	.	_	19	punct	_	_
+
+1	The	_	DT	DT	_	3	det	_	_
+2	Afghan	_	JJ	JJ	_	3	amod	_	_
+3	government	_	NN	NN	_	6	nsubj	_	_
+4	also	_	RB	RB	_	6	advmod	_	_
+5	is	_	VBZ	VBZ	_	6	aux	_	_
+6	trying	_	VBG	VBG	_	0	ROOT	_	_
+7	to	_	TO	TO	_	8	aux	_	_
+8	persuade	_	VB	VB	_	6	xcomp	_	_
+9	farmers	_	NNS	NNS	_	8	dobj	_	_
+10	to	_	TO	TO	_	11	aux	_	_
+11	stop	_	VB	VB	_	8	xcomp	_	_
+12	growing	_	VBG	VBG	_	13	amod	_	_
+13	poppy	_	NN	NN	_	11	dobj	_	_
+14	and	_	CC	CC	_	11	cc	_	_
+15	shift	_	VB	VB	_	11	conj	_	_
+16	to	_	TO	TO	_	15	prep	_	_
+17	other	_	JJ	JJ	_	18	amod	_	_
+18	crops	_	NNS	NNS	_	16	pobj	_	_
+19	,	_	,	,	_	18	punct	_	_
+20	particularly	_	RB	RB	_	18	advmod	_	_
+21	wheat	_	NN	NN	_	18	dep	_	_
+22	.	_	.	.	_	6	punct	_	_
+
+1	The	_	DT	DT	_	3	det	_	_
+2	most	_	RBS	RBS	_	3	advmod	_	_
+3	striking	_	JJ	JJ	_	6	nsubj	_	_
+4	is	_	VBZ	VBZ	_	6	cop	_	_
+5	the	_	DT	DT	_	6	det	_	_
+6	differences	_	NNS	NNS	_	0	ROOT	_	_
+7	over	_	IN	IN	_	6	prep	_	_
+8	what	_	WP	WP	_	10	nsubj	_	_
+9	to	_	TO	TO	_	10	aux	_	_
+10	do	_	VB	VB	_	7	pcomp	_	_
+11	with	_	IN	IN	_	10	prep	_	_
+12	the	_	DT	DT	_	13	det	_	_
+13	banks	_	NNS	NNS	_	11	pobj	_	_
+14	.	_	.	.	_	6	punct	_	_
+
+1	Philo	_	NNP	NNP	_	4	nsubj	_	_
+2	did	_	VBD	VBD	_	4	aux	_	_
+3	not	_	RB	RB	_	4	neg	_	_
+4	mention	_	VB	VB	_	0	ROOT	_	_
+5	any	_	DT	DT	_	6	det	_	_
+6	name	_	NN	NN	_	4	dobj	_	_
+7	,	_	,	,	_	6	punct	_	_
+8	place	_	NN	NN	_	6	conj	_	_
+9	,	_	,	,	_	6	punct	_	_
+10	date	_	NN	NN	_	6	conj	_	_
+11	,	_	,	,	_	6	punct	_	_
+12	or	_	CC	CC	_	6	cc	_	_
+13	historical	_	JJ	JJ	_	14	amod	_	_
+14	circumstances	_	NNS	NNS	_	6	conj	_	_
+15	,	_	,	,	_	6	punct	_	_
+16	or	_	CC	CC	_	6	cc	_	_
+17	any	_	DT	DT	_	18	det	_	_
+18	background	_	NN	NN	_	6	conj	_	_
+19	to	_	TO	TO	_	18	prep	_	_
+20	the	_	DT	DT	_	21	det	_	_
+21	consolidation	_	NN	NN	_	19	pobj	_	_
+22	of	_	IN	IN	_	21	prep	_	_
+23	this	_	DT	DT	_	24	det	_	_
+24	group	_	NN	NN	_	22	pobj	_	_
+25	.	_	.	.	_	4	punct	_	_
+
+1	Created	_	VBN	VBN	_	8	partmod	_	_
+2	in	_	IN	IN	_	1	prep	_	_
+3	1996	_	CD	CD	_	2	pobj	_	_
+4	,	_	,	,	_	8	punct	_	_
+5	the	_	DT	DT	_	6	det	_	_
+6	payments	_	NNS	NNS	_	8	nsubjpass	_	_
+7	are	_	VBP	VBP	_	8	auxpass	_	_
+8	based	_	VBN	VBN	_	0	ROOT	_	_
+9	on	_	IN	IN	_	8	prep	_	_
+10	a	_	DT	DT	_	11	det	_	_
+11	farm	_	NN	NN	_	14	poss	_	_
+12	's	_	POS	POS	_	11	possessive	_	_
+13	past	_	JJ	JJ	_	14	amod	_	_
+14	production	_	NN	NN	_	9	pobj	_	_
+15	and	_	CC	CC	_	8	cc	_	_
+16	are	_	VBP	VBP	_	17	auxpass	_	_
+17	issued	_	VBN	VBN	_	8	conj	_	_
+18	regardless	_	RB	RB	_	17	advmod	_	_
+19	of	_	IN	IN	_	18	prep	_	_
+20	current	_	JJ	JJ	_	21	amod	_	_
+21	production	_	NN	NN	_	19	pobj	_	_
+22	or	_	CC	CC	_	21	cc	_	_
+23	market	_	NN	NN	_	24	nn	_	_
+24	prices	_	NNS	NNS	_	21	conj	_	_
+25	.	_	.	.	_	8	punct	_	_
+
+1	Prosecutors	_	NNS	NNS	_	2	nsubj	_	_
+2	said	_	VBD	VBD	_	0	ROOT	_	_
+3	some	_	DT	DT	_	16	nsubjpass	_	_
+4	of	_	IN	IN	_	3	prep	_	_
+5	the	_	DT	DT	_	6	det	_	_
+6	billions	_	NNS	NNS	_	4	pobj	_	_
+7	of	_	IN	IN	_	6	prep	_	_
+8	dollars	_	NNS	NNS	_	7	pobj	_	_
+9	transferred	_	VBN	VBN	_	8	partmod	_	_
+10	from	_	IN	IN	_	9	prep	_	_
+11	Mexican	_	JJ	JJ	_	14	amod	_	_
+12	money	_	NN	NN	_	14	nn	_	_
+13	exchange	_	NN	NN	_	14	nn	_	_
+14	houses	_	NNS	NNS	_	10	pobj	_	_
+15	was	_	VBD	VBD	_	16	auxpass	_	_
+16	used	_	VBN	VBN	_	2	ccomp	_	_
+17	to	_	TO	TO	_	18	aux	_	_
+18	buy	_	VB	VB	_	16	xcomp	_	_
+19	planes	_	NNS	NNS	_	18	dobj	_	_
+20	for	_	IN	IN	_	18	prep	_	_
+21	drug	_	NN	NN	_	22	nn	_	_
+22	traffickers	_	NNS	NNS	_	20	pobj	_	_
+23	.	_	.	.	_	2	punct	_	_
+
+1	Margaret	_	NNP	NNP	_	2	nn	_	_
+2	Rutherford	_	NNP	NNP	_	11	nsubj	_	_
+3	,	_	,	,	_	2	punct	_	_
+4	chairwoman	_	NN	NN	_	2	appos	_	_
+5	of	_	IN	IN	_	4	prep	_	_
+6	Loxton	_	NNP	NNP	_	9	poss	_	_
+7	's	_	POS	POS	_	6	possessive	_	_
+8	parish	_	JJ	JJ	_	9	amod	_	_
+9	council	_	NN	NN	_	5	pobj	_	_
+10	,	_	,	,	_	2	punct	_	_
+11	told	_	VBD	VBD	_	0	ROOT	_	_
+12	BBC	_	NNP	NNP	_	13	nn	_	_
+13	Somerset	_	NNP	NNP	_	11	dobj	_	_
+14	that	_	IN	IN	_	16	mark	_	_
+15	she	_	PRP	PRP	_	16	nsubj	_	_
+16	hoped	_	VBD	VBD	_	11	ccomp	_	_
+17	the	_	DT	DT	_	18	det	_	_
+18	lines	_	NNS	NNS	_	21	nsubjpass	_	_
+19	could	_	MD	MD	_	21	aux	_	_
+20	be	_	VB	VB	_	21	auxpass	_	_
+21	sited	_	VBN	VBN	_	16	ccomp	_	_
+22	underground	_	RB	RB	_	21	advmod	_	_
+23	.	_	.	.	_	11	punct	_	_
+
+1	Amid	_	IN	IN	_	3	mark	_	_
+2	US	_	PRP	PRP	_	3	nsubj	_	_
+3	fears	_	VBZ	VBZ	_	16	advcl	_	_
+4	that	_	IN	IN	_	7	mark	_	_
+5	they	_	PRP	PRP	_	7	nsubj	_	_
+6	could	_	MD	MD	_	7	aux	_	_
+7	face	_	VB	VB	_	3	ccomp	_	_
+8	torture	_	VB	VB	_	7	dobj	_	_
+9	if	_	IN	IN	_	10	mark	_	_
+10	returned	_	VBN	VBN	_	7	advcl	_	_
+11	to	_	TO	TO	_	10	prep	_	_
+12	China	_	NNP	NNP	_	11	pobj	_	_
+13	,	_	,	,	_	16	punct	_	_
+14	five	_	CD	CD	_	16	nsubjpass	_	_
+15	were	_	VBD	VBD	_	16	auxpass	_	_
+16	released	_	VBN	VBN	_	0	ROOT	_	_
+17	to	_	TO	TO	_	16	prep	_	_
+18	Albania	_	NNP	NNP	_	17	pobj	_	_
+19	in	_	IN	IN	_	16	prep	_	_
+20	2006	_	CD	CD	_	19	pobj	_	_
+21	,	_	,	,	_	16	punct	_	_
+22	and	_	CC	CC	_	16	cc	_	_
+23	four	_	CD	CD	_	25	nsubjpass	_	_
+24	were	_	VBD	VBD	_	25	auxpass	_	_
+25	resettled	_	VBN	VBN	_	16	conj	_	_
+26	in	_	IN	IN	_	25	prep	_	_
+27	Bermuda	_	NNP	NNP	_	26	pobj	_	_
+28	this	_	DT	DT	_	29	det	_	_
+29	year	_	NN	NN	_	25	tmod	_	_
+30	.	_	.	.	_	16	punct	_	_
+
+1	He	_	PRP	PRP	_	3	nsubj	_	_
+2	then	_	RB	RB	_	3	advmod	_	_
+3	provided	_	VBD	VBD	_	0	ROOT	_	_
+4	Marshal	_	NNP	NNP	_	5	nn	_	_
+5	McAvoy	_	NNP	NNP	_	8	poss	_	_
+6	's	_	POS	POS	_	5	possessive	_	_
+7	phone	_	NN	NN	_	8	nn	_	_
+8	number	_	NN	NN	_	3	dobj	_	_
+9	.	_	.	.	_	3	punct	_	_
+
+1	Tech	_	NNP	NNP	_	2	nn	_	_
+2	credits	_	NNS	NNS	_	5	nsubj	_	_
+3	are	_	VBP	VBP	_	5	cop	_	_
+4	just	_	RB	RB	_	5	advmod	_	_
+5	fine	_	JJ	JJ	_	0	ROOT	_	_
+6	for	_	IN	IN	_	5	prep	_	_
+7	what	_	WP	WP	_	12	nsubj	_	_
+8	essentially	_	RB	RB	_	12	advmod	_	_
+9	is	_	VBZ	VBZ	_	12	cop	_	_
+10	an	_	DT	DT	_	12	det	_	_
+11	un-reality	_	JJ	JJ	_	12	amod	_	_
+12	show	_	NN	NN	_	6	pcomp	_	_
+13	.	_	.	.	_	5	punct	_	_
+
+1	But	_	CC	CC	_	8	cc	_	_
+2	my	_	PRP$	PRP$	_	4	poss	_	_
+3	eldest	_	JJS	JJS	_	4	amod	_	_
+4	daughter	_	NN	NN	_	8	nsubj	_	_
+5	,	_	,	,	_	4	punct	_	_
+6	Donna	_	NNP	NNP	_	4	appos	_	_
+7	,	_	,	,	_	4	punct	_	_
+8	did	_	VBD	VBD	_	0	ROOT	_	_
+9	.	_	.	.	_	8	punct	_	_
+
+1	The	_	DT	DT	_	2	det	_	_
+2	department	_	NN	NN	_	4	nsubj	_	_
+3	has	_	VBZ	VBZ	_	4	aux	_	_
+4	spent	_	VBN	VBN	_	0	ROOT	_	_
+5	$	_	$	$	_	4	dobj	_	_
+6	2.9	_	CD	CD	_	7	number	_	_
+7	million	_	CD	CD	_	5	num	_	_
+8	on	_	IN	IN	_	4	prep	_	_
+9	the	_	DT	DT	_	11	det	_	_
+10	hot	_	JJ	JJ	_	11	amod	_	_
+11	line	_	NN	NN	_	8	pobj	_	_
+12	thus	_	RB	RB	_	13	advmod	_	_
+13	far	_	RB	RB	_	4	advmod	_	_
+14	.	_	.	.	_	4	punct	_	_
+
+1	Picoplatin	_	NNP	NNP	_	3	nsubjpass	_	_
+2	is	_	VBZ	VBZ	_	3	auxpass	_	_
+3	designed	_	VBN	VBN	_	0	ROOT	_	_
+4	to	_	TO	TO	_	5	aux	_	_
+5	overcome	_	VB	VB	_	3	xcomp	_	_
+6	platinum	_	NN	NN	_	7	nn	_	_
+7	resistance	_	NN	NN	_	5	dobj	_	_
+8	associated	_	VBN	VBN	_	7	partmod	_	_
+9	with	_	IN	IN	_	8	prep	_	_
+10	chemotherapy	_	NN	NN	_	9	pobj	_	_
+11	in	_	IN	IN	_	10	prep	_	_
+12	solid	_	JJ	JJ	_	13	amod	_	_
+13	tumors	_	NNS	NNS	_	11	pobj	_	_
+14	,	_	,	,	_	3	punct	_	_
+15	and	_	CC	CC	_	3	cc	_	_
+16	is	_	VBZ	VBZ	_	18	aux	_	_
+17	being	_	VBG	VBG	_	18	auxpass	_	_
+18	studied	_	VBN	VBN	_	3	conj	_	_
+19	in	_	IN	IN	_	18	prep	_	_
+20	multiple	_	JJ	JJ	_	22	amod	_	_
+21	cancer	_	NN	NN	_	22	nn	_	_
+22	indications	_	NNS	NNS	_	19	pobj	_	_
+23	,	_	,	,	_	22	punct	_	_
+24	combinations	_	NNS	NNS	_	22	conj	_	_
+25	and	_	CC	CC	_	22	cc	_	_
+26	formulations	_	NNS	NNS	_	22	conj	_	_
+27	.	_	.	.	_	3	punct	_	_
+
+1	Only	_	RB	RB	_	4	advmod	_	_
+2	you	_	PRP	PRP	_	4	nsubj	_	_
+3	can	_	MD	MD	_	4	aux	_	_
+4	decide	_	VB	VB	_	0	ROOT	_	_
+5	what	_	WP	WP	_	7	nsubj	_	_
+6	's	_	VBZ	VBZ	_	7	cop	_	_
+7	important	_	JJ	JJ	_	4	ccomp	_	_
+8	.	_	.	.	_	4	punct	_	_
+
+1	Lt.	_	NNP	NNP	_	4	nn	_	_
+2	Col.	_	NNP	NNP	_	4	nn	_	_
+3	David	_	NNP	NNP	_	4	nn	_	_
+4	Accetta	_	NNP	NNP	_	14	nsubj	_	_
+5	,	_	,	,	_	4	punct	_	_
+6	the	_	DT	DT	_	10	det	_	_
+7	top	_	JJ	JJ	_	10	amod	_	_
+8	U.S.	_	NNP	NNP	_	10	nn	_	_
+9	military	_	JJ	JJ	_	10	amod	_	_
+10	spokesman	_	NN	NN	_	4	appos	_	_
+11	in	_	IN	IN	_	10	prep	_	_
+12	Afghanistan	_	NNP	NNP	_	11	pobj	_	_
+13	,	_	,	,	_	4	punct	_	_
+14	said	_	VBD	VBD	_	0	ROOT	_	_
+15	he	_	PRP	PRP	_	18	nsubj	_	_
+16	could	_	MD	MD	_	18	aux	_	_
+17	not	_	RB	RB	_	18	neg	_	_
+18	confirm	_	VB	VB	_	14	ccomp	_	_
+19	the	_	DT	DT	_	20	det	_	_
+20	report	_	NN	NN	_	18	dobj	_	_
+21	.	_	.	.	_	14	punct	_	_
+
+1	The	_	DT	DT	_	3	det	_	_
+2	four	_	CD	CD	_	3	num	_	_
+3	teams	_	NNS	NNS	_	14	nsubj	_	_
+4	that	_	WDT	WDT	_	6	nsubj	_	_
+5	will	_	MD	MD	_	6	aux	_	_
+6	play	_	VB	VB	_	3	rcmod	_	_
+7	in	_	IN	IN	_	6	prep	_	_
+8	the	_	DT	DT	_	9	det	_	_
+9	women	_	NNS	NNS	_	11	poss	_	_
+10	's	_	POS	POS	_	9	possessive	_	_
+11	tournament	_	NN	NN	_	7	pobj	_	_
+12	are	_	VBP	VBP	_	14	cop	_	_
+13	Alaska	_	NNP	NNP	_	14	nn	_	_
+14	Anchorage	_	NNP	NNP	_	0	ROOT	_	_
+15	,	_	,	,	_	14	punct	_	_
+16	Cincinnati	_	NNP	NNP	_	14	conj	_	_
+17	,	_	,	,	_	14	punct	_	_
+18	Coastal	_	NNP	NNP	_	19	nn	_	_
+19	Carolina	_	NNP	NNP	_	14	conj	_	_
+20	and	_	CC	CC	_	14	cc	_	_
+21	Western	_	NNP	NNP	_	22	nn	_	_
+22	Carolina	_	NNP	NNP	_	14	conj	_	_
+23	.	_	.	.	_	14	punct	_	_
+
+1	Speaking	_	VBG	VBG	_	8	partmod	_	_
+2	to	_	TO	TO	_	1	prep	_	_
+3	reporters	_	NNS	NNS	_	2	pobj	_	_
+4	,	_	,	,	_	8	punct	_	_
+5	she	_	PRP	PRP	_	8	nsubj	_	_
+6	did	_	VBD	VBD	_	8	aux	_	_
+7	not	_	RB	RB	_	8	neg	_	_
+8	repeat	_	VB	VB	_	0	ROOT	_	_
+9	her	_	PRP$	PRP$	_	10	poss	_	_
+10	demand	_	NN	NN	_	8	dobj	_	_
+11	that	_	IN	IN	_	17	mark	_	_
+12	a	_	DT	DT	_	15	det	_	_
+13	new	_	JJ	JJ	_	15	amod	_	_
+14	government-run	_	JJ	JJ	_	15	amod	_	_
+15	plan	_	NN	NN	_	17	nsubj	_	_
+16	be	_	VB	VB	_	17	cop	_	_
+17	part	_	NN	NN	_	10	ccomp	_	_
+18	of	_	IN	IN	_	17	prep	_	_
+19	the	_	DT	DT	_	21	det	_	_
+20	final	_	JJ	JJ	_	21	amod	_	_
+21	legislation	_	NN	NN	_	18	pobj	_	_
+22	.	_	.	.	_	8	punct	_	_
+
+1	'	_	''	''	_	10	punct	_	_
+2	But	_	CC	CC	_	10	cc	_	_
+3	with	_	IN	IN	_	10	prep	_	_
+4	the	_	DT	DT	_	5	det	_	_
+5	help	_	NN	NN	_	3	pobj	_	_
+6	of	_	IN	IN	_	5	prep	_	_
+7	English	_	NNP	NNP	_	8	nn	_	_
+8	Heritage	_	NNP	NNP	_	6	pobj	_	_
+9	we	_	PRP	PRP	_	10	nsubj	_	_
+10	restored	_	VBD	VBD	_	0	ROOT	_	_
+11	them	_	PRP	PRP	_	10	dobj	_	_
+12	.	_	.	.	_	10	punct	_	_
+13	'	_	''	''	_	10	punct	_	_
+
+1	Mr	_	NNP	NNP	_	2	nn	_	_
+2	Oubridge	_	NNP	NNP	_	3	nsubj	_	_
+3	said	_	VBD	VBD	_	0	ROOT	_	_
+4	when	_	WRB	WRB	_	8	advmod	_	_
+5	the	_	DT	DT	_	7	det	_	_
+6	festival	_	NN	NN	_	7	nn	_	_
+7	team	_	NN	NN	_	8	nsubj	_	_
+8	met	_	VBD	VBD	_	20	advcl	_	_
+9	council	_	NN	NN	_	10	nn	_	_
+10	officials	_	NNS	NNS	_	8	dobj	_	_
+11	and	_	CC	CC	_	10	cc	_	_
+12	the	_	DT	DT	_	13	det	_	_
+13	police	_	NN	NN	_	10	conj	_	_
+14	on	_	IN	IN	_	8	prep	_	_
+15	Thursday	_	NNP	NNP	_	14	pobj	_	_
+16	there	_	EX	EX	_	20	expl	_	_
+17	had	_	VBD	VBD	_	20	aux	_	_
+18	been	_	VBN	VBN	_	20	cop	_	_
+19	no	_	DT	DT	_	20	det	_	_
+20	mention	_	NN	NN	_	3	ccomp	_	_
+21	of	_	IN	IN	_	20	prep	_	_
+22	a	_	DT	DT	_	24	det	_	_
+23	potential	_	JJ	JJ	_	24	amod	_	_
+24	injunction	_	NN	NN	_	21	pobj	_	_
+25	.	_	.	.	_	3	punct	_	_
+
+1	A	_	DT	DT	_	2	det	_	_
+2	number	_	NN	NN	_	6	nsubj	_	_
+3	of	_	IN	IN	_	2	prep	_	_
+4	ministers	_	NNS	NNS	_	3	pobj	_	_
+5	have	_	VBP	VBP	_	6	aux	_	_
+6	left	_	VBN	VBN	_	0	ROOT	_	_
+7	the	_	DT	DT	_	8	det	_	_
+8	government	_	NN	NN	_	9	nsubj	_	_
+9	facing	_	VBG	VBG	_	6	dep	_	_
+10	questions	_	NNS	NNS	_	9	dobj	_	_
+11	over	_	IN	IN	_	10	prep	_	_
+12	their	_	PRP$	PRP$	_	13	poss	_	_
+13	expenses	_	NNS	NNS	_	11	pobj	_	_
+14	,	_	,	,	_	13	punct	_	_
+15	including	_	VBG	VBG	_	13	prep	_	_
+16	Hazel	_	NNP	NNP	_	17	nn	_	_
+17	Blears	_	NNP	NNP	_	15	pobj	_	_
+18	,	_	,	,	_	17	punct	_	_
+19	the	_	DT	DT	_	22	det	_	_
+20	former	_	JJ	JJ	_	22	amod	_	_
+21	communities	_	NNS	NNS	_	22	nn	_	_
+22	secretary	_	NN	NN	_	17	appos	_	_
+23	;	_	:	:	_	17	punct	_	_
+24	Jacqui	_	NNP	NNP	_	25	nn	_	_
+25	Smith	_	NNP	NNP	_	17	conj	_	_
+26	,	_	,	,	_	25	punct	_	_
+27	the	_	DT	DT	_	30	det	_	_
+28	former	_	JJ	JJ	_	30	amod	_	_
+29	home	_	NN	NN	_	30	nn	_	_
+30	secretary	_	NN	NN	_	25	appos	_	_
+31	;	_	:	:	_	17	punct	_	_
+32	and	_	CC	CC	_	17	cc	_	_
+33	Tony	_	NNP	NNP	_	34	nn	_	_
+34	McNulty	_	NNP	NNP	_	17	conj	_	_
+35	,	_	,	,	_	34	punct	_	_
+36	the	_	DT	DT	_	39	det	_	_
+37	former	_	JJ	JJ	_	39	amod	_	_
+38	employment	_	NN	NN	_	39	nn	_	_
+39	minister	_	NN	NN	_	34	appos	_	_
+40	.	_	.	.	_	6	punct	_	_
+
+1	An	_	DT	DT	_	4	det	_	_
+2	enticingly	_	RB	RB	_	3	advmod	_	_
+3	big	_	JJ	JJ	_	4	amod	_	_
+4	button	_	NN	NN	_	10	nsubj	_	_
+5	that	_	WDT	WDT	_	6	nsubj	_	_
+6	looked	_	VBD	VBD	_	4	rcmod	_	_
+7	like	_	IN	IN	_	6	prep	_	_
+8	a	_	DT	DT	_	9	det	_	_
+9	latch	_	NN	NN	_	7	pobj	_	_
+10	turned	_	VBD	VBD	_	0	ROOT	_	_
+11	out	_	RP	RP	_	10	prt	_	_
+12	to	_	TO	TO	_	15	aux	_	_
+13	be	_	VB	VB	_	15	cop	_	_
+14	a	_	DT	DT	_	15	det	_	_
+15	hinge	_	NN	NN	_	10	xcomp	_	_
+16	.	_	.	.	_	10	punct	_	_
+
+1	After	_	IN	IN	_	8	prep	_	_
+2	an	_	DT	DT	_	5	det	_	_
+3	oustanding	_	JJ	JJ	_	5	amod	_	_
+4	opening	_	NN	NN	_	5	nn	_	_
+5	round	_	NN	NN	_	1	pobj	_	_
+6	,	_	,	,	_	8	punct	_	_
+7	Garcia	_	NNP	NNP	_	8	nsubj	_	_
+8	found	_	VBD	VBD	_	0	ROOT	_	_
+9	himself	_	PRP	PRP	_	10	nsubj	_	_
+10	tied	_	VBD	VBD	_	8	ccomp	_	_
+11	with	_	IN	IN	_	10	prep	_	_
+12	the	_	DT	DT	_	14	det	_	_
+13	50-year-old	_	JJ	JJ	_	14	amod	_	_
+14	Langer	_	NNP	NNP	_	11	pobj	_	_
+15	,	_	,	,	_	14	punct	_	_
+16	who	_	WP	WP	_	17	nsubj	_	_
+17	fired	_	VBD	VBD	_	14	rcmod	_	_
+18	a	_	DT	DT	_	20	det	_	_
+19	five-under	_	JJ	JJ	_	20	amod	_	_
+20	67	_	NN	NN	_	17	dobj	_	_
+21	following	_	VBG	VBG	_	17	prep	_	_
+22	his	_	PRP$	PRP$	_	24	poss	_	_
+23	first-round	_	JJ	JJ	_	24	amod	_	_
+24	72	_	CD	CD	_	21	pobj	_	_
+25	.	_	.	.	_	8	punct	_	_
+
+1	We	_	PRP	PRP	_	2	nsubj	_	_
+2	made	_	VBD	VBD	_	0	ROOT	_	_
+3	mistakes	_	NNS	NNS	_	2	dobj	_	_
+4	in	_	IN	IN	_	2	prep	_	_
+5	those	_	DT	DT	_	6	det	_	_
+6	games	_	NNS	NNS	_	4	pobj	_	_
+7	in	_	IN	IN	_	2	prep	_	_
+8	the	_	DT	DT	_	10	det	_	_
+9	last	_	JJ	JJ	_	10	amod	_	_
+10	minute	_	NN	NN	_	7	pobj	_	_
+11	,	_	,	,	_	2	punct	_	_
+12	so	_	IN	IN	_	16	mark	_	_
+13	it	_	PRP	PRP	_	16	nsubj	_	_
+14	's	_	VBZ	VBZ	_	16	cop	_	_
+15	our	_	PRP$	PRP$	_	16	poss	_	_
+16	fault	_	NN	NN	_	2	advcl	_	_
+17	in	_	IN	IN	_	16	prep	_	_
+18	the	_	DT	DT	_	19	det	_	_
+19	end	_	NN	NN	_	17	pobj	_	_
+20	.	_	.	.	_	2	punct	_	_
+
+1	This	_	DT	DT	_	3	det	_	_
+2	latest	_	JJS	JJS	_	3	amod	_	_
+3	incident	_	NN	NN	_	7	nsubj	_	_
+4	is	_	VBZ	VBZ	_	7	cop	_	_
+5	the	_	DT	DT	_	7	det	_	_
+6	second	_	JJ	JJ	_	7	amod	_	_
+7	time	_	NN	NN	_	0	ROOT	_	_
+8	in	_	IN	IN	_	7	prep	_	_
+9	four	_	CD	CD	_	10	num	_	_
+10	weeks	_	NNS	NNS	_	8	pobj	_	_
+11	the	_	DT	DT	_	12	det	_	_
+12	Revenue	_	NN	NN	_	14	nsubj	_	_
+13	has	_	VBZ	VBZ	_	14	aux	_	_
+14	admitted	_	VBN	VBN	_	7	rcmod	_	_
+15	losing	_	VBG	VBG	_	14	xcomp	_	_
+16	taxpayers	_	NNS	NNS	_	18	poss	_	_
+17	'	_	POS	POS	_	16	possessive	_	_
+18	details	_	NNS	NNS	_	15	dobj	_	_
+19	.	_	.	.	_	7	punct	_	_
+
+1	NebuAd	_	NNP	NNP	_	2	nsubj	_	_
+2	confirmed	_	VBD	VBD	_	0	ROOT	_	_
+3	Friday	_	NNP	NNP	_	2	tmod	_	_
+4	that	_	IN	IN	_	7	mark	_	_
+5	it	_	PRP	PRP	_	7	nsubj	_	_
+6	is	_	VBZ	VBZ	_	7	aux	_	_
+7	partnering	_	VBG	VBG	_	2	ccomp	_	_
+8	with	_	IN	IN	_	7	prep	_	_
+9	Charter	_	NNP	NNP	_	8	pobj	_	_
+10	but	_	CC	CC	_	2	cc	_	_
+11	declined	_	VBD	VBD	_	2	conj	_	_
+12	further	_	JJ	JJ	_	13	amod	_	_
+13	comment	_	NN	NN	_	11	dobj	_	_
+14	.	_	.	.	_	2	punct	_	_
+
+1	Needless	_	JJ	JJ	_	6	ccomp	_	_
+2	to	_	TO	TO	_	3	aux	_	_
+3	say	_	VB	VB	_	1	xcomp	_	_
+4	,	_	,	,	_	6	punct	_	_
+5	it	_	PRP	PRP	_	6	nsubj	_	_
+6	wasn	_	VBP	VBP	_	0	ROOT	_	_
+7	't	_	NN	NN	_	6	dobj	_	_
+8	long	_	RB	RB	_	11	advmod	_	_
+9	before	_	IN	IN	_	11	mark	_	_
+10	he	_	PRP	PRP	_	11	nsubj	_	_
+11	sat	_	VBD	VBD	_	6	advcl	_	_
+12	down	_	RP	RP	_	11	prt	_	_
+13	.	_	.	.	_	6	punct	_	_
+
+1	For	_	IN	IN	_	18	prep	_	_
+2	Judy	_	NNP	NNP	_	3	nn	_	_
+3	John-Baptiste	_	NNP	NNP	_	1	pobj	_	_
+4	,	_	,	,	_	3	punct	_	_
+5	who	_	WP	WP	_	6	nsubj	_	_
+6	runs	_	VBZ	VBZ	_	3	rcmod	_	_
+7	the	_	DT	DT	_	10	det	_	_
+8	Basement	_	NNP	NNP	_	10	nn	_	_
+9	Dance	_	NNP	NNP	_	10	nn	_	_
+10	Studio	_	NNP	NNP	_	6	dobj	_	_
+11	in	_	IN	IN	_	10	prep	_	_
+12	London	_	NNP	NNP	_	11	pobj	_	_
+13	,	_	,	,	_	18	punct	_	_
+14	ballet	_	NN	NN	_	18	nsubj	_	_
+15	is	_	VBZ	VBZ	_	18	cop	_	_
+16	the	_	DT	DT	_	18	det	_	_
+17	most	_	RBS	RBS	_	18	advmod	_	_
+18	popular	_	JJ	JJ	_	0	ROOT	_	_
+19	of	_	IN	IN	_	18	prep	_	_
+20	all	_	PDT	PDT	_	22	predet	_	_
+21	the	_	DT	DT	_	22	det	_	_
+22	classes	_	NNS	NNS	_	19	pobj	_	_
+23	she	_	PRP	PRP	_	24	nsubj	_	_
+24	offers	_	VBZ	VBZ	_	22	rcmod	_	_
+25	.	_	.	.	_	18	punct	_	_
+
+1	Russ	_	NNP	NNP	_	2	nn	_	_
+2	Dixon	_	NNP	NNP	_	7	nsubj	_	_
+3	,	_	,	,	_	2	punct	_	_
+4	an	_	DT	DT	_	5	det	_	_
+5	infielder	_	NN	NN	_	2	appos	_	_
+6	,	_	,	,	_	2	punct	_	_
+7	homered	_	VBD	VBD	_	0	ROOT	_	_
+8	to	_	TO	TO	_	7	prep	_	_
+9	right	_	NN	NN	_	8	pobj	_	_
+10	,	_	,	,	_	7	punct	_	_
+11	then	_	RB	RB	_	7	advmod	_	_
+12	sheepishly	_	RB	RB	_	13	advmod	_	_
+13	put	_	VBD	VBD	_	7	dep	_	_
+14	his	_	PRP$	PRP$	_	15	poss	_	_
+15	head	_	NN	NN	_	13	dobj	_	_
+16	down	_	RP	RP	_	13	prt	_	_
+17	to	_	TO	TO	_	18	aux	_	_
+18	avoid	_	VB	VB	_	13	xcomp	_	_
+19	eye	_	NN	NN	_	20	nn	_	_
+20	contact	_	NN	NN	_	18	dobj	_	_
+21	with	_	IN	IN	_	20	prep	_	_
+22	the	_	DT	DT	_	23	det	_	_
+23	pitcher	_	NN	NN	_	21	pobj	_	_
+24	.	_	.	.	_	7	punct	_	_
+
+1	Mr.	_	NNP	NNP	_	2	nn	_	_
+2	Gore	_	NNP	NNP	_	3	nsubj	_	_
+3	was	_	VBD	VBD	_	0	ROOT	_	_
+4	not	_	RB	RB	_	3	neg	_	_
+5	here	_	RB	RB	_	3	advmod	_	_
+6	,	_	,	,	_	3	punct	_	_
+7	but	_	CC	CC	_	3	cc	_	_
+8	his	_	PRP$	PRP$	_	9	poss	_	_
+9	name	_	NN	NN	_	10	nsubj	_	_
+10	came	_	VBD	VBD	_	3	conj	_	_
+11	up	_	RP	RP	_	10	prt	_	_
+12	frequently	_	RB	RB	_	10	advmod	_	_
+13	.	_	.	.	_	3	punct	_	_
+
+1	The	_	DT	DT	_	2	det	_	_
+2	lawsuit	_	NN	NN	_	4	nsubj	_	_
+3	also	_	RB	RB	_	4	advmod	_	_
+4	names	_	VBD	VBD	_	0	ROOT	_	_
+5	the	_	DT	DT	_	7	det	_	_
+6	shopping	_	NN	NN	_	7	nn	_	_
+7	mall	_	NN	NN	_	4	dobj	_	_
+8	where	_	WRB	WRB	_	11	advmod	_	_
+9	the	_	DT	DT	_	10	det	_	_
+10	incident	_	NN	NN	_	11	nsubj	_	_
+11	occurred	_	VBD	VBD	_	7	rcmod	_	_
+12	and	_	CC	CC	_	7	cc	_	_
+13	the	_	DT	DT	_	15	det	_	_
+14	security	_	NN	NN	_	15	nn	_	_
+15	company	_	NN	NN	_	7	conj	_	_
+16	employed	_	VBN	VBN	_	15	partmod	_	_
+17	by	_	IN	IN	_	16	prep	_	_
+18	Wal-Mart	_	NNP	NNP	_	17	pobj	_	_
+19	.	_	.	.	_	4	punct	_	_
+
+1	Rudy	_	NNP	NNP	_	2	nn	_	_
+2	Crutchfield	_	NNP	NNP	_	9	nsubj	_	_
+3	and	_	CC	CC	_	2	cc	_	_
+4	Steve	_	NNP	NNP	_	5	nn	_	_
+5	Hadeed	_	NNP	NNP	_	2	conj	_	_
+6	have	_	VBP	VBP	_	9	aux	_	_
+7	been	_	VBN	VBN	_	9	cop	_	_
+8	close	_	JJ	JJ	_	9	amod	_	_
+9	friends	_	NNS	NNS	_	0	ROOT	_	_
+10	since	_	IN	IN	_	9	prep	_	_
+11	their	_	PRP$	PRP$	_	12	poss	_	_
+12	days	_	NNS	NNS	_	10	pobj	_	_
+13	at	_	IN	IN	_	12	prep	_	_
+14	Wheaton	_	NNP	NNP	_	16	nn	_	_
+15	High	_	NNP	NNP	_	16	nn	_	_
+16	School	_	NNP	NNP	_	13	pobj	_	_
+17	.	_	.	.	_	9	punct	_	_
+
+1	Earlier	_	RBR	RBR	_	3	advmod	_	_
+2	this	_	DT	DT	_	3	det	_	_
+3	month	_	NN	NN	_	6	tmod	_	_
+4	,	_	,	,	_	6	punct	_	_
+5	GM	_	NNP	NNP	_	6	nsubj	_	_
+6	announced	_	VBD	VBD	_	0	ROOT	_	_
+7	plans	_	NNS	NNS	_	6	dobj	_	_
+8	to	_	TO	TO	_	9	aux	_	_
+9	sell	_	VB	VB	_	7	infmod	_	_
+10	Hummer	_	NNP	NNP	_	9	dobj	_	_
+11	to	_	TO	TO	_	9	prep	_	_
+12	a	_	DT	DT	_	14	det	_	_
+13	Chinese	_	JJ	JJ	_	14	amod	_	_
+14	manufacturer	_	NN	NN	_	11	pobj	_	_
+15	and	_	CC	CC	_	14	cc	_	_
+16	Saturn	_	NNP	NNP	_	14	conj	_	_
+17	to	_	TO	TO	_	9	prep	_	_
+18	Michigan-based	_	JJ	JJ	_	24	amod	_	_
+19	dealership	_	NN	NN	_	24	nn	_	_
+20	chain	_	NN	NN	_	24	nn	_	_
+21	Penske	_	NNP	NNP	_	24	nn	_	_
+22	Automotive	_	NNP	NNP	_	24	nn	_	_
+23	Group	_	NNP	NNP	_	24	nn	_	_
+24	Inc	_	NNP	NNP	_	17	pobj	_	_
+25	.	_	.	.	_	6	punct	_	_
--- a/syntaxnet/syntaxnet/text_formats.cc
+++ b/syntaxnet/syntaxnet/text_formats.cc
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "syntaxnet/document_format.h"
+#include "syntaxnet/sentence.pb.h"
+#include "syntaxnet/utils.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace syntaxnet {
+
+// CoNLL document format reader for dependency annotated corpora.
+// The expected format is described e.g. at http://ilk.uvt.nl/conll/#dataformat
+//
+// Data should adhere to the following rules:
+//   - Data files contain sentences separated by a blank line.
+//   - A sentence consists of one or tokens, each one starting on a new line.
+//   - A token consists of ten fields described in the table below.
+//   - Fields are separated by a single tab character.
+//   - All data files will contains these ten fields, although only the ID
+//     column is required to contain non-dummy (i.e. non-underscore) values.
+// Data files should be UTF-8 encoded (Unicode).
+//
+// Fields:
+// 1  ID:      Token counter, starting at 1 for each new sentence and increasing
+//             by 1 for every new token.
+// 2  FORM:    Word form or punctuation symbol.
+// 3  LEMMA:   Lemma or stem.
+// 4  CPOSTAG: Coarse-grained part-of-speech tag or category.
+// 5  POSTAG:  Fine-grained part-of-speech tag. Note that the same POS tag
+//             cannot appear with multiple coarse-grained POS tags.
+// 6  FEATS:   Unordered set of syntactic and/or morphological features.
+// 7  HEAD:    Head of the current token, which is either a value of ID or '0'.
+// 8  DEPREL:  Dependency relation to the HEAD.
+// 9  PHEAD:   Projective head of current token.
+// 10 PDEPREL: Dependency relation to the PHEAD.
+//
+// This CoNLL reader is compatible with the CoNLL-U format described at
+//   http://universaldependencies.org/format.html
+// Note that this reader skips CoNLL-U multiword tokens and ignores the last two
+// fields of every line, which are PHEAD and PDEPREL in CoNLL format, but are
+// replaced by DEPS and MISC in CoNLL-U.
+//
+class CoNLLSyntaxFormat : public DocumentFormat {
+ public:
+  CoNLLSyntaxFormat() {}
+
+  // Reads up to the first empty line and returns false end of file is reached.
+  bool ReadRecord(tensorflow::io::InputBuffer *buffer,
+                  string *record) override {
+    string line;
+    record->clear();
+    tensorflow::Status status = buffer->ReadLine(&line);
+    while (!line.empty() && status.ok()) {
+      tensorflow::strings::StrAppend(record, line, "\n");
+      status = buffer->ReadLine(&line);
+    }
+    return status.ok() || !record->empty();
+  }
+
+  void ConvertFromString(const string &key, const string &value,
+                         vector<Sentence *> *sentences) override {
+    // Create new sentence.
+    Sentence *sentence = new Sentence();
+
+    // Each line corresponds to one token.
+    string text;
+    vector<string> lines = utils::Split(value, '\n');
+
+    // Add each token to the sentence.
+    vector<string> fields;
+    int expected_id = 1;
+    for (size_t i = 0; i < lines.size(); ++i) {
+      // Split line into tab-separated fields.
+      fields.clear();
+      fields = utils::Split(lines[i], '\t');
+      if (fields.size() == 0) continue;
+
+      // Skip comment lines.
+      if (fields[0][0] == '#') continue;
+
+      // Skip CoNLLU lines for multiword tokens which are indicated by
+      // hyphenated line numbers, e.g., "2-4".
+      // http://universaldependencies.github.io/docs/format.html
+      if (RE2::FullMatch(fields[0], "[0-9]+-[0-9]+")) continue;
+
+      // Clear all optional fields equal to '_'.
+      for (size_t j = 2; j < fields.size(); ++j) {
+        if (fields[j].length() == 1 && fields[j][0] == '_') fields[j].clear();
+      }
+
+      // Check that the line is valid.
+      CHECK_GE(fields.size(), 8)
+          << "Every line has to have at least 8 tab separated fields.";
+
+      // Check that the ids follow the expected format.
+      const int id = utils::ParseUsing<int>(fields[0], 0, utils::ParseInt32);
+      CHECK_EQ(expected_id++, id)
+          << "Token ids start at 1 for each new sentence and increase by 1 "
+          << "on each new token. Sentences are separated by an empty line.";
+
+      // Get relevant fields.
+      const string &word = fields[1];
+      const string &cpostag = fields[3];
+      const string &tag = fields[4];
+      const int head = utils::ParseUsing<int>(fields[6], 0, utils::ParseInt32);
+      const string &label = fields[7];
+
+      // Add token to sentence text.
+      if (!text.empty()) text.append(" ");
+      const int start = text.size();
+      const int end = start + word.size() - 1;
+      text.append(word);
+
+      // Add token to sentence.
+      Token *token = sentence->add_token();
+      token->set_word(word);
+      token->set_start(start);
+      token->set_end(end);
+      if (head > 0) token->set_head(head - 1);
+      if (!tag.empty()) token->set_tag(tag);
+      if (!cpostag.empty()) token->set_category(cpostag);
+      if (!label.empty()) token->set_label(label);
+    }
+
+    if (sentence->token_size() > 0) {
+      sentence->set_docid(key);
+      sentence->set_text(text);
+      sentences->push_back(sentence);
+    } else {
+      // If the sentence was empty (e.g., blank lines at the beginning of a
+      // file), then don't save it.
+      delete sentence;
+    }
+  }
+
+  // Converts a sentence to a key/value pair.
+  void ConvertToString(const Sentence &sentence, string *key,
+                       string *value) override {
+    *key = sentence.docid();
+    vector<string> lines;
+    for (int i = 0; i < sentence.token_size(); ++i) {
+      vector<string> fields(10);
+      fields[0] = tensorflow::strings::Printf("%d", i + 1);
+      fields[1] = sentence.token(i).word();
+      fields[2] = "_";
+      fields[3] = sentence.token(i).category();
+      fields[4] = sentence.token(i).tag();
+      fields[5] = "_";
+      fields[6] =
+          tensorflow::strings::Printf("%d", sentence.token(i).head() + 1);
+      fields[7] = sentence.token(i).label();
+      fields[8] = "_";
+      fields[9] = "_";
+      lines.push_back(utils::Join(fields, "\t"));
+    }
+    *value = tensorflow::strings::StrCat(utils::Join(lines, "\n"), "\n\n");
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
+};
+
+REGISTER_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);
+
+// Reader for tokenized text. This reader expects every sentence to be on a
+// single line and tokens on that line to be separated by single spaces.
+//
+class TokenizedTextFormat : public DocumentFormat {
+ public:
+  TokenizedTextFormat() {}
+
+  // Reads a line and returns false if end of file is reached.
+  bool ReadRecord(tensorflow::io::InputBuffer *buffer,
+                  string *record) override {
+    return buffer->ReadLine(record).ok();
+  }
+
+  void ConvertFromString(const string &key, const string &value,
+                         vector<Sentence *> *sentences) override {
+    Sentence *sentence = new Sentence();
+    string text;
+    for (const string &word : utils::Split(value, ' ')) {
+      if (word.empty()) continue;
+      const int start = text.size();
+      const int end = start + word.size() - 1;
+      if (!text.empty()) text.append(" ");
+      text.append(word);
+      Token *token = sentence->add_token();
+      token->set_word(word);
+      token->set_start(start);
+      token->set_end(end);
+    }
+
+    if (sentence->token_size() > 0) {
+      sentence->set_docid(key);
+      sentence->set_text(text);
+      sentences->push_back(sentence);
+    } else {
+      // If the sentence was empty (e.g., blank lines at the beginning of a
+      // file), then don't save it.
+      delete sentence;
+    }
+  }
+
+  void ConvertToString(const Sentence &sentence, string *key,
+                       string *value) override {
+    *key = sentence.docid();
+    value->clear();
+    for (const Token &token : sentence.token()) {
+      if (!value->empty()) value->append(" ");
+      value->append(token.word());
+      if (token.has_tag()) {
+        value->append("_");
+        value->append(token.tag());
+      }
+      if (token.has_head()) {
+        value->append("_");
+        value->append(tensorflow::strings::StrCat(token.head()));
+      }
+    }
+    value->append("\n");
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TokenizedTextFormat);
+};
+
+REGISTER_DOCUMENT_FORMAT("tokenized-text", TokenizedTextFormat);
+
+// Text reader that attmpts to perform Penn Treebank tokenization on arbitrary
+// raw text. Adapted from https://www.cis.upenn.edu/~treebank/tokenizer.sed
+// by Robert MacIntyre, University of Pennsylvania, late 1995.
+// Expected input: raw text with one sentence per line.
+//
+class EnglishTextFormat : public TokenizedTextFormat {
+ public:
+  EnglishTextFormat() {}
+
+  void ConvertFromString(const string &key, const string &value,
+                         vector<Sentence *> *sentences) override {
+    vector<pair<string, string>> preproc_rules = {
+        // Punctuation.
+        {"’", "'"},
+        {"…", "..."},
+        {"---", "--"},
+        {"—", "--"},
+        {"–", "--"},
+        {"，", ","},
+        {"。", "."},
+        {"！", "!"},
+        {"？", "?"},
+        {"：", ":"},
+        {"；", ";"},
+        {"＆", "&"},
+
+        // Brackets.
+        {"\\[", "("},
+        {"]", ")"},
+        {"{", "("},
+        {"}", ")"},
+        {"【", "("},
+        {"】", ")"},
+        {"（", "("},
+        {"）", ")"},
+
+        // Quotation marks.
+        {"\"", "\""},
+        {"″", "\""},
+        {"“", "\""},
+        {"„", "\""},
+        {"‵‵", "\""},
+        {"”", "\""},
+        {"’", "\""},
+        {"‘", "\""},
+        {"′′", "\""},
+        {"‹", "\""},
+        {"›", "\""},
+        {"«", "\""},
+        {"»", "\""},
+
+        // Discarded punctuation that breaks sentences.
+        {"|", ""},
+        {"·", ""},
+        {"•", ""},
+        {"●", ""},
+        {"▪", ""},
+        {"■", ""},
+        {"□", ""},
+        {"❑", ""},
+        {"◆", ""},
+        {"★", ""},
+        {"＊", ""},
+        {"♦", ""},
+    };
+
+    vector<pair<string, string>> rules = {
+        // attempt to get correct directional quotes
+        {R"re(^")re", "`` "},
+        {R"re(([ \([{<])")re", "\\1 `` "},
+        // close quotes handled at end
+
+        {R"re(\.\.\.)re", " ... "},
+        {"[,;:@#$%&]", " \\0 "},
+
+        // Assume sentence tokenization has been done first, so split FINAL
+        // periods only.
+        {R"re(([^.])(\.)([\]\)}>"']*)[ ]*$)re", "\\1 \\2\\3 "},
+        // however, we may as well split ALL question marks and exclamation
+        // points, since they shouldn't have the abbrev.-marker ambiguity
+        // problem
+        {"[?!]", " \\0 "},
+
+        // parentheses, brackets, etc.
+        {R"re([\]\[\(\){}<>])re", " \\0 "},
+
+        // Like Adwait Ratnaparkhi's MXPOST, we use the parsed-file version of
+        // these symbols.
+        {"\\(", "-LRB-"},
+        {"\\)", "-RRB-"},
+        {"\\]", "-LSB-"},
+        {"\\]", "-RSB-"},
+        {"{", "-LCB-"},
+        {"}", "-RCB-"},
+
+        {"--", " -- "},
+
+        // First off, add a space to the beginning and end of each line, to
+        // reduce necessary number of regexps.
+        {"$", " "},
+        {"^", " "},
+
+        {"\"", " '' "},
+        // possessive or close-single-quote
+        {"([^'])' ", "\\1 ' "},
+        // as in it's, I'm, we'd
+        {"'([sSmMdD]) ", " '\\1 "},
+        {"'ll ", " 'll "},
+        {"'re ", " 're "},
+        {"'ve ", " 've "},
+        {"n't ", " n't "},
+        {"'LL ", " 'LL "},
+        {"'RE ", " 'RE "},
+        {"'VE ", " 'VE "},
+        {"N'T ", " N'T "},
+
+        {" ([Cc])annot ", " \\1an not "},
+        {" ([Dd])'ye ", " \\1' ye "},
+        {" ([Gg])imme ", " \\1im me "},
+        {" ([Gg])onna ", " \\1on na "},
+        {" ([Gg])otta ", " \\1ot ta "},
+        {" ([Ll])emme ", " \\1em me "},
+        {" ([Mm])ore'n ", " \\1ore 'n "},
+        {" '([Tt])is ", " '\\1 is "},
+        {" '([Tt])was ", " '\\1 was "},
+        {" ([Ww])anna ", " \\1an na "},
+        {" ([Ww])haddya ", " \\1ha dd ya "},
+        {" ([Ww])hatcha ", " \\1ha t cha "},
+
+        // clean out extra spaces
+        {"  *", " "},
+        {"^ *", ""},
+    };
+
+    string rewritten = value;
+    for (const pair<string, string> &rule : preproc_rules) {
+      RE2::GlobalReplace(&rewritten, rule.first, rule.second);
+    }
+    for (const pair<string, string> &rule : rules) {
+      RE2::GlobalReplace(&rewritten, rule.first, rule.second);
+    }
+    TokenizedTextFormat::ConvertFromString(key, rewritten, sentences);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(EnglishTextFormat);
+};
+
+REGISTER_DOCUMENT_FORMAT("english-text", EnglishTextFormat);
+
+}  // namespace syntaxnet
--- a/syntaxnet/syntaxnet/text_formats_test.py
+++ b/syntaxnet/syntaxnet/text_formats_test.py
+# coding=utf-8
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for english_tokenizer."""
+
+
+# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
+import os.path
+
+import tensorflow as tf
+
+import syntaxnet.load_parser_ops
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import logging
+
+from syntaxnet import sentence_pb2
+from syntaxnet import task_spec_pb2
+from syntaxnet.ops import gen_parser_ops
+
+FLAGS = tf.app.flags.FLAGS
+
+
+class TextFormatsTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    if not hasattr(FLAGS, 'test_srcdir'):
+      FLAGS.test_srcdir = ''
+    if not hasattr(FLAGS, 'test_tmpdir'):
+      FLAGS.test_tmpdir = tf.test.get_temp_dir()
+    self.corpus_file = os.path.join(FLAGS.test_tmpdir, 'documents.conll')
+    self.context_file = os.path.join(FLAGS.test_tmpdir, 'context.pbtxt')
+
+  def AddInput(self, name, file_pattern, record_format, context):
+    inp = context.input.add()
+    inp.name = name
+    inp.record_format.append(record_format)
+    inp.part.add().file_pattern = file_pattern
+
+  def WriteContext(self, corpus_format):
+    context = task_spec_pb2.TaskSpec()
+    self.AddInput('documents', self.corpus_file, corpus_format, context)
+    for name in ('word-map', 'lcword-map', 'tag-map',
+                 'category-map', 'label-map', 'prefix-table',
+                 'suffix-table', 'tag-to-category'):
+      self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
+    logging.info('Writing context to: %s', self.context_file)
+    with open(self.context_file, 'w') as f:
+      f.write(str(context))
+
+  def ReadNextDocument(self, sess, sentence):
+    sentence_str, = sess.run([sentence])
+    if sentence_str:
+      sentence_doc = sentence_pb2.Sentence()
+      sentence_doc.ParseFromString(sentence_str[0])
+    else:
+      sentence_doc = None
+    return sentence_doc
+
+  def CheckTokenization(self, sentence, tokenization):
+    self.WriteContext('english-text')
+    logging.info('Writing text file to: %s', self.corpus_file)
+    with open(self.corpus_file, 'w') as f:
+      f.write(sentence)
+    sentence, _ = gen_parser_ops.document_source(
+        self.context_file, batch_size=1)
+    with self.test_session() as sess:
+      sentence_doc = self.ReadNextDocument(sess, sentence)
+      self.assertEqual(' '.join([t.word for t in sentence_doc.token]),
+                       tokenization)
+
+  def testSimple(self):
+    self.CheckTokenization('Hello, world!', 'Hello , world !')
+    self.CheckTokenization('"Hello"', "`` Hello ''")
+    self.CheckTokenization('{"Hello@#$', '-LRB- `` Hello @ # $')
+    self.CheckTokenization('"Hello..."', "`` Hello ... ''")
+    self.CheckTokenization('()[]{}<>',
+                           '-LRB- -RRB- -LRB- -RRB- -LRB- -RRB- < >')
+    self.CheckTokenization('Hello--world', 'Hello -- world')
+    self.CheckTokenization("Isn't", "Is n't")
+    self.CheckTokenization("n't", "n't")
+    self.CheckTokenization('Hello Mr. Smith.', 'Hello Mr. Smith .')
+    self.CheckTokenization("It's Mr. Smith's.", "It 's Mr. Smith 's .")
+    self.CheckTokenization("It's the Smiths'.", "It 's the Smiths ' .")
+    self.CheckTokenization('Gotta go', 'Got ta go')
+    self.CheckTokenization('50-year-old', '50-year-old')
+
+  def testUrl(self):
+    self.CheckTokenization('http://www.google.com/news is down',
+                           'http : //www.google.com/news is down')
+
+
+if __name__ == '__main__':
+  googletest.main()
--- a/syntaxnet/syntaxnet/unpack_sparse_features.cc
+++ b/syntaxnet/syntaxnet/unpack_sparse_features.cc
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "syntaxnet/utils.h"
+#include "syntaxnet/sparse.pb.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+
+using tensorflow::DEVICE_CPU;
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT32;
+using tensorflow::DT_INT64;
+using tensorflow::DT_STRING;
+using tensorflow::OpKernel;
+using tensorflow::OpKernelConstruction;
+using tensorflow::OpKernelContext;
+using tensorflow::Tensor;
+using tensorflow::TensorShape;
+using tensorflow::errors::InvalidArgument;
+
+namespace syntaxnet {
+
+// Operator to unpack ids and weights stored in SparseFeatures proto.
+class UnpackSparseFeatures : public OpKernel {
+ public:
+  explicit UnpackSparseFeatures(OpKernelConstruction *context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->MatchSignature(
+                                {DT_STRING}, {DT_INT32, DT_INT64, DT_FLOAT}));
+  }
+
+  void Compute(OpKernelContext *context) override {
+    const Tensor &input = context->input(0);
+    OP_REQUIRES(context, IsLegacyVector(input.shape()),
+                InvalidArgument("input should be a vector."));
+
+    const int64 n = input.NumElements();
+    const auto input_vec = input.flat<string>();
+    SparseFeatures sf;
+    int output_size = 0;
+    std::vector<std::pair<int64, float> > id_and_weight;
+
+    // Guess that we'll be averaging a handful of ids per SparseFeatures record.
+    id_and_weight.reserve(n * 4);
+    std::vector<int> num_ids(n);
+    for (int64 i = 0; i < n; ++i) {
+      OP_REQUIRES(context, sf.ParseFromString(input_vec(i)),
+                  InvalidArgument("Couldn't parse as SparseFeature"));
+      OP_REQUIRES(context,
+                  sf.weight_size() == 0 || sf.weight_size() == sf.id_size(),
+                  InvalidArgument(tensorflow::strings::StrCat(
+                      "Incorrect number of weights", sf.DebugString())));
+      int n_ids = sf.id_size();
+      num_ids[i] = n_ids;
+      output_size += n_ids;
+      for (int j = 0; j < n_ids; j++) {
+        float w = (sf.weight_size() > 0) ? sf.weight(j) : 1.0f;
+        id_and_weight.push_back(std::make_pair(sf.id(j), w));
+      }
+    }
+
+    Tensor *indices_t;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({output_size}), &indices_t));
+    Tensor *ids_t;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                1, TensorShape({output_size}), &ids_t));
+    Tensor *weights_t;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                2, TensorShape({output_size}), &weights_t));
+
+    auto indices = indices_t->vec<int32>();
+    auto ids = ids_t->vec<int64>();
+    auto weights = weights_t->vec<float>();
+    int c = 0;
+    for (int64 i = 0; i < n; ++i) {
+      for (int j = 0; j < num_ids[i]; ++j) {
+        indices(c) = i;
+        ids(c) = id_and_weight[c].first;
+        weights(c) = id_and_weight[c].second;
+        ++c;
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnpackSparseFeatures").Device(DEVICE_CPU),
+                        UnpackSparseFeatures);
+
+}  // namespace syntaxnet
--- a/syntaxnet/syntaxnet/utils.cc
+++ b/syntaxnet/syntaxnet/utils.cc
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/utils.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace syntaxnet {
+namespace utils {
+
+bool ParseInt32(const char *c_str, int *value) {
+  char *temp;
+  *value = strtol(c_str, &temp, 0);  // NOLINT
+  return (*temp == '\0');
+}
+
+bool ParseInt64(const char *c_str, int64 *value) {
+  char *temp;
+  *value = strtol(c_str, &temp, 0);  // NOLINT
+  return (*temp == '\0');
+}
+
+bool ParseDouble(const char *c_str, double *value) {
+  char *temp;
+  *value = strtod(c_str, &temp);
+  return (*temp == '\0');
+}
+
+static char hex_char[] = "0123456789abcdef";
+
+string CEscape(const string &src) {
+  string dest;
+
+  for (unsigned char c : src) {
+    switch (c) {
+      case '\n':
+        dest.append("\\n");
+        break;
+      case '\r':
+        dest.append("\\r");
+        break;
+      case '\t':
+        dest.append("\\t");
+        break;
+      case '\"':
+        dest.append("\\\"");
+        break;
+      case '\'':
+        dest.append("\\'");
+        break;
+      case '\\':
+        dest.append("\\\\");
+        break;
+      default:
+        // Note that if we emit \xNN and the src character after that is a hex
+        // digit then that digit must be escaped too to prevent it being
+        // interpreted as part of the character code by C.
+        if ((c >= 0x80) || !isprint(c)) {
+          dest.append("\\");
+          dest.push_back(hex_char[c / 64]);
+          dest.push_back(hex_char[(c % 64) / 8]);
+          dest.push_back(hex_char[c % 8]);
+        } else {
+          dest.push_back(c);
+          break;
+        }
+    }
+  }
+
+  return dest;
+}
+
+std::vector<string> Split(const string &text, char delim) {
+  std::vector<string> result;
+  int token_start = 0;
+  if (!text.empty()) {
+    for (size_t i = 0; i < text.size() + 1; i++) {
+      if ((i == text.size()) || (text[i] == delim)) {
+        result.push_back(string(text.data() + token_start, i - token_start));
+        token_start = i + 1;
+      }
+    }
+  }
+  return result;
+}
+
+bool IsAbsolutePath(tensorflow::StringPiece path) {
+  return !path.empty() && path[0] == '/';
+}
+
+// For an array of paths of length count, append them all together,
+// ensuring that the proper path separators are inserted between them.
+string JoinPath(std::initializer_list<tensorflow::StringPiece> paths) {
+  string result;
+
+  for (tensorflow::StringPiece path : paths) {
+    if (path.empty()) {
+      continue;
+    }
+
+    if (result.empty()) {
+      result = path.ToString();
+      continue;
+    }
+
+    if (result[result.size() - 1] == '/') {
+      if (IsAbsolutePath(path)) {
+        tensorflow::strings::StrAppend(&result, path.substr(1));
+      } else {
+        tensorflow::strings::StrAppend(&result, path);
+      }
+    } else {
+      if (IsAbsolutePath(path)) {
+        tensorflow::strings::StrAppend(&result, path);
+      } else {
+        tensorflow::strings::StrAppend(&result, "/", path);
+      }
+    }
+  }
+
+  return result;
+}
+
+size_t RemoveLeadingWhitespace(tensorflow::StringPiece *text) {
+  size_t count = 0;
+  const char *ptr = text->data();
+  while (count < text->size() && isspace(*ptr)) {
+    count++;
+    ptr++;
+  }
+  text->remove_prefix(count);
+  return count;
+}
+
+size_t RemoveTrailingWhitespace(tensorflow::StringPiece *text) {
+  size_t count = 0;
+  const char *ptr = text->data() + text->size() - 1;
+  while (count < text->size() && isspace(*ptr)) {
+    ++count;
+    --ptr;
+  }
+  text->remove_suffix(count);
+  return count;
+}
+
+size_t RemoveWhitespaceContext(tensorflow::StringPiece *text) {
+  // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
+  return RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text);
+}
+
+namespace {
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+inline uint32 DecodeFixed32(const char *ptr) {
+  return ((static_cast<uint32>(static_cast<unsigned char>(ptr[0]))) |
+          (static_cast<uint32>(static_cast<unsigned char>(ptr[1])) << 8) |
+          (static_cast<uint32>(static_cast<unsigned char>(ptr[2])) << 16) |
+          (static_cast<uint32>(static_cast<unsigned char>(ptr[3])) << 24));
+}
+
+// 0xff is in case char is signed.
+static inline uint32 ByteAs32(char c) { return static_cast<uint32>(c) & 0xff; }
+}  // namespace
+
+uint32 Hash32(const char *data, size_t n, uint32 seed) {
+  // 'm' and 'r' are mixing constants generated offline.
+  // They're not really 'magic', they just happen to work well.
+  const uint32 m = 0x5bd1e995;
+  const int r = 24;
+
+  // Initialize the hash to a 'random' value
+  uint32 h = seed ^ n;
+
+  // Mix 4 bytes at a time into the hash
+  while (n >= 4) {
+    uint32 k = DecodeFixed32(data);
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+    h *= m;
+    h ^= k;
+    data += 4;
+    n -= 4;
+  }
+
+  // Handle the last few bytes of the input array
+  switch (n) {
+    case 3:
+      h ^= ByteAs32(data[2]) << 16;
+      TF_FALLTHROUGH_INTENDED;
+    case 2:
+      h ^= ByteAs32(data[1]) << 8;
+      TF_FALLTHROUGH_INTENDED;
+    case 1:
+      h ^= ByteAs32(data[0]);
+      h *= m;
+  }
+
+  // Do a few final mixes of the hash to ensure the last few
+  // bytes are well-incorporated.
+  h ^= h >> 13;
+  h *= m;
+  h ^= h >> 15;
+  return h;
+}
+
+string Lowercase(tensorflow::StringPiece s) {
+  string result(s.data(), s.size());
+  for (char &c : result) {
+    c = tolower(c);
+  }
+  return result;
+}
+
+PunctuationUtil::CharacterRange PunctuationUtil::kPunctuation[] = {
+    {33, 35},       {37, 42},       {44, 47},       {58, 59},
+    {63, 64},       {91, 93},       {95, 95},       {123, 123},
+    {125, 125},     {161, 161},     {171, 171},     {183, 183},
+    {187, 187},     {191, 191},     {894, 894},     {903, 903},
+    {1370, 1375},   {1417, 1418},   {1470, 1470},   {1472, 1472},
+    {1475, 1475},   {1478, 1478},   {1523, 1524},   {1548, 1549},
+    {1563, 1563},   {1566, 1567},   {1642, 1645},   {1748, 1748},
+    {1792, 1805},   {2404, 2405},   {2416, 2416},   {3572, 3572},
+    {3663, 3663},   {3674, 3675},   {3844, 3858},   {3898, 3901},
+    {3973, 3973},   {4048, 4049},   {4170, 4175},   {4347, 4347},
+    {4961, 4968},   {5741, 5742},   {5787, 5788},   {5867, 5869},
+    {5941, 5942},   {6100, 6102},   {6104, 6106},   {6144, 6154},
+    {6468, 6469},   {6622, 6623},   {6686, 6687},   {8208, 8231},
+    {8240, 8259},   {8261, 8273},   {8275, 8286},   {8317, 8318},
+    {8333, 8334},   {9001, 9002},   {9140, 9142},   {10088, 10101},
+    {10181, 10182}, {10214, 10219}, {10627, 10648}, {10712, 10715},
+    {10748, 10749}, {11513, 11516}, {11518, 11519}, {11776, 11799},
+    {11804, 11805}, {12289, 12291}, {12296, 12305}, {12308, 12319},
+    {12336, 12336}, {12349, 12349}, {12448, 12448}, {12539, 12539},
+    {64830, 64831}, {65040, 65049}, {65072, 65106}, {65108, 65121},
+    {65123, 65123}, {65128, 65128}, {65130, 65131}, {65281, 65283},
+    {65285, 65290}, {65292, 65295}, {65306, 65307}, {65311, 65312},
+    {65339, 65341}, {65343, 65343}, {65371, 65371}, {65373, 65373},
+    {65375, 65381}, {65792, 65793}, {66463, 66463}, {68176, 68184},
+    {-1, -1}};
+
+void NormalizeDigits(string *form) {
+  for (size_t i = 0; i < form->size(); ++i) {
+    if ((*form)[i] >= '0' && (*form)[i] <= '9') (*form)[i] = '9';
+  }
+}
+
+}  // namespace utils
+}  // namespace syntaxnet
--- a/syntaxnet/syntaxnet/utils.h
+++ b/syntaxnet/syntaxnet/utils.h
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef $TARGETDIR_UTILS_H_
+#define $TARGETDIR_UTILS_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+#include <unordered_set>
+#include "syntaxnet/base.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/default/integral_types.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "util/utf8/unicodetext.h"
+
+namespace syntaxnet {
+namespace utils {
+
+bool ParseInt32(const char *c_str, int *value);
+bool ParseInt64(const char *c_str, int64 *value);
+bool ParseDouble(const char *c_str, double *value);
+
+template <typename T>
+T ParseUsing(const string &str, std::function<bool(const char *, T *)> func) {
+  T value;
+  CHECK(func(str.c_str(), &value)) << "Failed to convert: " << str;
+  return value;
+}
+
+template <typename T>
+T ParseUsing(const string &str, T defval,
+             std::function<bool(const char *, T *)> func) {
+  return str.empty() ? defval : ParseUsing<T>(str, func);
+}
+
+string CEscape(const string &src);
+
+std::vector<string> Split(const string &text, char delim);
+
+template <typename T>
+string Join(const std::vector<T> &s, const char *sep) {
+  string result;
+  bool first = true;
+  for (const auto &x : s) {
+    tensorflow::strings::StrAppend(&result, (first ? "" : sep), x);
+    first = false;
+  }
+  return result;
+}
+
+string JoinPath(std::initializer_list<StringPiece> paths);
+
+size_t RemoveLeadingWhitespace(tensorflow::StringPiece *text);
+
+size_t RemoveTrailingWhitespace(tensorflow::StringPiece *text);
+
+size_t RemoveWhitespaceContext(tensorflow::StringPiece *text);
+
+uint32 Hash32(const char *data, size_t n, uint32 seed);
+
+// Deletes all the elements in an STL container and clears the container. This
+// function is suitable for use with a vector, set, hash_set, or any other STL
+// container which defines sensible begin(), end(), and clear() methods.
+// If container is NULL, this function is a no-op.
+template <typename T>
+void STLDeleteElements(T *container) {
+  if (!container) return;
+  auto it = container->begin();
+  while (it != container->end()) {
+    auto temp = it;
+    ++it;
+    delete *temp;
+  }
+  container->clear();
+}
+
+// Returns lower-cased version of s.
+string Lowercase(tensorflow::StringPiece s);
+
+class PunctuationUtil {
+ public:
+  // Unicode character ranges for punctuation characters according to CoNLL.
+  struct CharacterRange {
+    int first;
+    int last;
+  };
+  static CharacterRange kPunctuation[];
+
+  // Returns true if Unicode character is a punctuation character.
+  static bool IsPunctuation(int u) {
+    int i = 0;
+    while (kPunctuation[i].first > 0) {
+      if (u < kPunctuation[i].first) return false;
+      if (u <= kPunctuation[i].last) return true;
+      ++i;
+    }
+    return false;
+  }
+
+  // Determine if tag is a punctuation tag.
+  static bool IsPunctuationTag(const string &tag) {
+    for (size_t i = 0; i < tag.length(); ++i) {
+      int c = tag[i];
+      if (c != ',' && c != ':' && c != '.' && c != '\'' && c != '`') {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Returns true if word consists of punctuation characters.
+  static bool IsPunctuationToken(const string &word) {
+    UnicodeText text;
+    text.PointToUTF8(word.c_str(), word.length());
+    UnicodeText::const_iterator it;
+    for (it = text.begin(); it != text.end(); ++it) {
+      if (!IsPunctuation(*it)) return false;
+    }
+    return true;
+  }
+
+  // Returns true if tag is non-empty and has only punctuation or parens
+  // symbols.
+  static bool IsPunctuationTagOrParens(const string &tag) {
+    if (tag.empty()) return false;
+    for (size_t i = 0; i < tag.length(); ++i) {
+      int c = tag[i];
+      if (c != '(' && c != ')' && c != ',' && c != ':' && c != '.' &&
+          c != '\'' && c != '`') {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Decides whether to score a token, given the word, the POS tag and
+  // and the scoring type.
+  static bool ScoreToken(const string &word, const string &tag,
+                         const string &scoring_type) {
+    if (scoring_type == "default") {
+      return tag.empty() || !IsPunctuationTag(tag);
+    } else if (scoring_type == "conllx") {
+      return !IsPunctuationToken(word);
+    } else if (scoring_type == "ignore_parens") {
+      return !IsPunctuationTagOrParens(tag);
+    }
+    CHECK(scoring_type.empty()) << "Unknown scoring strategy " << scoring_type;
+    return true;
+  }
+};
+
+void NormalizeDigits(string *form);
+
+}  // namespace utils
+}  // namespace syntaxnet
+
+#endif  // $TARGETDIR_UTILS_H_
--- a/syntaxnet/syntaxnet/workspace.cc
+++ b/syntaxnet/syntaxnet/workspace.cc
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/workspace.h"
+
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace syntaxnet {
+
+string WorkspaceRegistry::DebugString() const {
+  string str;
+  for (auto &it : workspace_names_) {
+    const string &type_name = workspace_types_.at(it.first);
+    for (size_t index = 0; index < it.second.size(); ++index) {
+      const string &workspace_name = it.second[index];
+      tensorflow::strings::StrAppend(&str, "\n  ", type_name, " :: ",
+                                     workspace_name);
+    }
+  }
+  return str;
+}
+
+VectorIntWorkspace::VectorIntWorkspace(int size) : elements_(size) {}
+
+VectorIntWorkspace::VectorIntWorkspace(int size, int value)
+    : elements_(size, value) {}
+
+VectorIntWorkspace::VectorIntWorkspace(const vector<int> &elements)
+    : elements_(elements) {}
+
+string VectorIntWorkspace::TypeName() { return "Vector"; }
+
+VectorVectorIntWorkspace::VectorVectorIntWorkspace(int size)
+    : elements_(size) {}
+
+string VectorVectorIntWorkspace::TypeName() { return "VectorVector"; }
+
+}  // namespace syntaxnet
--- a/syntaxnet/syntaxnet/workspace.h
+++ b/syntaxnet/syntaxnet/workspace.h
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Notes on thread-safety: All of the classes here are thread-compatible.  More
+// specifically, the registry machinery is thread-safe, as long as each thread
+// performs feature extraction on a different Sentence object.
+
+#ifndef $TARGETDIR_WORKSPACE_H_
+#define $TARGETDIR_WORKSPACE_H_
+
+#include <string>
+#include <typeindex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "syntaxnet/utils.h"
+
+namespace syntaxnet {
+
+// A base class for shared workspaces. Derived classes implement a static member
+// function TypeName() which returns a human readable string name for the class.
+class Workspace {
+ public:
+  // Polymorphic destructor.
+  virtual ~Workspace() {}
+
+ protected:
+  // Create an empty workspace.
+  Workspace() {}
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(Workspace);
+};
+
+// A registry that keeps track of workspaces.
+class WorkspaceRegistry {
+ public:
+  // Create an empty registry.
+  WorkspaceRegistry() {}
+
+  // Returns the index of a named workspace, adding it to the registry first
+  // if necessary.
+  template <class W>
+  int Request(const string &name) {
+    const std::type_index id = std::type_index(typeid(W));
+    workspace_types_[id] = W::TypeName();
+    vector<string> &names = workspace_names_[id];
+    for (int i = 0; i < names.size(); ++i) {
+      if (names[i] == name) return i;
+    }
+    names.push_back(name);
+    return names.size() - 1;
+  }
+
+  const std::unordered_map<std::type_index, vector<string> > &WorkspaceNames()
+      const {
+    return workspace_names_;
+  }
+
+  // Returns a string describing the registered workspaces.
+  string DebugString() const;
+
+ private:
+  // Workspace type names, indexed as workspace_types_[typeid].
+  std::unordered_map<std::type_index, string> workspace_types_;
+
+  // Workspace names, indexed as workspace_names_[typeid][workspace].
+  std::unordered_map<std::type_index, vector<string> > workspace_names_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(WorkspaceRegistry);
+};
+
+// A typed collected of workspaces. The workspaces are indexed according to an
+// external WorkspaceRegistry. If the WorkspaceSet is const, the contents are
+// also immutable.
+class WorkspaceSet {
+ public:
+  ~WorkspaceSet() { Reset(WorkspaceRegistry()); }
+
+  // Returns true if a workspace has been set.
+  template <class W>
+  bool Has(int index) const {
+    const std::type_index id = std::type_index(typeid(W));
+    DCHECK(workspaces_.find(id) != workspaces_.end());
+    DCHECK_LT(index, workspaces_.find(id)->second.size());
+    return workspaces_.find(id)->second[index] != nullptr;
+  }
+
+  // Returns an indexed workspace; the workspace must have been set.
+  template <class W>
+  const W &Get(int index) const {
+    DCHECK(Has<W>(index));
+    const Workspace *w =
+        workspaces_.find(std::type_index(typeid(W)))->second[index];
+    return reinterpret_cast<const W &>(*w);
+  }
+
+  // Sets an indexed workspace; this takes ownership of the workspace, which
+  // must have been new-allocated.  It is an error to set a workspace twice.
+  template <class W>
+  void Set(int index, W *workspace) {
+    const std::type_index id = std::type_index(typeid(W));
+    DCHECK(workspaces_.find(id) != workspaces_.end());
+    DCHECK_LT(index, workspaces_[id].size());
+    DCHECK(workspaces_[id][index] == nullptr);
+    DCHECK(workspace != nullptr);
+    workspaces_[id][index] = workspace;
+  }
+
+  void Reset(const WorkspaceRegistry &registry) {
+    // Deallocate current workspaces.
+    for (auto &it : workspaces_) {
+      for (size_t index = 0; index < it.second.size(); ++index) {
+        delete it.second[index];
+      }
+    }
+    workspaces_.clear();
+
+    // Allocate space for new workspaces.
+    for (auto &it : registry.WorkspaceNames()) {
+      workspaces_[it.first].resize(it.second.size());
+    }
+  }
+
+ private:
+  // The set of workspaces, indexed as workspaces_[typeid][index].
+  std::unordered_map<std::type_index, vector<Workspace *> > workspaces_;
+};
+
+// A workspace that wraps around a single int.
+class SingletonIntWorkspace : public Workspace {
+ public:
+  // Default-initializes the int value.
+  SingletonIntWorkspace() {}
+
+  // Initializes the int with the given value.
+  explicit SingletonIntWorkspace(int value) : value_(value) {}
+
+  // Returns the name of this type of workspace.
+  static string TypeName() { return "SingletonInt"; }
+
+  // Returns the int value.
+  int get() const { return value_; }
+
+  // Sets the int value.
+  void set(int value) { value_ = value; }
+
+ private:
+  // The enclosed int.
+  int value_ = 0;
+};
+
+// A workspace that wraps around a vector of int.
+class VectorIntWorkspace : public Workspace {
+ public:
+  // Creates a vector of the given size.
+  explicit VectorIntWorkspace(int size);
+
+  // Creates a vector initialized with the given array.
+  explicit VectorIntWorkspace(const vector<int> &elements);
+
+  // Creates a vector of the given size, with each element initialized to the
+  // given value.
+  VectorIntWorkspace(int size, int value);
+
+  // Returns the name of this type of workspace.
+  static string TypeName();
+
+  // Returns the i'th element.
+  int element(int i) const { return elements_[i]; }
+
+  // Sets the i'th element.
+  void set_element(int i, int value) { elements_[i] = value; }
+
+ private:
+  // The enclosed vector.
+  vector<int> elements_;
+};
+
+// A workspace that wraps around a vector of vector of int.
+class VectorVectorIntWorkspace : public Workspace {
+ public:
+  // Creates a vector of empty vectors of the given size.
+  explicit VectorVectorIntWorkspace(int size);
+
+  // Returns the name of this type of workspace.
+  static string TypeName();
+
+  // Returns the i'th vector of elements.
+  const vector<int> &elements(int i) const { return elements_[i]; }
+
+  // Mutable access to the i'th vector of elements.
+  vector<int> *mutable_elements(int i) { return &(elements_[i]); }
+
+ private:
+  // The enclosed vector of vector of elements.
+  vector<vector<int> > elements_;
+};
+
+}  // namespace syntaxnet
+
+#endif  // $TARGETDIR_WORKSPACE_H_
--- a/tensorflow @ 3402f51e
+++ b/tensorflow @ 3402f51e
+Subproject commit 3402f51ecd11a26d0c071b1d06b4edab1b0ef351
--- a/syntaxnet/third_party/utf/BUILD
+++ b/syntaxnet/third_party/utf/BUILD
+licenses(["notice"])
+
+cc_library(
+    name = "utf",
+    srcs = [
+        "rune.c",
+        "runestrcat.c",
+        "runestrchr.c",
+        "runestrcmp.c",
+        "runestrcpy.c",
+        "runestrdup.c",
+        "runestrecpy.c",
+        "runestrlen.c",
+        "runestrncat.c",
+        "runestrncmp.c",
+        "runestrncpy.c",
+        "runestrrchr.c",
+        "runestrstr.c",
+        "runetype.c",
+        "utfecpy.c",
+        "utflen.c",
+        "utfnlen.c",
+        "utfrrune.c",
+        "utfrune.c",
+        "utfutf.c",
+    ],
+    hdrs = [
+        "runetypebody.c",
+        "utf.h",
+        "utfdef.h",
+    ],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
--- a/syntaxnet/third_party/utf/README
+++ b/syntaxnet/third_party/utf/README
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 1998-2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
--- a/syntaxnet/third_party/utf/rune.c
+++ b/syntaxnet/third_party/utf/rune.c
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#include <stdarg.h>
+#include <string.h>
+#include "third_party/utf/utf.h"
+#include "third_party/utf/utfdef.h"
+
+enum
+{
+	Bit1	= 7,
+	Bitx	= 6,
+	Bit2	= 5,
+	Bit3	= 4,
+	Bit4	= 3,
+	Bit5	= 2, 
+
+	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
+	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
+	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
+	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
+	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
+
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,
+                                        /* 0001 1111 1111 1111 1111 1111 */
+
+	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
+	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
+
+	Bad	= Runeerror,
+};
+
+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune 
+ * that works on strings that are not necessarily null-terminated.
+ * 
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer.  This is to avoid
+ * possible access violations.  If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
+int
+charntorune(Rune *rune, const char *str, int length)
+{
+	int c, c1, c2, c3;
+	long l;
+
+	/* When we're not allowed to read anything */
+	if(length <= 0) {
+		goto badlen;
+	}
+
+	/*
+	 * one character sequence (7-bit value)
+	 *	00000-0007F => T1
+	 */
+	c = *(uchar*)str;
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	// If we can't read more than one character we must stop
+	if(length <= 1) {
+		goto badlen;
+	}
+
+	/*
+	 * two character sequence (11-bit value)
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(uchar*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	// If we can't read more than two characters we must stop
+	if(length <= 2) {
+		goto badlen;
+	}
+
+	/*
+	 * three character sequence (16-bit value)
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(uchar*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	if (length <= 3)
+		goto badlen;
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		if (l > Runemax)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	// Support for 5-byte or longer UTF-8 would go here, but
+	// since we don't have that, we'll just fall through to bad.
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+badlen:
+	*rune = Bad;
+	return 0;
+
+}
+
+
+/*
+ * This is the older "unsafe" version, which works fine on 
+ * null-terminated strings.
+ */
+int
+chartorune(Rune *rune, const char *str)
+{
+	int c, c1, c2, c3;
+	long l;
+
+	/*
+	 * one character sequence
+	 *	00000-0007F => T1
+	 */
+	c = *(uchar*)str;
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	/*
+	 * two character sequence
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(uchar*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(uchar*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		if (l > Runemax)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	/*
+	 * Support for 5-byte or longer UTF-8 would go here, but
+	 * since we don't have that, we'll just fall through to bad.
+	 */
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+}
+
+int
+isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
+	*consumed = charntorune(rune, str, length);
+	return *rune != Runeerror || *consumed == 3;
+}
+    
+int
+runetochar(char *str, const Rune *rune)
+{
+	/* Runes are signed, so convert to unsigned for range check. */
+	unsigned long c;
+
+	/*
+	 * one character sequence
+	 *	00000-0007F => 00-7F
+	 */
+	c = *rune;
+	if(c <= Rune1) {
+		str[0] = c;
+		return 1;
+	}
+
+	/*
+	 * two character sequence
+	 *	0080-07FF => T2 Tx
+	 */
+	if(c <= Rune2) {
+		str[0] = T2 | (c >> 1*Bitx);
+		str[1] = Tx | (c & Maskx);
+		return 2;
+	}
+
+	/*
+	 * If the Rune is out of range, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
+}
+
+int
+runelen(Rune rune)
+{
+	char str[10];
+
+	return runetochar(str, &rune);
+}
+
+int
+runenlen(const Rune *r, int nrune)
+{
+	int nb;
+	ulong c;	/* Rune is signed, so use unsigned for range check. */
+
+	nb = 0;
+	while(nrune--) {
+		c = *r++;
+		if (c <= Rune1)
+			nb++;
+		else if (c <= Rune2)
+			nb += 2;
+		else if (c <= Rune3)
+			nb += 3;
+		else if (c <= Runemax)
+			nb += 4;
+		else
+			nb += 3;	/* Runeerror = 0xFFFD, see runetochar */
+	}
+	return nb;
+}
+
+int
+fullrune(const char *str, int n)
+{
+	if (n > 0) {
+		int c = *(uchar*)str;
+		if (c < Tx)
+			return 1;
+		if (n > 1) {
+			if (c < T3)
+				return 1;
+			if (n > 2) {
+				if (c < T4 || n > 3)
+					return 1;
+			}
+		}
+	}
+	return 0;
+}
--- a/syntaxnet/third_party/utf/runestrcat.c
+++ b/syntaxnet/third_party/utf/runestrcat.c
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#include <stdarg.h>
+#include <string.h>
+#include "third_party/utf/utf.h"
+#include "third_party/utf/utfdef.h"
+
+Rune*
+runestrcat(Rune *s1, const Rune *s2)
+{
+
+	runestrcpy((Rune*)runestrchr(s1, 0), s2);
+	return s1;
+}