Release DRAGNN (#1177)

* Release DRAGNN * Update CoNLL evaluation table & evaluator.py

Release DRAGNN (#1177)
* Release DRAGNN * Update CoNLL evaluation table & evaluator.py
7d30a017 · Ivan Bogatyy · calberti · c774cc95 · 7d30a017 · 7d30a017
Commit 7d30a017 authored Mar 15, 2017 by Ivan Bogatyy Committed by calberti Mar 15, 2017
20 changed files
--- a/syntaxnet/dragnn/components/syntaxnet/testdata/syntaxnet-tagger.tag-map
+++ b/syntaxnet/dragnn/components/syntaxnet/testdata/syntaxnet-tagger.tag-map
+49
+NN 285194
+IN 228165
+DT 179147
+NNP 175147
+JJ 125667
+NNS 115732
+, 97481
+. 85938
+RB 78513
+VB 63952
+CC 57554
+VBD 56635
+CD 55674
+PRP 55244
+VBZ 48126
+VBN 44458
+VBG 34524
+VBP 33669
+TO 28772
+MD 22364
+PRP$ 20706
+HYPH 18526
+POS 14905
+`` 12193
+'' 12154
+WDT 10267
+: 8713
+$ 7993
+WP 7336
+RP 7335
+WRB 6634
+JJR 6295
+NNPS 5917
+-RRB- 3904
+-LRB- 3840
+JJS 3596
+RBR 3186
+EX 2733
+UH 1521
+RBS 1467
+PDT 1271
+FW 928
+NFP 844
+SYM 652
+ADD 476
+LS 392
+WP$ 332
+GW 184
+AFX 42
--- a/syntaxnet/dragnn/components/syntaxnet/testdata/syntaxnet-tagger.word-map
+++ b/syntaxnet/dragnn/components/syntaxnet/testdata/syntaxnet-tagger.word-map
+3
+Sentence 4
+. 3
+0 2
--- a/syntaxnet/dragnn/components/util/BUILD
+++ b/syntaxnet/dragnn/components/util/BUILD
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "bulk_feature_extractor",
+    hdrs = ["bulk_feature_extractor.h"],
+    deps = [
+        "@org_tensorflow//tensorflow/core:lib",
+    ],
+)
--- a/syntaxnet/dragnn/components/util/bulk_feature_extractor.h
+++ b/syntaxnet/dragnn/components/util/bulk_feature_extractor.h
+#ifndef NLP_SAFT_OPENSOURCE_DRAGNN_COMPONENTS_UTIL_BULK_FEATURE_EXTRACTOR_H_
+#define NLP_SAFT_OPENSOURCE_DRAGNN_COMPONENTS_UTIL_BULK_FEATURE_EXTRACTOR_H_
+
+#include <functional>
+#include <utility>
+#include "tensorflow/core/platform/types.h"
+
+namespace syntaxnet {
+namespace dragnn {
+
+// Provides a wrapper for allocator functions and padding data for the Bulk
+// ExtractFixedFeatures operation.
+class BulkFeatureExtractor {
+ public:
+  // Create a BulkFeatureExtractor with the given allocator functions and
+  // padding. The allocator functions should take a channel and an element
+  // count and return a contigous block of memory that is associated with that
+  // channel (the caller can decide what that means). If use_padding is true,
+  // the provided pad_to_step and pad_to_element will be used to calculate
+  // the ID size.
+  BulkFeatureExtractor(
+      std::function<tensorflow::int32 *(int channel, int num_elements)>
+          allocate_indices_by_channel,
+      std::function<tensorflow::int64 *(int channel, int num_elements)>
+          allocate_ids_by_channel,
+      std::function<float *(int channel, int num_elements)>
+          allocate_weights_by_channel,
+      bool use_padding, int pad_to_step, int pad_to_element)
+      : use_padding_(use_padding),
+        pad_to_step_(pad_to_step),
+        pad_to_element_(pad_to_element),
+        allocate_indices_by_channel_(std::move(allocate_indices_by_channel)),
+        allocate_ids_by_channel_(std::move(allocate_ids_by_channel)),
+        allocate_weights_by_channel_(std::move(allocate_weights_by_channel)) {}
+
+  // Create a BulkFeatureExtractor with allocator functions as above, but with
+  // use_padding set to False. Useful when you know your caller will never
+  // need to pad.
+  BulkFeatureExtractor(
+      std::function<tensorflow::int32 *(int channel, int num_elements)>
+          allocate_indices_by_channel,
+      std::function<tensorflow::int64 *(int channel, int num_elements)>
+          allocate_ids_by_channel,
+      std::function<float *(int channel, int num_elements)>
+          allocate_weights_by_channel)
+      : use_padding_(false),
+        pad_to_step_(-1),
+        pad_to_element_(-1),
+        allocate_indices_by_channel_(std::move(allocate_indices_by_channel)),
+        allocate_ids_by_channel_(std::move(allocate_ids_by_channel)),
+        allocate_weights_by_channel_(std::move(allocate_weights_by_channel)) {}
+
+  // Invoke the index memory allocator.
+  tensorflow::int32 *AllocateIndexMemory(int channel, int num_elements) const {
+    return allocate_indices_by_channel_(channel, num_elements);
+  }
+
+  // Invoke the ID memory allocator.
+  tensorflow::int64 *AllocateIdMemory(int channel, int num_elements) const {
+    return allocate_ids_by_channel_(channel, num_elements);
+  }
+
+  // Invoke the weight memory allocator.
+  float *AllocateWeightMemory(int channel, int num_elements) const {
+    return allocate_weights_by_channel_(channel, num_elements);
+  }
+
+  // Given the total number of steps and total number of elements for a given
+  // feature, calculate the index (not ID) of that feature. Based on how the
+  // BulkFeatureExtractor was constructed, it may use the given number of steps
+  // and number of elements, or it may use the passed padded number.
+  int GetIndex(int total_steps, int num_elements, int feature_idx,
+               int element_idx, int step_idx) const {
+    const int steps = (use_padding_) ? pad_to_step_ : total_steps;
+    const int elements = (use_padding_) ? pad_to_element_ : num_elements;
+    const int feature_offset = elements * steps;
+    const int element_offset = steps;
+    return (feature_idx * feature_offset) + (element_idx * element_offset) +
+           step_idx;
+  }
+
+ private:
+  const bool use_padding_;
+  const int pad_to_step_;
+  const int pad_to_element_;
+  const std::function<tensorflow::int32 *(int, int)>
+      allocate_indices_by_channel_;
+  const std::function<tensorflow::int64 *(int, int)> allocate_ids_by_channel_;
+  const std::function<float *(int, int)> allocate_weights_by_channel_;
+};
+
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#endif  // NLP_SAFT_OPENSOURCE_DRAGNN_COMPONENTS_UTIL_BULK_FEATURE_EXTRACTOR_H_
--- a/syntaxnet/dragnn/conll2017/BUILD
+++ b/syntaxnet/dragnn/conll2017/BUILD
+py_binary(
+    name = "make_parser_spec",
+    srcs = ["make_parser_spec.py"],
+    deps = [
+        "//dragnn/protos:spec_py_pb2",
+        "//dragnn/python:spec_builder",
+        "@org_tensorflow//tensorflow:tensorflow_py",
+    ],
+)
--- a/syntaxnet/dragnn/conll2017/conll_parser_trainer.sh
+++ b/syntaxnet/dragnn/conll2017/conll_parser_trainer.sh
+#!/bin/sh
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# A script to train the CONLL2017 baseline.
+set -e
+
+language=English
+output_dir=./trained-"$language"
+
+training_corpus=$1
+dev_corpus=$2
+
+bazel build -c opt //dragnn/tools:trainer //dragnn/conll2017:make_parser_spec
+
+mkdir -p $output_dir
+bazel-bin/dragnn/conll2017/make_parser_spec \
+  --spec_file="$output_dir/parser_spec.textproto"
+
+bazel-bin/dragnn/tools/trainer \
+  --logtostderr \
+  --compute_lexicon \
+  --dragnn_spec="$output_dir/parser_spec.textproto" \
+  --resource_path="$output_dir/resources" \
+  --training_corpus_path="$training_corpus" \
+  --tune_corpus_path="$dev_corpus" \
+  --tensorboard_dir="$output_dir/tensorboard" \
+  --checkpoint_filename="$output_dir/checkpoint.model"
--- a/syntaxnet/dragnn/conll2017/make_parser_spec.py
+++ b/syntaxnet/dragnn/conll2017/make_parser_spec.py
+# Copyright 2016 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Construct the spec for the CONLL2017 Parser baseline."""
+
+import tensorflow as tf
+
+from tensorflow.python.platform import gfile
+
+from dragnn.protos import spec_pb2
+from dragnn.python import spec_builder
+
+flags = tf.app.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('spec_file', 'parser_spec.textproto',
+                    'Filename to save the spec to.')
+
+
+def main(unused_argv):
+  # Left-to-right, character-based LSTM.
+  char2word = spec_builder.ComponentSpecBuilder('char_lstm')
+  char2word.set_network_unit(
+      name='wrapped_units.LayerNormBasicLSTMNetwork',
+      hidden_layer_sizes='256')
+  char2word.set_transition_system(name='char-shift-only', left_to_right='true')
+  char2word.add_fixed_feature(name='chars', fml='char-input.text-char',
+                              embedding_dim=16)
+
+  # Lookahead LSTM reads right-to-left to represent the rightmost context of the
+  # words. It gets word embeddings from the char model.
+  lookahead = spec_builder.ComponentSpecBuilder('lookahead')
+  lookahead.set_network_unit(
+      name='wrapped_units.LayerNormBasicLSTMNetwork',
+      hidden_layer_sizes='256')
+  lookahead.set_transition_system(name='shift-only', left_to_right='false')
+  lookahead.add_link(source=char2word, fml='input.last-char-focus',
+                     embedding_dim=64)
+
+  # Construct the tagger. This is a simple left-to-right LSTM sequence tagger.
+  tagger = spec_builder.ComponentSpecBuilder('tagger')
+  tagger.set_network_unit(
+      name='wrapped_units.LayerNormBasicLSTMNetwork',
+      hidden_layer_sizes='256')
+  tagger.set_transition_system(name='tagger')
+  tagger.add_token_link(source=lookahead, fml='input.focus', embedding_dim=64)
+
+  # Construct the parser.
+  parser = spec_builder.ComponentSpecBuilder('parser')
+  parser.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256',
+                          layer_norm_hidden='true')
+  parser.set_transition_system(name='arc-standard')
+  parser.add_token_link(source=lookahead, fml='input.focus', embedding_dim=64)
+  parser.add_token_link(
+      source=tagger, fml='input.focus stack.focus stack(1).focus',
+      embedding_dim=64)
+
+  # Add discrete features of the predicted parse tree so far, like in Parsey
+  # McParseface.
+  parser.add_fixed_feature(name='labels', embedding_dim=16,
+                           fml=' '.join([
+                               'stack.child(1).label',
+                               'stack.child(1).sibling(-1).label',
+                               'stack.child(-1).label',
+                               'stack.child(-1).sibling(1).label',
+                               'stack(1).child(1).label',
+                               'stack(1).child(1).sibling(-1).label',
+                               'stack(1).child(-1).label',
+                               'stack(1).child(-1).sibling(1).label',
+                               'stack.child(2).label',
+                               'stack.child(-2).label',
+                               'stack(1).child(2).label',
+                               'stack(1).child(-2).label']))
+
+  # Recurrent connection for the arc-standard parser. For both tokens on the
+  # stack, we connect to the last time step to either SHIFT or REDUCE that
+  # token. This allows the parser to build up compositional representations of
+  # phrases.
+  parser.add_link(
+      source=parser,  # recurrent connection
+      name='rnn-stack',  # unique identifier
+      fml='stack.focus stack(1).focus',  # look for both stack tokens
+      source_translator='shift-reduce-step',  # maps token indices -> step
+      embedding_dim=64)  # project down to 64 dims
+
+  master_spec = spec_pb2.MasterSpec()
+  master_spec.component.extend(
+      [char2word.spec, lookahead.spec, tagger.spec, parser.spec])
+
+  with gfile.FastGFile(FLAGS.spec_file, 'w') as f:
+    f.write(str(master_spec).encode('utf-8'))
+
+if __name__ == '__main__':
+  tf.app.run()
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/category-map
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/category-map
+15
+NOUN 25758
+VERB 14242
+PUNCT 12945
+PART 9977
+PROPN 8280
+NUM 5082
+ADV 4323
+ADP 4165
+ADJ 2318
+AUX 2024
+PRON 1343
+CCONJ 1329
+DET 994
+X 948
+SYM 25
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/char-map
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/char-map
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/char-ngram-map
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/char-ngram-map
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/label-map
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/label-map
+42
+punct 12965
+nmod 11147
+nsubj 7134
+obj 6016
+nummod 4732
+case:suff 4179
+acl 4163
+root 3797
+mark 3445
+det 3434
+advmod 2962
+case 2739
+case:dec 2517
+conj 2421
+obl 2000
+dep 1998
+mark:relcl 1833
+clf 1722
+ccomp 1655
+amod 1525
+xcomp 1382
+acl:relcl 1356
+cop 1349
+cc 1334
+nmod:tmod 1199
+appos 1089
+case:aspect 718
+aux 675
+case:pref 569
+aux:pass 324
+csubj 280
+flat:foreign 250
+nsubj:pass 211
+discourse 151
+aux:caus 149
+advcl 125
+mark:advb 79
+iobj 61
+dislocated 45
+mark:comp 17
+csubj:pass 5
+vocative 1
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/lcword-map
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/lcword-map
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/prefix-table
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/prefix-table
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/suffix-table
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/suffix-table
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/tag-map
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/tag-map
+42
+NN 21794
+VV 13177
+NNP 8280
+, 5824
+CD 5082
+DEC 4350
+RB 4323
+SFN 4229
+IN 4165
+NNB 3963
+. 3807
+JJ 2318
+VC 1935
+CC 1329
+PRP 996
+DT 994
+EC 942
+FW 778
+AS 718
+MD 681
+( 641
+) 641
+PFA 555
+BB 472
+'' 331
+`` 329
+PRD 324
+/ 202
+: 165
+UH 150
+DEV 96
+HYPH 76
+WP 23
+SFV 18
+XX 17
+ADD 8
+SFA 7
+... 4
+PFN 4
+LS 3
+" 1
+VERB 1
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/tag-to-category
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/tag-to-category
+"	PUNCT
+''	PUNCT
+(	PUNCT
+)	PUNCT
+,	PUNCT
+.	PUNCT
+...	PUNCT
+/	PUNCT
+:	PUNCT
+ADD	NOUN
+AS	PART
+BB	VERB
+CC	CCONJ
+CD	NUM
+DEC	PART
+DEV	PART
+DT	DET
+EC	PUNCT
+FW	X
+HYPH	PUNCT
+IN	ADP
+JJ	ADJ
+LS	X
+MD	AUX
+NN	NOUN
+NNB	NOUN
+NNP	PROPN
+PFA	PART
+PFN	PART
+PRD	PRON
+PRP	PRON
+RB	ADV
+SFA	PART
+SFN	PART
+SFV	PART
+UH	X
+VC	VERB
+VERB	VERB
+VV	AUX
+WP	PRON
+XX	X
+``	PUNCT
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/word-map
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter-resource/word-map
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter.checkpoint.data-00000-of-00001
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter.checkpoint.data-00000-of-00001
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter.checkpoint.index
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter.checkpoint.index
--- a/syntaxnet/dragnn/conll2017/sample/zh-segmenter.checkpoint.meta
+++ b/syntaxnet/dragnn/conll2017/sample/zh-segmenter.checkpoint.meta