Commit 7d30a017 authored by Ivan Bogatyy's avatar Ivan Bogatyy Committed by calberti
Browse files

Release DRAGNN (#1177)

* Release DRAGNN

* Update CoNLL evaluation table & evaluator.py
parent c774cc95
49
NN 285194
IN 228165
DT 179147
NNP 175147
JJ 125667
NNS 115732
, 97481
. 85938
RB 78513
VB 63952
CC 57554
VBD 56635
CD 55674
PRP 55244
VBZ 48126
VBN 44458
VBG 34524
VBP 33669
TO 28772
MD 22364
PRP$ 20706
HYPH 18526
POS 14905
`` 12193
'' 12154
WDT 10267
: 8713
$ 7993
WP 7336
RP 7335
WRB 6634
JJR 6295
NNPS 5917
-RRB- 3904
-LRB- 3840
JJS 3596
RBR 3186
EX 2733
UH 1521
RBS 1467
PDT 1271
FW 928
NFP 844
SYM 652
ADD 476
LS 392
WP$ 332
GW 184
AFX 42
package(default_visibility = ["//visibility:public"])
cc_library(
name = "bulk_feature_extractor",
hdrs = ["bulk_feature_extractor.h"],
deps = [
"@org_tensorflow//tensorflow/core:lib",
],
)
#ifndef NLP_SAFT_OPENSOURCE_DRAGNN_COMPONENTS_UTIL_BULK_FEATURE_EXTRACTOR_H_
#define NLP_SAFT_OPENSOURCE_DRAGNN_COMPONENTS_UTIL_BULK_FEATURE_EXTRACTOR_H_
#include <functional>
#include <utility>
#include "tensorflow/core/platform/types.h"
namespace syntaxnet {
namespace dragnn {
// Provides a wrapper for allocator functions and padding data for the Bulk
// ExtractFixedFeatures operation.
class BulkFeatureExtractor {
public:
// Create a BulkFeatureExtractor with the given allocator functions and
// padding. The allocator functions should take a channel and an element
// count and return a contigous block of memory that is associated with that
// channel (the caller can decide what that means). If use_padding is true,
// the provided pad_to_step and pad_to_element will be used to calculate
// the ID size.
BulkFeatureExtractor(
std::function<tensorflow::int32 *(int channel, int num_elements)>
allocate_indices_by_channel,
std::function<tensorflow::int64 *(int channel, int num_elements)>
allocate_ids_by_channel,
std::function<float *(int channel, int num_elements)>
allocate_weights_by_channel,
bool use_padding, int pad_to_step, int pad_to_element)
: use_padding_(use_padding),
pad_to_step_(pad_to_step),
pad_to_element_(pad_to_element),
allocate_indices_by_channel_(std::move(allocate_indices_by_channel)),
allocate_ids_by_channel_(std::move(allocate_ids_by_channel)),
allocate_weights_by_channel_(std::move(allocate_weights_by_channel)) {}
// Create a BulkFeatureExtractor with allocator functions as above, but with
// use_padding set to False. Useful when you know your caller will never
// need to pad.
BulkFeatureExtractor(
std::function<tensorflow::int32 *(int channel, int num_elements)>
allocate_indices_by_channel,
std::function<tensorflow::int64 *(int channel, int num_elements)>
allocate_ids_by_channel,
std::function<float *(int channel, int num_elements)>
allocate_weights_by_channel)
: use_padding_(false),
pad_to_step_(-1),
pad_to_element_(-1),
allocate_indices_by_channel_(std::move(allocate_indices_by_channel)),
allocate_ids_by_channel_(std::move(allocate_ids_by_channel)),
allocate_weights_by_channel_(std::move(allocate_weights_by_channel)) {}
// Invoke the index memory allocator.
tensorflow::int32 *AllocateIndexMemory(int channel, int num_elements) const {
return allocate_indices_by_channel_(channel, num_elements);
}
// Invoke the ID memory allocator.
tensorflow::int64 *AllocateIdMemory(int channel, int num_elements) const {
return allocate_ids_by_channel_(channel, num_elements);
}
// Invoke the weight memory allocator.
float *AllocateWeightMemory(int channel, int num_elements) const {
return allocate_weights_by_channel_(channel, num_elements);
}
// Given the total number of steps and total number of elements for a given
// feature, calculate the index (not ID) of that feature. Based on how the
// BulkFeatureExtractor was constructed, it may use the given number of steps
// and number of elements, or it may use the passed padded number.
int GetIndex(int total_steps, int num_elements, int feature_idx,
int element_idx, int step_idx) const {
const int steps = (use_padding_) ? pad_to_step_ : total_steps;
const int elements = (use_padding_) ? pad_to_element_ : num_elements;
const int feature_offset = elements * steps;
const int element_offset = steps;
return (feature_idx * feature_offset) + (element_idx * element_offset) +
step_idx;
}
private:
const bool use_padding_;
const int pad_to_step_;
const int pad_to_element_;
const std::function<tensorflow::int32 *(int, int)>
allocate_indices_by_channel_;
const std::function<tensorflow::int64 *(int, int)> allocate_ids_by_channel_;
const std::function<float *(int, int)> allocate_weights_by_channel_;
};
} // namespace dragnn
} // namespace syntaxnet
#endif // NLP_SAFT_OPENSOURCE_DRAGNN_COMPONENTS_UTIL_BULK_FEATURE_EXTRACTOR_H_
py_binary(
name = "make_parser_spec",
srcs = ["make_parser_spec.py"],
deps = [
"//dragnn/protos:spec_py_pb2",
"//dragnn/python:spec_builder",
"@org_tensorflow//tensorflow:tensorflow_py",
],
)
#!/bin/sh
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# A script to train the CONLL2017 baseline.
set -e
language=English
output_dir=./trained-"$language"
training_corpus=$1
dev_corpus=$2
bazel build -c opt //dragnn/tools:trainer //dragnn/conll2017:make_parser_spec
mkdir -p $output_dir
bazel-bin/dragnn/conll2017/make_parser_spec \
--spec_file="$output_dir/parser_spec.textproto"
bazel-bin/dragnn/tools/trainer \
--logtostderr \
--compute_lexicon \
--dragnn_spec="$output_dir/parser_spec.textproto" \
--resource_path="$output_dir/resources" \
--training_corpus_path="$training_corpus" \
--tune_corpus_path="$dev_corpus" \
--tensorboard_dir="$output_dir/tensorboard" \
--checkpoint_filename="$output_dir/checkpoint.model"
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Construct the spec for the CONLL2017 Parser baseline."""
import tensorflow as tf
from tensorflow.python.platform import gfile
from dragnn.protos import spec_pb2
from dragnn.python import spec_builder
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('spec_file', 'parser_spec.textproto',
'Filename to save the spec to.')
def main(unused_argv):
# Left-to-right, character-based LSTM.
char2word = spec_builder.ComponentSpecBuilder('char_lstm')
char2word.set_network_unit(
name='wrapped_units.LayerNormBasicLSTMNetwork',
hidden_layer_sizes='256')
char2word.set_transition_system(name='char-shift-only', left_to_right='true')
char2word.add_fixed_feature(name='chars', fml='char-input.text-char',
embedding_dim=16)
# Lookahead LSTM reads right-to-left to represent the rightmost context of the
# words. It gets word embeddings from the char model.
lookahead = spec_builder.ComponentSpecBuilder('lookahead')
lookahead.set_network_unit(
name='wrapped_units.LayerNormBasicLSTMNetwork',
hidden_layer_sizes='256')
lookahead.set_transition_system(name='shift-only', left_to_right='false')
lookahead.add_link(source=char2word, fml='input.last-char-focus',
embedding_dim=64)
# Construct the tagger. This is a simple left-to-right LSTM sequence tagger.
tagger = spec_builder.ComponentSpecBuilder('tagger')
tagger.set_network_unit(
name='wrapped_units.LayerNormBasicLSTMNetwork',
hidden_layer_sizes='256')
tagger.set_transition_system(name='tagger')
tagger.add_token_link(source=lookahead, fml='input.focus', embedding_dim=64)
# Construct the parser.
parser = spec_builder.ComponentSpecBuilder('parser')
parser.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256',
layer_norm_hidden='true')
parser.set_transition_system(name='arc-standard')
parser.add_token_link(source=lookahead, fml='input.focus', embedding_dim=64)
parser.add_token_link(
source=tagger, fml='input.focus stack.focus stack(1).focus',
embedding_dim=64)
# Add discrete features of the predicted parse tree so far, like in Parsey
# McParseface.
parser.add_fixed_feature(name='labels', embedding_dim=16,
fml=' '.join([
'stack.child(1).label',
'stack.child(1).sibling(-1).label',
'stack.child(-1).label',
'stack.child(-1).sibling(1).label',
'stack(1).child(1).label',
'stack(1).child(1).sibling(-1).label',
'stack(1).child(-1).label',
'stack(1).child(-1).sibling(1).label',
'stack.child(2).label',
'stack.child(-2).label',
'stack(1).child(2).label',
'stack(1).child(-2).label']))
# Recurrent connection for the arc-standard parser. For both tokens on the
# stack, we connect to the last time step to either SHIFT or REDUCE that
# token. This allows the parser to build up compositional representations of
# phrases.
parser.add_link(
source=parser, # recurrent connection
name='rnn-stack', # unique identifier
fml='stack.focus stack(1).focus', # look for both stack tokens
source_translator='shift-reduce-step', # maps token indices -> step
embedding_dim=64) # project down to 64 dims
master_spec = spec_pb2.MasterSpec()
master_spec.component.extend(
[char2word.spec, lookahead.spec, tagger.spec, parser.spec])
with gfile.FastGFile(FLAGS.spec_file, 'w') as f:
f.write(str(master_spec).encode('utf-8'))
if __name__ == '__main__':
tf.app.run()
15
NOUN 25758
VERB 14242
PUNCT 12945
PART 9977
PROPN 8280
NUM 5082
ADV 4323
ADP 4165
ADJ 2318
AUX 2024
PRON 1343
CCONJ 1329
DET 994
X 948
SYM 25
42
punct 12965
nmod 11147
nsubj 7134
obj 6016
nummod 4732
case:suff 4179
acl 4163
root 3797
mark 3445
det 3434
advmod 2962
case 2739
case:dec 2517
conj 2421
obl 2000
dep 1998
mark:relcl 1833
clf 1722
ccomp 1655
amod 1525
xcomp 1382
acl:relcl 1356
cop 1349
cc 1334
nmod:tmod 1199
appos 1089
case:aspect 718
aux 675
case:pref 569
aux:pass 324
csubj 280
flat:foreign 250
nsubj:pass 211
discourse 151
aux:caus 149
advcl 125
mark:advb 79
iobj 61
dislocated 45
mark:comp 17
csubj:pass 5
vocative 1
42
NN 21794
VV 13177
NNP 8280
, 5824
CD 5082
DEC 4350
RB 4323
SFN 4229
IN 4165
NNB 3963
. 3807
JJ 2318
VC 1935
CC 1329
PRP 996
DT 994
EC 942
FW 778
AS 718
MD 681
( 641
) 641
PFA 555
BB 472
'' 331
`` 329
PRD 324
/ 202
: 165
UH 150
DEV 96
HYPH 76
WP 23
SFV 18
XX 17
ADD 8
SFA 7
... 4
PFN 4
LS 3
" 1
VERB 1
" PUNCT
'' PUNCT
( PUNCT
) PUNCT
, PUNCT
. PUNCT
... PUNCT
/ PUNCT
: PUNCT
ADD NOUN
AS PART
BB VERB
CC CCONJ
CD NUM
DEC PART
DEV PART
DT DET
EC PUNCT
FW X
HYPH PUNCT
IN ADP
JJ ADJ
LS X
MD AUX
NN NOUN
NNB NOUN
NNP PROPN
PFA PART
PFN PART
PRD PRON
PRP PRON
RB ADV
SFA PART
SFN PART
SFV PART
UH X
VC VERB
VERB VERB
VV AUX
WP PRON
XX X
`` PUNCT
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment