Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
32ab5a58
Commit
32ab5a58
authored
May 12, 2016
by
calberti
Committed by
Martin Wicke
May 12, 2016
Browse files
Adding SyntaxNet to tensorflow/models (#63)
parent
148a15fb
Changes
131
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3802 additions
and
0 deletions
+3802
-0
syntaxnet/syntaxnet/dictionary.proto
syntaxnet/syntaxnet/dictionary.proto
+57
-0
syntaxnet/syntaxnet/document_filters.cc
syntaxnet/syntaxnet/document_filters.cc
+335
-0
syntaxnet/syntaxnet/document_format.cc
syntaxnet/syntaxnet/document_format.cc
+23
-0
syntaxnet/syntaxnet/document_format.h
syntaxnet/syntaxnet/document_format.h
+63
-0
syntaxnet/syntaxnet/embedding_feature_extractor.cc
syntaxnet/syntaxnet/embedding_feature_extractor.cc
+80
-0
syntaxnet/syntaxnet/embedding_feature_extractor.h
syntaxnet/syntaxnet/embedding_feature_extractor.h
+222
-0
syntaxnet/syntaxnet/feature_extractor.cc
syntaxnet/syntaxnet/feature_extractor.cc
+122
-0
syntaxnet/syntaxnet/feature_extractor.h
syntaxnet/syntaxnet/feature_extractor.h
+624
-0
syntaxnet/syntaxnet/feature_extractor.proto
syntaxnet/syntaxnet/feature_extractor.proto
+34
-0
syntaxnet/syntaxnet/feature_types.h
syntaxnet/syntaxnet/feature_types.h
+176
-0
syntaxnet/syntaxnet/fml_parser.cc
syntaxnet/syntaxnet/fml_parser.cc
+291
-0
syntaxnet/syntaxnet/fml_parser.h
syntaxnet/syntaxnet/fml_parser.h
+113
-0
syntaxnet/syntaxnet/graph_builder.py
syntaxnet/syntaxnet/graph_builder.py
+569
-0
syntaxnet/syntaxnet/graph_builder_test.py
syntaxnet/syntaxnet/graph_builder_test.py
+325
-0
syntaxnet/syntaxnet/kbest_syntax.proto
syntaxnet/syntaxnet/kbest_syntax.proto
+82
-0
syntaxnet/syntaxnet/lexicon_builder.cc
syntaxnet/syntaxnet/lexicon_builder.cc
+248
-0
syntaxnet/syntaxnet/lexicon_builder_test.py
syntaxnet/syntaxnet/lexicon_builder_test.py
+174
-0
syntaxnet/syntaxnet/load_parser_ops.py
syntaxnet/syntaxnet/load_parser_ops.py
+23
-0
syntaxnet/syntaxnet/models/parsey_mcparseface/context.pbtxt
syntaxnet/syntaxnet/models/parsey_mcparseface/context.pbtxt
+189
-0
syntaxnet/syntaxnet/models/parsey_mcparseface/fine-to-universal.map
...syntaxnet/models/parsey_mcparseface/fine-to-universal.map
+52
-0
No files found.
syntaxnet/syntaxnet/dictionary.proto
0 → 100644
View file @
32ab5a58
// Protocol buffers for serializing string<=>index dictionaries.
syntax
=
"proto2"
;
package
syntaxnet
;
// Serializable representation of a string=>string pair.
message
StringToStringPair
{
// String representing the key.
required
string
key
=
1
;
// String representing the value.
required
string
value
=
2
;
}
// Serializable representation of a string=>string mapping.
message
StringToStringMap
{
// Key=>value pairs.
repeated
StringToStringPair
pair
=
1
;
}
// Affix table entry, for serialization of the affix tables.
message
AffixTableEntry
{
// Nested message for serializing a single affix.
message
AffixEntry
{
// The affix as a string.
required
string
form
=
1
;
// The length of the affix (this is non-trivial to compute due to UTF-8).
required
int32
length
=
2
;
// The ID of the affix that is one character shorter, or -1 if none exists.
required
int32
shorter_id
=
3
;
}
// The type of affix table, as a string.
required
string
type
=
1
;
// The maximum affix length.
required
int32
max_length
=
2
;
// The list of affixes, in order of affix ID.
repeated
AffixEntry
affix
=
3
;
}
// A light-weight proto to store vectors in binary format.
message
TokenEmbedding
{
required
bytes
token
=
1
;
// can be word or phrase, or URL, etc.
// If available, raw count of this token in the training corpus.
optional
int64
count
=
3
;
message
Vector
{
repeated
float
values
=
1
[
packed
=
true
];
}
optional
Vector
vector
=
2
;
};
syntaxnet/syntaxnet/document_filters.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Various utilities for handling documents.
#include <stddef.h>
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "syntaxnet/base.h"
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/utils.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/lib/core/status.h"
using
tensorflow
::
DEVICE_CPU
;
using
tensorflow
::
OpKernel
;
using
tensorflow
::
OpKernelConstruction
;
using
tensorflow
::
OpKernelContext
;
using
tensorflow
::
Tensor
;
using
tensorflow
::
TensorShape
;
using
tensorflow
::
errors
::
InvalidArgument
;
namespace
syntaxnet
{
namespace
{
void
GetTaskContext
(
OpKernelConstruction
*
context
,
TaskContext
*
task_context
)
{
string
file_path
,
data
;
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"task_context"
,
&
file_path
));
OP_REQUIRES_OK
(
context
,
ReadFileToString
(
tensorflow
::
Env
::
Default
(),
file_path
,
&
data
));
OP_REQUIRES
(
context
,
TextFormat
::
ParseFromString
(
data
,
task_context
->
mutable_spec
()),
InvalidArgument
(
"Could not parse task context at "
,
file_path
));
}
// Outputs the given batch of sentences as a tensor and deletes them.
void
OutputDocuments
(
OpKernelContext
*
context
,
vector
<
Sentence
*>
*
document_batch
)
{
const
int64
size
=
document_batch
->
size
();
Tensor
*
output
;
OP_REQUIRES_OK
(
context
,
context
->
allocate_output
(
0
,
TensorShape
({
size
}),
&
output
));
for
(
int64
i
=
0
;
i
<
size
;
++
i
)
{
output
->
vec
<
string
>
()(
i
)
=
(
*
document_batch
)[
i
]
->
SerializeAsString
();
}
utils
::
STLDeleteElements
(
document_batch
);
}
}
// namespace
class
DocumentSource
:
public
OpKernel
{
public:
explicit
DocumentSource
(
OpKernelConstruction
*
context
)
:
OpKernel
(
context
)
{
GetTaskContext
(
context
,
&
task_context_
);
string
corpus_name
;
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"corpus_name"
,
&
corpus_name
));
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"batch_size"
,
&
batch_size_
));
OP_REQUIRES
(
context
,
batch_size_
>
0
,
InvalidArgument
(
"invalid batch_size provided"
));
corpus_
.
reset
(
new
TextReader
(
*
task_context_
.
GetInput
(
corpus_name
)));
}
void
Compute
(
OpKernelContext
*
context
)
override
{
mutex_lock
lock
(
mu_
);
Sentence
*
document
;
vector
<
Sentence
*>
document_batch
;
while
((
document
=
corpus_
->
Read
())
!=
NULL
)
{
document_batch
.
push_back
(
document
);
if
(
static_cast
<
int
>
(
document_batch
.
size
())
==
batch_size_
)
{
OutputDocuments
(
context
,
&
document_batch
);
OutputLast
(
context
,
false
);
return
;
}
}
OutputDocuments
(
context
,
&
document_batch
);
OutputLast
(
context
,
true
);
}
private:
void
OutputLast
(
OpKernelContext
*
context
,
bool
last
)
{
Tensor
*
output
;
OP_REQUIRES_OK
(
context
,
context
->
allocate_output
(
1
,
TensorShape
({}),
&
output
));
output
->
scalar
<
bool
>
()()
=
last
;
}
// Task context used to configure this op.
TaskContext
task_context_
;
// mutex to synchronize access to Compute.
mutex
mu_
;
std
::
unique_ptr
<
TextReader
>
corpus_
;
string
documents_path_
;
int
batch_size_
;
};
REGISTER_KERNEL_BUILDER
(
Name
(
"DocumentSource"
).
Device
(
DEVICE_CPU
),
DocumentSource
);
class
DocumentSink
:
public
OpKernel
{
public:
explicit
DocumentSink
(
OpKernelConstruction
*
context
)
:
OpKernel
(
context
)
{
GetTaskContext
(
context
,
&
task_context_
);
string
corpus_name
;
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"corpus_name"
,
&
corpus_name
));
writer_
.
reset
(
new
TextWriter
(
*
task_context_
.
GetInput
(
corpus_name
)));
}
void
Compute
(
OpKernelContext
*
context
)
override
{
mutex_lock
lock
(
mu_
);
auto
documents
=
context
->
input
(
0
).
vec
<
string
>
();
for
(
int
i
=
0
;
i
<
documents
.
size
();
++
i
)
{
Sentence
document
;
OP_REQUIRES
(
context
,
document
.
ParseFromString
(
documents
(
i
)),
InvalidArgument
(
"failed to parse sentence"
));
writer_
->
Write
(
document
);
}
}
private:
// Task context used to configure this op.
TaskContext
task_context_
;
// mutex to synchronize access to Compute.
mutex
mu_
;
string
documents_path_
;
std
::
unique_ptr
<
TextWriter
>
writer_
;
};
REGISTER_KERNEL_BUILDER
(
Name
(
"DocumentSink"
).
Device
(
DEVICE_CPU
),
DocumentSink
);
// Sentence filter for filtering out documents where the parse trees are not
// well-formed, i.e. they contain cycles.
class
WellFormedFilter
:
public
OpKernel
{
public:
explicit
WellFormedFilter
(
OpKernelConstruction
*
context
)
:
OpKernel
(
context
)
{
GetTaskContext
(
context
,
&
task_context_
);
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"keep_malformed_documents"
,
&
keep_malformed_
));
}
void
Compute
(
OpKernelContext
*
context
)
override
{
auto
documents
=
context
->
input
(
0
).
vec
<
string
>
();
vector
<
Sentence
*>
output_documents
;
for
(
int
i
=
0
;
i
<
documents
.
size
();
++
i
)
{
Sentence
*
document
=
new
Sentence
;
OP_REQUIRES
(
context
,
document
->
ParseFromString
(
documents
(
i
)),
InvalidArgument
(
"failed to parse sentence"
));
if
(
ShouldKeep
(
*
document
))
{
output_documents
.
push_back
(
document
);
}
else
{
delete
document
;
}
}
OutputDocuments
(
context
,
&
output_documents
);
}
private:
bool
ShouldKeep
(
const
Sentence
&
doc
)
{
vector
<
int
>
visited
(
doc
.
token_size
(),
-
1
);
for
(
int
i
=
0
;
i
<
doc
.
token_size
();
++
i
)
{
// Already visited node.
if
(
visited
[
i
]
!=
-
1
)
continue
;
int
t
=
i
;
while
(
t
!=
-
1
)
{
if
(
visited
[
t
]
==
-
1
)
{
// If it is not visited yet, mark it.
visited
[
t
]
=
i
;
}
else
if
(
visited
[
t
]
<
i
)
{
// If the index number is smaller than index and not -1, the token has
// already been visited.
break
;
}
else
{
// Loop detected.
LOG
(
ERROR
)
<<
"Loop detected in document "
<<
doc
.
DebugString
();
return
keep_malformed_
;
}
t
=
doc
.
token
(
t
).
head
();
}
}
return
true
;
}
private:
// Task context used to configure this op.
TaskContext
task_context_
;
bool
keep_malformed_
;
};
REGISTER_KERNEL_BUILDER
(
Name
(
"WellFormedFilter"
).
Device
(
DEVICE_CPU
),
WellFormedFilter
);
// Sentence filter that modifies dependency trees to make them projective. This
// could be made more efficient by looping over sentences instead of the entire
// document. Assumes that the document is well-formed in the sense of having
// no looping dependencies.
//
// Task arguments:
// bool discard_non_projective (false) : If true, discards documents with
// non-projective trees instead of projectivizing them.
class
ProjectivizeFilter
:
public
OpKernel
{
public:
explicit
ProjectivizeFilter
(
OpKernelConstruction
*
context
)
:
OpKernel
(
context
)
{
GetTaskContext
(
context
,
&
task_context_
);
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"discard_non_projective"
,
&
discard_non_projective_
));
}
void
Compute
(
OpKernelContext
*
context
)
override
{
auto
documents
=
context
->
input
(
0
).
vec
<
string
>
();
vector
<
Sentence
*>
output_documents
;
for
(
int
i
=
0
;
i
<
documents
.
size
();
++
i
)
{
Sentence
*
document
=
new
Sentence
;
OP_REQUIRES
(
context
,
document
->
ParseFromString
(
documents
(
i
)),
InvalidArgument
(
"failed to parse sentence"
));
if
(
Process
(
document
))
{
output_documents
.
push_back
(
document
);
}
else
{
delete
document
;
}
}
OutputDocuments
(
context
,
&
output_documents
);
}
bool
Process
(
Sentence
*
doc
)
{
const
int
num_tokens
=
doc
->
token_size
();
// Left and right boundaries for arcs. The left and right ends of an arc are
// bounded by the arcs that pass over it. If an arc exceeds these bounds it
// will cross an arc passing over it, making it a non-projective arc.
vector
<
int
>
left
(
num_tokens
);
vector
<
int
>
right
(
num_tokens
);
// Lift the shortest non-projective arc until the document is projective.
while
(
true
)
{
// Initialize boundaries to the whole document for all arcs.
for
(
int
i
=
0
;
i
<
num_tokens
;
++
i
)
{
left
[
i
]
=
-
1
;
right
[
i
]
=
num_tokens
-
1
;
}
// Find left and right bounds for each token.
for
(
int
i
=
0
;
i
<
num_tokens
;
++
i
)
{
int
head_index
=
doc
->
token
(
i
).
head
();
// Find left and right end of arc.
int
l
=
std
::
min
(
i
,
head_index
);
int
r
=
std
::
max
(
i
,
head_index
);
// Bound all tokens under the arc.
for
(
int
j
=
l
+
1
;
j
<
r
;
++
j
)
{
if
(
left
[
j
]
<
l
)
left
[
j
]
=
l
;
if
(
right
[
j
]
>
r
)
right
[
j
]
=
r
;
}
}
// Find deepest non-projective arc.
int
deepest_arc
=
-
1
;
int
max_depth
=
-
1
;
// The non-projective arcs are those that exceed their bounds.
for
(
int
i
=
0
;
i
<
num_tokens
;
++
i
)
{
int
head_index
=
doc
->
token
(
i
).
head
();
if
(
head_index
==
-
1
)
continue
;
// any crossing arc must be deeper
int
l
=
std
::
min
(
i
,
head_index
);
int
r
=
std
::
max
(
i
,
head_index
);
int
left_bound
=
std
::
max
(
left
[
l
],
left
[
r
]);
int
right_bound
=
std
::
min
(
right
[
l
],
right
[
r
]);
if
(
l
<
left_bound
||
r
>
right_bound
)
{
// Found non-projective arc.
if
(
discard_non_projective_
)
return
false
;
// Pick the deepest as the best candidate for lifting.
int
depth
=
0
;
int
j
=
i
;
while
(
j
!=
-
1
)
{
++
depth
;
j
=
doc
->
token
(
j
).
head
();
}
if
(
depth
>
max_depth
)
{
deepest_arc
=
i
;
max_depth
=
depth
;
}
}
}
// If there are no more non-projective arcs we are done.
if
(
deepest_arc
==
-
1
)
return
true
;
// Lift non-projective arc.
int
lifted_head
=
doc
->
token
(
doc
->
token
(
deepest_arc
).
head
()).
head
();
doc
->
mutable_token
(
deepest_arc
)
->
set_head
(
lifted_head
);
}
}
private:
// Task context used to configure this op.
TaskContext
task_context_
;
// Whether or not to throw away non-projective documents.
bool
discard_non_projective_
;
};
REGISTER_KERNEL_BUILDER
(
Name
(
"ProjectivizeFilter"
).
Device
(
DEVICE_CPU
),
ProjectivizeFilter
);
}
// namespace syntaxnet
syntaxnet/syntaxnet/document_format.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/document_format.h"
namespace
syntaxnet
{
// Component registry for document formatters.
REGISTER_CLASS_REGISTRY
(
"document format"
,
DocumentFormat
);
}
// namespace syntaxnet
syntaxnet/syntaxnet/document_format.h
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// An interface for document formats.
#ifndef $TARGETDIR_DOCUMENT_FORMAT_H__
#define $TARGETDIR_DOCUMENT_FORMAT_H__
#include <string>
#include <vector>
#include "syntaxnet/utils.h"
#include "syntaxnet/registry.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/task_context.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
namespace
syntaxnet
{
// A document format component converts a key/value pair from a record to one or
// more documents. The record format is used for selecting the document format
// component. A document format component can be registered with the
// REGISTER_DOCUMENT_FORMAT macro.
class
DocumentFormat
:
public
RegisterableClass
<
DocumentFormat
>
{
public:
DocumentFormat
()
{}
virtual
~
DocumentFormat
()
{}
// Reads a record from the given input buffer with format specific logic.
// Returns false if no record could be read because we reached end of file.
virtual
bool
ReadRecord
(
tensorflow
::
io
::
InputBuffer
*
buffer
,
string
*
record
)
=
0
;
// Converts a key/value pair to one or more documents.
virtual
void
ConvertFromString
(
const
string
&
key
,
const
string
&
value
,
vector
<
Sentence
*>
*
documents
)
=
0
;
// Converts a document to a key/value pair.
virtual
void
ConvertToString
(
const
Sentence
&
document
,
string
*
key
,
string
*
value
)
=
0
;
private:
TF_DISALLOW_COPY_AND_ASSIGN
(
DocumentFormat
);
};
#define REGISTER_DOCUMENT_FORMAT(type, component) \
REGISTER_CLASS_COMPONENT(DocumentFormat, type, component)
}
// namespace syntaxnet
#endif // $TARGETDIR_DOCUMENT_FORMAT_H__
syntaxnet/syntaxnet/embedding_feature_extractor.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/embedding_feature_extractor.h"
#include <vector>
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/parser_features.h"
#include "syntaxnet/task_context.h"
#include "syntaxnet/utils.h"
namespace
syntaxnet
{
void
GenericEmbeddingFeatureExtractor
::
Setup
(
TaskContext
*
context
)
{
// Don't use version to determine how to get feature FML.
const
string
features
=
context
->
Get
(
tensorflow
::
strings
::
StrCat
(
ArgPrefix
(),
"_"
,
"features"
),
""
);
const
string
embedding_names
=
context
->
Get
(
GetParamName
(
"embedding_names"
),
""
);
const
string
embedding_dims
=
context
->
Get
(
GetParamName
(
"embedding_dims"
),
""
);
LOG
(
INFO
)
<<
"Features: "
<<
features
;
LOG
(
INFO
)
<<
"Embedding names: "
<<
embedding_names
;
LOG
(
INFO
)
<<
"Embedding dims: "
<<
embedding_dims
;
embedding_fml_
=
utils
::
Split
(
features
,
';'
);
add_strings_
=
context
->
Get
(
GetParamName
(
"add_varlen_strings"
),
false
);
embedding_names_
=
utils
::
Split
(
embedding_names
,
';'
);
for
(
const
string
&
dim
:
utils
::
Split
(
embedding_dims
,
';'
))
{
embedding_dims_
.
push_back
(
utils
::
ParseUsing
<
int
>
(
dim
,
utils
::
ParseInt32
));
}
}
void
GenericEmbeddingFeatureExtractor
::
Init
(
TaskContext
*
context
)
{
}
vector
<
vector
<
SparseFeatures
>>
GenericEmbeddingFeatureExtractor
::
ConvertExample
(
const
vector
<
FeatureVector
>
&
feature_vectors
)
const
{
// Extract the features.
vector
<
vector
<
SparseFeatures
>>
sparse_features
(
feature_vectors
.
size
());
for
(
size_t
i
=
0
;
i
<
feature_vectors
.
size
();
++
i
)
{
// Convert the nlp_parser::FeatureVector to dist belief format.
sparse_features
[
i
]
=
vector
<
SparseFeatures
>
(
generic_feature_extractor
(
i
).
feature_types
());
for
(
int
j
=
0
;
j
<
feature_vectors
[
i
].
size
();
++
j
)
{
const
FeatureType
&
feature_type
=
*
feature_vectors
[
i
].
type
(
j
);
const
FeatureValue
value
=
feature_vectors
[
i
].
value
(
j
);
const
bool
is_continuous
=
feature_type
.
name
().
find
(
"continuous"
)
==
0
;
const
int64
id
=
is_continuous
?
FloatFeatureValue
(
value
).
id
:
value
;
const
int
base
=
feature_type
.
base
();
if
(
id
>=
0
)
{
sparse_features
[
i
][
base
].
add_id
(
id
);
if
(
is_continuous
)
{
sparse_features
[
i
][
base
].
add_weight
(
FloatFeatureValue
(
value
).
weight
);
}
if
(
add_strings_
)
{
sparse_features
[
i
][
base
].
add_description
(
tensorflow
::
strings
::
StrCat
(
feature_type
.
name
(),
"="
,
feature_type
.
GetFeatureValueName
(
id
)));
}
}
}
}
return
sparse_features
;
}
}
// namespace syntaxnet
syntaxnet/syntaxnet/embedding_feature_extractor.h
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef $TARGETDIR_EMBEDDING_FEATURE_EXTRACTOR_H_
#define $TARGETDIR_EMBEDDING_FEATURE_EXTRACTOR_H_
#include <functional>
#include <memory>
#include <string>
#include <vector>
#include "syntaxnet/utils.h"
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/feature_types.h"
#include "syntaxnet/parser_features.h"
#include "syntaxnet/sentence_features.h"
#include "syntaxnet/sparse.pb.h"
#include "syntaxnet/task_context.h"
#include "syntaxnet/workspace.h"
#include "tensorflow/core/lib/strings/strcat.h"
namespace
syntaxnet
{
// An EmbeddingFeatureExtractor manages the extraction of features for
// embedding-based models. It wraps a sequence of underlying classes of feature
// extractors, along with associated predicate maps. Each class of feature
// extractors is associated with a name, e.g., "words", "labels", "tags".
//
// The class is split between a generic abstract version,
// GenericEmbeddingFeatureExtractor (that can be initialized without knowing the
// signature of the ExtractFeatures method) and a typed version.
//
// The predicate maps must be initialized before use: they can be loaded using
// Read() or updated via UpdateMapsForExample.
class
GenericEmbeddingFeatureExtractor
{
public:
virtual
~
GenericEmbeddingFeatureExtractor
()
{}
// Get the prefix string to put in front of all arguments, so they don't
// conflict with other embedding models.
virtual
const
string
ArgPrefix
()
const
=
0
;
// Sets up predicate maps and embedding space names that are common for all
// embedding based feature extractors.
virtual
void
Setup
(
TaskContext
*
context
);
virtual
void
Init
(
TaskContext
*
context
);
// Requests workspace for the underlying feature extractors. This is
// implemented in the typed class.
virtual
void
RequestWorkspaces
(
WorkspaceRegistry
*
registry
)
=
0
;
// Number of predicates for the embedding at a given index (vocabulary size.)
int
EmbeddingSize
(
int
index
)
const
{
return
generic_feature_extractor
(
index
).
GetDomainSize
();
}
// Returns number of embedding spaces.
int
NumEmbeddings
()
const
{
return
embedding_dims_
.
size
();
}
// Returns the number of features in the embedding space.
const
int
FeatureSize
(
int
idx
)
const
{
return
generic_feature_extractor
(
idx
).
feature_types
();
}
// Returns the dimensionality of the embedding space.
int
EmbeddingDims
(
int
index
)
const
{
return
embedding_dims_
[
index
];
}
// Accessor for embedding dims (dimensions of the embedding spaces).
const
vector
<
int
>
&
embedding_dims
()
const
{
return
embedding_dims_
;
}
const
vector
<
string
>
&
embedding_fml
()
const
{
return
embedding_fml_
;
}
// Get parameter name by concatenating the prefix and the original name.
string
GetParamName
(
const
string
&
param_name
)
const
{
return
tensorflow
::
strings
::
StrCat
(
ArgPrefix
(),
"_"
,
param_name
);
}
protected:
// Provides the generic class with access to the templated extractors. This is
// used to get the type information out of the feature extractor without
// knowing the specific calling arguments of the extractor itself.
virtual
const
GenericFeatureExtractor
&
generic_feature_extractor
(
int
idx
)
const
=
0
;
// Converts a vector of extracted features into
// dist_belief::SparseFeatures. Each feature in each feature vector becomes a
// single SparseFeatures. The predicates are mapped through map_fn which
// should point to either mutable_map_fn or const_map_fn depending on whether
// or not the predicate maps should be updated.
vector
<
vector
<
SparseFeatures
>>
ConvertExample
(
const
vector
<
FeatureVector
>
&
feature_vectors
)
const
;
private:
// Embedding space names for parameter sharing.
vector
<
string
>
embedding_names_
;
// FML strings for each feature extractor.
vector
<
string
>
embedding_fml_
;
// Size of each of the embedding spaces (maximum predicate id).
vector
<
int
>
embedding_sizes_
;
// Embedding dimensions of the embedding spaces (i.e. 32, 64 etc.)
vector
<
int
>
embedding_dims_
;
// Whether or not to add string descriptions to converted examples.
bool
add_strings_
;
};
// Templated, object-specific implementation of the
// EmbeddingFeatureExtractor. EXTRACTOR should be a FeatureExtractor<OBJ,
// ARGS...> class that has the appropriate FeatureTraits() to ensure that
// locator type features work.
//
// Note: for backwards compatibility purposes, this always reads the FML spec
// from "<prefix>_features".
template
<
class
EXTRACTOR
,
class
OBJ
,
class
...
ARGS
>
class
EmbeddingFeatureExtractor
:
public
GenericEmbeddingFeatureExtractor
{
public:
// Sets up all predicate maps, feature extractors, and flags.
void
Setup
(
TaskContext
*
context
)
override
{
GenericEmbeddingFeatureExtractor
::
Setup
(
context
);
feature_extractors_
.
resize
(
embedding_fml
().
size
());
for
(
int
i
=
0
;
i
<
embedding_fml
().
size
();
++
i
)
{
feature_extractors_
[
i
].
Parse
(
embedding_fml
()[
i
]);
feature_extractors_
[
i
].
Setup
(
context
);
}
}
// Initializes resources needed by the feature extractors.
void
Init
(
TaskContext
*
context
)
override
{
GenericEmbeddingFeatureExtractor
::
Init
(
context
);
for
(
auto
&
feature_extractor
:
feature_extractors_
)
{
feature_extractor
.
Init
(
context
);
}
}
// Requests workspaces from the registry. Must be called after Init(), and
// before Preprocess().
void
RequestWorkspaces
(
WorkspaceRegistry
*
registry
)
override
{
for
(
auto
&
feature_extractor
:
feature_extractors_
)
{
feature_extractor
.
RequestWorkspaces
(
registry
);
}
}
// Must be called on the object one state for each sentence, before any
// feature extraction (e.g., UpdateMapsForExample, ExtractSparseFeatures).
void
Preprocess
(
WorkspaceSet
*
workspaces
,
OBJ
*
obj
)
const
{
for
(
auto
&
feature_extractor
:
feature_extractors_
)
{
feature_extractor
.
Preprocess
(
workspaces
,
obj
);
}
}
// Returns a ragged array of SparseFeatures, for 1) each feature extractor
// class e, and 2) each feature f extracted by e. Underlying predicate maps
// will not be updated and so unrecognized predicates may occur. In such a
// case the SparseFeatures object associated with a given extractor class and
// feature will be empty.
vector
<
vector
<
SparseFeatures
>>
ExtractSparseFeatures
(
const
WorkspaceSet
&
workspaces
,
const
OBJ
&
obj
,
ARGS
...
args
)
const
{
vector
<
FeatureVector
>
features
(
feature_extractors_
.
size
());
ExtractFeatures
(
workspaces
,
obj
,
args
...,
&
features
);
return
ConvertExample
(
features
);
}
// Extracts features using the extractors. Note that features must already
// be initialized to the correct number of feature extractors. No predicate
// mapping is applied.
void
ExtractFeatures
(
const
WorkspaceSet
&
workspaces
,
const
OBJ
&
obj
,
ARGS
...
args
,
vector
<
FeatureVector
>
*
features
)
const
{
DCHECK
(
features
!=
nullptr
);
DCHECK_EQ
(
features
->
size
(),
feature_extractors_
.
size
());
for
(
int
i
=
0
;
i
<
feature_extractors_
.
size
();
++
i
)
{
(
*
features
)[
i
].
clear
();
feature_extractors_
[
i
].
ExtractFeatures
(
workspaces
,
obj
,
args
...,
&
(
*
features
)[
i
]);
}
}
protected:
// Provides generic access to the feature extractors.
const
GenericFeatureExtractor
&
generic_feature_extractor
(
int
idx
)
const
override
{
DCHECK_LT
(
idx
,
feature_extractors_
.
size
());
DCHECK_GE
(
idx
,
0
);
return
feature_extractors_
[
idx
];
}
private:
// Templated feature extractor class.
vector
<
EXTRACTOR
>
feature_extractors_
;
};
class
ParserEmbeddingFeatureExtractor
:
public
EmbeddingFeatureExtractor
<
ParserFeatureExtractor
,
ParserState
>
{
public:
explicit
ParserEmbeddingFeatureExtractor
(
const
string
&
arg_prefix
)
:
arg_prefix_
(
arg_prefix
)
{}
private:
const
string
ArgPrefix
()
const
override
{
return
arg_prefix_
;
}
// Prefix for context parameters.
string
arg_prefix_
;
};
}
// namespace syntaxnet
#endif // $TARGETDIR_EMBEDDING_FEATURE_EXTRACTOR_H_
syntaxnet/syntaxnet/feature_extractor.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/feature_types.h"
#include "syntaxnet/fml_parser.h"
namespace
syntaxnet
{
constexpr
FeatureValue
GenericFeatureFunction
::
kNone
;
GenericFeatureExtractor
::
GenericFeatureExtractor
()
{}
GenericFeatureExtractor
::~
GenericFeatureExtractor
()
{}
void
GenericFeatureExtractor
::
Parse
(
const
string
&
source
)
{
// Parse feature specification into descriptor.
FMLParser
parser
;
parser
.
Parse
(
source
,
mutable_descriptor
());
// Initialize feature extractor from descriptor.
InitializeFeatureFunctions
();
}
void
GenericFeatureExtractor
::
InitializeFeatureTypes
()
{
// Register all feature types.
GetFeatureTypes
(
&
feature_types_
);
for
(
size_t
i
=
0
;
i
<
feature_types_
.
size
();
++
i
)
{
FeatureType
*
ft
=
feature_types_
[
i
];
ft
->
set_base
(
i
);
// Check for feature space overflow.
double
domain_size
=
ft
->
GetDomainSize
();
if
(
domain_size
<
0
)
{
LOG
(
FATAL
)
<<
"Illegal domain size for feature "
<<
ft
->
name
()
<<
domain_size
;
}
}
vector
<
string
>
types_names
;
GetFeatureTypeNames
(
&
types_names
);
CHECK_EQ
(
feature_types_
.
size
(),
types_names
.
size
());
}
void
GenericFeatureExtractor
::
GetFeatureTypeNames
(
vector
<
string
>
*
type_names
)
const
{
for
(
size_t
i
=
0
;
i
<
feature_types_
.
size
();
++
i
)
{
FeatureType
*
ft
=
feature_types_
[
i
];
type_names
->
push_back
(
ft
->
name
());
}
}
FeatureValue
GenericFeatureExtractor
::
GetDomainSize
()
const
{
// Domain size of the set of features is equal to:
// [largest domain size of any feature types] * [number of feature types]
FeatureValue
max_feature_type_dsize
=
0
;
for
(
size_t
i
=
0
;
i
<
feature_types_
.
size
();
++
i
)
{
FeatureType
*
ft
=
feature_types_
[
i
];
const
FeatureValue
feature_type_dsize
=
ft
->
GetDomainSize
();
if
(
feature_type_dsize
>
max_feature_type_dsize
)
{
max_feature_type_dsize
=
feature_type_dsize
;
}
}
return
max_feature_type_dsize
;
}
string
GenericFeatureFunction
::
GetParameter
(
const
string
&
name
)
const
{
// Find named parameter in feature descriptor.
for
(
int
i
=
0
;
i
<
descriptor_
->
parameter_size
();
++
i
)
{
if
(
name
==
descriptor_
->
parameter
(
i
).
name
())
{
return
descriptor_
->
parameter
(
i
).
value
();
}
}
return
""
;
}
GenericFeatureFunction
::
GenericFeatureFunction
()
{}
GenericFeatureFunction
::~
GenericFeatureFunction
()
{
delete
feature_type_
;
}
int
GenericFeatureFunction
::
GetIntParameter
(
const
string
&
name
,
int
default_value
)
const
{
string
value
=
GetParameter
(
name
);
return
utils
::
ParseUsing
<
int
>
(
value
,
default_value
,
tensorflow
::
strings
::
safe_strto32
);
}
void
GenericFeatureFunction
::
GetFeatureTypes
(
vector
<
FeatureType
*>
*
types
)
const
{
if
(
feature_type_
!=
nullptr
)
types
->
push_back
(
feature_type_
);
}
FeatureType
*
GenericFeatureFunction
::
GetFeatureType
()
const
{
// If a single feature type has been registered return it.
if
(
feature_type_
!=
nullptr
)
return
feature_type_
;
// Get feature types for function.
vector
<
FeatureType
*>
types
;
GetFeatureTypes
(
&
types
);
// If there is exactly one feature type return this, else return null.
if
(
types
.
size
()
==
1
)
return
types
[
0
];
return
nullptr
;
}
}
// namespace syntaxnet
syntaxnet/syntaxnet/feature_extractor.h
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Generic feature extractor for extracting features from objects. The feature
// extractor can be used for extracting features from any object. The feature
// extractor and feature function classes are template classes that have to
// be instantiated for extracting feature from a specific object type.
//
// A feature extractor consists of a hierarchy of feature functions. Each
// feature function extracts one or more feature type and value pairs from the
// object.
//
// The feature extractor has a modular design where new feature functions can be
// registered as components. The feature extractor is initialized from a
// descriptor represented by a protocol buffer. The feature extractor can also
// be initialized from a text-based source specification of the feature
// extractor. Feature specification parsers can be added as components. By
// default the feature extractor can be read from an ASCII protocol buffer or in
// a simple feature modeling language (fml).
// A feature function is invoked with a focus. Nested feature function can be
// invoked with another focus determined by the parent feature function.
#ifndef $TARGETDIR_FEATURE_EXTRACTOR_H_
#define $TARGETDIR_FEATURE_EXTRACTOR_H_
#include <memory>
#include <string>
#include <vector>
#include "syntaxnet/feature_extractor.pb.h"
#include "syntaxnet/feature_types.h"
#include "syntaxnet/proto_io.h"
#include "syntaxnet/registry.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/task_context.h"
#include "syntaxnet/utils.h"
#include "syntaxnet/workspace.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
#include "tensorflow/core/lib/io/record_reader.h"
#include "tensorflow/core/lib/io/record_writer.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/env.h"
namespace
syntaxnet
{
// Use the same type for feature values as is used for predicated.
typedef
int64
Predicate
;
typedef
Predicate
FeatureValue
;
// Output feature model in FML format.
void
ToFMLFunction
(
const
FeatureFunctionDescriptor
&
function
,
string
*
output
);
void
ToFML
(
const
FeatureFunctionDescriptor
&
function
,
string
*
output
);
// A feature vector contains feature type and value pairs.
class
FeatureVector
{
public:
FeatureVector
()
{}
// Adds feature type and value pair to feature vector.
void
add
(
FeatureType
*
type
,
FeatureValue
value
)
{
features_
.
emplace_back
(
type
,
value
);
}
// Removes all elements from the feature vector.
void
clear
()
{
features_
.
clear
();
}
// Returns the number of elements in the feature vector.
int
size
()
const
{
return
features_
.
size
();
}
// Reserves space in the underlying feature vector.
void
reserve
(
int
n
)
{
features_
.
reserve
(
n
);
}
// Returns feature type for an element in the feature vector.
FeatureType
*
type
(
int
index
)
const
{
return
features_
[
index
].
type
;
}
// Returns feature value for an element in the feature vector.
FeatureValue
value
(
int
index
)
const
{
return
features_
[
index
].
value
;
}
private:
// Structure for holding feature type and value pairs.
struct
Element
{
Element
()
:
type
(
NULL
),
value
(
-
1
)
{}
Element
(
FeatureType
*
t
,
FeatureValue
v
)
:
type
(
t
),
value
(
v
)
{}
FeatureType
*
type
;
FeatureValue
value
;
};
// Array for storing feature vector elements.
vector
<
Element
>
features_
;
TF_DISALLOW_COPY_AND_ASSIGN
(
FeatureVector
);
};
// The generic feature extractor is the type-independent part of a feature
// extractor. This holds the descriptor for the feature extractor and the
// collection of feature types used in the feature extractor. The feature
// types are not available until FeatureExtractor<>::Init() has been called.
class
GenericFeatureExtractor
{
public:
GenericFeatureExtractor
();
virtual
~
GenericFeatureExtractor
();
// Initializes the feature extractor from a source representation of the
// feature extractor. The first line is used for determining the feature
// specification language. If the first line starts with #! followed by a name
// then this name is used for instantiating a feature specification parser
// with that name. If the language cannot be detected this way it falls back
// to using the default language supplied.
void
Parse
(
const
string
&
source
);
// Returns the feature extractor descriptor.
const
FeatureExtractorDescriptor
&
descriptor
()
const
{
return
descriptor_
;
}
FeatureExtractorDescriptor
*
mutable_descriptor
()
{
return
&
descriptor_
;
}
// Returns the number of feature types in the feature extractor. Invalid
// before Init() has been called.
int
feature_types
()
const
{
return
feature_types_
.
size
();
}
// Returns all feature types names used by the extractor. The names are
// added to the types_names array. Invalid before Init() has been called.
void
GetFeatureTypeNames
(
vector
<
string
>
*
type_names
)
const
;
// Returns a feature type used in the extractor. Invalid before Init() has
// been called.
const
FeatureType
*
feature_type
(
int
index
)
const
{
return
feature_types_
[
index
];
}
// Returns the feature domain size of this feature extractor.
// NOTE: The way that domain size is calculated is, for some, unintuitive. It
// is the largest domain size of any feature type.
FeatureValue
GetDomainSize
()
const
;
protected:
// Initializes the feature types used by the extractor. Called from
// FeatureExtractor<>::Init().
void
InitializeFeatureTypes
();
private:
// Initializes the top-level feature functions.
virtual
void
InitializeFeatureFunctions
()
=
0
;
// Returns all feature types used by the extractor. The feature types are
// added to the result array.
virtual
void
GetFeatureTypes
(
vector
<
FeatureType
*>
*
types
)
const
=
0
;
// Descriptor for the feature extractor. This is a protocol buffer that
// contains all the information about the feature extractor. The feature
// functions are initialized from the information in the descriptor.
FeatureExtractorDescriptor
descriptor_
;
// All feature types used by the feature extractor. The collection of all the
// feature types describes the feature space of the feature set produced by
// the feature extractor. Not owned.
vector
<
FeatureType
*>
feature_types_
;
};
// The generic feature function is the type-independent part of a feature
// function. Each feature function is associated with the descriptor that it is
// instantiated from. The feature types associated with this feature function
// will be established by the time FeatureExtractor<>::Init() completes.
class
GenericFeatureFunction
{
public:
// A feature value that represents the absence of a value.
static
constexpr
FeatureValue
kNone
=
-
1
;
GenericFeatureFunction
();
virtual
~
GenericFeatureFunction
();
// Sets up the feature function. NB: FeatureTypes of nested functions are not
// guaranteed to be available until Init().
virtual
void
Setup
(
TaskContext
*
context
)
{}
// Initializes the feature function. NB: The FeatureType of this function must
// be established when this method completes.
virtual
void
Init
(
TaskContext
*
context
)
{}
// Requests workspaces from a registry to obtain indices into a WorkspaceSet
// for any Workspace objects used by this feature function. NB: This will be
// called after Init(), so it can depend on resources and arguments.
virtual
void
RequestWorkspaces
(
WorkspaceRegistry
*
registry
)
{}
// Appends the feature types produced by the feature function to types. The
// default implementation appends feature_type(), if non-null. Invalid
// before Init() has been called.
virtual
void
GetFeatureTypes
(
vector
<
FeatureType
*>
*
types
)
const
;
// Returns the feature type for feature produced by this feature function. If
// the feature function produces features of different types this returns
// null. Invalid before Init() has been called.
virtual
FeatureType
*
GetFeatureType
()
const
;
// Returns the name of the registry used for creating the feature function.
// This can be used for checking if two feature functions are of the same
// kind.
virtual
const
char
*
RegistryName
()
const
=
0
;
// Returns the value of a named parameter in the feature functions descriptor.
// If the named parameter is not found the global parameters are searched.
string
GetParameter
(
const
string
&
name
)
const
;
int
GetIntParameter
(
const
string
&
name
,
int
default_value
)
const
;
// Returns the FML function description for the feature function, i.e. the
// name and parameters without the nested features.
string
FunctionName
()
const
{
string
output
;
ToFMLFunction
(
*
descriptor_
,
&
output
);
return
output
;
}
// Returns the prefix for nested feature functions. This is the prefix of this
// feature function concatenated with the feature function name.
string
SubPrefix
()
const
{
return
prefix_
.
empty
()
?
FunctionName
()
:
prefix_
+
"."
+
FunctionName
();
}
// Returns/sets the feature extractor this function belongs to.
GenericFeatureExtractor
*
extractor
()
const
{
return
extractor_
;
}
void
set_extractor
(
GenericFeatureExtractor
*
extractor
)
{
extractor_
=
extractor
;
}
// Returns/sets the feature function descriptor.
FeatureFunctionDescriptor
*
descriptor
()
const
{
return
descriptor_
;
}
void
set_descriptor
(
FeatureFunctionDescriptor
*
descriptor
)
{
descriptor_
=
descriptor
;
}
// Returns a descriptive name for the feature function. The name is taken from
// the descriptor for the feature function. If the name is empty or the
// feature function is a variable the name is the FML representation of the
// feature, including the prefix.
string
name
()
const
{
string
output
;
if
(
descriptor_
->
name
().
empty
())
{
if
(
!
prefix_
.
empty
())
{
output
.
append
(
prefix_
);
output
.
append
(
"."
);
}
ToFML
(
*
descriptor_
,
&
output
);
}
else
{
output
=
descriptor_
->
name
();
}
tensorflow
::
StringPiece
stripped
(
output
);
utils
::
RemoveWhitespaceContext
(
&
stripped
);
return
stripped
.
ToString
();
}
// Returns the argument from the feature function descriptor. It defaults to
// 0 if the argument has not been specified.
int
argument
()
const
{
return
descriptor_
->
has_argument
()
?
descriptor_
->
argument
()
:
0
;
}
// Returns/sets/clears function name prefix.
const
string
&
prefix
()
const
{
return
prefix_
;
}
void
set_prefix
(
const
string
&
prefix
)
{
prefix_
=
prefix
;
}
protected:
// Returns the feature type for single-type feature functions.
FeatureType
*
feature_type
()
const
{
return
feature_type_
;
}
// Sets the feature type for single-type feature functions. This takes
// ownership of feature_type. Can only be called once.
void
set_feature_type
(
FeatureType
*
feature_type
)
{
CHECK
(
feature_type_
==
nullptr
);
feature_type_
=
feature_type
;
}
private:
// Feature extractor this feature function belongs to. Not owned.
GenericFeatureExtractor
*
extractor_
=
nullptr
;
// Descriptor for feature function. Not owned.
FeatureFunctionDescriptor
*
descriptor_
=
nullptr
;
// Feature type for features produced by this feature function. If the
// feature function produces features of multiple feature types this is null
// and the feature function must return it's feature types in
// GetFeatureTypes(). Owned.
FeatureType
*
feature_type_
=
nullptr
;
// Prefix used for sub-feature types of this function.
string
prefix_
;
};
// Feature function that can extract features from an object. Templated on
// two type arguments:
//
// OBJ: The "object" from which features are extracted; e.g., a sentence. This
// should be a plain type, rather than a reference or pointer.
//
// ARGS: A set of 0 or more types that are used to "index" into some part of the
// object that should be extracted, e.g. an int token index for a sentence
// object. This should not be a reference type.
template
<
class
OBJ
,
class
...
ARGS
>
class
FeatureFunction
:
public
GenericFeatureFunction
,
public
RegisterableClass
<
FeatureFunction
<
OBJ
,
ARGS
...
>
>
{
public:
using
Self
=
FeatureFunction
<
OBJ
,
ARGS
...
>
;
// Preprocesses the object. This will be called prior to calling Evaluate()
// or Compute() on that object.
virtual
void
Preprocess
(
WorkspaceSet
*
workspaces
,
OBJ
*
object
)
const
{}
// Appends features computed from the object and focus to the result. The
// default implementation delegates to Compute(), adding a single value if
// available. Multi-valued feature functions must override this method.
virtual
void
Evaluate
(
const
WorkspaceSet
&
workspaces
,
const
OBJ
&
object
,
ARGS
...
args
,
FeatureVector
*
result
)
const
{
FeatureValue
value
=
Compute
(
workspaces
,
object
,
args
...,
result
);
if
(
value
!=
kNone
)
result
->
add
(
feature_type
(),
value
);
}
// Returns a feature value computed from the object and focus, or kNone if no
// value is computed. Single-valued feature functions only need to override
// this method.
virtual
FeatureValue
Compute
(
const
WorkspaceSet
&
workspaces
,
const
OBJ
&
object
,
ARGS
...
args
,
const
FeatureVector
*
fv
)
const
{
return
kNone
;
}
// Instantiates a new feature function in a feature extractor from a feature
// descriptor.
static
Self
*
Instantiate
(
GenericFeatureExtractor
*
extractor
,
FeatureFunctionDescriptor
*
fd
,
const
string
&
prefix
)
{
Self
*
f
=
Self
::
Create
(
fd
->
type
());
f
->
set_extractor
(
extractor
);
f
->
set_descriptor
(
fd
);
f
->
set_prefix
(
prefix
);
return
f
;
}
// Returns the name of the registry for the feature function.
const
char
*
RegistryName
()
const
override
{
return
Self
::
registry
()
->
name
;
}
private:
// Special feature function class for resolving variable references. The type
// of the feature function is used for resolving the variable reference. When
// evaluated it will either get the feature value(s) from the variable portion
// of the feature vector, if present, or otherwise it will call the referenced
// feature extractor function directly to extract the feature(s).
class
Reference
;
};
// Base class for features with nested feature functions. The nested functions
// are of type NES, which may be different from the type of the parent function.
// NB: NestedFeatureFunction will ensure that all initialization of nested
// functions takes place during Setup() and Init() -- after the nested features
// are initialized, the parent feature is initialized via SetupNested() and
// InitNested(). Alternatively, a derived classes that overrides Setup() and
// Init() directly should call Parent::Setup(), Parent::Init(), etc. first.
//
// Note: NestedFeatureFunction cannot know how to call Preprocess, Evaluate, or
// Compute, since the nested functions may be of a different type.
template
<
class
NES
,
class
OBJ
,
class
...
ARGS
>
class
NestedFeatureFunction
:
public
FeatureFunction
<
OBJ
,
ARGS
...
>
{
public:
using
Parent
=
NestedFeatureFunction
<
NES
,
OBJ
,
ARGS
...
>
;
// Clean up nested functions.
~
NestedFeatureFunction
()
override
{
utils
::
STLDeleteElements
(
&
nested_
);
}
// By default, just appends the nested feature types.
void
GetFeatureTypes
(
vector
<
FeatureType
*>
*
types
)
const
override
{
CHECK
(
!
this
->
nested
().
empty
())
<<
"Nested features require nested features to be defined."
;
for
(
auto
*
function
:
nested_
)
function
->
GetFeatureTypes
(
types
);
}
// Sets up the nested features.
void
Setup
(
TaskContext
*
context
)
override
{
CreateNested
(
this
->
extractor
(),
this
->
descriptor
(),
&
nested_
,
this
->
SubPrefix
());
for
(
auto
*
function
:
nested_
)
function
->
Setup
(
context
);
SetupNested
(
context
);
}
// Sets up this NestedFeatureFunction specifically.
virtual
void
SetupNested
(
TaskContext
*
context
)
{}
// Initializes the nested features.
void
Init
(
TaskContext
*
context
)
override
{
for
(
auto
*
function
:
nested_
)
function
->
Init
(
context
);
InitNested
(
context
);
}
// Initializes this NestedFeatureFunction specifically.
virtual
void
InitNested
(
TaskContext
*
context
)
{}
// Gets all the workspaces needed for the nested functions.
void
RequestWorkspaces
(
WorkspaceRegistry
*
registry
)
override
{
for
(
auto
*
function
:
nested_
)
function
->
RequestWorkspaces
(
registry
);
}
// Returns the list of nested feature functions.
const
vector
<
NES
*>
&
nested
()
const
{
return
nested_
;
}
// Instantiates nested feature functions for a feature function. Creates and
// initializes one feature function for each sub-descriptor in the feature
// descriptor.
static
void
CreateNested
(
GenericFeatureExtractor
*
extractor
,
FeatureFunctionDescriptor
*
fd
,
vector
<
NES
*>
*
functions
,
const
string
&
prefix
)
{
for
(
int
i
=
0
;
i
<
fd
->
feature_size
();
++
i
)
{
FeatureFunctionDescriptor
*
sub
=
fd
->
mutable_feature
(
i
);
NES
*
f
=
NES
::
Instantiate
(
extractor
,
sub
,
prefix
);
functions
->
push_back
(
f
);
}
}
protected:
// The nested feature functions, if any, in order of declaration in the
// feature descriptor. Owned.
vector
<
NES
*>
nested_
;
};
// Base class for a nested feature function that takes nested features with the
// same signature as these features, i.e. a meta feature. For this class, we can
// provide preprocessing of the nested features.
template
<
class
OBJ
,
class
...
ARGS
>
class
MetaFeatureFunction
:
public
NestedFeatureFunction
<
FeatureFunction
<
OBJ
,
ARGS
...
>
,
OBJ
,
ARGS
...
>
{
public:
// Preprocesses using the nested features.
void
Preprocess
(
WorkspaceSet
*
workspaces
,
OBJ
*
object
)
const
override
{
for
(
auto
*
function
:
this
->
nested_
)
{
function
->
Preprocess
(
workspaces
,
object
);
}
}
};
// Template for a special type of locator: The locator of type
// FeatureFunction<OBJ, ARGS...> calls nested functions of type
// FeatureFunction<OBJ, IDX, ARGS...>, where the derived class DER is
// responsible for translating by providing the following:
//
// // Gets the new additional focus.
// IDX GetFocus(const WorkspaceSet &workspaces, const OBJ &object);
//
// This is useful to e.g. add a token focus to a parser state based on some
// desired property of that state.
template
<
class
DER
,
class
OBJ
,
class
IDX
,
class
...
ARGS
>
class
FeatureAddFocusLocator
:
public
NestedFeatureFunction
<
FeatureFunction
<
OBJ
,
IDX
,
ARGS
...
>
,
OBJ
,
ARGS
...
>
{
public:
void
Preprocess
(
WorkspaceSet
*
workspaces
,
OBJ
*
object
)
const
override
{
for
(
auto
*
function
:
this
->
nested_
)
{
function
->
Preprocess
(
workspaces
,
object
);
}
}
void
Evaluate
(
const
WorkspaceSet
&
workspaces
,
const
OBJ
&
object
,
ARGS
...
args
,
FeatureVector
*
result
)
const
override
{
IDX
focus
=
static_cast
<
const
DER
*>
(
this
)
->
GetFocus
(
workspaces
,
object
,
args
...);
for
(
auto
*
function
:
this
->
nested
())
{
function
->
Evaluate
(
workspaces
,
object
,
focus
,
args
...,
result
);
}
}
// Returns the first nested feature's computed value.
FeatureValue
Compute
(
const
WorkspaceSet
&
workspaces
,
const
OBJ
&
object
,
ARGS
...
args
,
const
FeatureVector
*
result
)
const
override
{
IDX
focus
=
static_cast
<
const
DER
*>
(
this
)
->
GetFocus
(
workspaces
,
object
,
args
...);
return
this
->
nested
()[
0
]
->
Compute
(
workspaces
,
object
,
focus
,
args
...,
result
);
}
};
// CRTP feature locator class. This is a meta feature that modifies ARGS and
// then calls the nested feature functions with the modified ARGS. Note that in
// order for this template to work correctly, all of ARGS must be types for
// which the reference operator & can be interpreted as a pointer to the
// argument. The derived class DER must implement the UpdateFocus method which
// takes pointers to the ARGS arguments:
//
// // Updates the current arguments.
// void UpdateArgs(const OBJ &object, ARGS *...args) const;
template
<
class
DER
,
class
OBJ
,
class
...
ARGS
>
class
FeatureLocator
:
public
MetaFeatureFunction
<
OBJ
,
ARGS
...
>
{
public:
// Feature locators have an additional check that there is no intrinsic type.
void
GetFeatureTypes
(
vector
<
FeatureType
*>
*
types
)
const
override
{
CHECK
(
this
->
feature_type
()
==
nullptr
)
<<
"FeatureLocators should not have an intrinsic type."
;
MetaFeatureFunction
<
OBJ
,
ARGS
...
>::
GetFeatureTypes
(
types
);
}
// Evaluates the locator.
void
Evaluate
(
const
WorkspaceSet
&
workspaces
,
const
OBJ
&
object
,
ARGS
...
args
,
FeatureVector
*
result
)
const
override
{
static_cast
<
const
DER
*>
(
this
)
->
UpdateArgs
(
workspaces
,
object
,
&
args
...);
for
(
auto
*
function
:
this
->
nested
())
{
function
->
Evaluate
(
workspaces
,
object
,
args
...,
result
);
}
}
// Returns the first nested feature's computed value.
FeatureValue
Compute
(
const
WorkspaceSet
&
workspaces
,
const
OBJ
&
object
,
ARGS
...
args
,
const
FeatureVector
*
result
)
const
override
{
static_cast
<
const
DER
*>
(
this
)
->
UpdateArgs
(
workspaces
,
object
,
&
args
...);
return
this
->
nested
()[
0
]
->
Compute
(
workspaces
,
object
,
args
...,
result
);
}
};
// Feature extractor for extracting features from objects of a certain class.
// Template type parameters are as defined for FeatureFunction.
template
<
class
OBJ
,
class
...
ARGS
>
class
FeatureExtractor
:
public
GenericFeatureExtractor
{
public:
// Feature function type for top-level functions in the feature extractor.
typedef
FeatureFunction
<
OBJ
,
ARGS
...
>
Function
;
typedef
FeatureExtractor
<
OBJ
,
ARGS
...
>
Self
;
// Feature locator type for the feature extractor.
template
<
class
DER
>
using
Locator
=
FeatureLocator
<
DER
,
OBJ
,
ARGS
...
>
;
// Initializes feature extractor.
FeatureExtractor
()
{}
~
FeatureExtractor
()
override
{
utils
::
STLDeleteElements
(
&
functions_
);
}
// Sets up the feature extractor. Note that only top-level functions exist
// until Setup() is called. This does not take ownership over the context,
// which must outlive this.
void
Setup
(
TaskContext
*
context
)
{
for
(
Function
*
function
:
functions_
)
function
->
Setup
(
context
);
}
// Initializes the feature extractor. Must be called after Setup(). This
// does not take ownership over the context, which must outlive this.
void
Init
(
TaskContext
*
context
)
{
for
(
Function
*
function
:
functions_
)
function
->
Init
(
context
);
this
->
InitializeFeatureTypes
();
}
// Requests workspaces from the registry. Must be called after Init(), and
// before Preprocess(). Does not take ownership over registry. This should be
// the same registry used to initialize the WorkspaceSet used in Preprocess()
// and ExtractFeatures(). NB: This is a different ordering from that used in
// SentenceFeatureRepresentation style feature computation.
void
RequestWorkspaces
(
WorkspaceRegistry
*
registry
)
{
for
(
auto
*
function
:
functions_
)
function
->
RequestWorkspaces
(
registry
);
}
// Preprocesses the object using feature functions for the phase. Must be
// called before any calls to ExtractFeatures() on that object and phase.
void
Preprocess
(
WorkspaceSet
*
workspaces
,
OBJ
*
object
)
const
{
for
(
Function
*
function
:
functions_
)
{
function
->
Preprocess
(
workspaces
,
object
);
}
}
// Extracts features from an object with a focus. This invokes all the
// top-level feature functions in the feature extractor. Only feature
// functions belonging to the specified phase are invoked.
void
ExtractFeatures
(
const
WorkspaceSet
&
workspaces
,
const
OBJ
&
object
,
ARGS
...
args
,
FeatureVector
*
result
)
const
{
result
->
reserve
(
this
->
feature_types
());
// Extract features.
for
(
int
i
=
0
;
i
<
functions_
.
size
();
++
i
)
{
functions_
[
i
]
->
Evaluate
(
workspaces
,
object
,
args
...,
result
);
}
}
private:
// Creates and initializes all feature functions in the feature extractor.
void
InitializeFeatureFunctions
()
override
{
// Create all top-level feature functions.
for
(
int
i
=
0
;
i
<
descriptor
().
feature_size
();
++
i
)
{
FeatureFunctionDescriptor
*
fd
=
mutable_descriptor
()
->
mutable_feature
(
i
);
Function
*
function
=
Function
::
Instantiate
(
this
,
fd
,
""
);
functions_
.
push_back
(
function
);
}
}
// Collect all feature types used in the feature extractor.
void
GetFeatureTypes
(
vector
<
FeatureType
*>
*
types
)
const
override
{
for
(
int
i
=
0
;
i
<
functions_
.
size
();
++
i
)
{
functions_
[
i
]
->
GetFeatureTypes
(
types
);
}
}
// Top-level feature functions (and variables) in the feature extractor.
// Owned.
vector
<
Function
*>
functions_
;
};
#define REGISTER_FEATURE_FUNCTION(base, name, component) \
REGISTER_CLASS_COMPONENT(base, name, component)
}
// namespace syntaxnet
#endif // $TARGETDIR_FEATURE_EXTRACTOR_H_
syntaxnet/syntaxnet/feature_extractor.proto
0 → 100644
View file @
32ab5a58
// Protocol buffers for feature extractor.
syntax
=
"proto2"
;
package
syntaxnet
;
message
Parameter
{
optional
string
name
=
1
;
optional
string
value
=
2
;
}
// Descriptor for feature function.
message
FeatureFunctionDescriptor
{
// Feature function type.
required
string
type
=
1
;
// Feature function name.
optional
string
name
=
2
;
// Default argument for feature function.
optional
int32
argument
=
3
[
default
=
0
];
// Named parameters for feature descriptor.
repeated
Parameter
parameter
=
4
;
// Nested sub-feature function descriptors.
repeated
FeatureFunctionDescriptor
feature
=
7
;
};
// Descriptor for feature extractor.
message
FeatureExtractorDescriptor
{
// Top-level feature function for extractor.
repeated
FeatureFunctionDescriptor
feature
=
1
;
};
syntaxnet/syntaxnet/feature_types.h
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Common feature types for parser components.
#ifndef $TARGETDIR_FEATURE_TYPES_H_
#define $TARGETDIR_FEATURE_TYPES_H_
#include <algorithm>
#include <map>
#include <string>
#include <utility>
#include "syntaxnet/utils.h"
namespace
syntaxnet
{
// Use the same type for feature values as is used for predicated.
typedef
int64
Predicate
;
typedef
Predicate
FeatureValue
;
// Each feature value in a feature vector has a feature type. The feature type
// is used for converting feature type and value pairs to predicate values. The
// feature type can also return names for feature values and calculate the size
// of the feature value domain. The FeatureType class is abstract and must be
// specialized for the concrete feature types.
class
FeatureType
{
public:
// Initializes a feature type.
explicit
FeatureType
(
const
string
&
name
)
:
name_
(
name
),
base_
(
0
)
{}
virtual
~
FeatureType
()
{}
// Converts a feature value to a name.
virtual
string
GetFeatureValueName
(
FeatureValue
value
)
const
=
0
;
// Returns the size of the feature values domain.
virtual
int64
GetDomainSize
()
const
=
0
;
// Returns the feature type name.
const
string
&
name
()
const
{
return
name_
;
}
Predicate
base
()
const
{
return
base_
;
}
void
set_base
(
Predicate
base
)
{
base_
=
base
;
}
private:
// Feature type name.
string
name_
;
// "Base" feature value: i.e. a "slot" in a global ordering of features.
Predicate
base_
;
};
// Templated generic resource based feature type. This feature type delegates
// look up of feature value names to an unknown resource class, which is not
// owned. Optionally, this type can also store a mapping of extra values which
// are not in the resource.
//
// Note: this class assumes that Resource->GetFeatureValueName() will return
// successfully for values ONLY in the range [0, Resource->NumValues()) Any
// feature value not in the extra value map and not in the above range of
// Resource will result in a ERROR and return of "<INVALID>".
template
<
class
Resource
>
class
ResourceBasedFeatureType
:
public
FeatureType
{
public:
// Creates a new type with given name, resource object, and a mapping of
// special values. The values must be greater or equal to
// resource->NumValues() so as to avoid collisions; this is verified with
// CHECK at creation.
ResourceBasedFeatureType
(
const
string
&
name
,
const
Resource
*
resource
,
const
map
<
FeatureValue
,
string
>
&
values
)
:
FeatureType
(
name
),
resource_
(
resource
),
values_
(
values
)
{
max_value_
=
resource
->
NumValues
()
-
1
;
for
(
const
auto
&
pair
:
values
)
{
CHECK_GE
(
pair
.
first
,
resource
->
NumValues
())
<<
"Invalid extra value: "
<<
pair
.
first
<<
","
<<
pair
.
second
;
max_value_
=
pair
.
first
>
max_value_
?
pair
.
first
:
max_value_
;
}
}
// Creates a new type with no special values.
ResourceBasedFeatureType
(
const
string
&
name
,
const
Resource
*
resource
)
:
ResourceBasedFeatureType
(
name
,
resource
,
{})
{}
// Returns the feature name for a given feature value. First checks the values
// map, then checks the resource to look up the name.
string
GetFeatureValueName
(
FeatureValue
value
)
const
override
{
if
(
values_
.
find
(
value
)
!=
values_
.
end
())
{
return
values_
.
find
(
value
)
->
second
;
}
if
(
value
>=
0
&&
value
<
resource_
->
NumValues
())
{
return
resource_
->
GetFeatureValueName
(
value
);
}
else
{
LOG
(
ERROR
)
<<
"Invalid feature value "
<<
value
<<
" for "
<<
name
();
return
"<INVALID>"
;
}
}
// Returns the number of possible values for this feature type. This is the
// based on the largest value that was observed in the extra values.
FeatureValue
GetDomainSize
()
const
override
{
return
max_value_
+
1
;
}
protected:
// Shared resource. Not owned.
const
Resource
*
resource_
=
nullptr
;
// Maximum possible value this feature could take.
FeatureValue
max_value_
;
// Mapping for extra feature values not in the resource.
map
<
FeatureValue
,
string
>
values_
;
};
// Feature type that is defined using an explicit map from FeatureValue to
// string values. This can reduce some of the boilerplate when defining
// features that generate enum values. Example usage:
//
// class BeverageSizeFeature : public FeatureFunction<Beverage>
// enum FeatureValue { SMALL, MEDIUM, LARGE }; // values for this feature
// void Init(TaskContext *context) override {
// set_feature_type(new EnumFeatureType("beverage_size",
// {{SMALL, "SMALL"}, {MEDIUM, "MEDIUM"}, {LARGE, "LARGE"}});
// }
// [...]
// };
class
EnumFeatureType
:
public
FeatureType
{
public:
EnumFeatureType
(
const
string
&
name
,
const
map
<
FeatureValue
,
string
>
&
value_names
)
:
FeatureType
(
name
),
value_names_
(
value_names
)
{
for
(
const
auto
&
pair
:
value_names
)
{
CHECK_GE
(
pair
.
first
,
0
)
<<
"Invalid feature value: "
<<
pair
.
first
<<
", "
<<
pair
.
second
;
domain_size_
=
std
::
max
(
domain_size_
,
pair
.
first
+
1
);
}
}
// Returns the feature name for a given feature value.
string
GetFeatureValueName
(
FeatureValue
value
)
const
override
{
auto
it
=
value_names_
.
find
(
value
);
if
(
it
==
value_names_
.
end
())
{
LOG
(
ERROR
)
<<
"Invalid feature value "
<<
value
<<
" for "
<<
name
();
return
"<INVALID>"
;
}
return
it
->
second
;
}
// Returns the number of possible values for this feature type. This is one
// greater than the largest value in the value_names map.
FeatureValue
GetDomainSize
()
const
override
{
return
domain_size_
;
}
protected:
// Maximum possible value this feature could take.
FeatureValue
domain_size_
=
0
;
// Names of feature values.
map
<
FeatureValue
,
string
>
value_names_
;
};
}
// namespace syntaxnet
#endif // $TARGETDIR_FEATURE_TYPES_H_
syntaxnet/syntaxnet/fml_parser.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/fml_parser.h"
#include <ctype.h>
#include <string>
#include "syntaxnet/utils.h"
#include "tensorflow/core/lib/strings/strcat.h"
namespace
syntaxnet
{
void
FMLParser
::
Initialize
(
const
string
&
source
)
{
// Initialize parser state.
source_
=
source
;
current_
=
source_
.
begin
();
item_start_
=
line_start_
=
current_
;
line_number_
=
item_line_number_
=
1
;
// Read first input item.
NextItem
();
}
void
FMLParser
::
Error
(
const
string
&
error_message
)
{
LOG
(
FATAL
)
<<
"Error in feature model, line "
<<
item_line_number_
<<
", position "
<<
(
item_start_
-
line_start_
+
1
)
<<
": "
<<
error_message
<<
"
\n
"
<<
string
(
line_start_
,
current_
)
<<
" <--HERE"
;
}
void
FMLParser
::
Next
()
{
// Move to the next input character. If we are at a line break update line
// number and line start position.
if
(
*
current_
==
'\n'
)
{
++
line_number_
;
++
current_
;
line_start_
=
current_
;
}
else
{
++
current_
;
}
}
void
FMLParser
::
NextItem
()
{
// Skip white space and comments.
while
(
!
eos
())
{
if
(
*
current_
==
'#'
)
{
// Skip comment.
while
(
!
eos
()
&&
*
current_
!=
'\n'
)
Next
();
}
else
if
(
isspace
(
*
current_
))
{
// Skip whitespace.
while
(
!
eos
()
&&
isspace
(
*
current_
))
Next
();
}
else
{
break
;
}
}
// Record start position for next item.
item_start_
=
current_
;
item_line_number_
=
line_number_
;
// Check for end of input.
if
(
eos
())
{
item_type_
=
END
;
return
;
}
// Parse number.
if
(
isdigit
(
*
current_
)
||
*
current_
==
'+'
||
*
current_
==
'-'
)
{
string
::
iterator
start
=
current_
;
Next
();
while
(
isdigit
(
*
current_
)
||
*
current_
==
'.'
)
Next
();
item_text_
.
assign
(
start
,
current_
);
item_type_
=
NUMBER
;
return
;
}
// Parse string.
if
(
*
current_
==
'"'
)
{
Next
();
string
::
iterator
start
=
current_
;
while
(
*
current_
!=
'"'
)
{
if
(
eos
())
Error
(
"Unterminated string"
);
Next
();
}
item_text_
.
assign
(
start
,
current_
);
item_type_
=
STRING
;
Next
();
return
;
}
// Parse identifier name.
if
(
isalpha
(
*
current_
)
||
*
current_
==
'_'
||
*
current_
==
'/'
)
{
string
::
iterator
start
=
current_
;
while
(
isalnum
(
*
current_
)
||
*
current_
==
'_'
||
*
current_
==
'-'
||
*
current_
==
'/'
)
Next
();
item_text_
.
assign
(
start
,
current_
);
item_type_
=
NAME
;
return
;
}
// Single character item.
item_type_
=
*
current_
;
Next
();
}
void
FMLParser
::
Parse
(
const
string
&
source
,
FeatureExtractorDescriptor
*
result
)
{
// Initialize parser.
Initialize
(
source
);
while
(
item_type_
!=
END
)
{
// Parse either a parameter name or a feature.
if
(
item_type_
!=
NAME
)
Error
(
"Feature type name expected"
);
string
name
=
item_text_
;
NextItem
();
if
(
item_type_
==
'='
)
{
Error
(
"Invalid syntax: feature expected"
);
}
else
{
// Parse feature.
FeatureFunctionDescriptor
*
descriptor
=
result
->
add_feature
();
descriptor
->
set_type
(
name
);
ParseFeature
(
descriptor
);
}
}
}
void
FMLParser
::
ParseFeature
(
FeatureFunctionDescriptor
*
result
)
{
// Parse argument and parameters.
if
(
item_type_
==
'('
)
{
NextItem
();
ParseParameter
(
result
);
while
(
item_type_
==
','
)
{
NextItem
();
ParseParameter
(
result
);
}
if
(
item_type_
!=
')'
)
Error
(
") expected"
);
NextItem
();
}
// Parse feature name.
if
(
item_type_
==
':'
)
{
NextItem
();
if
(
item_type_
!=
NAME
&&
item_type_
!=
STRING
)
{
Error
(
"Feature name expected"
);
}
string
name
=
item_text_
;
NextItem
();
// Set feature name.
result
->
set_name
(
name
);
}
// Parse sub-features.
if
(
item_type_
==
'.'
)
{
// Parse dotted sub-feature.
NextItem
();
if
(
item_type_
!=
NAME
)
Error
(
"Feature type name expected"
);
string
type
=
item_text_
;
NextItem
();
// Parse sub-feature.
FeatureFunctionDescriptor
*
subfeature
=
result
->
add_feature
();
subfeature
->
set_type
(
type
);
ParseFeature
(
subfeature
);
}
else
if
(
item_type_
==
'{'
)
{
// Parse sub-feature block.
NextItem
();
while
(
item_type_
!=
'}'
)
{
if
(
item_type_
!=
NAME
)
Error
(
"Feature type name expected"
);
string
type
=
item_text_
;
NextItem
();
// Parse sub-feature.
FeatureFunctionDescriptor
*
subfeature
=
result
->
add_feature
();
subfeature
->
set_type
(
type
);
ParseFeature
(
subfeature
);
}
NextItem
();
}
}
void
FMLParser
::
ParseParameter
(
FeatureFunctionDescriptor
*
result
)
{
if
(
item_type_
==
NUMBER
)
{
int
argument
=
utils
::
ParseUsing
<
int
>
(
item_text_
,
tensorflow
::
strings
::
safe_strto32
);
NextItem
();
// Set default argument for feature.
result
->
set_argument
(
argument
);
}
else
if
(
item_type_
==
NAME
)
{
string
name
=
item_text_
;
NextItem
();
if
(
item_type_
!=
'='
)
Error
(
"= expected"
);
NextItem
();
if
(
item_type_
>=
END
)
Error
(
"Parameter value expected"
);
string
value
=
item_text_
;
NextItem
();
// Add parameter to feature.
Parameter
*
parameter
;
parameter
=
result
->
add_parameter
();
parameter
->
set_name
(
name
);
parameter
->
set_value
(
value
);
}
else
{
Error
(
"Syntax error in parameter list"
);
}
}
void
ToFMLFunction
(
const
FeatureFunctionDescriptor
&
function
,
string
*
output
)
{
output
->
append
(
function
.
type
());
if
(
function
.
argument
()
!=
0
||
function
.
parameter_size
()
>
0
)
{
output
->
append
(
"("
);
bool
first
=
true
;
if
(
function
.
argument
()
!=
0
)
{
tensorflow
::
strings
::
StrAppend
(
output
,
function
.
argument
());
first
=
false
;
}
for
(
int
i
=
0
;
i
<
function
.
parameter_size
();
++
i
)
{
if
(
!
first
)
output
->
append
(
","
);
output
->
append
(
function
.
parameter
(
i
).
name
());
output
->
append
(
"="
);
output
->
append
(
"
\"
"
);
output
->
append
(
function
.
parameter
(
i
).
value
());
output
->
append
(
"
\"
"
);
first
=
false
;
}
output
->
append
(
")"
);
}
}
void
ToFML
(
const
FeatureFunctionDescriptor
&
function
,
string
*
output
)
{
ToFMLFunction
(
function
,
output
);
if
(
function
.
feature_size
()
==
1
)
{
output
->
append
(
"."
);
ToFML
(
function
.
feature
(
0
),
output
);
}
else
if
(
function
.
feature_size
()
>
1
)
{
output
->
append
(
" { "
);
for
(
int
i
=
0
;
i
<
function
.
feature_size
();
++
i
)
{
if
(
i
>
0
)
output
->
append
(
" "
);
ToFML
(
function
.
feature
(
i
),
output
);
}
output
->
append
(
" } "
);
}
}
void
ToFML
(
const
FeatureExtractorDescriptor
&
extractor
,
string
*
output
)
{
for
(
int
i
=
0
;
i
<
extractor
.
feature_size
();
++
i
)
{
ToFML
(
extractor
.
feature
(
i
),
output
);
output
->
append
(
"
\n
"
);
}
}
string
AsFML
(
const
FeatureFunctionDescriptor
&
function
)
{
string
str
;
ToFML
(
function
,
&
str
);
return
str
;
}
string
AsFML
(
const
FeatureExtractorDescriptor
&
extractor
)
{
string
str
;
ToFML
(
extractor
,
&
str
);
return
str
;
}
void
StripFML
(
string
*
fml_string
)
{
auto
it
=
fml_string
->
begin
();
while
(
it
!=
fml_string
->
end
())
{
if
(
*
it
==
'"'
)
{
it
=
fml_string
->
erase
(
it
);
}
else
{
++
it
;
}
}
}
}
// namespace syntaxnet
syntaxnet/syntaxnet/fml_parser.h
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Feature modeling language (fml) parser.
//
// BNF grammar for fml:
//
// <feature model> ::= { <feature extractor> }
//
// <feature extractor> ::= <extractor spec> |
// <extractor spec> '.' <feature extractor> |
// <extractor spec> '{' { <feature extractor> } '}'
//
// <extractor spec> ::= <extractor type>
// [ '(' <parameter list> ')' ]
// [ ':' <extractor name> ]
//
// <parameter list> = ( <parameter> | <argument> ) { ',' <parameter> }
//
// <parameter> ::= <parameter name> '=' <parameter value>
//
// <extractor type> ::= NAME
// <extractor name> ::= NAME | STRING
// <argument> ::= NUMBER
// <parameter name> ::= NAME
// <parameter value> ::= NUMBER | STRING | NAME
#ifndef $TARGETDIR_FML_PARSER_H_
#define $TARGETDIR_FML_PARSER_H_
#include <string>
#include "syntaxnet/utils.h"
#include "syntaxnet/feature_extractor.pb.h"
namespace
syntaxnet
{
class
FMLParser
{
public:
// Parses fml specification into feature extractor descriptor.
void
Parse
(
const
string
&
source
,
FeatureExtractorDescriptor
*
result
);
private:
// Initializes the parser with the source text.
void
Initialize
(
const
string
&
source
);
// Outputs error message and exits.
void
Error
(
const
string
&
error_message
);
// Moves to the next input character.
void
Next
();
// Moves to the next input item.
void
NextItem
();
// Parses a feature descriptor.
void
ParseFeature
(
FeatureFunctionDescriptor
*
result
);
// Parses a parameter specification.
void
ParseParameter
(
FeatureFunctionDescriptor
*
result
);
// Returns true if end of source input has been reached.
bool
eos
()
{
return
current_
==
source_
.
end
();
}
// Item types.
enum
ItemTypes
{
END
=
0
,
NAME
=
-
1
,
NUMBER
=
-
2
,
STRING
=
-
3
,
};
// Source text.
string
source_
;
// Current input position.
string
::
iterator
current_
;
// Line number for current input position.
int
line_number_
;
// Start position for current item.
string
::
iterator
item_start_
;
// Start position for current line.
string
::
iterator
line_start_
;
// Line number for current item.
int
item_line_number_
;
// Item type for current item. If this is positive it is interpreted as a
// character. If it is negative it is interpreted as an item type.
int
item_type_
;
// Text for current item.
string
item_text_
;
};
}
// namespace syntaxnet
#endif // $TARGETDIR_FML_PARSER_H_
syntaxnet/syntaxnet/graph_builder.py
0 → 100644
View file @
32ab5a58
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Builds parser models."""
import
tensorflow
as
tf
import
syntaxnet.load_parser_ops
from
tensorflow.python.ops
import
control_flow_ops
as
cf
from
tensorflow.python.ops
import
state_ops
from
tensorflow.python.platform
import
logging
from
syntaxnet.ops
import
gen_parser_ops
def
BatchedSparseToDense
(
sparse_indices
,
output_size
):
"""Batch compatible sparse to dense conversion.
This is useful for one-hot coded target labels.
Args:
sparse_indices: [batch_size] tensor containing one index per batch
output_size: needed in order to generate the correct dense output
Returns:
A [batch_size, output_size] dense tensor.
"""
eye
=
tf
.
diag
(
tf
.
fill
([
output_size
],
tf
.
constant
(
1
,
tf
.
float32
)))
return
tf
.
nn
.
embedding_lookup
(
eye
,
sparse_indices
)
def
EmbeddingLookupFeatures
(
params
,
sparse_features
,
allow_weights
):
"""Computes embeddings for each entry of sparse features sparse_features.
Args:
params: list of 2D tensors containing vector embeddings
sparse_features: 1D tensor of strings. Each entry is a string encoding of
dist_belief.SparseFeatures, and represents a variable length list of
feature ids, and optionally, corresponding weights values.
allow_weights: boolean to control whether the weights returned from the
SparseFeatures are used to multiply the embeddings.
Returns:
A tensor representing the combined embeddings for the sparse features.
For each entry s in sparse_features, the function looks up the embeddings
for each id and sums them into a single tensor weighing them by the
weight of each id. It returns a tensor with each entry of sparse_features
replaced by this combined embedding.
"""
if
not
isinstance
(
params
,
list
):
params
=
[
params
]
# Lookup embeddings.
sparse_features
=
tf
.
convert_to_tensor
(
sparse_features
)
indices
,
ids
,
weights
=
gen_parser_ops
.
unpack_sparse_features
(
sparse_features
)
embeddings
=
tf
.
nn
.
embedding_lookup
(
params
,
ids
)
if
allow_weights
:
# Multiply by weights, reshaping to allow broadcast.
broadcast_weights_shape
=
tf
.
concat
(
0
,
[
tf
.
shape
(
weights
),
[
1
]])
embeddings
*=
tf
.
reshape
(
weights
,
broadcast_weights_shape
)
# Sum embeddings by index.
return
tf
.
unsorted_segment_sum
(
embeddings
,
indices
,
tf
.
size
(
sparse_features
))
class
GreedyParser
(
object
):
"""Builds a Chen & Manning style greedy neural net parser.
Builds a graph with an optional reader op connected at one end and
operations needed to train the network on the other. Supports multiple
network instantiations sharing the same parameters and network topology.
The following named nodes are added to the training and eval networks:
epochs: a tensor containing the current epoch number
cost: a tensor containing the current training step cost
gold_actions: a tensor containing actions from gold decoding
feature_endpoints: a list of sparse feature vectors
logits: output of the final layer before computing softmax
The training network also contains:
train_op: an op that executes a single training step
Typical usage:
parser = graph_builder.GreedyParser(num_actions, num_features,
num_feature_ids, embedding_sizes,
hidden_layer_sizes)
parser.AddTraining(task_context, batch_size=5)
with tf.Session('local') as sess:
# This works because the session uses the same default graph as the
# GraphBuilder did.
sess.run(parser.inits.values())
while True:
tf_epoch, _ = sess.run([parser.training['epoch'],
parser.training['train_op']])
if tf_epoch[0] > 0:
break
"""
def
__init__
(
self
,
num_actions
,
num_features
,
num_feature_ids
,
embedding_sizes
,
hidden_layer_sizes
,
seed
=
None
,
gate_gradients
=
False
,
use_locking
=
False
,
embedding_init
=
1.0
,
relu_init
=
1e-4
,
bias_init
=
0.2
,
softmax_init
=
1e-4
,
averaging_decay
=
0.9999
,
use_averaging
=
True
,
check_parameters
=
True
,
check_every
=
1
,
allow_feature_weights
=
False
,
only_train
=
''
,
arg_prefix
=
None
,
**
unused_kwargs
):
"""Initialize the graph builder with parameters defining the network.
Args:
num_actions: int size of the set of parser actions
num_features: int list of dimensions of the feature vectors
num_feature_ids: int list of same length as num_features corresponding to
the sizes of the input feature spaces
embedding_sizes: int list of same length as num_features of the desired
embedding layer sizes
hidden_layer_sizes: int list of desired relu layer sizes; may be empty
seed: optional random initializer seed to enable reproducibility
gate_gradients: if True, gradient updates are computed synchronously,
ensuring consistency and reproducibility
use_locking: if True, use locking to avoid read-write contention when
updating Variables
embedding_init: sets the std dev of normal initializer of embeddings to
embedding_init / embedding_size ** .5
relu_init: sets the std dev of normal initializer of relu weights
to relu_init
bias_init: sets constant initializer of relu bias to bias_init
softmax_init: sets the std dev of normal initializer of softmax init
to softmax_init
averaging_decay: decay for exponential moving average when computing
averaged parameters, set to 1 to do vanilla averaging
use_averaging: whether to use moving averages of parameters during evals
check_parameters: whether to check for NaN/Inf parameters during
training
check_every: checks numerics every check_every steps.
allow_feature_weights: whether feature weights are allowed.
only_train: the comma separated set of parameter names to train. If empty,
all model parameters will be trained.
arg_prefix: prefix for context parameters.
"""
self
.
_num_actions
=
num_actions
self
.
_num_features
=
num_features
self
.
_num_feature_ids
=
num_feature_ids
self
.
_embedding_sizes
=
embedding_sizes
self
.
_hidden_layer_sizes
=
hidden_layer_sizes
self
.
_seed
=
seed
self
.
_gate_gradients
=
gate_gradients
self
.
_use_locking
=
use_locking
self
.
_use_averaging
=
use_averaging
self
.
_check_parameters
=
check_parameters
self
.
_check_every
=
check_every
self
.
_allow_feature_weights
=
allow_feature_weights
self
.
_only_train
=
set
(
only_train
.
split
(
','
))
if
only_train
else
None
self
.
_feature_size
=
len
(
embedding_sizes
)
self
.
_embedding_init
=
embedding_init
self
.
_relu_init
=
relu_init
self
.
_softmax_init
=
softmax_init
self
.
_arg_prefix
=
arg_prefix
# Parameters of the network with respect to which training is done.
self
.
params
=
{}
# Other variables, with respect to which no training is done, but which we
# nonetheless need to save in order to capture the state of the graph.
self
.
variables
=
{}
# Operations to initialize any nodes that require initialization.
self
.
inits
=
{}
# Training- and eval-related nodes.
self
.
training
=
{}
self
.
evaluation
=
{}
self
.
saver
=
None
# Nodes to compute moving averages of parameters, called every train step.
self
.
_averaging
=
{}
self
.
_averaging_decay
=
averaging_decay
# Pretrained embeddings that can be used instead of constant initializers.
self
.
_pretrained_embeddings
=
{}
# After the following 'with' statement, we'll be able to re-enter the
# 'params' scope by re-using the self._param_scope member variable. See for
# instance _AddParam.
with
tf
.
name_scope
(
'params'
)
as
self
.
_param_scope
:
self
.
_relu_bias_init
=
tf
.
constant_initializer
(
bias_init
)
@
property
def
embedding_size
(
self
):
size
=
0
for
i
in
range
(
self
.
_feature_size
):
size
+=
self
.
_num_features
[
i
]
*
self
.
_embedding_sizes
[
i
]
return
size
def
_AddParam
(
self
,
shape
,
dtype
,
name
,
initializer
=
None
,
return_average
=
False
):
"""Add a model parameter w.r.t. we expect to compute gradients.
_AddParam creates both regular parameters (usually for training) and
averaged nodes (usually for inference). It returns one or the other based
on the 'return_average' arg.
Args:
shape: int list, tensor shape of the parameter to create
dtype: tf.DataType, data type of the parameter
name: string, name of the parameter in the TF graph
initializer: optional initializer for the paramter
return_average: if False, return parameter otherwise return moving average
Returns:
parameter or averaged parameter
"""
if
name
not
in
self
.
params
:
step
=
tf
.
cast
(
self
.
GetStep
(),
tf
.
float32
)
# Put all parameters and their initializing ops in their own scope
# irrespective of the current scope (training or eval).
with
tf
.
name_scope
(
self
.
_param_scope
):
self
.
params
[
name
]
=
tf
.
get_variable
(
name
,
shape
,
dtype
,
initializer
)
param
=
self
.
params
[
name
]
if
initializer
is
not
None
:
self
.
inits
[
name
]
=
state_ops
.
init_variable
(
param
,
initializer
)
if
self
.
_averaging_decay
==
1
:
logging
.
info
(
'Using vanilla averaging of parameters.'
)
ema
=
tf
.
train
.
ExponentialMovingAverage
(
decay
=
(
step
/
(
step
+
1.0
)),
num_updates
=
None
)
else
:
ema
=
tf
.
train
.
ExponentialMovingAverage
(
decay
=
self
.
_averaging_decay
,
num_updates
=
step
)
self
.
_averaging
[
name
+
'_avg_update'
]
=
ema
.
apply
([
param
])
self
.
variables
[
name
+
'_avg_var'
]
=
ema
.
average
(
param
)
self
.
inits
[
name
+
'_avg_init'
]
=
state_ops
.
init_variable
(
ema
.
average
(
param
),
tf
.
zeros_initializer
)
return
(
self
.
variables
[
name
+
'_avg_var'
]
if
return_average
else
self
.
params
[
name
])
def
GetStep
(
self
):
def
OnesInitializer
(
shape
,
dtype
=
tf
.
float32
):
return
tf
.
ones
(
shape
,
dtype
)
return
self
.
_AddVariable
([],
tf
.
int32
,
'step'
,
OnesInitializer
)
def
_AddVariable
(
self
,
shape
,
dtype
,
name
,
initializer
=
None
):
if
name
in
self
.
variables
:
return
self
.
variables
[
name
]
self
.
variables
[
name
]
=
tf
.
get_variable
(
name
,
shape
,
dtype
,
initializer
)
if
initializer
is
not
None
:
self
.
inits
[
name
]
=
state_ops
.
init_variable
(
self
.
variables
[
name
],
initializer
)
return
self
.
variables
[
name
]
def
_ReluWeightInitializer
(
self
):
with
tf
.
name_scope
(
self
.
_param_scope
):
return
tf
.
random_normal_initializer
(
stddev
=
self
.
_relu_init
,
seed
=
self
.
_seed
)
def
_EmbeddingMatrixInitializer
(
self
,
index
,
embedding_size
):
if
index
in
self
.
_pretrained_embeddings
:
return
self
.
_pretrained_embeddings
[
index
]
else
:
return
tf
.
random_normal_initializer
(
stddev
=
self
.
_embedding_init
/
embedding_size
**
.
5
,
seed
=
self
.
_seed
)
def
_AddEmbedding
(
self
,
features
,
num_features
,
num_ids
,
embedding_size
,
index
,
return_average
=
False
):
"""Adds an embedding matrix and passes the `features` vector through it."""
embedding_matrix
=
self
.
_AddParam
(
[
num_ids
,
embedding_size
],
tf
.
float32
,
'embedding_matrix_%d'
%
index
,
self
.
_EmbeddingMatrixInitializer
(
index
,
embedding_size
),
return_average
=
return_average
)
embedding
=
EmbeddingLookupFeatures
(
embedding_matrix
,
tf
.
reshape
(
features
,
[
-
1
],
name
=
'feature_%d'
%
index
),
self
.
_allow_feature_weights
)
return
tf
.
reshape
(
embedding
,
[
-
1
,
num_features
*
embedding_size
])
def
_BuildNetwork
(
self
,
feature_endpoints
,
return_average
=
False
):
"""Builds a feed-forward part of the net given features as input.
The network topology is already defined in the constructor, so multiple
calls to BuildForward build multiple networks whose parameters are all
shared. It is the source of the input features and the use of the output
that distinguishes each network.
Args:
feature_endpoints: tensors with input features to the network
return_average: whether to use moving averages as model parameters
Returns:
logits: output of the final layer before computing softmax
"""
assert
len
(
feature_endpoints
)
==
self
.
_feature_size
# Create embedding layer.
embeddings
=
[]
for
i
in
range
(
self
.
_feature_size
):
embeddings
.
append
(
self
.
_AddEmbedding
(
feature_endpoints
[
i
],
self
.
_num_features
[
i
],
self
.
_num_feature_ids
[
i
],
self
.
_embedding_sizes
[
i
],
i
,
return_average
=
return_average
))
last_layer
=
tf
.
concat
(
1
,
embeddings
)
last_layer_size
=
self
.
embedding_size
# Create ReLU layers.
for
i
,
hidden_layer_size
in
enumerate
(
self
.
_hidden_layer_sizes
):
weights
=
self
.
_AddParam
(
[
last_layer_size
,
hidden_layer_size
],
tf
.
float32
,
'weights_%d'
%
i
,
self
.
_ReluWeightInitializer
(),
return_average
=
return_average
)
bias
=
self
.
_AddParam
([
hidden_layer_size
],
tf
.
float32
,
'bias_%d'
%
i
,
self
.
_relu_bias_init
,
return_average
=
return_average
)
last_layer
=
tf
.
nn
.
relu_layer
(
last_layer
,
weights
,
bias
,
name
=
'layer_%d'
%
i
)
last_layer_size
=
hidden_layer_size
# Create softmax layer.
softmax_weight
=
self
.
_AddParam
(
[
last_layer_size
,
self
.
_num_actions
],
tf
.
float32
,
'softmax_weight'
,
tf
.
random_normal_initializer
(
stddev
=
self
.
_softmax_init
,
seed
=
self
.
_seed
),
return_average
=
return_average
)
softmax_bias
=
self
.
_AddParam
(
[
self
.
_num_actions
],
tf
.
float32
,
'softmax_bias'
,
tf
.
zeros_initializer
,
return_average
=
return_average
)
logits
=
tf
.
nn
.
xw_plus_b
(
last_layer
,
softmax_weight
,
softmax_bias
,
name
=
'logits'
)
return
{
'logits'
:
logits
}
def
_AddGoldReader
(
self
,
task_context
,
batch_size
,
corpus_name
):
features
,
epochs
,
gold_actions
=
(
gen_parser_ops
.
gold_parse_reader
(
task_context
,
self
.
_feature_size
,
batch_size
,
corpus_name
=
corpus_name
,
arg_prefix
=
self
.
_arg_prefix
))
return
{
'gold_actions'
:
tf
.
identity
(
gold_actions
,
name
=
'gold_actions'
),
'epochs'
:
tf
.
identity
(
epochs
,
name
=
'epochs'
),
'feature_endpoints'
:
features
}
def
_AddDecodedReader
(
self
,
task_context
,
batch_size
,
transition_scores
,
corpus_name
):
features
,
epochs
,
eval_metrics
,
documents
=
(
gen_parser_ops
.
decoded_parse_reader
(
transition_scores
,
task_context
,
self
.
_feature_size
,
batch_size
,
corpus_name
=
corpus_name
,
arg_prefix
=
self
.
_arg_prefix
))
return
{
'eval_metrics'
:
eval_metrics
,
'epochs'
:
tf
.
identity
(
epochs
,
name
=
'epochs'
),
'feature_endpoints'
:
features
,
'documents'
:
documents
}
def
_AddCostFunction
(
self
,
batch_size
,
gold_actions
,
logits
):
"""Cross entropy plus L2 loss on weights and biases of the hidden layers."""
dense_golden
=
BatchedSparseToDense
(
gold_actions
,
self
.
_num_actions
)
cross_entropy
=
tf
.
div
(
tf
.
reduce_sum
(
tf
.
nn
.
softmax_cross_entropy_with_logits
(
logits
,
dense_golden
)),
batch_size
)
regularized_params
=
[
tf
.
nn
.
l2_loss
(
p
)
for
k
,
p
in
self
.
params
.
items
()
if
k
.
startswith
(
'weights'
)
or
k
.
startswith
(
'bias'
)]
l2_loss
=
1e-4
*
tf
.
add_n
(
regularized_params
)
if
regularized_params
else
0
return
{
'cost'
:
tf
.
add
(
cross_entropy
,
l2_loss
,
name
=
'cost'
)}
def
AddEvaluation
(
self
,
task_context
,
batch_size
,
evaluation_max_steps
=
300
,
corpus_name
=
'documents'
):
"""Builds the forward network only without the training operation.
Args:
task_context: file path from which to read the task context.
batch_size: batch size to request from reader op.
evaluation_max_steps: max number of parsing actions during evaluation,
only used in beam parsing.
corpus_name: name of the task input to read parses from.
Returns:
Dictionary of named eval nodes.
"""
def
_AssignTransitionScores
():
return
tf
.
assign
(
nodes
[
'transition_scores'
],
nodes
[
'logits'
],
validate_shape
=
False
)
def
_Pass
():
return
tf
.
constant
(
-
1.0
)
unused_evaluation_max_steps
=
evaluation_max_steps
with
tf
.
name_scope
(
'evaluation'
):
nodes
=
self
.
evaluation
nodes
[
'transition_scores'
]
=
self
.
_AddVariable
(
[
batch_size
,
self
.
_num_actions
],
tf
.
float32
,
'transition_scores'
,
tf
.
constant_initializer
(
-
1.0
))
nodes
.
update
(
self
.
_AddDecodedReader
(
task_context
,
batch_size
,
nodes
[
'transition_scores'
],
corpus_name
))
nodes
.
update
(
self
.
_BuildNetwork
(
nodes
[
'feature_endpoints'
],
return_average
=
self
.
_use_averaging
))
nodes
[
'eval_metrics'
]
=
cf
.
with_dependencies
(
[
tf
.
cond
(
tf
.
greater
(
tf
.
size
(
nodes
[
'logits'
]),
0
),
_AssignTransitionScores
,
_Pass
)],
nodes
[
'eval_metrics'
],
name
=
'eval_metrics'
)
return
nodes
def
_IncrementCounter
(
self
,
counter
):
return
state_ops
.
assign_add
(
counter
,
1
,
use_locking
=
True
)
def
_AddLearningRate
(
self
,
initial_learning_rate
,
decay_steps
):
"""Returns a learning rate that decays by 0.96 every decay_steps.
Args:
initial_learning_rate: initial value of the learning rate
decay_steps: decay by 0.96 every this many steps
Returns:
learning rate variable.
"""
step
=
self
.
GetStep
()
return
cf
.
with_dependencies
(
[
self
.
_IncrementCounter
(
step
)],
tf
.
train
.
exponential_decay
(
initial_learning_rate
,
step
,
decay_steps
,
0.96
,
staircase
=
True
))
def
AddPretrainedEmbeddings
(
self
,
index
,
embeddings_path
,
task_context
):
"""Embeddings at the given index will be set to pretrained values."""
def
_Initializer
(
shape
,
dtype
=
tf
.
float32
):
unused_dtype
=
dtype
t
=
gen_parser_ops
.
word_embedding_initializer
(
vectors
=
embeddings_path
,
task_context
=
task_context
,
embedding_init
=
self
.
_embedding_init
)
t
.
set_shape
(
shape
)
return
t
self
.
_pretrained_embeddings
[
index
]
=
_Initializer
def
AddTraining
(
self
,
task_context
,
batch_size
,
learning_rate
=
0.1
,
decay_steps
=
4000
,
momentum
=
0.9
,
corpus_name
=
'documents'
):
"""Builds a trainer to minimize the cross entropy cost function.
Args:
task_context: file path from which to read the task context
batch_size: batch size to request from reader op
learning_rate: initial value of the learning rate
decay_steps: decay learning rate by 0.96 every this many steps
momentum: momentum parameter used when training with momentum
corpus_name: name of the task input to read parses from
Returns:
Dictionary of named training nodes.
"""
with
tf
.
name_scope
(
'training'
):
nodes
=
self
.
training
nodes
.
update
(
self
.
_AddGoldReader
(
task_context
,
batch_size
,
corpus_name
))
nodes
.
update
(
self
.
_BuildNetwork
(
nodes
[
'feature_endpoints'
],
return_average
=
False
))
nodes
.
update
(
self
.
_AddCostFunction
(
batch_size
,
nodes
[
'gold_actions'
],
nodes
[
'logits'
]))
# Add the optimizer
if
self
.
_only_train
:
trainable_params
=
[
v
for
k
,
v
in
self
.
params
.
iteritems
()
if
k
in
self
.
_only_train
]
else
:
trainable_params
=
self
.
params
.
values
()
lr
=
self
.
_AddLearningRate
(
learning_rate
,
decay_steps
)
optimizer
=
tf
.
train
.
MomentumOptimizer
(
lr
,
momentum
,
use_locking
=
self
.
_use_locking
)
train_op
=
optimizer
.
minimize
(
nodes
[
'cost'
],
var_list
=
trainable_params
)
for
param
in
trainable_params
:
slot
=
optimizer
.
get_slot
(
param
,
'momentum'
)
self
.
inits
[
slot
.
name
]
=
state_ops
.
init_variable
(
slot
,
tf
.
zeros_initializer
)
self
.
variables
[
slot
.
name
]
=
slot
numerical_checks
=
[
tf
.
check_numerics
(
param
,
message
=
'Parameter is not finite.'
)
for
param
in
trainable_params
if
param
.
dtype
.
base_dtype
in
[
tf
.
float32
,
tf
.
float64
]
]
check_op
=
tf
.
group
(
*
numerical_checks
)
avg_update_op
=
tf
.
group
(
*
self
.
_averaging
.
values
())
train_ops
=
[
train_op
]
if
self
.
_check_parameters
:
train_ops
.
append
(
check_op
)
if
self
.
_use_averaging
:
train_ops
.
append
(
avg_update_op
)
nodes
[
'train_op'
]
=
tf
.
group
(
*
train_ops
,
name
=
'train_op'
)
return
nodes
def
AddSaver
(
self
,
slim_model
=
False
):
"""Adds ops to save and restore model parameters.
Args:
slim_model: whether only averaged variables are saved.
Returns:
the saver object.
"""
# We have to put the save op in the root scope otherwise running
# "save/restore_all" won't find the "save/Const" node it expects.
with
tf
.
name_scope
(
None
):
variables_to_save
=
self
.
params
.
copy
()
variables_to_save
.
update
(
self
.
variables
)
if
slim_model
:
for
key
in
variables_to_save
.
keys
():
if
not
key
.
endswith
(
'avg_var'
):
del
variables_to_save
[
key
]
self
.
saver
=
tf
.
train
.
Saver
(
variables_to_save
)
return
self
.
saver
syntaxnet/syntaxnet/graph_builder_test.py
0 → 100644
View file @
32ab5a58
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for graph_builder."""
# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
import
os.path
import
tensorflow
as
tf
from
tensorflow.python.framework
import
test_util
from
tensorflow.python.ops
import
variables
from
tensorflow.python.platform
import
googletest
from
syntaxnet
import
graph_builder
from
syntaxnet
import
sparse_pb2
from
syntaxnet.ops
import
gen_parser_ops
FLAGS
=
tf
.
app
.
flags
.
FLAGS
if
not
hasattr
(
FLAGS
,
'test_srcdir'
):
FLAGS
.
test_srcdir
=
''
if
not
hasattr
(
FLAGS
,
'test_tmpdir'
):
FLAGS
.
test_tmpdir
=
tf
.
test
.
get_temp_dir
()
class
GraphBuilderTest
(
test_util
.
TensorFlowTestCase
):
def
setUp
(
self
):
# Creates a task context with the correct testing paths.
initial_task_context
=
os
.
path
.
join
(
FLAGS
.
test_srcdir
,
'syntaxnet/'
'testdata/context.pbtxt'
)
self
.
_task_context
=
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
'context.pbtxt'
)
with
open
(
initial_task_context
,
'r'
)
as
fin
:
with
open
(
self
.
_task_context
,
'w'
)
as
fout
:
fout
.
write
(
fin
.
read
().
replace
(
'SRCDIR'
,
FLAGS
.
test_srcdir
)
.
replace
(
'OUTPATH'
,
FLAGS
.
test_tmpdir
))
# Creates necessary term maps.
with
self
.
test_session
()
as
sess
:
gen_parser_ops
.
lexicon_builder
(
task_context
=
self
.
_task_context
,
corpus_name
=
'training-corpus'
).
run
()
self
.
_num_features
,
self
.
_num_feature_ids
,
_
,
self
.
_num_actions
=
(
sess
.
run
(
gen_parser_ops
.
feature_size
(
task_context
=
self
.
_task_context
,
arg_prefix
=
'brain_parser'
)))
def
MakeBuilder
(
self
,
use_averaging
=
True
,
**
kw_args
):
# Set the seed and gate_gradients to ensure reproducibility.
return
graph_builder
.
GreedyParser
(
self
.
_num_actions
,
self
.
_num_features
,
self
.
_num_feature_ids
,
embedding_sizes
=
[
8
,
8
,
8
],
hidden_layer_sizes
=
[
32
,
32
],
seed
=
42
,
gate_gradients
=
True
,
use_averaging
=
use_averaging
,
**
kw_args
)
def
FindNode
(
self
,
name
):
for
node
in
tf
.
get_default_graph
().
as_graph_def
().
node
:
if
node
.
name
==
name
:
return
node
return
None
def
NodeFound
(
self
,
name
):
return
self
.
FindNode
(
name
)
is
not
None
def
testScope
(
self
):
# Set up the network topology
graph
=
tf
.
Graph
()
with
graph
.
as_default
():
parser
=
self
.
MakeBuilder
()
parser
.
AddTraining
(
self
.
_task_context
,
batch_size
=
10
,
corpus_name
=
'training-corpus'
)
parser
.
AddEvaluation
(
self
.
_task_context
,
batch_size
=
2
,
corpus_name
=
'tuning-corpus'
)
parser
.
AddSaver
()
# Check that the node ids we may rely on are there with the expected
# names.
self
.
assertEqual
(
parser
.
training
[
'logits'
].
name
,
'training/logits:0'
)
self
.
assertTrue
(
self
.
NodeFound
(
'training/logits'
))
self
.
assertTrue
(
self
.
NodeFound
(
'training/feature_0'
))
self
.
assertTrue
(
self
.
NodeFound
(
'training/feature_1'
))
self
.
assertTrue
(
self
.
NodeFound
(
'training/feature_2'
))
self
.
assertFalse
(
self
.
NodeFound
(
'training/feature_3'
))
self
.
assertEqual
(
parser
.
evaluation
[
'logits'
].
name
,
'evaluation/logits:0'
)
self
.
assertTrue
(
self
.
NodeFound
(
'evaluation/logits'
))
# The saver node is expected to be in the root scope.
self
.
assertTrue
(
self
.
NodeFound
(
'save/restore_all'
))
# Also check that the parameters have the scope we expect.
self
.
assertTrue
(
self
.
NodeFound
(
'embedding_matrix_0'
))
self
.
assertTrue
(
self
.
NodeFound
(
'embedding_matrix_1'
))
self
.
assertTrue
(
self
.
NodeFound
(
'embedding_matrix_2'
))
self
.
assertFalse
(
self
.
NodeFound
(
'embedding_matrix_3'
))
def
testNestedScope
(
self
):
# It's OK to put the whole graph in a scope of its own.
graph
=
tf
.
Graph
()
with
graph
.
as_default
():
with
graph
.
name_scope
(
'top'
):
parser
=
self
.
MakeBuilder
()
parser
.
AddTraining
(
self
.
_task_context
,
batch_size
=
10
,
corpus_name
=
'training-corpus'
)
parser
.
AddSaver
()
self
.
assertTrue
(
self
.
NodeFound
(
'top/training/logits'
))
self
.
assertTrue
(
self
.
NodeFound
(
'top/training/feature_0'
))
# The saver node is expected to be in the root scope no matter what.
self
.
assertFalse
(
self
.
NodeFound
(
'top/save/restore_all'
))
self
.
assertTrue
(
self
.
NodeFound
(
'save/restore_all'
))
def
testUseCustomGraphs
(
self
):
batch_size
=
10
# Use separate custom graphs.
custom_train_graph
=
tf
.
Graph
()
with
custom_train_graph
.
as_default
():
train_parser
=
self
.
MakeBuilder
()
train_parser
.
AddTraining
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'training-corpus'
)
custom_eval_graph
=
tf
.
Graph
()
with
custom_eval_graph
.
as_default
():
eval_parser
=
self
.
MakeBuilder
()
eval_parser
.
AddEvaluation
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'tuning-corpus'
)
# The following session runs should not fail.
with
self
.
test_session
(
graph
=
custom_train_graph
)
as
sess
:
self
.
assertTrue
(
self
.
NodeFound
(
'training/logits'
))
sess
.
run
(
train_parser
.
inits
.
values
())
sess
.
run
([
'training/logits:0'
])
with
self
.
test_session
(
graph
=
custom_eval_graph
)
as
sess
:
self
.
assertFalse
(
self
.
NodeFound
(
'training/logits'
))
self
.
assertTrue
(
self
.
NodeFound
(
'evaluation/logits'
))
sess
.
run
(
eval_parser
.
inits
.
values
())
sess
.
run
([
'evaluation/logits:0'
])
def
testTrainingAndEvalAreIndependent
(
self
):
batch_size
=
10
graph
=
tf
.
Graph
()
with
graph
.
as_default
():
parser
=
self
.
MakeBuilder
(
use_averaging
=
False
)
parser
.
AddTraining
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'training-corpus'
)
parser
.
AddEvaluation
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'tuning-corpus'
)
with
self
.
test_session
(
graph
=
graph
)
as
sess
:
sess
.
run
(
parser
.
inits
.
values
())
# Before any training updates are performed, both training and eval nets
# should return the same computations.
eval_logits
,
=
sess
.
run
([
parser
.
evaluation
[
'logits'
]])
training_logits
,
=
sess
.
run
([
parser
.
training
[
'logits'
]])
self
.
assertNear
(
abs
((
eval_logits
-
training_logits
).
sum
()),
0
,
1e-6
)
# After training, activations should differ.
for
_
in
range
(
5
):
eval_logits
=
parser
.
evaluation
[
'logits'
].
eval
()
for
_
in
range
(
5
):
training_logits
,
_
=
sess
.
run
([
parser
.
training
[
'logits'
],
parser
.
training
[
'train_op'
]])
self
.
assertGreater
(
abs
((
eval_logits
-
training_logits
).
sum
()),
0
,
1e-3
)
def
testReproducibility
(
self
):
batch_size
=
10
def
ComputeACost
(
graph
):
with
graph
.
as_default
():
parser
=
self
.
MakeBuilder
(
use_averaging
=
False
)
parser
.
AddTraining
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'training-corpus'
)
parser
.
AddEvaluation
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'tuning-corpus'
)
with
self
.
test_session
(
graph
=
graph
)
as
sess
:
sess
.
run
(
parser
.
inits
.
values
())
for
_
in
range
(
5
):
cost
,
_
=
sess
.
run
([
parser
.
training
[
'cost'
],
parser
.
training
[
'train_op'
]])
return
cost
cost1
=
ComputeACost
(
tf
.
Graph
())
cost2
=
ComputeACost
(
tf
.
Graph
())
self
.
assertNear
(
cost1
,
cost2
,
1e-8
)
def
testAddTrainingAndEvalOrderIndependent
(
self
):
batch_size
=
10
graph1
=
tf
.
Graph
()
with
graph1
.
as_default
():
parser
=
self
.
MakeBuilder
(
use_averaging
=
False
)
parser
.
AddTraining
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'training-corpus'
)
parser
.
AddEvaluation
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'tuning-corpus'
)
with
self
.
test_session
(
graph
=
graph1
)
as
sess
:
sess
.
run
(
parser
.
inits
.
values
())
metrics1
=
None
for
_
in
range
(
500
):
cost1
,
_
=
sess
.
run
([
parser
.
training
[
'cost'
],
parser
.
training
[
'train_op'
]])
em1
=
parser
.
evaluation
[
'eval_metrics'
].
eval
()
metrics1
=
metrics1
+
em1
if
metrics1
is
not
None
else
em1
# Reverse the order in which Training and Eval stacks are added.
graph2
=
tf
.
Graph
()
with
graph2
.
as_default
():
parser
=
self
.
MakeBuilder
(
use_averaging
=
False
)
parser
.
AddEvaluation
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'tuning-corpus'
)
parser
.
AddTraining
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'training-corpus'
)
with
self
.
test_session
(
graph
=
graph2
)
as
sess
:
sess
.
run
(
parser
.
inits
.
values
())
metrics2
=
None
for
_
in
range
(
500
):
cost2
,
_
=
sess
.
run
([
parser
.
training
[
'cost'
],
parser
.
training
[
'train_op'
]])
em2
=
parser
.
evaluation
[
'eval_metrics'
].
eval
()
metrics2
=
metrics2
+
em2
if
metrics2
is
not
None
else
em2
self
.
assertNear
(
cost1
,
cost2
,
1e-8
)
self
.
assertEqual
(
abs
(
metrics1
-
metrics2
).
sum
(),
0
)
def
testEvalMetrics
(
self
):
batch_size
=
10
graph
=
tf
.
Graph
()
with
graph
.
as_default
():
parser
=
self
.
MakeBuilder
()
parser
.
AddEvaluation
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'tuning-corpus'
)
with
self
.
test_session
(
graph
=
graph
)
as
sess
:
sess
.
run
(
parser
.
inits
.
values
())
tokens
=
0
correct_heads
=
0
for
_
in
range
(
100
):
eval_metrics
=
sess
.
run
(
parser
.
evaluation
[
'eval_metrics'
])
tokens
+=
eval_metrics
[
0
]
correct_heads
+=
eval_metrics
[
1
]
self
.
assertGreater
(
tokens
,
0
)
self
.
assertGreaterEqual
(
tokens
,
correct_heads
)
self
.
assertGreaterEqual
(
correct_heads
,
0
)
def
MakeSparseFeatures
(
self
,
ids
,
weights
):
f
=
sparse_pb2
.
SparseFeatures
()
for
i
,
w
in
zip
(
ids
,
weights
):
f
.
id
.
append
(
i
)
f
.
weight
.
append
(
w
)
return
f
.
SerializeToString
()
def
testEmbeddingOp
(
self
):
graph
=
tf
.
Graph
()
with
self
.
test_session
(
graph
=
graph
):
params
=
tf
.
constant
([[
1.0
,
2.0
],
[
3.0
,
4.0
],
[
5.0
,
6.0
]],
tf
.
float32
)
var
=
variables
.
Variable
([
self
.
MakeSparseFeatures
([
1
,
2
],
[
1.0
,
1.0
]),
self
.
MakeSparseFeatures
([],
[])])
var
.
initializer
.
run
()
embeddings
=
graph_builder
.
EmbeddingLookupFeatures
(
params
,
var
,
True
).
eval
()
self
.
assertAllClose
([[
8.0
,
10.0
],
[
0.0
,
0.0
]],
embeddings
)
var
=
variables
.
Variable
([
self
.
MakeSparseFeatures
([],
[]),
self
.
MakeSparseFeatures
([
0
,
2
],
[
0.5
,
2.0
])])
var
.
initializer
.
run
()
embeddings
=
graph_builder
.
EmbeddingLookupFeatures
(
params
,
var
,
True
).
eval
()
self
.
assertAllClose
([[
0.0
,
0.0
],
[
10.5
,
13.0
]],
embeddings
)
def
testOnlyTrainSomeParameters
(
self
):
batch_size
=
10
graph
=
tf
.
Graph
()
with
graph
.
as_default
():
parser
=
self
.
MakeBuilder
(
use_averaging
=
False
,
only_train
=
'softmax_bias'
)
parser
.
AddTraining
(
self
.
_task_context
,
batch_size
,
corpus_name
=
'training-corpus'
)
with
self
.
test_session
(
graph
=
graph
)
as
sess
:
sess
.
run
(
parser
.
inits
.
values
())
# Before training, save the state of two of the parameters.
bias0
,
weight0
=
sess
.
run
([
parser
.
params
[
'softmax_bias'
],
parser
.
params
[
'softmax_weight'
]])
for
_
in
range
(
5
):
bias
,
weight
,
_
=
sess
.
run
([
parser
.
params
[
'softmax_bias'
],
parser
.
params
[
'softmax_weight'
],
parser
.
training
[
'train_op'
]])
# After training, only one of the parameters should have changed.
self
.
assertAllEqual
(
weight
,
weight0
)
self
.
assertGreater
(
abs
(
bias
-
bias0
).
sum
(),
0
,
1e-5
)
if
__name__
==
'__main__'
:
googletest
.
main
()
syntaxnet/syntaxnet/kbest_syntax.proto
0 → 100644
View file @
32ab5a58
// K-best part-of-speech and dependency annotations for tokens.
syntax
=
"proto2"
;
import
"syntaxnet/sentence.proto"
;
package
syntaxnet
;
// A list of alternative (k-best) syntax analyses, grouped by sentences.
message
KBestSyntaxAnalyses
{
extend
Sentence
{
optional
KBestSyntaxAnalyses
extension
=
60366242
;
}
// Alternative analyses for each sentence. Sentences are listed in the
// order visited by a SentenceIterator.
repeated
KBestSyntaxAnalysesForSentence
sentence
=
1
;
// Alternative analyses for each token.
repeated
KBestSyntaxAnalysesForToken
token
=
2
;
}
// A list of alternative (k-best) analyses for a sentence spanning from a start
// token index to an end token index. The alternative analyses are ordered by
// decreasing model score from best to worst. The first analysis is the 1-best
// analysis, which is typically also stored in the document tokens.
message
KBestSyntaxAnalysesForSentence
{
// First token of sentence.
optional
int32
start
=
1
[
default
=
-
1
];
// Last token of sentence.
optional
int32
end
=
2
[
default
=
-
1
];
// K-best analyses for the tokens in this sentence. All of the analyses in
// the list have the same "type"; e.g., k-best taggings,
// k-best {tagging+parse}s, etc.
// Note also that the type of analysis stored in this list can change
// depending on where we are in the document processing pipeline; e.g.,
// may initially be taggings, and then switch to parses. The first
// token_analysis would be the 1-best analysis, which is typically also stored
// in the document. Note: some post-processors will update the document's
// syntax trees, but will leave these unchanged.
repeated
AlternativeTokenAnalysis
token_analysis
=
3
;
}
// A list of scored alternative (k-best) analyses for a particular token. These
// are all distinct from each other and ordered by decreasing model score. The
// first is the 1-best analysis, which may or may not match the document tokens
// depending on how the k-best analyses are selected.
message
KBestSyntaxAnalysesForToken
{
// All token analyses in this repeated field refer to the same token.
// Each alternative analysis will contain a single entry for repeated fields
// such as head, tag, category and label.
repeated
AlternativeTokenAnalysis
token_analysis
=
3
;
}
// An alternative analysis of tokens in the document. The repeated fields
// are indexed relative to the beginning of a sentence. Fields not
// represented in the alternative analysis are assumed to be unchanged.
// Currently only alternatives for tags, categories and (labeled) dependency
// heads are supported.
// Each repeated field should either have length=0 or length=number of tokens.
message
AlternativeTokenAnalysis
{
// Head of this token in the dependency tree: the id of the token which has
// an arc going to this one. If it is the root token of a sentence, then it
// is set to -1.
repeated
int32
head
=
1
;
// Part-of-speech tag for token.
repeated
string
tag
=
2
;
// Coarse-grained word category for token.
repeated
string
category
=
3
;
// Label for dependency relation between this token and its head.
repeated
string
label
=
4
;
// The score of this analysis, where bigger values typically indicate better
// quality, but there are no guarantees and there is also no pre-defined
// range.
optional
double
score
=
5
;
}
syntaxnet/syntaxnet/lexicon_builder.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <stddef.h>
#include <string>
#include "syntaxnet/utils.h"
#include "syntaxnet/affix.h"
#include "syntaxnet/dictionary.pb.h"
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/sentence_batch.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/term_frequency_map.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/env.h"
// A task that collects term statistics over a corpus and saves a set of
// term maps; these saved mappings are used to map strings to ints in both the
// chunker trainer and the chunker processors.
using
tensorflow
::
DEVICE_CPU
;
using
tensorflow
::
DT_INT32
;
using
tensorflow
::
OpKernel
;
using
tensorflow
::
OpKernelConstruction
;
using
tensorflow
::
OpKernelContext
;
using
tensorflow
::
Tensor
;
using
tensorflow
::
TensorShape
;
using
tensorflow
::
errors
::
InvalidArgument
;
namespace
syntaxnet
{
// A workflow task that creates term maps (e.g., word, tag, etc.).
//
// Non-flag task parameters:
// int lexicon_max_prefix_length (3):
// The maximum prefix length for lexicon words.
// int lexicon_max_suffix_length (3):
// The maximum suffix length for lexicon words.
class
LexiconBuilder
:
public
OpKernel
{
public:
explicit
LexiconBuilder
(
OpKernelConstruction
*
context
)
:
OpKernel
(
context
)
{
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"corpus_name"
,
&
corpus_name_
));
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"lexicon_max_prefix_length"
,
&
max_prefix_length_
));
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"lexicon_max_suffix_length"
,
&
max_suffix_length_
));
string
file_path
,
data
;
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"task_context"
,
&
file_path
));
OP_REQUIRES_OK
(
context
,
ReadFileToString
(
tensorflow
::
Env
::
Default
(),
file_path
,
&
data
));
OP_REQUIRES
(
context
,
TextFormat
::
ParseFromString
(
data
,
task_context_
.
mutable_spec
()),
InvalidArgument
(
"Could not parse task context at "
,
file_path
));
}
// Counts term frequencies.
void
Compute
(
OpKernelContext
*
context
)
override
{
// Term frequency maps to be populated by the corpus.
TermFrequencyMap
words
;
TermFrequencyMap
lcwords
;
TermFrequencyMap
tags
;
TermFrequencyMap
categories
;
TermFrequencyMap
labels
;
// Affix tables to be populated by the corpus.
AffixTable
prefixes
(
AffixTable
::
PREFIX
,
max_prefix_length_
);
AffixTable
suffixes
(
AffixTable
::
SUFFIX
,
max_suffix_length_
);
// Tag-to-category mapping.
TagToCategoryMap
tag_to_category
;
// Make a pass over the corpus.
int64
num_tokens
=
0
;
int64
num_documents
=
0
;
Sentence
*
document
;
TextReader
corpus
(
*
task_context_
.
GetInput
(
corpus_name_
));
while
((
document
=
corpus
.
Read
())
!=
NULL
)
{
// Gather token information.
for
(
int
t
=
0
;
t
<
document
->
token_size
();
++
t
)
{
// Get token and lowercased word.
const
Token
&
token
=
document
->
token
(
t
);
string
word
=
token
.
word
();
utils
::
NormalizeDigits
(
&
word
);
string
lcword
=
tensorflow
::
str_util
::
Lowercase
(
word
);
// Make sure the token does not contain a newline.
CHECK
(
lcword
.
find
(
'\n'
)
==
string
::
npos
);
// Increment frequencies (only for terms that exist).
if
(
!
word
.
empty
()
&&
!
HasSpaces
(
word
))
words
.
Increment
(
word
);
if
(
!
lcword
.
empty
()
&&
!
HasSpaces
(
lcword
))
lcwords
.
Increment
(
lcword
);
if
(
!
token
.
tag
().
empty
())
tags
.
Increment
(
token
.
tag
());
if
(
!
token
.
category
().
empty
())
categories
.
Increment
(
token
.
category
());
if
(
!
token
.
label
().
empty
())
labels
.
Increment
(
token
.
label
());
// Add prefixes/suffixes for the current word.
prefixes
.
AddAffixesForWord
(
word
.
c_str
(),
word
.
size
());
suffixes
.
AddAffixesForWord
(
word
.
c_str
(),
word
.
size
());
// Add mapping from tag to category.
tag_to_category
.
SetCategory
(
token
.
tag
(),
token
.
category
());
// Update the number of processed tokens.
++
num_tokens
;
}
delete
document
;
++
num_documents
;
}
LOG
(
INFO
)
<<
"Term maps collected over "
<<
num_tokens
<<
" tokens from "
<<
num_documents
<<
" documents"
;
// Write mappings to disk.
words
.
Save
(
TaskContext
::
InputFile
(
*
task_context_
.
GetInput
(
"word-map"
)));
lcwords
.
Save
(
TaskContext
::
InputFile
(
*
task_context_
.
GetInput
(
"lcword-map"
)));
tags
.
Save
(
TaskContext
::
InputFile
(
*
task_context_
.
GetInput
(
"tag-map"
)));
categories
.
Save
(
TaskContext
::
InputFile
(
*
task_context_
.
GetInput
(
"category-map"
)));
labels
.
Save
(
TaskContext
::
InputFile
(
*
task_context_
.
GetInput
(
"label-map"
)));
// Write affixes to disk.
WriteAffixTable
(
prefixes
,
TaskContext
::
InputFile
(
*
task_context_
.
GetInput
(
"prefix-table"
)));
WriteAffixTable
(
suffixes
,
TaskContext
::
InputFile
(
*
task_context_
.
GetInput
(
"suffix-table"
)));
// Write tag-to-category mapping to disk.
tag_to_category
.
Save
(
TaskContext
::
InputFile
(
*
task_context_
.
GetInput
(
"tag-to-category"
)));
}
private:
// Returns true if the word contains spaces.
static
bool
HasSpaces
(
const
string
&
word
)
{
for
(
char
c
:
word
)
{
if
(
c
==
' '
)
return
true
;
}
return
false
;
}
// Writes an affix table to a task output.
static
void
WriteAffixTable
(
const
AffixTable
&
affixes
,
const
string
&
output_file
)
{
ProtoRecordWriter
writer
(
output_file
);
affixes
.
Write
(
&
writer
);
}
// Name of the context input to compute lexicons.
string
corpus_name_
;
// Max length for prefix table.
int
max_prefix_length_
;
// Max length for suffix table.
int
max_suffix_length_
;
// Task context used to configure this op.
TaskContext
task_context_
;
};
REGISTER_KERNEL_BUILDER
(
Name
(
"LexiconBuilder"
).
Device
(
DEVICE_CPU
),
LexiconBuilder
);
class
FeatureSize
:
public
OpKernel
{
public:
explicit
FeatureSize
(
OpKernelConstruction
*
context
)
:
OpKernel
(
context
)
{
string
task_context_path
;
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"task_context"
,
&
task_context_path
));
OP_REQUIRES_OK
(
context
,
context
->
GetAttr
(
"arg_prefix"
,
&
arg_prefix_
));
OP_REQUIRES_OK
(
context
,
context
->
MatchSignature
(
{},
{
DT_INT32
,
DT_INT32
,
DT_INT32
,
DT_INT32
}));
string
data
;
OP_REQUIRES_OK
(
context
,
ReadFileToString
(
tensorflow
::
Env
::
Default
(),
task_context_path
,
&
data
));
OP_REQUIRES
(
context
,
TextFormat
::
ParseFromString
(
data
,
task_context_
.
mutable_spec
()),
InvalidArgument
(
"Could not parse task context at "
,
task_context_path
));
string
label_map_path
=
TaskContext
::
InputFile
(
*
task_context_
.
GetInput
(
"label-map"
));
label_map_
=
SharedStoreUtils
::
GetWithDefaultName
<
TermFrequencyMap
>
(
label_map_path
,
0
,
0
);
}
~
FeatureSize
()
override
{
SharedStore
::
Release
(
label_map_
);
}
void
Compute
(
OpKernelContext
*
context
)
override
{
// Computes feature sizes.
ParserEmbeddingFeatureExtractor
features
(
arg_prefix_
);
features
.
Setup
(
&
task_context_
);
features
.
Init
(
&
task_context_
);
const
int
num_embeddings
=
features
.
NumEmbeddings
();
Tensor
*
feature_sizes
=
nullptr
;
Tensor
*
domain_sizes
=
nullptr
;
Tensor
*
embedding_dims
=
nullptr
;
Tensor
*
num_actions
=
nullptr
;
TF_CHECK_OK
(
context
->
allocate_output
(
0
,
TensorShape
({
num_embeddings
}),
&
feature_sizes
));
TF_CHECK_OK
(
context
->
allocate_output
(
1
,
TensorShape
({
num_embeddings
}),
&
domain_sizes
));
TF_CHECK_OK
(
context
->
allocate_output
(
2
,
TensorShape
({
num_embeddings
}),
&
embedding_dims
));
TF_CHECK_OK
(
context
->
allocate_output
(
3
,
TensorShape
({}),
&
num_actions
));
for
(
int
i
=
0
;
i
<
num_embeddings
;
++
i
)
{
feature_sizes
->
vec
<
int32
>
()(
i
)
=
features
.
FeatureSize
(
i
);
domain_sizes
->
vec
<
int32
>
()(
i
)
=
features
.
EmbeddingSize
(
i
);
embedding_dims
->
vec
<
int32
>
()(
i
)
=
features
.
EmbeddingDims
(
i
);
}
// Computes number of actions in the transition system.
std
::
unique_ptr
<
ParserTransitionSystem
>
transition_system
(
ParserTransitionSystem
::
Create
(
task_context_
.
Get
(
features
.
GetParamName
(
"transition_system"
),
"arc-standard"
)));
transition_system
->
Setup
(
&
task_context_
);
transition_system
->
Init
(
&
task_context_
);
num_actions
->
scalar
<
int32
>
()()
=
transition_system
->
NumActions
(
label_map_
->
Size
());
}
private:
// Task context used to configure this op.
TaskContext
task_context_
;
// Dependency label map used in transition system.
const
TermFrequencyMap
*
label_map_
;
// Prefix for context parameters.
string
arg_prefix_
;
};
REGISTER_KERNEL_BUILDER
(
Name
(
"FeatureSize"
).
Device
(
DEVICE_CPU
),
FeatureSize
);
}
// namespace syntaxnet
syntaxnet/syntaxnet/lexicon_builder_test.py
0 → 100644
View file @
32ab5a58
# coding=utf-8
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for lexicon_builder."""
# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
import
os.path
import
tensorflow
as
tf
import
syntaxnet.load_parser_ops
from
tensorflow.python.framework
import
test_util
from
tensorflow.python.platform
import
googletest
from
tensorflow.python.platform
import
logging
from
syntaxnet
import
sentence_pb2
from
syntaxnet
import
task_spec_pb2
from
syntaxnet.ops
import
gen_parser_ops
FLAGS
=
tf
.
app
.
flags
.
FLAGS
CONLL_DOC1
=
u
'''1 बात _ n NN _ _ _ _ _
2 गलत _ adj JJ _ _ _ _ _
3 हो _ v VM _ _ _ _ _
4 तो _ avy CC _ _ _ _ _
5 गुस्सा _ n NN _ _ _ _ _
6 सेलेब्रिटिज _ n NN _ _ _ _ _
7 को _ psp PSP _ _ _ _ _
8 भी _ avy RP _ _ _ _ _
9 आना _ v VM _ _ _ _ _
10 लाजमी _ adj JJ _ _ _ _ _
11 है _ v VM _ _ _ _ _
12 । _ punc SYM _ _ _ _ _'''
CONLL_DOC2
=
u
'''1 लेकिन _ avy CC _ _ _ _ _
2 अभिनेत्री _ n NN _ _ _ _ _
3 के _ psp PSP _ _ _ _ _
4 इस _ pn DEM _ _ _ _ _
5 कदम _ n NN _ _ _ _ _
6 से _ psp PSP _ _ _ _ _
7 वहां _ pn PRP _ _ _ _ _
8 रंग _ n NN _ _ _ _ _
9 में _ psp PSP _ _ _ _ _
10 भंग _ adj JJ _ _ _ _ _
11 पड़ _ v VM _ _ _ _ _
12 गया _ v VAUX _ _ _ _ _
13 । _ punc SYM _ _ _ _ _'''
TAGS
=
[
'NN'
,
'JJ'
,
'VM'
,
'CC'
,
'PSP'
,
'RP'
,
'JJ'
,
'SYM'
,
'DEM'
,
'PRP'
,
'VAUX'
]
CATEGORIES
=
[
'n'
,
'adj'
,
'v'
,
'avy'
,
'n'
,
'psp'
,
'punc'
,
'pn'
]
TOKENIZED_DOCS
=
u
'''बात गलत हो तो गुस्सा सेलेब्रिटिज को भी आना लाजमी है ।
लेकिन अभिनेत्री के इस कदम से वहां रंग में भंग पड़ गया ।
'''
COMMENTS
=
u
'# Line with fake comments.'
class
LexiconBuilderTest
(
test_util
.
TensorFlowTestCase
):
def
setUp
(
self
):
if
not
hasattr
(
FLAGS
,
'test_srcdir'
):
FLAGS
.
test_srcdir
=
''
if
not
hasattr
(
FLAGS
,
'test_tmpdir'
):
FLAGS
.
test_tmpdir
=
tf
.
test
.
get_temp_dir
()
self
.
corpus_file
=
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
'documents.conll'
)
self
.
context_file
=
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
'context.pbtxt'
)
def
AddInput
(
self
,
name
,
file_pattern
,
record_format
,
context
):
inp
=
context
.
input
.
add
()
inp
.
name
=
name
inp
.
record_format
.
append
(
record_format
)
inp
.
part
.
add
().
file_pattern
=
file_pattern
def
WriteContext
(
self
,
corpus_format
):
context
=
task_spec_pb2
.
TaskSpec
()
self
.
AddInput
(
'documents'
,
self
.
corpus_file
,
corpus_format
,
context
)
for
name
in
(
'word-map'
,
'lcword-map'
,
'tag-map'
,
'category-map'
,
'label-map'
,
'prefix-table'
,
'suffix-table'
,
'tag-to-category'
):
self
.
AddInput
(
name
,
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
name
),
''
,
context
)
logging
.
info
(
'Writing context to: %s'
,
self
.
context_file
)
with
open
(
self
.
context_file
,
'w'
)
as
f
:
f
.
write
(
str
(
context
))
def
ReadNextDocument
(
self
,
sess
,
doc_source
):
doc_str
,
last
=
sess
.
run
(
doc_source
)
if
doc_str
:
doc
=
sentence_pb2
.
Sentence
()
doc
.
ParseFromString
(
doc_str
[
0
])
else
:
doc
=
None
return
doc
,
last
def
ValidateDocuments
(
self
):
doc_source
=
gen_parser_ops
.
document_source
(
self
.
context_file
,
batch_size
=
1
)
with
self
.
test_session
()
as
sess
:
logging
.
info
(
'Reading document1'
)
doc
,
last
=
self
.
ReadNextDocument
(
sess
,
doc_source
)
self
.
assertEqual
(
len
(
doc
.
token
),
12
)
self
.
assertEqual
(
u
'लाजमी'
,
doc
.
token
[
9
].
word
)
self
.
assertFalse
(
last
)
logging
.
info
(
'Reading document2'
)
doc
,
last
=
self
.
ReadNextDocument
(
sess
,
doc_source
)
self
.
assertEqual
(
len
(
doc
.
token
),
13
)
self
.
assertEqual
(
u
'भंग'
,
doc
.
token
[
9
].
word
)
self
.
assertFalse
(
last
)
logging
.
info
(
'Hitting end of the dataset'
)
doc
,
last
=
self
.
ReadNextDocument
(
sess
,
doc_source
)
self
.
assertTrue
(
doc
is
None
)
self
.
assertTrue
(
last
)
def
ValidateTagToCategoryMap
(
self
):
with
file
(
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
'tag-to-category'
),
'r'
)
as
f
:
entries
=
[
line
.
strip
().
split
(
'
\t
'
)
for
line
in
f
.
readlines
()]
for
tag
,
category
in
entries
:
self
.
assertIn
(
tag
,
TAGS
)
self
.
assertIn
(
category
,
CATEGORIES
)
def
BuildLexicon
(
self
):
with
self
.
test_session
():
gen_parser_ops
.
lexicon_builder
(
task_context
=
self
.
context_file
).
run
()
def
testCoNLLFormat
(
self
):
self
.
WriteContext
(
'conll-sentence'
)
logging
.
info
(
'Writing conll file to: %s'
,
self
.
corpus_file
)
with
open
(
self
.
corpus_file
,
'w'
)
as
f
:
f
.
write
((
CONLL_DOC1
+
u
'
\n\n
'
+
CONLL_DOC2
+
u
'
\n
'
)
.
replace
(
' '
,
'
\t
'
).
encode
(
'utf-8'
))
self
.
ValidateDocuments
()
self
.
BuildLexicon
()
self
.
ValidateTagToCategoryMap
()
def
testCoNLLFormatExtraNewlinesAndComments
(
self
):
self
.
WriteContext
(
'conll-sentence'
)
with
open
(
self
.
corpus_file
,
'w'
)
as
f
:
f
.
write
((
u
'
\n\n\n
'
+
CONLL_DOC1
+
u
'
\n\n\n
'
+
COMMENTS
+
u
'
\n\n
'
+
CONLL_DOC2
).
replace
(
' '
,
'
\t
'
).
encode
(
'utf-8'
))
self
.
ValidateDocuments
()
self
.
BuildLexicon
()
self
.
ValidateTagToCategoryMap
()
def
testTokenizedTextFormat
(
self
):
self
.
WriteContext
(
'tokenized-text'
)
with
open
(
self
.
corpus_file
,
'w'
)
as
f
:
f
.
write
(
TOKENIZED_DOCS
.
encode
(
'utf-8'
))
self
.
ValidateDocuments
()
self
.
BuildLexicon
()
def
testTokenizedTextFormatExtraNewlines
(
self
):
self
.
WriteContext
(
'tokenized-text'
)
with
open
(
self
.
corpus_file
,
'w'
)
as
f
:
f
.
write
((
u
'
\n\n\n
'
+
TOKENIZED_DOCS
+
u
'
\n\n\n
'
).
encode
(
'utf-8'
))
self
.
ValidateDocuments
()
self
.
BuildLexicon
()
if
__name__
==
'__main__'
:
googletest
.
main
()
syntaxnet/syntaxnet/load_parser_ops.py
0 → 100644
View file @
32ab5a58
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads parser_ops shared library."""
import
os.path
import
tensorflow
as
tf
tf
.
load_op_library
(
os
.
path
.
join
(
tf
.
resource_loader
.
get_data_files_path
(),
'parser_ops.so'
))
syntaxnet/syntaxnet/models/parsey_mcparseface/context.pbtxt
0 → 100644
View file @
32ab5a58
Parameter {
name: "brain_parser_embedding_dims"
value: "32;32;64"
}
Parameter {
name: "brain_parser_embedding_names"
value: "labels;tags;words"
}
Parameter {
name: 'brain_parser_scoring'
value: 'default'
}
Parameter {
name: "brain_parser_features"
value:
'stack.child(1).label '
'stack.child(1).sibling(-1).label '
'stack.child(-1).label '
'stack.child(-1).sibling(1).label '
'stack.child(2).label '
'stack.child(-2).label '
'stack(1).child(1).label '
'stack(1).child(1).sibling(-1).label '
'stack(1).child(-1).label '
'stack(1).child(-1).sibling(1).label '
'stack(1).child(2).label '
'stack(1).child(-2).label; '
'input.token.tag '
'input(1).token.tag '
'input(2).token.tag '
'input(3).token.tag '
'stack.token.tag '
'stack.child(1).token.tag '
'stack.child(1).sibling(-1).token.tag '
'stack.child(-1).token.tag '
'stack.child(-1).sibling(1).token.tag '
'stack.child(2).token.tag '
'stack.child(-2).token.tag '
'stack(1).token.tag '
'stack(1).child(1).token.tag '
'stack(1).child(1).sibling(-1).token.tag '
'stack(1).child(-1).token.tag '
'stack(1).child(-1).sibling(1).token.tag '
'stack(1).child(2).token.tag '
'stack(1).child(-2).token.tag '
'stack(2).token.tag '
'stack(3).token.tag; '
'input.token.word '
'input(1).token.word '
'input(2).token.word '
'input(3).token.word '
'stack.token.word '
'stack.child(1).token.word '
'stack.child(1).sibling(-1).token.word '
'stack.child(-1).token.word '
'stack.child(-1).sibling(1).token.word '
'stack.child(2).token.word '
'stack.child(-2).token.word '
'stack(1).token.word '
'stack(1).child(1).token.word '
'stack(1).child(1).sibling(-1).token.word '
'stack(1).child(-1).token.word '
'stack(1).child(-1).sibling(1).token.word '
'stack(1).child(2).token.word '
'stack(1).child(-2).token.word '
'stack(2).token.word '
'stack(3).token.word '
}
Parameter {
name: "brain_parser_transition_system"
value: "arc-standard"
}
Parameter {
name: "brain_tagger_embedding_dims"
value: "8;16;16;16;16;64"
}
Parameter {
name: "brain_tagger_embedding_names"
value: "other;prefix2;prefix3;suffix2;suffix3;words"
}
Parameter {
name: "brain_tagger_features"
value:
'input.digit '
'input.hyphen; '
'input.prefix(length="2") '
'input(1).prefix(length="2") '
'input(2).prefix(length="2") '
'input(3).prefix(length="2") '
'input(-1).prefix(length="2") '
'input(-2).prefix(length="2") '
'input(-3).prefix(length="2") '
'input(-4).prefix(length="2"); '
'input.prefix(length="3") '
'input(1).prefix(length="3") '
'input(2).prefix(length="3") '
'input(3).prefix(length="3") '
'input(-1).prefix(length="3") '
'input(-2).prefix(length="3") '
'input(-3).prefix(length="3") '
'input(-4).prefix(length="3"); '
'input.suffix(length="2") '
'input(1).suffix(length="2") '
'input(2).suffix(length="2") '
'input(3).suffix(length="2") '
'input(-1).suffix(length="2") '
'input(-2).suffix(length="2") '
'input(-3).suffix(length="2") '
'input(-4).suffix(length="2"); '
'input.suffix(length="3") '
'input(1).suffix(length="3") '
'input(2).suffix(length="3") '
'input(3).suffix(length="3") '
'input(-1).suffix(length="3") '
'input(-2).suffix(length="3") '
'input(-3).suffix(length="3") '
'input(-4).suffix(length="3"); '
'input.token.word '
'input(1).token.word '
'input(2).token.word '
'input(3).token.word '
'input(-1).token.word '
'input(-2).token.word '
'input(-3).token.word '
'input(-4).token.word '
}
Parameter {
name: "brain_tagger_transition_system"
value: "tagger"
}
input {
name: "tag-map"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/tag-map"
}
}
input {
name: "tag-to-category"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/fine-to-universal.map"
}
}
input {
name: "word-map"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/word-map"
}
}
input {
name: "label-map"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/label-map"
}
}
input {
name: "prefix-table"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/prefix-table"
}
}
input {
name: "suffix-table"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/suffix-table"
}
}
input {
name: 'stdin'
record_format: 'english-text'
Part {
file_pattern: '-'
}
}
input {
name: 'stdin-conll'
record_format: 'conll-sentence'
Part {
file_pattern: '-'
}
}
input {
name: 'stdout-conll'
record_format: 'conll-sentence'
Part {
file_pattern: '-'
}
}
syntaxnet/syntaxnet/models/parsey_mcparseface/fine-to-universal.map
0 → 100644
View file @
32ab5a58
# .
$ .
'' .
-LRB- .
-RRB- .
, .
. .
: .
ADD X
AFX PRT
CC CONJ
CD NUM
DT DET
EX DET
FW X
GW X
HYPH .
IN ADP
JJ ADJ
JJR ADJ
JJS ADJ
LS X
MD VERB
NFP .
NN NOUN
NNP NOUN
NNPS NOUN
NNS NOUN
PDT DET
POS PRT
PRP PRON
PRP$ PRON
RB ADV
RBR ADV
RBS ADV
RP PRT
SYM X
TO PRT
UH X
VB VERB
VBD VERB
VBG VERB
VBN VERB
VBP VERB
VBZ VERB
WDT DET
WP PRON
WP$ PRON
WRB ADV
`` .
X X
XX X
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment