Commit 32ab5a58 authored by calberti's avatar calberti Committed by Martin Wicke
Browse files

Adding SyntaxNet to tensorflow/models (#63)

parent 148a15fb
// Protocol buffers for serializing string<=>index dictionaries.
syntax = "proto2";
package syntaxnet;
// Serializable representation of a string=>string pair.
message StringToStringPair {
// String representing the key.
required string key = 1;
// String representing the value.
required string value = 2;
}
// Serializable representation of a string=>string mapping.
message StringToStringMap {
// Key=>value pairs.
repeated StringToStringPair pair = 1;
}
// Affix table entry, for serialization of the affix tables.
message AffixTableEntry {
// Nested message for serializing a single affix.
message AffixEntry {
// The affix as a string.
required string form = 1;
// The length of the affix (this is non-trivial to compute due to UTF-8).
required int32 length = 2;
// The ID of the affix that is one character shorter, or -1 if none exists.
required int32 shorter_id = 3;
}
// The type of affix table, as a string.
required string type = 1;
// The maximum affix length.
required int32 max_length = 2;
// The list of affixes, in order of affix ID.
repeated AffixEntry affix = 3;
}
// A light-weight proto to store vectors in binary format.
message TokenEmbedding {
required bytes token = 1; // can be word or phrase, or URL, etc.
// If available, raw count of this token in the training corpus.
optional int64 count = 3;
message Vector {
repeated float values = 1 [packed = true];
}
optional Vector vector = 2;
};
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Various utilities for handling documents.
#include <stddef.h>
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "syntaxnet/base.h"
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/utils.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/lib/core/status.h"
using tensorflow::DEVICE_CPU;
using tensorflow::OpKernel;
using tensorflow::OpKernelConstruction;
using tensorflow::OpKernelContext;
using tensorflow::Tensor;
using tensorflow::TensorShape;
using tensorflow::errors::InvalidArgument;
namespace syntaxnet {
namespace {
void GetTaskContext(OpKernelConstruction *context, TaskContext *task_context) {
string file_path, data;
OP_REQUIRES_OK(context, context->GetAttr("task_context", &file_path));
OP_REQUIRES_OK(
context, ReadFileToString(tensorflow::Env::Default(), file_path, &data));
OP_REQUIRES(context,
TextFormat::ParseFromString(data, task_context->mutable_spec()),
InvalidArgument("Could not parse task context at ", file_path));
}
// Outputs the given batch of sentences as a tensor and deletes them.
void OutputDocuments(OpKernelContext *context,
vector<Sentence *> *document_batch) {
const int64 size = document_batch->size();
Tensor *output;
OP_REQUIRES_OK(context,
context->allocate_output(0, TensorShape({size}), &output));
for (int64 i = 0; i < size; ++i) {
output->vec<string>()(i) = (*document_batch)[i]->SerializeAsString();
}
utils::STLDeleteElements(document_batch);
}
} // namespace
class DocumentSource : public OpKernel {
public:
explicit DocumentSource(OpKernelConstruction *context) : OpKernel(context) {
GetTaskContext(context, &task_context_);
string corpus_name;
OP_REQUIRES_OK(context, context->GetAttr("corpus_name", &corpus_name));
OP_REQUIRES_OK(context, context->GetAttr("batch_size", &batch_size_));
OP_REQUIRES(context, batch_size_ > 0,
InvalidArgument("invalid batch_size provided"));
corpus_.reset(new TextReader(*task_context_.GetInput(corpus_name)));
}
void Compute(OpKernelContext *context) override {
mutex_lock lock(mu_);
Sentence *document;
vector<Sentence *> document_batch;
while ((document = corpus_->Read()) != NULL) {
document_batch.push_back(document);
if (static_cast<int>(document_batch.size()) == batch_size_) {
OutputDocuments(context, &document_batch);
OutputLast(context, false);
return;
}
}
OutputDocuments(context, &document_batch);
OutputLast(context, true);
}
private:
void OutputLast(OpKernelContext *context, bool last) {
Tensor *output;
OP_REQUIRES_OK(context,
context->allocate_output(1, TensorShape({}), &output));
output->scalar<bool>()() = last;
}
// Task context used to configure this op.
TaskContext task_context_;
// mutex to synchronize access to Compute.
mutex mu_;
std::unique_ptr<TextReader> corpus_;
string documents_path_;
int batch_size_;
};
REGISTER_KERNEL_BUILDER(Name("DocumentSource").Device(DEVICE_CPU),
DocumentSource);
class DocumentSink : public OpKernel {
public:
explicit DocumentSink(OpKernelConstruction *context) : OpKernel(context) {
GetTaskContext(context, &task_context_);
string corpus_name;
OP_REQUIRES_OK(context, context->GetAttr("corpus_name", &corpus_name));
writer_.reset(new TextWriter(*task_context_.GetInput(corpus_name)));
}
void Compute(OpKernelContext *context) override {
mutex_lock lock(mu_);
auto documents = context->input(0).vec<string>();
for (int i = 0; i < documents.size(); ++i) {
Sentence document;
OP_REQUIRES(context, document.ParseFromString(documents(i)),
InvalidArgument("failed to parse sentence"));
writer_->Write(document);
}
}
private:
// Task context used to configure this op.
TaskContext task_context_;
// mutex to synchronize access to Compute.
mutex mu_;
string documents_path_;
std::unique_ptr<TextWriter> writer_;
};
REGISTER_KERNEL_BUILDER(Name("DocumentSink").Device(DEVICE_CPU),
DocumentSink);
// Sentence filter for filtering out documents where the parse trees are not
// well-formed, i.e. they contain cycles.
class WellFormedFilter : public OpKernel {
public:
explicit WellFormedFilter(OpKernelConstruction *context) : OpKernel(context) {
GetTaskContext(context, &task_context_);
OP_REQUIRES_OK(context, context->GetAttr("keep_malformed_documents",
&keep_malformed_));
}
void Compute(OpKernelContext *context) override {
auto documents = context->input(0).vec<string>();
vector<Sentence *> output_documents;
for (int i = 0; i < documents.size(); ++i) {
Sentence *document = new Sentence;
OP_REQUIRES(context, document->ParseFromString(documents(i)),
InvalidArgument("failed to parse sentence"));
if (ShouldKeep(*document)) {
output_documents.push_back(document);
} else {
delete document;
}
}
OutputDocuments(context, &output_documents);
}
private:
bool ShouldKeep(const Sentence &doc) {
vector<int> visited(doc.token_size(), -1);
for (int i = 0; i < doc.token_size(); ++i) {
// Already visited node.
if (visited[i] != -1) continue;
int t = i;
while (t != -1) {
if (visited[t] == -1) {
// If it is not visited yet, mark it.
visited[t] = i;
} else if (visited[t] < i) {
// If the index number is smaller than index and not -1, the token has
// already been visited.
break;
} else {
// Loop detected.
LOG(ERROR) << "Loop detected in document " << doc.DebugString();
return keep_malformed_;
}
t = doc.token(t).head();
}
}
return true;
}
private:
// Task context used to configure this op.
TaskContext task_context_;
bool keep_malformed_;
};
REGISTER_KERNEL_BUILDER(Name("WellFormedFilter").Device(DEVICE_CPU),
WellFormedFilter);
// Sentence filter that modifies dependency trees to make them projective. This
// could be made more efficient by looping over sentences instead of the entire
// document. Assumes that the document is well-formed in the sense of having
// no looping dependencies.
//
// Task arguments:
// bool discard_non_projective (false) : If true, discards documents with
// non-projective trees instead of projectivizing them.
class ProjectivizeFilter : public OpKernel {
public:
explicit ProjectivizeFilter(OpKernelConstruction *context)
: OpKernel(context) {
GetTaskContext(context, &task_context_);
OP_REQUIRES_OK(context, context->GetAttr("discard_non_projective",
&discard_non_projective_));
}
void Compute(OpKernelContext *context) override {
auto documents = context->input(0).vec<string>();
vector<Sentence *> output_documents;
for (int i = 0; i < documents.size(); ++i) {
Sentence *document = new Sentence;
OP_REQUIRES(context, document->ParseFromString(documents(i)),
InvalidArgument("failed to parse sentence"));
if (Process(document)) {
output_documents.push_back(document);
} else {
delete document;
}
}
OutputDocuments(context, &output_documents);
}
bool Process(Sentence *doc) {
const int num_tokens = doc->token_size();
// Left and right boundaries for arcs. The left and right ends of an arc are
// bounded by the arcs that pass over it. If an arc exceeds these bounds it
// will cross an arc passing over it, making it a non-projective arc.
vector<int> left(num_tokens);
vector<int> right(num_tokens);
// Lift the shortest non-projective arc until the document is projective.
while (true) {
// Initialize boundaries to the whole document for all arcs.
for (int i = 0; i < num_tokens; ++i) {
left[i] = -1;
right[i] = num_tokens - 1;
}
// Find left and right bounds for each token.
for (int i = 0; i < num_tokens; ++i) {
int head_index = doc->token(i).head();
// Find left and right end of arc.
int l = std::min(i, head_index);
int r = std::max(i, head_index);
// Bound all tokens under the arc.
for (int j = l + 1; j < r; ++j) {
if (left[j] < l) left[j] = l;
if (right[j] > r) right[j] = r;
}
}
// Find deepest non-projective arc.
int deepest_arc = -1;
int max_depth = -1;
// The non-projective arcs are those that exceed their bounds.
for (int i = 0; i < num_tokens; ++i) {
int head_index = doc->token(i).head();
if (head_index == -1) continue; // any crossing arc must be deeper
int l = std::min(i, head_index);
int r = std::max(i, head_index);
int left_bound = std::max(left[l], left[r]);
int right_bound = std::min(right[l], right[r]);
if (l < left_bound || r > right_bound) {
// Found non-projective arc.
if (discard_non_projective_) return false;
// Pick the deepest as the best candidate for lifting.
int depth = 0;
int j = i;
while (j != -1) {
++depth;
j = doc->token(j).head();
}
if (depth > max_depth) {
deepest_arc = i;
max_depth = depth;
}
}
}
// If there are no more non-projective arcs we are done.
if (deepest_arc == -1) return true;
// Lift non-projective arc.
int lifted_head = doc->token(doc->token(deepest_arc).head()).head();
doc->mutable_token(deepest_arc)->set_head(lifted_head);
}
}
private:
// Task context used to configure this op.
TaskContext task_context_;
// Whether or not to throw away non-projective documents.
bool discard_non_projective_;
};
REGISTER_KERNEL_BUILDER(Name("ProjectivizeFilter").Device(DEVICE_CPU),
ProjectivizeFilter);
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/document_format.h"
namespace syntaxnet {
// Component registry for document formatters.
REGISTER_CLASS_REGISTRY("document format", DocumentFormat);
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// An interface for document formats.
#ifndef $TARGETDIR_DOCUMENT_FORMAT_H__
#define $TARGETDIR_DOCUMENT_FORMAT_H__
#include <string>
#include <vector>
#include "syntaxnet/utils.h"
#include "syntaxnet/registry.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/task_context.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
namespace syntaxnet {
// A document format component converts a key/value pair from a record to one or
// more documents. The record format is used for selecting the document format
// component. A document format component can be registered with the
// REGISTER_DOCUMENT_FORMAT macro.
class DocumentFormat : public RegisterableClass<DocumentFormat> {
public:
DocumentFormat() {}
virtual ~DocumentFormat() {}
// Reads a record from the given input buffer with format specific logic.
// Returns false if no record could be read because we reached end of file.
virtual bool ReadRecord(tensorflow::io::InputBuffer *buffer,
string *record) = 0;
// Converts a key/value pair to one or more documents.
virtual void ConvertFromString(const string &key, const string &value,
vector<Sentence *> *documents) = 0;
// Converts a document to a key/value pair.
virtual void ConvertToString(const Sentence &document,
string *key, string *value) = 0;
private:
TF_DISALLOW_COPY_AND_ASSIGN(DocumentFormat);
};
#define REGISTER_DOCUMENT_FORMAT(type, component) \
REGISTER_CLASS_COMPONENT(DocumentFormat, type, component)
} // namespace syntaxnet
#endif // $TARGETDIR_DOCUMENT_FORMAT_H__
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/embedding_feature_extractor.h"
#include <vector>
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/parser_features.h"
#include "syntaxnet/task_context.h"
#include "syntaxnet/utils.h"
namespace syntaxnet {
void GenericEmbeddingFeatureExtractor::Setup(TaskContext *context) {
// Don't use version to determine how to get feature FML.
const string features = context->Get(
tensorflow::strings::StrCat(ArgPrefix(), "_", "features"), "");
const string embedding_names =
context->Get(GetParamName("embedding_names"), "");
const string embedding_dims =
context->Get(GetParamName("embedding_dims"), "");
LOG(INFO) << "Features: " << features;
LOG(INFO) << "Embedding names: " << embedding_names;
LOG(INFO) << "Embedding dims: " << embedding_dims;
embedding_fml_ = utils::Split(features, ';');
add_strings_ = context->Get(GetParamName("add_varlen_strings"), false);
embedding_names_ = utils::Split(embedding_names, ';');
for (const string &dim : utils::Split(embedding_dims, ';')) {
embedding_dims_.push_back(utils::ParseUsing<int>(dim, utils::ParseInt32));
}
}
void GenericEmbeddingFeatureExtractor::Init(TaskContext *context) {
}
vector<vector<SparseFeatures>> GenericEmbeddingFeatureExtractor::ConvertExample(
const vector<FeatureVector> &feature_vectors) const {
// Extract the features.
vector<vector<SparseFeatures>> sparse_features(feature_vectors.size());
for (size_t i = 0; i < feature_vectors.size(); ++i) {
// Convert the nlp_parser::FeatureVector to dist belief format.
sparse_features[i] =
vector<SparseFeatures>(generic_feature_extractor(i).feature_types());
for (int j = 0; j < feature_vectors[i].size(); ++j) {
const FeatureType &feature_type = *feature_vectors[i].type(j);
const FeatureValue value = feature_vectors[i].value(j);
const bool is_continuous = feature_type.name().find("continuous") == 0;
const int64 id = is_continuous ? FloatFeatureValue(value).id : value;
const int base = feature_type.base();
if (id >= 0) {
sparse_features[i][base].add_id(id);
if (is_continuous) {
sparse_features[i][base].add_weight(FloatFeatureValue(value).weight);
}
if (add_strings_) {
sparse_features[i][base].add_description(tensorflow::strings::StrCat(
feature_type.name(), "=", feature_type.GetFeatureValueName(id)));
}
}
}
}
return sparse_features;
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef $TARGETDIR_EMBEDDING_FEATURE_EXTRACTOR_H_
#define $TARGETDIR_EMBEDDING_FEATURE_EXTRACTOR_H_
#include <functional>
#include <memory>
#include <string>
#include <vector>
#include "syntaxnet/utils.h"
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/feature_types.h"
#include "syntaxnet/parser_features.h"
#include "syntaxnet/sentence_features.h"
#include "syntaxnet/sparse.pb.h"
#include "syntaxnet/task_context.h"
#include "syntaxnet/workspace.h"
#include "tensorflow/core/lib/strings/strcat.h"
namespace syntaxnet {
// An EmbeddingFeatureExtractor manages the extraction of features for
// embedding-based models. It wraps a sequence of underlying classes of feature
// extractors, along with associated predicate maps. Each class of feature
// extractors is associated with a name, e.g., "words", "labels", "tags".
//
// The class is split between a generic abstract version,
// GenericEmbeddingFeatureExtractor (that can be initialized without knowing the
// signature of the ExtractFeatures method) and a typed version.
//
// The predicate maps must be initialized before use: they can be loaded using
// Read() or updated via UpdateMapsForExample.
class GenericEmbeddingFeatureExtractor {
public:
virtual ~GenericEmbeddingFeatureExtractor() {}
// Get the prefix string to put in front of all arguments, so they don't
// conflict with other embedding models.
virtual const string ArgPrefix() const = 0;
// Sets up predicate maps and embedding space names that are common for all
// embedding based feature extractors.
virtual void Setup(TaskContext *context);
virtual void Init(TaskContext *context);
// Requests workspace for the underlying feature extractors. This is
// implemented in the typed class.
virtual void RequestWorkspaces(WorkspaceRegistry *registry) = 0;
// Number of predicates for the embedding at a given index (vocabulary size.)
int EmbeddingSize(int index) const {
return generic_feature_extractor(index).GetDomainSize();
}
// Returns number of embedding spaces.
int NumEmbeddings() const { return embedding_dims_.size(); }
// Returns the number of features in the embedding space.
const int FeatureSize(int idx) const {
return generic_feature_extractor(idx).feature_types();
}
// Returns the dimensionality of the embedding space.
int EmbeddingDims(int index) const { return embedding_dims_[index]; }
// Accessor for embedding dims (dimensions of the embedding spaces).
const vector<int> &embedding_dims() const { return embedding_dims_; }
const vector<string> &embedding_fml() const { return embedding_fml_; }
// Get parameter name by concatenating the prefix and the original name.
string GetParamName(const string &param_name) const {
return tensorflow::strings::StrCat(ArgPrefix(), "_", param_name);
}
protected:
// Provides the generic class with access to the templated extractors. This is
// used to get the type information out of the feature extractor without
// knowing the specific calling arguments of the extractor itself.
virtual const GenericFeatureExtractor &generic_feature_extractor(
int idx) const = 0;
// Converts a vector of extracted features into
// dist_belief::SparseFeatures. Each feature in each feature vector becomes a
// single SparseFeatures. The predicates are mapped through map_fn which
// should point to either mutable_map_fn or const_map_fn depending on whether
// or not the predicate maps should be updated.
vector<vector<SparseFeatures>> ConvertExample(
const vector<FeatureVector> &feature_vectors) const;
private:
// Embedding space names for parameter sharing.
vector<string> embedding_names_;
// FML strings for each feature extractor.
vector<string> embedding_fml_;
// Size of each of the embedding spaces (maximum predicate id).
vector<int> embedding_sizes_;
// Embedding dimensions of the embedding spaces (i.e. 32, 64 etc.)
vector<int> embedding_dims_;
// Whether or not to add string descriptions to converted examples.
bool add_strings_;
};
// Templated, object-specific implementation of the
// EmbeddingFeatureExtractor. EXTRACTOR should be a FeatureExtractor<OBJ,
// ARGS...> class that has the appropriate FeatureTraits() to ensure that
// locator type features work.
//
// Note: for backwards compatibility purposes, this always reads the FML spec
// from "<prefix>_features".
template <class EXTRACTOR, class OBJ, class... ARGS>
class EmbeddingFeatureExtractor : public GenericEmbeddingFeatureExtractor {
public:
// Sets up all predicate maps, feature extractors, and flags.
void Setup(TaskContext *context) override {
GenericEmbeddingFeatureExtractor::Setup(context);
feature_extractors_.resize(embedding_fml().size());
for (int i = 0; i < embedding_fml().size(); ++i) {
feature_extractors_[i].Parse(embedding_fml()[i]);
feature_extractors_[i].Setup(context);
}
}
// Initializes resources needed by the feature extractors.
void Init(TaskContext *context) override {
GenericEmbeddingFeatureExtractor::Init(context);
for (auto &feature_extractor : feature_extractors_) {
feature_extractor.Init(context);
}
}
// Requests workspaces from the registry. Must be called after Init(), and
// before Preprocess().
void RequestWorkspaces(WorkspaceRegistry *registry) override {
for (auto &feature_extractor : feature_extractors_) {
feature_extractor.RequestWorkspaces(registry);
}
}
// Must be called on the object one state for each sentence, before any
// feature extraction (e.g., UpdateMapsForExample, ExtractSparseFeatures).
void Preprocess(WorkspaceSet *workspaces, OBJ *obj) const {
for (auto &feature_extractor : feature_extractors_) {
feature_extractor.Preprocess(workspaces, obj);
}
}
// Returns a ragged array of SparseFeatures, for 1) each feature extractor
// class e, and 2) each feature f extracted by e. Underlying predicate maps
// will not be updated and so unrecognized predicates may occur. In such a
// case the SparseFeatures object associated with a given extractor class and
// feature will be empty.
vector<vector<SparseFeatures>> ExtractSparseFeatures(
const WorkspaceSet &workspaces, const OBJ &obj, ARGS... args) const {
vector<FeatureVector> features(feature_extractors_.size());
ExtractFeatures(workspaces, obj, args..., &features);
return ConvertExample(features);
}
// Extracts features using the extractors. Note that features must already
// be initialized to the correct number of feature extractors. No predicate
// mapping is applied.
void ExtractFeatures(const WorkspaceSet &workspaces, const OBJ &obj,
ARGS... args,
vector<FeatureVector> *features) const {
DCHECK(features != nullptr);
DCHECK_EQ(features->size(), feature_extractors_.size());
for (int i = 0; i < feature_extractors_.size(); ++i) {
(*features)[i].clear();
feature_extractors_[i].ExtractFeatures(workspaces, obj, args...,
&(*features)[i]);
}
}
protected:
// Provides generic access to the feature extractors.
const GenericFeatureExtractor &generic_feature_extractor(
int idx) const override {
DCHECK_LT(idx, feature_extractors_.size());
DCHECK_GE(idx, 0);
return feature_extractors_[idx];
}
private:
// Templated feature extractor class.
vector<EXTRACTOR> feature_extractors_;
};
class ParserEmbeddingFeatureExtractor
: public EmbeddingFeatureExtractor<ParserFeatureExtractor, ParserState> {
public:
explicit ParserEmbeddingFeatureExtractor(const string &arg_prefix)
: arg_prefix_(arg_prefix) {}
private:
const string ArgPrefix() const override { return arg_prefix_; }
// Prefix for context parameters.
string arg_prefix_;
};
} // namespace syntaxnet
#endif // $TARGETDIR_EMBEDDING_FEATURE_EXTRACTOR_H_
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/feature_types.h"
#include "syntaxnet/fml_parser.h"
namespace syntaxnet {
constexpr FeatureValue GenericFeatureFunction::kNone;
GenericFeatureExtractor::GenericFeatureExtractor() {}
GenericFeatureExtractor::~GenericFeatureExtractor() {}
void GenericFeatureExtractor::Parse(const string &source) {
// Parse feature specification into descriptor.
FMLParser parser;
parser.Parse(source, mutable_descriptor());
// Initialize feature extractor from descriptor.
InitializeFeatureFunctions();
}
void GenericFeatureExtractor::InitializeFeatureTypes() {
// Register all feature types.
GetFeatureTypes(&feature_types_);
for (size_t i = 0; i < feature_types_.size(); ++i) {
FeatureType *ft = feature_types_[i];
ft->set_base(i);
// Check for feature space overflow.
double domain_size = ft->GetDomainSize();
if (domain_size < 0) {
LOG(FATAL) << "Illegal domain size for feature " << ft->name()
<< domain_size;
}
}
vector<string> types_names;
GetFeatureTypeNames(&types_names);
CHECK_EQ(feature_types_.size(), types_names.size());
}
void GenericFeatureExtractor::GetFeatureTypeNames(
vector<string> *type_names) const {
for (size_t i = 0; i < feature_types_.size(); ++i) {
FeatureType *ft = feature_types_[i];
type_names->push_back(ft->name());
}
}
FeatureValue GenericFeatureExtractor::GetDomainSize() const {
// Domain size of the set of features is equal to:
// [largest domain size of any feature types] * [number of feature types]
FeatureValue max_feature_type_dsize = 0;
for (size_t i = 0; i < feature_types_.size(); ++i) {
FeatureType *ft = feature_types_[i];
const FeatureValue feature_type_dsize = ft->GetDomainSize();
if (feature_type_dsize > max_feature_type_dsize) {
max_feature_type_dsize = feature_type_dsize;
}
}
return max_feature_type_dsize;
}
string GenericFeatureFunction::GetParameter(const string &name) const {
// Find named parameter in feature descriptor.
for (int i = 0; i < descriptor_->parameter_size(); ++i) {
if (name == descriptor_->parameter(i).name()) {
return descriptor_->parameter(i).value();
}
}
return "";
}
GenericFeatureFunction::GenericFeatureFunction() {}
GenericFeatureFunction::~GenericFeatureFunction() {
delete feature_type_;
}
int GenericFeatureFunction::GetIntParameter(const string &name,
int default_value) const {
string value = GetParameter(name);
return utils::ParseUsing<int>(value, default_value,
tensorflow::strings::safe_strto32);
}
void GenericFeatureFunction::GetFeatureTypes(
vector<FeatureType *> *types) const {
if (feature_type_ != nullptr) types->push_back(feature_type_);
}
FeatureType *GenericFeatureFunction::GetFeatureType() const {
// If a single feature type has been registered return it.
if (feature_type_ != nullptr) return feature_type_;
// Get feature types for function.
vector<FeatureType *> types;
GetFeatureTypes(&types);
// If there is exactly one feature type return this, else return null.
if (types.size() == 1) return types[0];
return nullptr;
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Generic feature extractor for extracting features from objects. The feature
// extractor can be used for extracting features from any object. The feature
// extractor and feature function classes are template classes that have to
// be instantiated for extracting feature from a specific object type.
//
// A feature extractor consists of a hierarchy of feature functions. Each
// feature function extracts one or more feature type and value pairs from the
// object.
//
// The feature extractor has a modular design where new feature functions can be
// registered as components. The feature extractor is initialized from a
// descriptor represented by a protocol buffer. The feature extractor can also
// be initialized from a text-based source specification of the feature
// extractor. Feature specification parsers can be added as components. By
// default the feature extractor can be read from an ASCII protocol buffer or in
// a simple feature modeling language (fml).
// A feature function is invoked with a focus. Nested feature function can be
// invoked with another focus determined by the parent feature function.
#ifndef $TARGETDIR_FEATURE_EXTRACTOR_H_
#define $TARGETDIR_FEATURE_EXTRACTOR_H_
#include <memory>
#include <string>
#include <vector>
#include "syntaxnet/feature_extractor.pb.h"
#include "syntaxnet/feature_types.h"
#include "syntaxnet/proto_io.h"
#include "syntaxnet/registry.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/task_context.h"
#include "syntaxnet/utils.h"
#include "syntaxnet/workspace.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
#include "tensorflow/core/lib/io/record_reader.h"
#include "tensorflow/core/lib/io/record_writer.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/env.h"
namespace syntaxnet {
// Use the same type for feature values as is used for predicated.
typedef int64 Predicate;
typedef Predicate FeatureValue;
// Output feature model in FML format.
void ToFMLFunction(const FeatureFunctionDescriptor &function, string *output);
void ToFML(const FeatureFunctionDescriptor &function, string *output);
// A feature vector contains feature type and value pairs.
class FeatureVector {
public:
FeatureVector() {}
// Adds feature type and value pair to feature vector.
void add(FeatureType *type, FeatureValue value) {
features_.emplace_back(type, value);
}
// Removes all elements from the feature vector.
void clear() { features_.clear(); }
// Returns the number of elements in the feature vector.
int size() const { return features_.size(); }
// Reserves space in the underlying feature vector.
void reserve(int n) { features_.reserve(n); }
// Returns feature type for an element in the feature vector.
FeatureType *type(int index) const { return features_[index].type; }
// Returns feature value for an element in the feature vector.
FeatureValue value(int index) const { return features_[index].value; }
private:
// Structure for holding feature type and value pairs.
struct Element {
Element() : type(NULL), value(-1) {}
Element(FeatureType *t, FeatureValue v) : type(t), value(v) {}
FeatureType *type;
FeatureValue value;
};
// Array for storing feature vector elements.
vector<Element> features_;
TF_DISALLOW_COPY_AND_ASSIGN(FeatureVector);
};
// The generic feature extractor is the type-independent part of a feature
// extractor. This holds the descriptor for the feature extractor and the
// collection of feature types used in the feature extractor. The feature
// types are not available until FeatureExtractor<>::Init() has been called.
class GenericFeatureExtractor {
public:
GenericFeatureExtractor();
virtual ~GenericFeatureExtractor();
// Initializes the feature extractor from a source representation of the
// feature extractor. The first line is used for determining the feature
// specification language. If the first line starts with #! followed by a name
// then this name is used for instantiating a feature specification parser
// with that name. If the language cannot be detected this way it falls back
// to using the default language supplied.
void Parse(const string &source);
// Returns the feature extractor descriptor.
const FeatureExtractorDescriptor &descriptor() const { return descriptor_; }
FeatureExtractorDescriptor *mutable_descriptor() { return &descriptor_; }
// Returns the number of feature types in the feature extractor. Invalid
// before Init() has been called.
int feature_types() const { return feature_types_.size(); }
// Returns all feature types names used by the extractor. The names are
// added to the types_names array. Invalid before Init() has been called.
void GetFeatureTypeNames(vector<string> *type_names) const;
// Returns a feature type used in the extractor. Invalid before Init() has
// been called.
const FeatureType *feature_type(int index) const {
return feature_types_[index];
}
// Returns the feature domain size of this feature extractor.
// NOTE: The way that domain size is calculated is, for some, unintuitive. It
// is the largest domain size of any feature type.
FeatureValue GetDomainSize() const;
protected:
// Initializes the feature types used by the extractor. Called from
// FeatureExtractor<>::Init().
void InitializeFeatureTypes();
private:
// Initializes the top-level feature functions.
virtual void InitializeFeatureFunctions() = 0;
// Returns all feature types used by the extractor. The feature types are
// added to the result array.
virtual void GetFeatureTypes(vector<FeatureType *> *types) const = 0;
// Descriptor for the feature extractor. This is a protocol buffer that
// contains all the information about the feature extractor. The feature
// functions are initialized from the information in the descriptor.
FeatureExtractorDescriptor descriptor_;
// All feature types used by the feature extractor. The collection of all the
// feature types describes the feature space of the feature set produced by
// the feature extractor. Not owned.
vector<FeatureType *> feature_types_;
};
// The generic feature function is the type-independent part of a feature
// function. Each feature function is associated with the descriptor that it is
// instantiated from. The feature types associated with this feature function
// will be established by the time FeatureExtractor<>::Init() completes.
class GenericFeatureFunction {
public:
// A feature value that represents the absence of a value.
static constexpr FeatureValue kNone = -1;
GenericFeatureFunction();
virtual ~GenericFeatureFunction();
// Sets up the feature function. NB: FeatureTypes of nested functions are not
// guaranteed to be available until Init().
virtual void Setup(TaskContext *context) {}
// Initializes the feature function. NB: The FeatureType of this function must
// be established when this method completes.
virtual void Init(TaskContext *context) {}
// Requests workspaces from a registry to obtain indices into a WorkspaceSet
// for any Workspace objects used by this feature function. NB: This will be
// called after Init(), so it can depend on resources and arguments.
virtual void RequestWorkspaces(WorkspaceRegistry *registry) {}
// Appends the feature types produced by the feature function to types. The
// default implementation appends feature_type(), if non-null. Invalid
// before Init() has been called.
virtual void GetFeatureTypes(vector<FeatureType *> *types) const;
// Returns the feature type for feature produced by this feature function. If
// the feature function produces features of different types this returns
// null. Invalid before Init() has been called.
virtual FeatureType *GetFeatureType() const;
// Returns the name of the registry used for creating the feature function.
// This can be used for checking if two feature functions are of the same
// kind.
virtual const char *RegistryName() const = 0;
// Returns the value of a named parameter in the feature functions descriptor.
// If the named parameter is not found the global parameters are searched.
string GetParameter(const string &name) const;
int GetIntParameter(const string &name, int default_value) const;
// Returns the FML function description for the feature function, i.e. the
// name and parameters without the nested features.
string FunctionName() const {
string output;
ToFMLFunction(*descriptor_, &output);
return output;
}
// Returns the prefix for nested feature functions. This is the prefix of this
// feature function concatenated with the feature function name.
string SubPrefix() const {
return prefix_.empty() ? FunctionName() : prefix_ + "." + FunctionName();
}
// Returns/sets the feature extractor this function belongs to.
GenericFeatureExtractor *extractor() const { return extractor_; }
void set_extractor(GenericFeatureExtractor *extractor) {
extractor_ = extractor;
}
// Returns/sets the feature function descriptor.
FeatureFunctionDescriptor *descriptor() const { return descriptor_; }
void set_descriptor(FeatureFunctionDescriptor *descriptor) {
descriptor_ = descriptor;
}
// Returns a descriptive name for the feature function. The name is taken from
// the descriptor for the feature function. If the name is empty or the
// feature function is a variable the name is the FML representation of the
// feature, including the prefix.
string name() const {
string output;
if (descriptor_->name().empty()) {
if (!prefix_.empty()) {
output.append(prefix_);
output.append(".");
}
ToFML(*descriptor_, &output);
} else {
output = descriptor_->name();
}
tensorflow::StringPiece stripped(output);
utils::RemoveWhitespaceContext(&stripped);
return stripped.ToString();
}
// Returns the argument from the feature function descriptor. It defaults to
// 0 if the argument has not been specified.
int argument() const {
return descriptor_->has_argument() ? descriptor_->argument() : 0;
}
// Returns/sets/clears function name prefix.
const string &prefix() const { return prefix_; }
void set_prefix(const string &prefix) { prefix_ = prefix; }
protected:
// Returns the feature type for single-type feature functions.
FeatureType *feature_type() const { return feature_type_; }
// Sets the feature type for single-type feature functions. This takes
// ownership of feature_type. Can only be called once.
void set_feature_type(FeatureType *feature_type) {
CHECK(feature_type_ == nullptr);
feature_type_ = feature_type;
}
private:
// Feature extractor this feature function belongs to. Not owned.
GenericFeatureExtractor *extractor_ = nullptr;
// Descriptor for feature function. Not owned.
FeatureFunctionDescriptor *descriptor_ = nullptr;
// Feature type for features produced by this feature function. If the
// feature function produces features of multiple feature types this is null
// and the feature function must return it's feature types in
// GetFeatureTypes(). Owned.
FeatureType *feature_type_ = nullptr;
// Prefix used for sub-feature types of this function.
string prefix_;
};
// Feature function that can extract features from an object. Templated on
// two type arguments:
//
// OBJ: The "object" from which features are extracted; e.g., a sentence. This
// should be a plain type, rather than a reference or pointer.
//
// ARGS: A set of 0 or more types that are used to "index" into some part of the
// object that should be extracted, e.g. an int token index for a sentence
// object. This should not be a reference type.
template<class OBJ, class ...ARGS>
class FeatureFunction
: public GenericFeatureFunction,
public RegisterableClass< FeatureFunction<OBJ, ARGS...> > {
public:
using Self = FeatureFunction<OBJ, ARGS...>;
// Preprocesses the object. This will be called prior to calling Evaluate()
// or Compute() on that object.
virtual void Preprocess(WorkspaceSet *workspaces, OBJ *object) const {}
// Appends features computed from the object and focus to the result. The
// default implementation delegates to Compute(), adding a single value if
// available. Multi-valued feature functions must override this method.
virtual void Evaluate(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const {
FeatureValue value = Compute(workspaces, object, args..., result);
if (value != kNone) result->add(feature_type(), value);
}
// Returns a feature value computed from the object and focus, or kNone if no
// value is computed. Single-valued feature functions only need to override
// this method.
virtual FeatureValue Compute(const WorkspaceSet &workspaces,
const OBJ &object,
ARGS... args,
const FeatureVector *fv) const {
return kNone;
}
// Instantiates a new feature function in a feature extractor from a feature
// descriptor.
static Self *Instantiate(GenericFeatureExtractor *extractor,
FeatureFunctionDescriptor *fd,
const string &prefix) {
Self *f = Self::Create(fd->type());
f->set_extractor(extractor);
f->set_descriptor(fd);
f->set_prefix(prefix);
return f;
}
// Returns the name of the registry for the feature function.
const char *RegistryName() const override {
return Self::registry()->name;
}
private:
// Special feature function class for resolving variable references. The type
// of the feature function is used for resolving the variable reference. When
// evaluated it will either get the feature value(s) from the variable portion
// of the feature vector, if present, or otherwise it will call the referenced
// feature extractor function directly to extract the feature(s).
class Reference;
};
// Base class for features with nested feature functions. The nested functions
// are of type NES, which may be different from the type of the parent function.
// NB: NestedFeatureFunction will ensure that all initialization of nested
// functions takes place during Setup() and Init() -- after the nested features
// are initialized, the parent feature is initialized via SetupNested() and
// InitNested(). Alternatively, a derived classes that overrides Setup() and
// Init() directly should call Parent::Setup(), Parent::Init(), etc. first.
//
// Note: NestedFeatureFunction cannot know how to call Preprocess, Evaluate, or
// Compute, since the nested functions may be of a different type.
template<class NES, class OBJ, class ...ARGS>
class NestedFeatureFunction : public FeatureFunction<OBJ, ARGS...> {
public:
using Parent = NestedFeatureFunction<NES, OBJ, ARGS...>;
// Clean up nested functions.
~NestedFeatureFunction() override { utils::STLDeleteElements(&nested_); }
// By default, just appends the nested feature types.
void GetFeatureTypes(vector<FeatureType *> *types) const override {
CHECK(!this->nested().empty())
<< "Nested features require nested features to be defined.";
for (auto *function : nested_) function->GetFeatureTypes(types);
}
// Sets up the nested features.
void Setup(TaskContext *context) override {
CreateNested(this->extractor(), this->descriptor(), &nested_,
this->SubPrefix());
for (auto *function : nested_) function->Setup(context);
SetupNested(context);
}
// Sets up this NestedFeatureFunction specifically.
virtual void SetupNested(TaskContext *context) {}
// Initializes the nested features.
void Init(TaskContext *context) override {
for (auto *function : nested_) function->Init(context);
InitNested(context);
}
// Initializes this NestedFeatureFunction specifically.
virtual void InitNested(TaskContext *context) {}
// Gets all the workspaces needed for the nested functions.
void RequestWorkspaces(WorkspaceRegistry *registry) override {
for (auto *function : nested_) function->RequestWorkspaces(registry);
}
// Returns the list of nested feature functions.
const vector<NES *> &nested() const { return nested_; }
// Instantiates nested feature functions for a feature function. Creates and
// initializes one feature function for each sub-descriptor in the feature
// descriptor.
static void CreateNested(GenericFeatureExtractor *extractor,
FeatureFunctionDescriptor *fd,
vector<NES *> *functions,
const string &prefix) {
for (int i = 0; i < fd->feature_size(); ++i) {
FeatureFunctionDescriptor *sub = fd->mutable_feature(i);
NES *f = NES::Instantiate(extractor, sub, prefix);
functions->push_back(f);
}
}
protected:
// The nested feature functions, if any, in order of declaration in the
// feature descriptor. Owned.
vector<NES *> nested_;
};
// Base class for a nested feature function that takes nested features with the
// same signature as these features, i.e. a meta feature. For this class, we can
// provide preprocessing of the nested features.
template<class OBJ, class ...ARGS>
class MetaFeatureFunction : public NestedFeatureFunction<
FeatureFunction<OBJ, ARGS...>, OBJ, ARGS...> {
public:
// Preprocesses using the nested features.
void Preprocess(WorkspaceSet *workspaces, OBJ *object) const override {
for (auto *function : this->nested_) {
function->Preprocess(workspaces, object);
}
}
};
// Template for a special type of locator: The locator of type
// FeatureFunction<OBJ, ARGS...> calls nested functions of type
// FeatureFunction<OBJ, IDX, ARGS...>, where the derived class DER is
// responsible for translating by providing the following:
//
// // Gets the new additional focus.
// IDX GetFocus(const WorkspaceSet &workspaces, const OBJ &object);
//
// This is useful to e.g. add a token focus to a parser state based on some
// desired property of that state.
template<class DER, class OBJ, class IDX, class ...ARGS>
class FeatureAddFocusLocator : public NestedFeatureFunction<
FeatureFunction<OBJ, IDX, ARGS...>, OBJ, ARGS...> {
public:
void Preprocess(WorkspaceSet *workspaces, OBJ *object) const override {
for (auto *function : this->nested_) {
function->Preprocess(workspaces, object);
}
}
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const override {
IDX focus = static_cast<const DER *>(this)->GetFocus(
workspaces, object, args...);
for (auto *function : this->nested()) {
function->Evaluate(workspaces, object, focus, args..., result);
}
}
// Returns the first nested feature's computed value.
FeatureValue Compute(const WorkspaceSet &workspaces,
const OBJ &object,
ARGS... args,
const FeatureVector *result) const override {
IDX focus = static_cast<const DER *>(this)->GetFocus(
workspaces, object, args...);
return this->nested()[0]->Compute(
workspaces, object, focus, args..., result);
}
};
// CRTP feature locator class. This is a meta feature that modifies ARGS and
// then calls the nested feature functions with the modified ARGS. Note that in
// order for this template to work correctly, all of ARGS must be types for
// which the reference operator & can be interpreted as a pointer to the
// argument. The derived class DER must implement the UpdateFocus method which
// takes pointers to the ARGS arguments:
//
// // Updates the current arguments.
// void UpdateArgs(const OBJ &object, ARGS *...args) const;
template<class DER, class OBJ, class ...ARGS>
class FeatureLocator : public MetaFeatureFunction<OBJ, ARGS...> {
public:
// Feature locators have an additional check that there is no intrinsic type.
void GetFeatureTypes(vector<FeatureType *> *types) const override {
CHECK(this->feature_type() == nullptr)
<< "FeatureLocators should not have an intrinsic type.";
MetaFeatureFunction<OBJ, ARGS...>::GetFeatureTypes(types);
}
// Evaluates the locator.
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const override {
static_cast<const DER *>(this)->UpdateArgs(workspaces, object, &args...);
for (auto *function : this->nested()) {
function->Evaluate(workspaces, object, args..., result);
}
}
// Returns the first nested feature's computed value.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args,
const FeatureVector *result) const override {
static_cast<const DER *>(this)->UpdateArgs(workspaces, object, &args...);
return this->nested()[0]->Compute(workspaces, object, args..., result);
}
};
// Feature extractor for extracting features from objects of a certain class.
// Template type parameters are as defined for FeatureFunction.
template<class OBJ, class ...ARGS>
class FeatureExtractor : public GenericFeatureExtractor {
public:
// Feature function type for top-level functions in the feature extractor.
typedef FeatureFunction<OBJ, ARGS...> Function;
typedef FeatureExtractor<OBJ, ARGS...> Self;
// Feature locator type for the feature extractor.
template<class DER>
using Locator = FeatureLocator<DER, OBJ, ARGS...>;
// Initializes feature extractor.
FeatureExtractor() {}
~FeatureExtractor() override { utils::STLDeleteElements(&functions_); }
// Sets up the feature extractor. Note that only top-level functions exist
// until Setup() is called. This does not take ownership over the context,
// which must outlive this.
void Setup(TaskContext *context) {
for (Function *function : functions_) function->Setup(context);
}
// Initializes the feature extractor. Must be called after Setup(). This
// does not take ownership over the context, which must outlive this.
void Init(TaskContext *context) {
for (Function *function : functions_) function->Init(context);
this->InitializeFeatureTypes();
}
// Requests workspaces from the registry. Must be called after Init(), and
// before Preprocess(). Does not take ownership over registry. This should be
// the same registry used to initialize the WorkspaceSet used in Preprocess()
// and ExtractFeatures(). NB: This is a different ordering from that used in
// SentenceFeatureRepresentation style feature computation.
void RequestWorkspaces(WorkspaceRegistry *registry) {
for (auto *function : functions_) function->RequestWorkspaces(registry);
}
// Preprocesses the object using feature functions for the phase. Must be
// called before any calls to ExtractFeatures() on that object and phase.
void Preprocess(WorkspaceSet *workspaces, OBJ *object) const {
for (Function *function : functions_) {
function->Preprocess(workspaces, object);
}
}
// Extracts features from an object with a focus. This invokes all the
// top-level feature functions in the feature extractor. Only feature
// functions belonging to the specified phase are invoked.
void ExtractFeatures(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const {
result->reserve(this->feature_types());
// Extract features.
for (int i = 0; i < functions_.size(); ++i) {
functions_[i]->Evaluate(workspaces, object, args..., result);
}
}
private:
// Creates and initializes all feature functions in the feature extractor.
void InitializeFeatureFunctions() override {
// Create all top-level feature functions.
for (int i = 0; i < descriptor().feature_size(); ++i) {
FeatureFunctionDescriptor *fd = mutable_descriptor()->mutable_feature(i);
Function *function = Function::Instantiate(this, fd, "");
functions_.push_back(function);
}
}
// Collect all feature types used in the feature extractor.
void GetFeatureTypes(vector<FeatureType *> *types) const override {
for (int i = 0; i < functions_.size(); ++i) {
functions_[i]->GetFeatureTypes(types);
}
}
// Top-level feature functions (and variables) in the feature extractor.
// Owned.
vector<Function *> functions_;
};
#define REGISTER_FEATURE_FUNCTION(base, name, component) \
REGISTER_CLASS_COMPONENT(base, name, component)
} // namespace syntaxnet
#endif // $TARGETDIR_FEATURE_EXTRACTOR_H_
// Protocol buffers for feature extractor.
syntax = "proto2";
package syntaxnet;
message Parameter {
optional string name = 1;
optional string value = 2;
}
// Descriptor for feature function.
message FeatureFunctionDescriptor {
// Feature function type.
required string type = 1;
// Feature function name.
optional string name = 2;
// Default argument for feature function.
optional int32 argument = 3 [default = 0];
// Named parameters for feature descriptor.
repeated Parameter parameter = 4;
// Nested sub-feature function descriptors.
repeated FeatureFunctionDescriptor feature = 7;
};
// Descriptor for feature extractor.
message FeatureExtractorDescriptor {
// Top-level feature function for extractor.
repeated FeatureFunctionDescriptor feature = 1;
};
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Common feature types for parser components.
#ifndef $TARGETDIR_FEATURE_TYPES_H_
#define $TARGETDIR_FEATURE_TYPES_H_
#include <algorithm>
#include <map>
#include <string>
#include <utility>
#include "syntaxnet/utils.h"
namespace syntaxnet {
// Use the same type for feature values as is used for predicated.
typedef int64 Predicate;
typedef Predicate FeatureValue;
// Each feature value in a feature vector has a feature type. The feature type
// is used for converting feature type and value pairs to predicate values. The
// feature type can also return names for feature values and calculate the size
// of the feature value domain. The FeatureType class is abstract and must be
// specialized for the concrete feature types.
class FeatureType {
public:
// Initializes a feature type.
explicit FeatureType(const string &name)
: name_(name), base_(0) {}
virtual ~FeatureType() {}
// Converts a feature value to a name.
virtual string GetFeatureValueName(FeatureValue value) const = 0;
// Returns the size of the feature values domain.
virtual int64 GetDomainSize() const = 0;
// Returns the feature type name.
const string &name() const { return name_; }
Predicate base() const { return base_; }
void set_base(Predicate base) { base_ = base; }
private:
// Feature type name.
string name_;
// "Base" feature value: i.e. a "slot" in a global ordering of features.
Predicate base_;
};
// Templated generic resource based feature type. This feature type delegates
// look up of feature value names to an unknown resource class, which is not
// owned. Optionally, this type can also store a mapping of extra values which
// are not in the resource.
//
// Note: this class assumes that Resource->GetFeatureValueName() will return
// successfully for values ONLY in the range [0, Resource->NumValues()) Any
// feature value not in the extra value map and not in the above range of
// Resource will result in a ERROR and return of "<INVALID>".
template<class Resource>
class ResourceBasedFeatureType : public FeatureType {
public:
// Creates a new type with given name, resource object, and a mapping of
// special values. The values must be greater or equal to
// resource->NumValues() so as to avoid collisions; this is verified with
// CHECK at creation.
ResourceBasedFeatureType(const string &name, const Resource *resource,
const map<FeatureValue, string> &values)
: FeatureType(name), resource_(resource), values_(values) {
max_value_ = resource->NumValues() - 1;
for (const auto &pair : values) {
CHECK_GE(pair.first, resource->NumValues()) << "Invalid extra value: "
<< pair.first << "," << pair.second;
max_value_ = pair.first > max_value_ ? pair.first : max_value_;
}
}
// Creates a new type with no special values.
ResourceBasedFeatureType(const string &name, const Resource *resource)
: ResourceBasedFeatureType(name, resource, {}) {}
// Returns the feature name for a given feature value. First checks the values
// map, then checks the resource to look up the name.
string GetFeatureValueName(FeatureValue value) const override {
if (values_.find(value) != values_.end()) {
return values_.find(value)->second;
}
if (value >= 0 && value < resource_->NumValues()) {
return resource_->GetFeatureValueName(value);
} else {
LOG(ERROR) << "Invalid feature value " << value << " for " << name();
return "<INVALID>";
}
}
// Returns the number of possible values for this feature type. This is the
// based on the largest value that was observed in the extra values.
FeatureValue GetDomainSize() const override { return max_value_ + 1; }
protected:
// Shared resource. Not owned.
const Resource *resource_ = nullptr;
// Maximum possible value this feature could take.
FeatureValue max_value_;
// Mapping for extra feature values not in the resource.
map<FeatureValue, string> values_;
};
// Feature type that is defined using an explicit map from FeatureValue to
// string values. This can reduce some of the boilerplate when defining
// features that generate enum values. Example usage:
//
// class BeverageSizeFeature : public FeatureFunction<Beverage>
// enum FeatureValue { SMALL, MEDIUM, LARGE }; // values for this feature
// void Init(TaskContext *context) override {
// set_feature_type(new EnumFeatureType("beverage_size",
// {{SMALL, "SMALL"}, {MEDIUM, "MEDIUM"}, {LARGE, "LARGE"}});
// }
// [...]
// };
class EnumFeatureType : public FeatureType {
public:
EnumFeatureType(const string &name,
const map<FeatureValue, string> &value_names)
: FeatureType(name), value_names_(value_names) {
for (const auto &pair : value_names) {
CHECK_GE(pair.first, 0)
<< "Invalid feature value: " << pair.first << ", " << pair.second;
domain_size_ = std::max(domain_size_, pair.first + 1);
}
}
// Returns the feature name for a given feature value.
string GetFeatureValueName(FeatureValue value) const override {
auto it = value_names_.find(value);
if (it == value_names_.end()) {
LOG(ERROR)
<< "Invalid feature value " << value << " for " << name();
return "<INVALID>";
}
return it->second;
}
// Returns the number of possible values for this feature type. This is one
// greater than the largest value in the value_names map.
FeatureValue GetDomainSize() const override { return domain_size_; }
protected:
// Maximum possible value this feature could take.
FeatureValue domain_size_ = 0;
// Names of feature values.
map<FeatureValue, string> value_names_;
};
} // namespace syntaxnet
#endif // $TARGETDIR_FEATURE_TYPES_H_
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/fml_parser.h"
#include <ctype.h>
#include <string>
#include "syntaxnet/utils.h"
#include "tensorflow/core/lib/strings/strcat.h"
namespace syntaxnet {
void FMLParser::Initialize(const string &source) {
// Initialize parser state.
source_ = source;
current_ = source_.begin();
item_start_ = line_start_ = current_;
line_number_ = item_line_number_ = 1;
// Read first input item.
NextItem();
}
void FMLParser::Error(const string &error_message) {
LOG(FATAL) << "Error in feature model, line " << item_line_number_
<< ", position " << (item_start_ - line_start_ + 1)
<< ": " << error_message
<< "\n " << string(line_start_, current_) << " <--HERE";
}
void FMLParser::Next() {
// Move to the next input character. If we are at a line break update line
// number and line start position.
if (*current_ == '\n') {
++line_number_;
++current_;
line_start_ = current_;
} else {
++current_;
}
}
void FMLParser::NextItem() {
// Skip white space and comments.
while (!eos()) {
if (*current_ == '#') {
// Skip comment.
while (!eos() && *current_ != '\n') Next();
} else if (isspace(*current_)) {
// Skip whitespace.
while (!eos() && isspace(*current_)) Next();
} else {
break;
}
}
// Record start position for next item.
item_start_ = current_;
item_line_number_ = line_number_;
// Check for end of input.
if (eos()) {
item_type_ = END;
return;
}
// Parse number.
if (isdigit(*current_) || *current_ == '+' || *current_ == '-') {
string::iterator start = current_;
Next();
while (isdigit(*current_) || *current_ == '.') Next();
item_text_.assign(start, current_);
item_type_ = NUMBER;
return;
}
// Parse string.
if (*current_ == '"') {
Next();
string::iterator start = current_;
while (*current_ != '"') {
if (eos()) Error("Unterminated string");
Next();
}
item_text_.assign(start, current_);
item_type_ = STRING;
Next();
return;
}
// Parse identifier name.
if (isalpha(*current_) || *current_ == '_' || *current_ == '/') {
string::iterator start = current_;
while (isalnum(*current_) || *current_ == '_' || *current_ == '-' ||
*current_ == '/') Next();
item_text_.assign(start, current_);
item_type_ = NAME;
return;
}
// Single character item.
item_type_ = *current_;
Next();
}
void FMLParser::Parse(const string &source,
FeatureExtractorDescriptor *result) {
// Initialize parser.
Initialize(source);
while (item_type_ != END) {
// Parse either a parameter name or a feature.
if (item_type_ != NAME) Error("Feature type name expected");
string name = item_text_;
NextItem();
if (item_type_ == '=') {
Error("Invalid syntax: feature expected");
} else {
// Parse feature.
FeatureFunctionDescriptor *descriptor = result->add_feature();
descriptor->set_type(name);
ParseFeature(descriptor);
}
}
}
void FMLParser::ParseFeature(FeatureFunctionDescriptor *result) {
// Parse argument and parameters.
if (item_type_ == '(') {
NextItem();
ParseParameter(result);
while (item_type_ == ',') {
NextItem();
ParseParameter(result);
}
if (item_type_ != ')') Error(") expected");
NextItem();
}
// Parse feature name.
if (item_type_ == ':') {
NextItem();
if (item_type_ != NAME && item_type_ != STRING) {
Error("Feature name expected");
}
string name = item_text_;
NextItem();
// Set feature name.
result->set_name(name);
}
// Parse sub-features.
if (item_type_ == '.') {
// Parse dotted sub-feature.
NextItem();
if (item_type_ != NAME) Error("Feature type name expected");
string type = item_text_;
NextItem();
// Parse sub-feature.
FeatureFunctionDescriptor *subfeature = result->add_feature();
subfeature->set_type(type);
ParseFeature(subfeature);
} else if (item_type_ == '{') {
// Parse sub-feature block.
NextItem();
while (item_type_ != '}') {
if (item_type_ != NAME) Error("Feature type name expected");
string type = item_text_;
NextItem();
// Parse sub-feature.
FeatureFunctionDescriptor *subfeature = result->add_feature();
subfeature->set_type(type);
ParseFeature(subfeature);
}
NextItem();
}
}
void FMLParser::ParseParameter(FeatureFunctionDescriptor *result) {
if (item_type_ == NUMBER) {
int argument =
utils::ParseUsing<int>(item_text_, tensorflow::strings::safe_strto32);
NextItem();
// Set default argument for feature.
result->set_argument(argument);
} else if (item_type_ == NAME) {
string name = item_text_;
NextItem();
if (item_type_ != '=') Error("= expected");
NextItem();
if (item_type_ >= END) Error("Parameter value expected");
string value = item_text_;
NextItem();
// Add parameter to feature.
Parameter *parameter;
parameter = result->add_parameter();
parameter->set_name(name);
parameter->set_value(value);
} else {
Error("Syntax error in parameter list");
}
}
void ToFMLFunction(const FeatureFunctionDescriptor &function, string *output) {
output->append(function.type());
if (function.argument() != 0 || function.parameter_size() > 0) {
output->append("(");
bool first = true;
if (function.argument() != 0) {
tensorflow::strings::StrAppend(output, function.argument());
first = false;
}
for (int i = 0; i < function.parameter_size(); ++i) {
if (!first) output->append(",");
output->append(function.parameter(i).name());
output->append("=");
output->append("\"");
output->append(function.parameter(i).value());
output->append("\"");
first = false;
}
output->append(")");
}
}
void ToFML(const FeatureFunctionDescriptor &function, string *output) {
ToFMLFunction(function, output);
if (function.feature_size() == 1) {
output->append(".");
ToFML(function.feature(0), output);
} else if (function.feature_size() > 1) {
output->append(" { ");
for (int i = 0; i < function.feature_size(); ++i) {
if (i > 0) output->append(" ");
ToFML(function.feature(i), output);
}
output->append(" } ");
}
}
void ToFML(const FeatureExtractorDescriptor &extractor, string *output) {
for (int i = 0; i < extractor.feature_size(); ++i) {
ToFML(extractor.feature(i), output);
output->append("\n");
}
}
string AsFML(const FeatureFunctionDescriptor &function) {
string str;
ToFML(function, &str);
return str;
}
string AsFML(const FeatureExtractorDescriptor &extractor) {
string str;
ToFML(extractor, &str);
return str;
}
void StripFML(string *fml_string) {
auto it = fml_string->begin();
while (it != fml_string->end()) {
if (*it == '"') {
it = fml_string->erase(it);
} else {
++it;
}
}
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Feature modeling language (fml) parser.
//
// BNF grammar for fml:
//
// <feature model> ::= { <feature extractor> }
//
// <feature extractor> ::= <extractor spec> |
// <extractor spec> '.' <feature extractor> |
// <extractor spec> '{' { <feature extractor> } '}'
//
// <extractor spec> ::= <extractor type>
// [ '(' <parameter list> ')' ]
// [ ':' <extractor name> ]
//
// <parameter list> = ( <parameter> | <argument> ) { ',' <parameter> }
//
// <parameter> ::= <parameter name> '=' <parameter value>
//
// <extractor type> ::= NAME
// <extractor name> ::= NAME | STRING
// <argument> ::= NUMBER
// <parameter name> ::= NAME
// <parameter value> ::= NUMBER | STRING | NAME
#ifndef $TARGETDIR_FML_PARSER_H_
#define $TARGETDIR_FML_PARSER_H_
#include <string>
#include "syntaxnet/utils.h"
#include "syntaxnet/feature_extractor.pb.h"
namespace syntaxnet {
class FMLParser {
public:
// Parses fml specification into feature extractor descriptor.
void Parse(const string &source, FeatureExtractorDescriptor *result);
private:
// Initializes the parser with the source text.
void Initialize(const string &source);
// Outputs error message and exits.
void Error(const string &error_message);
// Moves to the next input character.
void Next();
// Moves to the next input item.
void NextItem();
// Parses a feature descriptor.
void ParseFeature(FeatureFunctionDescriptor *result);
// Parses a parameter specification.
void ParseParameter(FeatureFunctionDescriptor *result);
// Returns true if end of source input has been reached.
bool eos() { return current_ == source_.end(); }
// Item types.
enum ItemTypes {
END = 0,
NAME = -1,
NUMBER = -2,
STRING = -3,
};
// Source text.
string source_;
// Current input position.
string::iterator current_;
// Line number for current input position.
int line_number_;
// Start position for current item.
string::iterator item_start_;
// Start position for current line.
string::iterator line_start_;
// Line number for current item.
int item_line_number_;
// Item type for current item. If this is positive it is interpreted as a
// character. If it is negative it is interpreted as an item type.
int item_type_;
// Text for current item.
string item_text_;
};
} // namespace syntaxnet
#endif // $TARGETDIR_FML_PARSER_H_
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Builds parser models."""
import tensorflow as tf
import syntaxnet.load_parser_ops
from tensorflow.python.ops import control_flow_ops as cf
from tensorflow.python.ops import state_ops
from tensorflow.python.platform import logging
from syntaxnet.ops import gen_parser_ops
def BatchedSparseToDense(sparse_indices, output_size):
"""Batch compatible sparse to dense conversion.
This is useful for one-hot coded target labels.
Args:
sparse_indices: [batch_size] tensor containing one index per batch
output_size: needed in order to generate the correct dense output
Returns:
A [batch_size, output_size] dense tensor.
"""
eye = tf.diag(tf.fill([output_size], tf.constant(1, tf.float32)))
return tf.nn.embedding_lookup(eye, sparse_indices)
def EmbeddingLookupFeatures(params, sparse_features, allow_weights):
"""Computes embeddings for each entry of sparse features sparse_features.
Args:
params: list of 2D tensors containing vector embeddings
sparse_features: 1D tensor of strings. Each entry is a string encoding of
dist_belief.SparseFeatures, and represents a variable length list of
feature ids, and optionally, corresponding weights values.
allow_weights: boolean to control whether the weights returned from the
SparseFeatures are used to multiply the embeddings.
Returns:
A tensor representing the combined embeddings for the sparse features.
For each entry s in sparse_features, the function looks up the embeddings
for each id and sums them into a single tensor weighing them by the
weight of each id. It returns a tensor with each entry of sparse_features
replaced by this combined embedding.
"""
if not isinstance(params, list):
params = [params]
# Lookup embeddings.
sparse_features = tf.convert_to_tensor(sparse_features)
indices, ids, weights = gen_parser_ops.unpack_sparse_features(sparse_features)
embeddings = tf.nn.embedding_lookup(params, ids)
if allow_weights:
# Multiply by weights, reshaping to allow broadcast.
broadcast_weights_shape = tf.concat(0, [tf.shape(weights), [1]])
embeddings *= tf.reshape(weights, broadcast_weights_shape)
# Sum embeddings by index.
return tf.unsorted_segment_sum(embeddings, indices, tf.size(sparse_features))
class GreedyParser(object):
"""Builds a Chen & Manning style greedy neural net parser.
Builds a graph with an optional reader op connected at one end and
operations needed to train the network on the other. Supports multiple
network instantiations sharing the same parameters and network topology.
The following named nodes are added to the training and eval networks:
epochs: a tensor containing the current epoch number
cost: a tensor containing the current training step cost
gold_actions: a tensor containing actions from gold decoding
feature_endpoints: a list of sparse feature vectors
logits: output of the final layer before computing softmax
The training network also contains:
train_op: an op that executes a single training step
Typical usage:
parser = graph_builder.GreedyParser(num_actions, num_features,
num_feature_ids, embedding_sizes,
hidden_layer_sizes)
parser.AddTraining(task_context, batch_size=5)
with tf.Session('local') as sess:
# This works because the session uses the same default graph as the
# GraphBuilder did.
sess.run(parser.inits.values())
while True:
tf_epoch, _ = sess.run([parser.training['epoch'],
parser.training['train_op']])
if tf_epoch[0] > 0:
break
"""
def __init__(self,
num_actions,
num_features,
num_feature_ids,
embedding_sizes,
hidden_layer_sizes,
seed=None,
gate_gradients=False,
use_locking=False,
embedding_init=1.0,
relu_init=1e-4,
bias_init=0.2,
softmax_init=1e-4,
averaging_decay=0.9999,
use_averaging=True,
check_parameters=True,
check_every=1,
allow_feature_weights=False,
only_train='',
arg_prefix=None,
**unused_kwargs):
"""Initialize the graph builder with parameters defining the network.
Args:
num_actions: int size of the set of parser actions
num_features: int list of dimensions of the feature vectors
num_feature_ids: int list of same length as num_features corresponding to
the sizes of the input feature spaces
embedding_sizes: int list of same length as num_features of the desired
embedding layer sizes
hidden_layer_sizes: int list of desired relu layer sizes; may be empty
seed: optional random initializer seed to enable reproducibility
gate_gradients: if True, gradient updates are computed synchronously,
ensuring consistency and reproducibility
use_locking: if True, use locking to avoid read-write contention when
updating Variables
embedding_init: sets the std dev of normal initializer of embeddings to
embedding_init / embedding_size ** .5
relu_init: sets the std dev of normal initializer of relu weights
to relu_init
bias_init: sets constant initializer of relu bias to bias_init
softmax_init: sets the std dev of normal initializer of softmax init
to softmax_init
averaging_decay: decay for exponential moving average when computing
averaged parameters, set to 1 to do vanilla averaging
use_averaging: whether to use moving averages of parameters during evals
check_parameters: whether to check for NaN/Inf parameters during
training
check_every: checks numerics every check_every steps.
allow_feature_weights: whether feature weights are allowed.
only_train: the comma separated set of parameter names to train. If empty,
all model parameters will be trained.
arg_prefix: prefix for context parameters.
"""
self._num_actions = num_actions
self._num_features = num_features
self._num_feature_ids = num_feature_ids
self._embedding_sizes = embedding_sizes
self._hidden_layer_sizes = hidden_layer_sizes
self._seed = seed
self._gate_gradients = gate_gradients
self._use_locking = use_locking
self._use_averaging = use_averaging
self._check_parameters = check_parameters
self._check_every = check_every
self._allow_feature_weights = allow_feature_weights
self._only_train = set(only_train.split(',')) if only_train else None
self._feature_size = len(embedding_sizes)
self._embedding_init = embedding_init
self._relu_init = relu_init
self._softmax_init = softmax_init
self._arg_prefix = arg_prefix
# Parameters of the network with respect to which training is done.
self.params = {}
# Other variables, with respect to which no training is done, but which we
# nonetheless need to save in order to capture the state of the graph.
self.variables = {}
# Operations to initialize any nodes that require initialization.
self.inits = {}
# Training- and eval-related nodes.
self.training = {}
self.evaluation = {}
self.saver = None
# Nodes to compute moving averages of parameters, called every train step.
self._averaging = {}
self._averaging_decay = averaging_decay
# Pretrained embeddings that can be used instead of constant initializers.
self._pretrained_embeddings = {}
# After the following 'with' statement, we'll be able to re-enter the
# 'params' scope by re-using the self._param_scope member variable. See for
# instance _AddParam.
with tf.name_scope('params') as self._param_scope:
self._relu_bias_init = tf.constant_initializer(bias_init)
@property
def embedding_size(self):
size = 0
for i in range(self._feature_size):
size += self._num_features[i] * self._embedding_sizes[i]
return size
def _AddParam(self,
shape,
dtype,
name,
initializer=None,
return_average=False):
"""Add a model parameter w.r.t. we expect to compute gradients.
_AddParam creates both regular parameters (usually for training) and
averaged nodes (usually for inference). It returns one or the other based
on the 'return_average' arg.
Args:
shape: int list, tensor shape of the parameter to create
dtype: tf.DataType, data type of the parameter
name: string, name of the parameter in the TF graph
initializer: optional initializer for the paramter
return_average: if False, return parameter otherwise return moving average
Returns:
parameter or averaged parameter
"""
if name not in self.params:
step = tf.cast(self.GetStep(), tf.float32)
# Put all parameters and their initializing ops in their own scope
# irrespective of the current scope (training or eval).
with tf.name_scope(self._param_scope):
self.params[name] = tf.get_variable(name, shape, dtype, initializer)
param = self.params[name]
if initializer is not None:
self.inits[name] = state_ops.init_variable(param, initializer)
if self._averaging_decay == 1:
logging.info('Using vanilla averaging of parameters.')
ema = tf.train.ExponentialMovingAverage(decay=(step / (step + 1.0)),
num_updates=None)
else:
ema = tf.train.ExponentialMovingAverage(decay=self._averaging_decay,
num_updates=step)
self._averaging[name + '_avg_update'] = ema.apply([param])
self.variables[name + '_avg_var'] = ema.average(param)
self.inits[name + '_avg_init'] = state_ops.init_variable(
ema.average(param), tf.zeros_initializer)
return (self.variables[name + '_avg_var'] if return_average else
self.params[name])
def GetStep(self):
def OnesInitializer(shape, dtype=tf.float32):
return tf.ones(shape, dtype)
return self._AddVariable([], tf.int32, 'step', OnesInitializer)
def _AddVariable(self, shape, dtype, name, initializer=None):
if name in self.variables:
return self.variables[name]
self.variables[name] = tf.get_variable(name, shape, dtype, initializer)
if initializer is not None:
self.inits[name] = state_ops.init_variable(self.variables[name],
initializer)
return self.variables[name]
def _ReluWeightInitializer(self):
with tf.name_scope(self._param_scope):
return tf.random_normal_initializer(stddev=self._relu_init,
seed=self._seed)
def _EmbeddingMatrixInitializer(self, index, embedding_size):
if index in self._pretrained_embeddings:
return self._pretrained_embeddings[index]
else:
return tf.random_normal_initializer(
stddev=self._embedding_init / embedding_size**.5,
seed=self._seed)
def _AddEmbedding(self,
features,
num_features,
num_ids,
embedding_size,
index,
return_average=False):
"""Adds an embedding matrix and passes the `features` vector through it."""
embedding_matrix = self._AddParam(
[num_ids, embedding_size],
tf.float32,
'embedding_matrix_%d' % index,
self._EmbeddingMatrixInitializer(index, embedding_size),
return_average=return_average)
embedding = EmbeddingLookupFeatures(embedding_matrix,
tf.reshape(features,
[-1],
name='feature_%d' % index),
self._allow_feature_weights)
return tf.reshape(embedding, [-1, num_features * embedding_size])
def _BuildNetwork(self, feature_endpoints, return_average=False):
"""Builds a feed-forward part of the net given features as input.
The network topology is already defined in the constructor, so multiple
calls to BuildForward build multiple networks whose parameters are all
shared. It is the source of the input features and the use of the output
that distinguishes each network.
Args:
feature_endpoints: tensors with input features to the network
return_average: whether to use moving averages as model parameters
Returns:
logits: output of the final layer before computing softmax
"""
assert len(feature_endpoints) == self._feature_size
# Create embedding layer.
embeddings = []
for i in range(self._feature_size):
embeddings.append(self._AddEmbedding(feature_endpoints[i],
self._num_features[i],
self._num_feature_ids[i],
self._embedding_sizes[i],
i,
return_average=return_average))
last_layer = tf.concat(1, embeddings)
last_layer_size = self.embedding_size
# Create ReLU layers.
for i, hidden_layer_size in enumerate(self._hidden_layer_sizes):
weights = self._AddParam(
[last_layer_size, hidden_layer_size],
tf.float32,
'weights_%d' % i,
self._ReluWeightInitializer(),
return_average=return_average)
bias = self._AddParam([hidden_layer_size],
tf.float32,
'bias_%d' % i,
self._relu_bias_init,
return_average=return_average)
last_layer = tf.nn.relu_layer(last_layer,
weights,
bias,
name='layer_%d' % i)
last_layer_size = hidden_layer_size
# Create softmax layer.
softmax_weight = self._AddParam(
[last_layer_size, self._num_actions],
tf.float32,
'softmax_weight',
tf.random_normal_initializer(stddev=self._softmax_init,
seed=self._seed),
return_average=return_average)
softmax_bias = self._AddParam(
[self._num_actions],
tf.float32,
'softmax_bias',
tf.zeros_initializer,
return_average=return_average)
logits = tf.nn.xw_plus_b(last_layer,
softmax_weight,
softmax_bias,
name='logits')
return {'logits': logits}
def _AddGoldReader(self, task_context, batch_size, corpus_name):
features, epochs, gold_actions = (
gen_parser_ops.gold_parse_reader(task_context,
self._feature_size,
batch_size,
corpus_name=corpus_name,
arg_prefix=self._arg_prefix))
return {'gold_actions': tf.identity(gold_actions,
name='gold_actions'),
'epochs': tf.identity(epochs,
name='epochs'),
'feature_endpoints': features}
def _AddDecodedReader(self, task_context, batch_size, transition_scores,
corpus_name):
features, epochs, eval_metrics, documents = (
gen_parser_ops.decoded_parse_reader(transition_scores,
task_context,
self._feature_size,
batch_size,
corpus_name=corpus_name,
arg_prefix=self._arg_prefix))
return {'eval_metrics': eval_metrics,
'epochs': tf.identity(epochs,
name='epochs'),
'feature_endpoints': features,
'documents': documents}
def _AddCostFunction(self, batch_size, gold_actions, logits):
"""Cross entropy plus L2 loss on weights and biases of the hidden layers."""
dense_golden = BatchedSparseToDense(gold_actions, self._num_actions)
cross_entropy = tf.div(
tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(
logits, dense_golden)), batch_size)
regularized_params = [tf.nn.l2_loss(p)
for k, p in self.params.items()
if k.startswith('weights') or k.startswith('bias')]
l2_loss = 1e-4 * tf.add_n(regularized_params) if regularized_params else 0
return {'cost': tf.add(cross_entropy, l2_loss, name='cost')}
def AddEvaluation(self,
task_context,
batch_size,
evaluation_max_steps=300,
corpus_name='documents'):
"""Builds the forward network only without the training operation.
Args:
task_context: file path from which to read the task context.
batch_size: batch size to request from reader op.
evaluation_max_steps: max number of parsing actions during evaluation,
only used in beam parsing.
corpus_name: name of the task input to read parses from.
Returns:
Dictionary of named eval nodes.
"""
def _AssignTransitionScores():
return tf.assign(nodes['transition_scores'],
nodes['logits'], validate_shape=False)
def _Pass():
return tf.constant(-1.0)
unused_evaluation_max_steps = evaluation_max_steps
with tf.name_scope('evaluation'):
nodes = self.evaluation
nodes['transition_scores'] = self._AddVariable(
[batch_size, self._num_actions], tf.float32, 'transition_scores',
tf.constant_initializer(-1.0))
nodes.update(self._AddDecodedReader(task_context, batch_size, nodes[
'transition_scores'], corpus_name))
nodes.update(self._BuildNetwork(nodes['feature_endpoints'],
return_average=self._use_averaging))
nodes['eval_metrics'] = cf.with_dependencies(
[tf.cond(tf.greater(tf.size(nodes['logits']), 0),
_AssignTransitionScores, _Pass)],
nodes['eval_metrics'], name='eval_metrics')
return nodes
def _IncrementCounter(self, counter):
return state_ops.assign_add(counter, 1, use_locking=True)
def _AddLearningRate(self, initial_learning_rate, decay_steps):
"""Returns a learning rate that decays by 0.96 every decay_steps.
Args:
initial_learning_rate: initial value of the learning rate
decay_steps: decay by 0.96 every this many steps
Returns:
learning rate variable.
"""
step = self.GetStep()
return cf.with_dependencies(
[self._IncrementCounter(step)],
tf.train.exponential_decay(initial_learning_rate,
step,
decay_steps,
0.96,
staircase=True))
def AddPretrainedEmbeddings(self, index, embeddings_path, task_context):
"""Embeddings at the given index will be set to pretrained values."""
def _Initializer(shape, dtype=tf.float32):
unused_dtype = dtype
t = gen_parser_ops.word_embedding_initializer(
vectors=embeddings_path,
task_context=task_context,
embedding_init=self._embedding_init)
t.set_shape(shape)
return t
self._pretrained_embeddings[index] = _Initializer
def AddTraining(self,
task_context,
batch_size,
learning_rate=0.1,
decay_steps=4000,
momentum=0.9,
corpus_name='documents'):
"""Builds a trainer to minimize the cross entropy cost function.
Args:
task_context: file path from which to read the task context
batch_size: batch size to request from reader op
learning_rate: initial value of the learning rate
decay_steps: decay learning rate by 0.96 every this many steps
momentum: momentum parameter used when training with momentum
corpus_name: name of the task input to read parses from
Returns:
Dictionary of named training nodes.
"""
with tf.name_scope('training'):
nodes = self.training
nodes.update(self._AddGoldReader(task_context, batch_size, corpus_name))
nodes.update(self._BuildNetwork(nodes['feature_endpoints'],
return_average=False))
nodes.update(self._AddCostFunction(batch_size, nodes['gold_actions'],
nodes['logits']))
# Add the optimizer
if self._only_train:
trainable_params = [v
for k, v in self.params.iteritems()
if k in self._only_train]
else:
trainable_params = self.params.values()
lr = self._AddLearningRate(learning_rate, decay_steps)
optimizer = tf.train.MomentumOptimizer(lr,
momentum,
use_locking=self._use_locking)
train_op = optimizer.minimize(nodes['cost'], var_list=trainable_params)
for param in trainable_params:
slot = optimizer.get_slot(param, 'momentum')
self.inits[slot.name] = state_ops.init_variable(slot,
tf.zeros_initializer)
self.variables[slot.name] = slot
numerical_checks = [
tf.check_numerics(param,
message='Parameter is not finite.')
for param in trainable_params
if param.dtype.base_dtype in [tf.float32, tf.float64]
]
check_op = tf.group(*numerical_checks)
avg_update_op = tf.group(*self._averaging.values())
train_ops = [train_op]
if self._check_parameters:
train_ops.append(check_op)
if self._use_averaging:
train_ops.append(avg_update_op)
nodes['train_op'] = tf.group(*train_ops, name='train_op')
return nodes
def AddSaver(self, slim_model=False):
"""Adds ops to save and restore model parameters.
Args:
slim_model: whether only averaged variables are saved.
Returns:
the saver object.
"""
# We have to put the save op in the root scope otherwise running
# "save/restore_all" won't find the "save/Const" node it expects.
with tf.name_scope(None):
variables_to_save = self.params.copy()
variables_to_save.update(self.variables)
if slim_model:
for key in variables_to_save.keys():
if not key.endswith('avg_var'):
del variables_to_save[key]
self.saver = tf.train.Saver(variables_to_save)
return self.saver
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for graph_builder."""
# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
import os.path
import tensorflow as tf
from tensorflow.python.framework import test_util
from tensorflow.python.ops import variables
from tensorflow.python.platform import googletest
from syntaxnet import graph_builder
from syntaxnet import sparse_pb2
from syntaxnet.ops import gen_parser_ops
FLAGS = tf.app.flags.FLAGS
if not hasattr(FLAGS, 'test_srcdir'):
FLAGS.test_srcdir = ''
if not hasattr(FLAGS, 'test_tmpdir'):
FLAGS.test_tmpdir = tf.test.get_temp_dir()
class GraphBuilderTest(test_util.TensorFlowTestCase):
def setUp(self):
# Creates a task context with the correct testing paths.
initial_task_context = os.path.join(
FLAGS.test_srcdir,
'syntaxnet/'
'testdata/context.pbtxt')
self._task_context = os.path.join(FLAGS.test_tmpdir, 'context.pbtxt')
with open(initial_task_context, 'r') as fin:
with open(self._task_context, 'w') as fout:
fout.write(fin.read().replace('SRCDIR', FLAGS.test_srcdir)
.replace('OUTPATH', FLAGS.test_tmpdir))
# Creates necessary term maps.
with self.test_session() as sess:
gen_parser_ops.lexicon_builder(task_context=self._task_context,
corpus_name='training-corpus').run()
self._num_features, self._num_feature_ids, _, self._num_actions = (
sess.run(gen_parser_ops.feature_size(task_context=self._task_context,
arg_prefix='brain_parser')))
def MakeBuilder(self, use_averaging=True, **kw_args):
# Set the seed and gate_gradients to ensure reproducibility.
return graph_builder.GreedyParser(
self._num_actions, self._num_features, self._num_feature_ids,
embedding_sizes=[8, 8, 8], hidden_layer_sizes=[32, 32], seed=42,
gate_gradients=True, use_averaging=use_averaging, **kw_args)
def FindNode(self, name):
for node in tf.get_default_graph().as_graph_def().node:
if node.name == name:
return node
return None
def NodeFound(self, name):
return self.FindNode(name) is not None
def testScope(self):
# Set up the network topology
graph = tf.Graph()
with graph.as_default():
parser = self.MakeBuilder()
parser.AddTraining(self._task_context,
batch_size=10,
corpus_name='training-corpus')
parser.AddEvaluation(self._task_context,
batch_size=2,
corpus_name='tuning-corpus')
parser.AddSaver()
# Check that the node ids we may rely on are there with the expected
# names.
self.assertEqual(parser.training['logits'].name, 'training/logits:0')
self.assertTrue(self.NodeFound('training/logits'))
self.assertTrue(self.NodeFound('training/feature_0'))
self.assertTrue(self.NodeFound('training/feature_1'))
self.assertTrue(self.NodeFound('training/feature_2'))
self.assertFalse(self.NodeFound('training/feature_3'))
self.assertEqual(parser.evaluation['logits'].name, 'evaluation/logits:0')
self.assertTrue(self.NodeFound('evaluation/logits'))
# The saver node is expected to be in the root scope.
self.assertTrue(self.NodeFound('save/restore_all'))
# Also check that the parameters have the scope we expect.
self.assertTrue(self.NodeFound('embedding_matrix_0'))
self.assertTrue(self.NodeFound('embedding_matrix_1'))
self.assertTrue(self.NodeFound('embedding_matrix_2'))
self.assertFalse(self.NodeFound('embedding_matrix_3'))
def testNestedScope(self):
# It's OK to put the whole graph in a scope of its own.
graph = tf.Graph()
with graph.as_default():
with graph.name_scope('top'):
parser = self.MakeBuilder()
parser.AddTraining(self._task_context,
batch_size=10,
corpus_name='training-corpus')
parser.AddSaver()
self.assertTrue(self.NodeFound('top/training/logits'))
self.assertTrue(self.NodeFound('top/training/feature_0'))
# The saver node is expected to be in the root scope no matter what.
self.assertFalse(self.NodeFound('top/save/restore_all'))
self.assertTrue(self.NodeFound('save/restore_all'))
def testUseCustomGraphs(self):
batch_size = 10
# Use separate custom graphs.
custom_train_graph = tf.Graph()
with custom_train_graph.as_default():
train_parser = self.MakeBuilder()
train_parser.AddTraining(self._task_context,
batch_size,
corpus_name='training-corpus')
custom_eval_graph = tf.Graph()
with custom_eval_graph.as_default():
eval_parser = self.MakeBuilder()
eval_parser.AddEvaluation(self._task_context,
batch_size,
corpus_name='tuning-corpus')
# The following session runs should not fail.
with self.test_session(graph=custom_train_graph) as sess:
self.assertTrue(self.NodeFound('training/logits'))
sess.run(train_parser.inits.values())
sess.run(['training/logits:0'])
with self.test_session(graph=custom_eval_graph) as sess:
self.assertFalse(self.NodeFound('training/logits'))
self.assertTrue(self.NodeFound('evaluation/logits'))
sess.run(eval_parser.inits.values())
sess.run(['evaluation/logits:0'])
def testTrainingAndEvalAreIndependent(self):
batch_size = 10
graph = tf.Graph()
with graph.as_default():
parser = self.MakeBuilder(use_averaging=False)
parser.AddTraining(self._task_context,
batch_size,
corpus_name='training-corpus')
parser.AddEvaluation(self._task_context,
batch_size,
corpus_name='tuning-corpus')
with self.test_session(graph=graph) as sess:
sess.run(parser.inits.values())
# Before any training updates are performed, both training and eval nets
# should return the same computations.
eval_logits, = sess.run([parser.evaluation['logits']])
training_logits, = sess.run([parser.training['logits']])
self.assertNear(abs((eval_logits - training_logits).sum()), 0, 1e-6)
# After training, activations should differ.
for _ in range(5):
eval_logits = parser.evaluation['logits'].eval()
for _ in range(5):
training_logits, _ = sess.run([parser.training['logits'],
parser.training['train_op']])
self.assertGreater(abs((eval_logits - training_logits).sum()), 0, 1e-3)
def testReproducibility(self):
batch_size = 10
def ComputeACost(graph):
with graph.as_default():
parser = self.MakeBuilder(use_averaging=False)
parser.AddTraining(self._task_context,
batch_size,
corpus_name='training-corpus')
parser.AddEvaluation(self._task_context,
batch_size,
corpus_name='tuning-corpus')
with self.test_session(graph=graph) as sess:
sess.run(parser.inits.values())
for _ in range(5):
cost, _ = sess.run([parser.training['cost'],
parser.training['train_op']])
return cost
cost1 = ComputeACost(tf.Graph())
cost2 = ComputeACost(tf.Graph())
self.assertNear(cost1, cost2, 1e-8)
def testAddTrainingAndEvalOrderIndependent(self):
batch_size = 10
graph1 = tf.Graph()
with graph1.as_default():
parser = self.MakeBuilder(use_averaging=False)
parser.AddTraining(self._task_context,
batch_size,
corpus_name='training-corpus')
parser.AddEvaluation(self._task_context,
batch_size,
corpus_name='tuning-corpus')
with self.test_session(graph=graph1) as sess:
sess.run(parser.inits.values())
metrics1 = None
for _ in range(500):
cost1, _ = sess.run([parser.training['cost'],
parser.training['train_op']])
em1 = parser.evaluation['eval_metrics'].eval()
metrics1 = metrics1 + em1 if metrics1 is not None else em1
# Reverse the order in which Training and Eval stacks are added.
graph2 = tf.Graph()
with graph2.as_default():
parser = self.MakeBuilder(use_averaging=False)
parser.AddEvaluation(self._task_context,
batch_size,
corpus_name='tuning-corpus')
parser.AddTraining(self._task_context,
batch_size,
corpus_name='training-corpus')
with self.test_session(graph=graph2) as sess:
sess.run(parser.inits.values())
metrics2 = None
for _ in range(500):
cost2, _ = sess.run([parser.training['cost'],
parser.training['train_op']])
em2 = parser.evaluation['eval_metrics'].eval()
metrics2 = metrics2 + em2 if metrics2 is not None else em2
self.assertNear(cost1, cost2, 1e-8)
self.assertEqual(abs(metrics1 - metrics2).sum(), 0)
def testEvalMetrics(self):
batch_size = 10
graph = tf.Graph()
with graph.as_default():
parser = self.MakeBuilder()
parser.AddEvaluation(self._task_context,
batch_size,
corpus_name='tuning-corpus')
with self.test_session(graph=graph) as sess:
sess.run(parser.inits.values())
tokens = 0
correct_heads = 0
for _ in range(100):
eval_metrics = sess.run(parser.evaluation['eval_metrics'])
tokens += eval_metrics[0]
correct_heads += eval_metrics[1]
self.assertGreater(tokens, 0)
self.assertGreaterEqual(tokens, correct_heads)
self.assertGreaterEqual(correct_heads, 0)
def MakeSparseFeatures(self, ids, weights):
f = sparse_pb2.SparseFeatures()
for i, w in zip(ids, weights):
f.id.append(i)
f.weight.append(w)
return f.SerializeToString()
def testEmbeddingOp(self):
graph = tf.Graph()
with self.test_session(graph=graph):
params = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
tf.float32)
var = variables.Variable([self.MakeSparseFeatures([1, 2], [1.0, 1.0]),
self.MakeSparseFeatures([], [])])
var.initializer.run()
embeddings = graph_builder.EmbeddingLookupFeatures(params, var,
True).eval()
self.assertAllClose([[8.0, 10.0], [0.0, 0.0]], embeddings)
var = variables.Variable([self.MakeSparseFeatures([], []),
self.MakeSparseFeatures([0, 2],
[0.5, 2.0])])
var.initializer.run()
embeddings = graph_builder.EmbeddingLookupFeatures(params, var,
True).eval()
self.assertAllClose([[0.0, 0.0], [10.5, 13.0]], embeddings)
def testOnlyTrainSomeParameters(self):
batch_size = 10
graph = tf.Graph()
with graph.as_default():
parser = self.MakeBuilder(use_averaging=False, only_train='softmax_bias')
parser.AddTraining(self._task_context,
batch_size,
corpus_name='training-corpus')
with self.test_session(graph=graph) as sess:
sess.run(parser.inits.values())
# Before training, save the state of two of the parameters.
bias0, weight0 = sess.run([parser.params['softmax_bias'],
parser.params['softmax_weight']])
for _ in range(5):
bias, weight, _ = sess.run([parser.params['softmax_bias'],
parser.params['softmax_weight'],
parser.training['train_op']])
# After training, only one of the parameters should have changed.
self.assertAllEqual(weight, weight0)
self.assertGreater(abs(bias - bias0).sum(), 0, 1e-5)
if __name__ == '__main__':
googletest.main()
// K-best part-of-speech and dependency annotations for tokens.
syntax = "proto2";
import "syntaxnet/sentence.proto";
package syntaxnet;
// A list of alternative (k-best) syntax analyses, grouped by sentences.
message KBestSyntaxAnalyses {
extend Sentence {
optional KBestSyntaxAnalyses extension = 60366242;
}
// Alternative analyses for each sentence. Sentences are listed in the
// order visited by a SentenceIterator.
repeated KBestSyntaxAnalysesForSentence sentence = 1;
// Alternative analyses for each token.
repeated KBestSyntaxAnalysesForToken token = 2;
}
// A list of alternative (k-best) analyses for a sentence spanning from a start
// token index to an end token index. The alternative analyses are ordered by
// decreasing model score from best to worst. The first analysis is the 1-best
// analysis, which is typically also stored in the document tokens.
message KBestSyntaxAnalysesForSentence {
// First token of sentence.
optional int32 start = 1 [default = -1];
// Last token of sentence.
optional int32 end = 2 [default = -1];
// K-best analyses for the tokens in this sentence. All of the analyses in
// the list have the same "type"; e.g., k-best taggings,
// k-best {tagging+parse}s, etc.
// Note also that the type of analysis stored in this list can change
// depending on where we are in the document processing pipeline; e.g.,
// may initially be taggings, and then switch to parses. The first
// token_analysis would be the 1-best analysis, which is typically also stored
// in the document. Note: some post-processors will update the document's
// syntax trees, but will leave these unchanged.
repeated AlternativeTokenAnalysis token_analysis = 3;
}
// A list of scored alternative (k-best) analyses for a particular token. These
// are all distinct from each other and ordered by decreasing model score. The
// first is the 1-best analysis, which may or may not match the document tokens
// depending on how the k-best analyses are selected.
message KBestSyntaxAnalysesForToken {
// All token analyses in this repeated field refer to the same token.
// Each alternative analysis will contain a single entry for repeated fields
// such as head, tag, category and label.
repeated AlternativeTokenAnalysis token_analysis = 3;
}
// An alternative analysis of tokens in the document. The repeated fields
// are indexed relative to the beginning of a sentence. Fields not
// represented in the alternative analysis are assumed to be unchanged.
// Currently only alternatives for tags, categories and (labeled) dependency
// heads are supported.
// Each repeated field should either have length=0 or length=number of tokens.
message AlternativeTokenAnalysis {
// Head of this token in the dependency tree: the id of the token which has
// an arc going to this one. If it is the root token of a sentence, then it
// is set to -1.
repeated int32 head = 1;
// Part-of-speech tag for token.
repeated string tag = 2;
// Coarse-grained word category for token.
repeated string category = 3;
// Label for dependency relation between this token and its head.
repeated string label = 4;
// The score of this analysis, where bigger values typically indicate better
// quality, but there are no guarantees and there is also no pre-defined
// range.
optional double score = 5;
}
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <stddef.h>
#include <string>
#include "syntaxnet/utils.h"
#include "syntaxnet/affix.h"
#include "syntaxnet/dictionary.pb.h"
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/sentence_batch.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/term_frequency_map.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/env.h"
// A task that collects term statistics over a corpus and saves a set of
// term maps; these saved mappings are used to map strings to ints in both the
// chunker trainer and the chunker processors.
using tensorflow::DEVICE_CPU;
using tensorflow::DT_INT32;
using tensorflow::OpKernel;
using tensorflow::OpKernelConstruction;
using tensorflow::OpKernelContext;
using tensorflow::Tensor;
using tensorflow::TensorShape;
using tensorflow::errors::InvalidArgument;
namespace syntaxnet {
// A workflow task that creates term maps (e.g., word, tag, etc.).
//
// Non-flag task parameters:
// int lexicon_max_prefix_length (3):
// The maximum prefix length for lexicon words.
// int lexicon_max_suffix_length (3):
// The maximum suffix length for lexicon words.
class LexiconBuilder : public OpKernel {
public:
explicit LexiconBuilder(OpKernelConstruction *context) : OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("corpus_name", &corpus_name_));
OP_REQUIRES_OK(context, context->GetAttr("lexicon_max_prefix_length",
&max_prefix_length_));
OP_REQUIRES_OK(context, context->GetAttr("lexicon_max_suffix_length",
&max_suffix_length_));
string file_path, data;
OP_REQUIRES_OK(context, context->GetAttr("task_context", &file_path));
OP_REQUIRES_OK(context, ReadFileToString(tensorflow::Env::Default(),
file_path, &data));
OP_REQUIRES(context,
TextFormat::ParseFromString(data, task_context_.mutable_spec()),
InvalidArgument("Could not parse task context at ", file_path));
}
// Counts term frequencies.
void Compute(OpKernelContext *context) override {
// Term frequency maps to be populated by the corpus.
TermFrequencyMap words;
TermFrequencyMap lcwords;
TermFrequencyMap tags;
TermFrequencyMap categories;
TermFrequencyMap labels;
// Affix tables to be populated by the corpus.
AffixTable prefixes(AffixTable::PREFIX, max_prefix_length_);
AffixTable suffixes(AffixTable::SUFFIX, max_suffix_length_);
// Tag-to-category mapping.
TagToCategoryMap tag_to_category;
// Make a pass over the corpus.
int64 num_tokens = 0;
int64 num_documents = 0;
Sentence *document;
TextReader corpus(*task_context_.GetInput(corpus_name_));
while ((document = corpus.Read()) != NULL) {
// Gather token information.
for (int t = 0; t < document->token_size(); ++t) {
// Get token and lowercased word.
const Token &token = document->token(t);
string word = token.word();
utils::NormalizeDigits(&word);
string lcword = tensorflow::str_util::Lowercase(word);
// Make sure the token does not contain a newline.
CHECK(lcword.find('\n') == string::npos);
// Increment frequencies (only for terms that exist).
if (!word.empty() && !HasSpaces(word)) words.Increment(word);
if (!lcword.empty() && !HasSpaces(lcword)) lcwords.Increment(lcword);
if (!token.tag().empty()) tags.Increment(token.tag());
if (!token.category().empty()) categories.Increment(token.category());
if (!token.label().empty()) labels.Increment(token.label());
// Add prefixes/suffixes for the current word.
prefixes.AddAffixesForWord(word.c_str(), word.size());
suffixes.AddAffixesForWord(word.c_str(), word.size());
// Add mapping from tag to category.
tag_to_category.SetCategory(token.tag(), token.category());
// Update the number of processed tokens.
++num_tokens;
}
delete document;
++num_documents;
}
LOG(INFO) << "Term maps collected over " << num_tokens << " tokens from "
<< num_documents << " documents";
// Write mappings to disk.
words.Save(TaskContext::InputFile(*task_context_.GetInput("word-map")));
lcwords.Save(TaskContext::InputFile(*task_context_.GetInput("lcword-map")));
tags.Save(TaskContext::InputFile(*task_context_.GetInput("tag-map")));
categories.Save(
TaskContext::InputFile(*task_context_.GetInput("category-map")));
labels.Save(TaskContext::InputFile(*task_context_.GetInput("label-map")));
// Write affixes to disk.
WriteAffixTable(prefixes, TaskContext::InputFile(
*task_context_.GetInput("prefix-table")));
WriteAffixTable(suffixes, TaskContext::InputFile(
*task_context_.GetInput("suffix-table")));
// Write tag-to-category mapping to disk.
tag_to_category.Save(
TaskContext::InputFile(*task_context_.GetInput("tag-to-category")));
}
private:
// Returns true if the word contains spaces.
static bool HasSpaces(const string &word) {
for (char c : word) {
if (c == ' ') return true;
}
return false;
}
// Writes an affix table to a task output.
static void WriteAffixTable(const AffixTable &affixes,
const string &output_file) {
ProtoRecordWriter writer(output_file);
affixes.Write(&writer);
}
// Name of the context input to compute lexicons.
string corpus_name_;
// Max length for prefix table.
int max_prefix_length_;
// Max length for suffix table.
int max_suffix_length_;
// Task context used to configure this op.
TaskContext task_context_;
};
REGISTER_KERNEL_BUILDER(Name("LexiconBuilder").Device(DEVICE_CPU),
LexiconBuilder);
class FeatureSize : public OpKernel {
public:
explicit FeatureSize(OpKernelConstruction *context) : OpKernel(context) {
string task_context_path;
OP_REQUIRES_OK(context,
context->GetAttr("task_context", &task_context_path));
OP_REQUIRES_OK(context, context->GetAttr("arg_prefix", &arg_prefix_));
OP_REQUIRES_OK(context, context->MatchSignature(
{}, {DT_INT32, DT_INT32, DT_INT32, DT_INT32}));
string data;
OP_REQUIRES_OK(context, ReadFileToString(tensorflow::Env::Default(),
task_context_path, &data));
OP_REQUIRES(
context,
TextFormat::ParseFromString(data, task_context_.mutable_spec()),
InvalidArgument("Could not parse task context at ", task_context_path));
string label_map_path =
TaskContext::InputFile(*task_context_.GetInput("label-map"));
label_map_ = SharedStoreUtils::GetWithDefaultName<TermFrequencyMap>(
label_map_path, 0, 0);
}
~FeatureSize() override { SharedStore::Release(label_map_); }
void Compute(OpKernelContext *context) override {
// Computes feature sizes.
ParserEmbeddingFeatureExtractor features(arg_prefix_);
features.Setup(&task_context_);
features.Init(&task_context_);
const int num_embeddings = features.NumEmbeddings();
Tensor *feature_sizes = nullptr;
Tensor *domain_sizes = nullptr;
Tensor *embedding_dims = nullptr;
Tensor *num_actions = nullptr;
TF_CHECK_OK(context->allocate_output(0, TensorShape({num_embeddings}),
&feature_sizes));
TF_CHECK_OK(context->allocate_output(1, TensorShape({num_embeddings}),
&domain_sizes));
TF_CHECK_OK(context->allocate_output(2, TensorShape({num_embeddings}),
&embedding_dims));
TF_CHECK_OK(context->allocate_output(3, TensorShape({}), &num_actions));
for (int i = 0; i < num_embeddings; ++i) {
feature_sizes->vec<int32>()(i) = features.FeatureSize(i);
domain_sizes->vec<int32>()(i) = features.EmbeddingSize(i);
embedding_dims->vec<int32>()(i) = features.EmbeddingDims(i);
}
// Computes number of actions in the transition system.
std::unique_ptr<ParserTransitionSystem> transition_system(
ParserTransitionSystem::Create(task_context_.Get(
features.GetParamName("transition_system"), "arc-standard")));
transition_system->Setup(&task_context_);
transition_system->Init(&task_context_);
num_actions->scalar<int32>()() =
transition_system->NumActions(label_map_->Size());
}
private:
// Task context used to configure this op.
TaskContext task_context_;
// Dependency label map used in transition system.
const TermFrequencyMap *label_map_;
// Prefix for context parameters.
string arg_prefix_;
};
REGISTER_KERNEL_BUILDER(Name("FeatureSize").Device(DEVICE_CPU), FeatureSize);
} // namespace syntaxnet
# coding=utf-8
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for lexicon_builder."""
# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
import os.path
import tensorflow as tf
import syntaxnet.load_parser_ops
from tensorflow.python.framework import test_util
from tensorflow.python.platform import googletest
from tensorflow.python.platform import logging
from syntaxnet import sentence_pb2
from syntaxnet import task_spec_pb2
from syntaxnet.ops import gen_parser_ops
FLAGS = tf.app.flags.FLAGS
CONLL_DOC1 = u'''1 बात _ n NN _ _ _ _ _
2 गलत _ adj JJ _ _ _ _ _
3 हो _ v VM _ _ _ _ _
4 तो _ avy CC _ _ _ _ _
5 गुस्सा _ n NN _ _ _ _ _
6 सेलेब्रिटिज _ n NN _ _ _ _ _
7 को _ psp PSP _ _ _ _ _
8 भी _ avy RP _ _ _ _ _
9 आना _ v VM _ _ _ _ _
10 लाजमी _ adj JJ _ _ _ _ _
11 है _ v VM _ _ _ _ _
12 । _ punc SYM _ _ _ _ _'''
CONLL_DOC2 = u'''1 लेकिन _ avy CC _ _ _ _ _
2 अभिनेत्री _ n NN _ _ _ _ _
3 के _ psp PSP _ _ _ _ _
4 इस _ pn DEM _ _ _ _ _
5 कदम _ n NN _ _ _ _ _
6 से _ psp PSP _ _ _ _ _
7 वहां _ pn PRP _ _ _ _ _
8 रंग _ n NN _ _ _ _ _
9 में _ psp PSP _ _ _ _ _
10 भंग _ adj JJ _ _ _ _ _
11 पड़ _ v VM _ _ _ _ _
12 गया _ v VAUX _ _ _ _ _
13 । _ punc SYM _ _ _ _ _'''
TAGS = ['NN', 'JJ', 'VM', 'CC', 'PSP', 'RP', 'JJ', 'SYM', 'DEM', 'PRP', 'VAUX']
CATEGORIES = ['n', 'adj', 'v', 'avy', 'n', 'psp', 'punc', 'pn']
TOKENIZED_DOCS = u'''बात गलत हो तो गुस्सा सेलेब्रिटिज को भी आना लाजमी है ।
लेकिन अभिनेत्री के इस कदम से वहां रंग में भंग पड़ गया ।
'''
COMMENTS = u'# Line with fake comments.'
class LexiconBuilderTest(test_util.TensorFlowTestCase):
def setUp(self):
if not hasattr(FLAGS, 'test_srcdir'):
FLAGS.test_srcdir = ''
if not hasattr(FLAGS, 'test_tmpdir'):
FLAGS.test_tmpdir = tf.test.get_temp_dir()
self.corpus_file = os.path.join(FLAGS.test_tmpdir, 'documents.conll')
self.context_file = os.path.join(FLAGS.test_tmpdir, 'context.pbtxt')
def AddInput(self, name, file_pattern, record_format, context):
inp = context.input.add()
inp.name = name
inp.record_format.append(record_format)
inp.part.add().file_pattern = file_pattern
def WriteContext(self, corpus_format):
context = task_spec_pb2.TaskSpec()
self.AddInput('documents', self.corpus_file, corpus_format, context)
for name in ('word-map', 'lcword-map', 'tag-map',
'category-map', 'label-map', 'prefix-table',
'suffix-table', 'tag-to-category'):
self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
logging.info('Writing context to: %s', self.context_file)
with open(self.context_file, 'w') as f:
f.write(str(context))
def ReadNextDocument(self, sess, doc_source):
doc_str, last = sess.run(doc_source)
if doc_str:
doc = sentence_pb2.Sentence()
doc.ParseFromString(doc_str[0])
else:
doc = None
return doc, last
def ValidateDocuments(self):
doc_source = gen_parser_ops.document_source(self.context_file, batch_size=1)
with self.test_session() as sess:
logging.info('Reading document1')
doc, last = self.ReadNextDocument(sess, doc_source)
self.assertEqual(len(doc.token), 12)
self.assertEqual(u'लाजमी', doc.token[9].word)
self.assertFalse(last)
logging.info('Reading document2')
doc, last = self.ReadNextDocument(sess, doc_source)
self.assertEqual(len(doc.token), 13)
self.assertEqual(u'भंग', doc.token[9].word)
self.assertFalse(last)
logging.info('Hitting end of the dataset')
doc, last = self.ReadNextDocument(sess, doc_source)
self.assertTrue(doc is None)
self.assertTrue(last)
def ValidateTagToCategoryMap(self):
with file(os.path.join(FLAGS.test_tmpdir, 'tag-to-category'), 'r') as f:
entries = [line.strip().split('\t') for line in f.readlines()]
for tag, category in entries:
self.assertIn(tag, TAGS)
self.assertIn(category, CATEGORIES)
def BuildLexicon(self):
with self.test_session():
gen_parser_ops.lexicon_builder(task_context=self.context_file).run()
def testCoNLLFormat(self):
self.WriteContext('conll-sentence')
logging.info('Writing conll file to: %s', self.corpus_file)
with open(self.corpus_file, 'w') as f:
f.write((CONLL_DOC1 + u'\n\n' + CONLL_DOC2 + u'\n')
.replace(' ', '\t').encode('utf-8'))
self.ValidateDocuments()
self.BuildLexicon()
self.ValidateTagToCategoryMap()
def testCoNLLFormatExtraNewlinesAndComments(self):
self.WriteContext('conll-sentence')
with open(self.corpus_file, 'w') as f:
f.write((u'\n\n\n' + CONLL_DOC1 + u'\n\n\n' + COMMENTS +
u'\n\n' + CONLL_DOC2).replace(' ', '\t').encode('utf-8'))
self.ValidateDocuments()
self.BuildLexicon()
self.ValidateTagToCategoryMap()
def testTokenizedTextFormat(self):
self.WriteContext('tokenized-text')
with open(self.corpus_file, 'w') as f:
f.write(TOKENIZED_DOCS.encode('utf-8'))
self.ValidateDocuments()
self.BuildLexicon()
def testTokenizedTextFormatExtraNewlines(self):
self.WriteContext('tokenized-text')
with open(self.corpus_file, 'w') as f:
f.write((u'\n\n\n' + TOKENIZED_DOCS + u'\n\n\n').encode('utf-8'))
self.ValidateDocuments()
self.BuildLexicon()
if __name__ == '__main__':
googletest.main()
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads parser_ops shared library."""
import os.path
import tensorflow as tf
tf.load_op_library(
os.path.join(tf.resource_loader.get_data_files_path(),
'parser_ops.so'))
Parameter {
name: "brain_parser_embedding_dims"
value: "32;32;64"
}
Parameter {
name: "brain_parser_embedding_names"
value: "labels;tags;words"
}
Parameter {
name: 'brain_parser_scoring'
value: 'default'
}
Parameter {
name: "brain_parser_features"
value:
'stack.child(1).label '
'stack.child(1).sibling(-1).label '
'stack.child(-1).label '
'stack.child(-1).sibling(1).label '
'stack.child(2).label '
'stack.child(-2).label '
'stack(1).child(1).label '
'stack(1).child(1).sibling(-1).label '
'stack(1).child(-1).label '
'stack(1).child(-1).sibling(1).label '
'stack(1).child(2).label '
'stack(1).child(-2).label; '
'input.token.tag '
'input(1).token.tag '
'input(2).token.tag '
'input(3).token.tag '
'stack.token.tag '
'stack.child(1).token.tag '
'stack.child(1).sibling(-1).token.tag '
'stack.child(-1).token.tag '
'stack.child(-1).sibling(1).token.tag '
'stack.child(2).token.tag '
'stack.child(-2).token.tag '
'stack(1).token.tag '
'stack(1).child(1).token.tag '
'stack(1).child(1).sibling(-1).token.tag '
'stack(1).child(-1).token.tag '
'stack(1).child(-1).sibling(1).token.tag '
'stack(1).child(2).token.tag '
'stack(1).child(-2).token.tag '
'stack(2).token.tag '
'stack(3).token.tag; '
'input.token.word '
'input(1).token.word '
'input(2).token.word '
'input(3).token.word '
'stack.token.word '
'stack.child(1).token.word '
'stack.child(1).sibling(-1).token.word '
'stack.child(-1).token.word '
'stack.child(-1).sibling(1).token.word '
'stack.child(2).token.word '
'stack.child(-2).token.word '
'stack(1).token.word '
'stack(1).child(1).token.word '
'stack(1).child(1).sibling(-1).token.word '
'stack(1).child(-1).token.word '
'stack(1).child(-1).sibling(1).token.word '
'stack(1).child(2).token.word '
'stack(1).child(-2).token.word '
'stack(2).token.word '
'stack(3).token.word '
}
Parameter {
name: "brain_parser_transition_system"
value: "arc-standard"
}
Parameter {
name: "brain_tagger_embedding_dims"
value: "8;16;16;16;16;64"
}
Parameter {
name: "brain_tagger_embedding_names"
value: "other;prefix2;prefix3;suffix2;suffix3;words"
}
Parameter {
name: "brain_tagger_features"
value:
'input.digit '
'input.hyphen; '
'input.prefix(length="2") '
'input(1).prefix(length="2") '
'input(2).prefix(length="2") '
'input(3).prefix(length="2") '
'input(-1).prefix(length="2") '
'input(-2).prefix(length="2") '
'input(-3).prefix(length="2") '
'input(-4).prefix(length="2"); '
'input.prefix(length="3") '
'input(1).prefix(length="3") '
'input(2).prefix(length="3") '
'input(3).prefix(length="3") '
'input(-1).prefix(length="3") '
'input(-2).prefix(length="3") '
'input(-3).prefix(length="3") '
'input(-4).prefix(length="3"); '
'input.suffix(length="2") '
'input(1).suffix(length="2") '
'input(2).suffix(length="2") '
'input(3).suffix(length="2") '
'input(-1).suffix(length="2") '
'input(-2).suffix(length="2") '
'input(-3).suffix(length="2") '
'input(-4).suffix(length="2"); '
'input.suffix(length="3") '
'input(1).suffix(length="3") '
'input(2).suffix(length="3") '
'input(3).suffix(length="3") '
'input(-1).suffix(length="3") '
'input(-2).suffix(length="3") '
'input(-3).suffix(length="3") '
'input(-4).suffix(length="3"); '
'input.token.word '
'input(1).token.word '
'input(2).token.word '
'input(3).token.word '
'input(-1).token.word '
'input(-2).token.word '
'input(-3).token.word '
'input(-4).token.word '
}
Parameter {
name: "brain_tagger_transition_system"
value: "tagger"
}
input {
name: "tag-map"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/tag-map"
}
}
input {
name: "tag-to-category"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/fine-to-universal.map"
}
}
input {
name: "word-map"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/word-map"
}
}
input {
name: "label-map"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/label-map"
}
}
input {
name: "prefix-table"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/prefix-table"
}
}
input {
name: "suffix-table"
Part {
file_pattern: "syntaxnet/models/parsey_mcparseface/suffix-table"
}
}
input {
name: 'stdin'
record_format: 'english-text'
Part {
file_pattern: '-'
}
}
input {
name: 'stdin-conll'
record_format: 'conll-sentence'
Part {
file_pattern: '-'
}
}
input {
name: 'stdout-conll'
record_format: 'conll-sentence'
Part {
file_pattern: '-'
}
}
# .
$ .
'' .
-LRB- .
-RRB- .
, .
. .
: .
ADD X
AFX PRT
CC CONJ
CD NUM
DT DET
EX DET
FW X
GW X
HYPH .
IN ADP
JJ ADJ
JJR ADJ
JJS ADJ
LS X
MD VERB
NFP .
NN NOUN
NNP NOUN
NNPS NOUN
NNS NOUN
PDT DET
POS PRT
PRP PRON
PRP$ PRON
RB ADV
RBR ADV
RBS ADV
RP PRT
SYM X
TO PRT
UH X
VB VERB
VBD VERB
VBG VERB
VBN VERB
VBP VERB
VBZ VERB
WDT DET
WP PRON
WP$ PRON
WRB ADV
`` .
X X
XX X
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment