Commit 4364390a authored by Ivan Bogatyy's avatar Ivan Bogatyy Committed by calberti
Browse files

Release DRAGNN bulk networks (#2785)

* Release DRAGNN bulk networks
parent 638fd759
......@@ -60,6 +60,9 @@ class DocumentFormat : public RegisterableClass<DocumentFormat> {
#define REGISTER_SYNTAXNET_DOCUMENT_FORMAT(type, component) \
REGISTER_SYNTAXNET_CLASS_COMPONENT(DocumentFormat, type, component)
// Component registry for document formatters.
DECLARE_SYNTAXNET_CLASS_REGISTRY("document format", DocumentFormat);
} // namespace syntaxnet
#endif // SYNTAXNET_DOCUMENT_FORMAT_H__
......@@ -94,7 +94,7 @@ GenericEmbeddingFeatureExtractor::ConvertExample(
for (int j = 0; j < feature_vectors[i].size(); ++j) {
const FeatureType &feature_type = *feature_vectors[i].type(j);
const FeatureValue value = feature_vectors[i].value(j);
const bool is_continuous = feature_type.name().find("continuous") == 0;
const bool is_continuous = feature_type.is_continuous();
const int64 id = is_continuous ? FloatFeatureValue(value).id : value;
const int base = feature_type.base();
if (id >= 0) {
......
......@@ -80,6 +80,42 @@ class FeatureVector {
// Returns the number of elements in the feature vector.
int size() const { return features_.size(); }
// Truncates the feature vector. Requires that new_size <= size().
void Truncate(int new_size) {
DCHECK_GE(new_size, 0);
DCHECK_LE(new_size, size());
features_.resize(new_size);
}
// Returns string representation of feature vector.
string ToString() const {
string str;
str.append("[");
for (int i = 0; i < size(); ++i) {
if (i > 0) str.append(",");
if (!type(i)->name().empty()) {
// Get the name and erase any quotation characters.
string name_str = type(i)->name();
auto it = name_str.begin();
while (it != name_str.end()) {
if (*it == '"') {
it = name_str.erase(it);
} else {
++it;
}
}
str.append(name_str);
str.append("=");
}
str.append(type(i)->GetFeatureValueName(value(i)));
}
str.append("]");
return str;
}
// Reserves space in the underlying feature vector.
void reserve(int n) { features_.reserve(n); }
......
......@@ -40,9 +40,14 @@ class FeatureType {
public:
// Initializes a feature type.
explicit FeatureType(const string &name)
: name_(name), base_(0) {}
: name_(name),
base_(0),
is_continuous_(name.find("continuous") != string::npos) {
// TODO(googleuser): Switch to explicitly setting is_continuous.
VLOG(2) << "Feature: " << name << ":" << is_continuous_;
}
virtual ~FeatureType() {}
virtual ~FeatureType() = default;
// Converts a feature value to a name.
virtual string GetFeatureValueName(FeatureValue value) const = 0;
......@@ -56,12 +61,21 @@ class FeatureType {
Predicate base() const { return base_; }
void set_base(Predicate base) { base_ = base; }
// True if the underlying feature is continuous.
bool is_continuous() const { return is_continuous_; }
// Sets whenther the underlying feature should be represented as continuous.
void set_is_continuous(bool is_continuous) { is_continuous_ = is_continuous; }
private:
// Feature type name.
string name_;
// "Base" feature value: i.e. a "slot" in a global ordering of features.
Predicate base_;
// True if this feature is continuous.
bool is_continuous_;
};
// Templated generic resource based feature type. This feature type delegates
......@@ -73,7 +87,7 @@ class FeatureType {
// successfully for values ONLY in the range [0, Resource->NumValues()) Any
// feature value not in the extra value map and not in the above range of
// Resource will result in a ERROR and return of "<INVALID>".
template<class Resource>
template <class Resource>
class ResourceBasedFeatureType : public FeatureType {
public:
// Creates a new type with given name, resource object, and a mapping of
......@@ -85,8 +99,8 @@ class ResourceBasedFeatureType : public FeatureType {
: FeatureType(name), resource_(resource), values_(values) {
max_value_ = resource->NumValues() - 1;
for (const auto &pair : values) {
CHECK_GE(pair.first, resource->NumValues()) << "Invalid extra value: "
<< pair.first << "," << pair.second;
CHECK_GE(pair.first, resource->NumValues())
<< "Invalid extra value: " << pair.first << "," << pair.second;
max_value_ = pair.first > max_value_ ? pair.first : max_value_;
}
}
......@@ -152,8 +166,7 @@ class EnumFeatureType : public FeatureType {
string GetFeatureValueName(FeatureValue value) const override {
auto it = value_names_.find(value);
if (it == value_names_.end()) {
LOG(ERROR)
<< "Invalid feature value " << value << " for " << name();
LOG(ERROR) << "Invalid feature value " << value << " for " << name();
return "<INVALID>";
}
return it->second;
......
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/generic_features.h"
#include <limits>
#include <string>
#include "syntaxnet/base.h"
using tensorflow::strings::StrAppend;
using tensorflow::strings::StrCat;
namespace syntaxnet {
GenericFeatureTypes::TupleFeatureTypeBase::TupleFeatureTypeBase(
const string &prefix, const std::vector<FeatureType *> &sub_types)
: FeatureType(CreateTypeName(prefix, sub_types)),
types_(sub_types.begin(), sub_types.end()) {
CHECK(!types_.empty());
}
string GenericFeatureTypes::TupleFeatureTypeBase::GetFeatureValueName(
FeatureValue value) const {
if (value < 0 || value >= size_) return "<INVALID>";
string name = "(";
for (uint32 i = 0; i < types_.size(); ++i) {
const FeatureType *sub_type = types_[i];
const FeatureValue sub_size = sub_type->GetDomainSize();
const FeatureValue sub_value = value % sub_size;
const string sub_name = sub_type->GetFeatureValueName(sub_value);
const string delimiter = i + 1 < types_.size() ? "," : ")";
StrAppend(&name, sub_name, delimiter);
value /= sub_size;
}
return name;
}
FeatureValue GenericFeatureTypes::TupleFeatureTypeBase::GetDomainSize() const {
return size_;
}
void GenericFeatureTypes::TupleFeatureTypeBase::InitDomainSizes(
vector<FeatureValue> *sizes) {
CHECK_EQ(sizes->size(), types_.size());
// Populate sub-sizes.
for (uint32 i = 0; i < types_.size(); ++i) {
sizes->at(i) = types_[i]->GetDomainSize();
}
// Compute the cardinality of the tuple.
size_ = 1;
double real_size = 1.0; // for overflow detection
for (const FeatureValue sub_size : *sizes) {
size_ *= sub_size;
real_size *= static_cast<double>(sub_size);
}
// Check for overflow.
if (real_size > std::numeric_limits<FeatureValue>::max()) {
string message;
for (uint32 i = 0; i < types_.size(); ++i) {
StrAppend(&message, "\n ", types_[i]->name(), ")=", sizes->at(i));
}
LOG(FATAL) << "Feature space overflow in feature " << name() << message;
}
}
string GenericFeatureTypes::TupleFeatureTypeBase::CreateTypeName(
const string &prefix, const std::vector<FeatureType *> &sub_types) {
string prefix_to_strip = prefix.empty() ? "" : StrCat(prefix, ".");
string name = StrCat(prefix, " {");
for (const FeatureType *type : sub_types) {
string stripped_name = type->name();
if (stripped_name.find_first_of(prefix_to_strip) == 0) {
stripped_name = stripped_name.substr(prefix_to_strip.length());
}
StrAppend(&name, " ", stripped_name);
}
StrAppend(&name, " }");
return name;
}
GenericFeatureTypes::DynamicTupleFeatureType::DynamicTupleFeatureType(
const string &prefix, const std::vector<FeatureType *> &sub_types)
: TupleFeatureTypeBase(prefix, sub_types), sizes_(sub_types.size()) {
CHECK_GE(sizes_.size(), 2);
InitDomainSizes(&sizes_);
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Generic feature functions. These feature functions are independent of the
// feature function template types.
//
// The generic features should be instantiated and registered using the
// REGISTER_SYNTAXNET_GENERIC_FEATURES() macro:
//
// typedef GenericFeatures<Foo, int> GenericFooFeatures;
// REGISTER_SYNTAXNET_GENERIC_FEATURES(GenericFooFeatures);
//
#ifndef SYNTAXNET_GENERIC_FEATURES_H_
#define SYNTAXNET_GENERIC_FEATURES_H_
#include <string>
#include <utility>
#include <vector>
#include "syntaxnet/base.h"
#include "syntaxnet/feature_extractor.h"
namespace syntaxnet {
class TaskContext;
class WorkspaceSet;
// A class encapsulating all generic feature types.
class GenericFeatureTypes {
public:
// Base class for tuple feature types.
class TupleFeatureTypeBase : public FeatureType {
public:
// Creates a tuple whose elements are defined by the sub-types. This does
// not take ownership of the sub-types, which must remain live while this
// is in use.
TupleFeatureTypeBase(const string &prefix,
const std::vector<FeatureType *> &sub_types);
// Returns a string representation of the tuple value.
string GetFeatureValueName(FeatureValue value) const override;
// Returns the domain size of this feature.
FeatureValue GetDomainSize() const override;
protected:
// Sets the feature domain sizes and computes the total domain size of the
// tuple. Derived classes should call this method from their constructor.
void InitDomainSizes(vector<FeatureValue> *sizes);
private:
// Returns a string name for a type using the prefix and sub-types.
static string CreateTypeName(const string &prefix,
const std::vector<FeatureType *> &sub_types);
// The types of the sub-features. Not owned.
const std::vector<const FeatureType *> types_;
// The domain size of the tuple.
FeatureValue size_ = 0;
};
// Feature type for tuples of fixed size.
template <int kNumElements>
class StaticTupleFeatureType : public TupleFeatureTypeBase {
public:
static_assert(kNumElements >= 2, "At least two elements required");
// Creates a fixed-size tuple of sub-types. This does not take ownership
// of the sub-types, which must remain live while this is in use.
StaticTupleFeatureType(const string &prefix,
const std::vector<FeatureType *> &sub_types)
: TupleFeatureTypeBase(prefix, sub_types) {
CHECK_EQ(sub_types.size(), kNumElements);
sizes_.resize(kNumElements);
InitDomainSizes(&sizes_);
}
// Returns the conjoined tuple value for a list of sub-values. The range
// values[0,kNumElements) must be valid and non-absent.
FeatureValue Conjoin(const FeatureValue *values) const {
DCHECK_GE(values[kNumElements - 1], 0);
DCHECK_LT(values[kNumElements - 1], sizes_[kNumElements - 1]);
DCHECK_NE(values[kNumElements - 1], GenericFeatureFunction::kNone);
FeatureValue conjoined = values[kNumElements - 1];
for (int i = kNumElements - 2; i >= 0; --i) {
DCHECK_GE(values[i], 0);
DCHECK_LT(values[i], sizes_[i]);
DCHECK_NE(values[i], GenericFeatureFunction::kNone);
conjoined = values[i] + conjoined * sizes_[i];
}
return conjoined;
}
private:
// The domain sizes of the sub-types.
vector<FeatureValue> sizes_;
};
// Feature type for tuples of dynamic size.
class DynamicTupleFeatureType : public TupleFeatureTypeBase {
public:
// Creates a tuple of sub-types. This does not take ownership of the
// sub-types, which must remain live while this is in use.
DynamicTupleFeatureType(const string &prefix,
const std::vector<FeatureType *> &sub_types);
// Returns the conjoined tuple value for a list of sub-values, which must
// be the same size as the number of elements and non-absent.
FeatureValue Conjoin(const std::vector<FeatureValue> &values) const {
DCHECK_EQ(values.size(), sizes_.size());
DCHECK_GE(values.back(), 0);
DCHECK_LT(values.back(), sizes_.back());
DCHECK_NE(values.back(), GenericFeatureFunction::kNone);
FeatureValue conjoined = values.back();
for (int i = static_cast<int>(sizes_.size()) - 2; i >= 0; --i) {
DCHECK_GE(values[i], 0);
DCHECK_LT(values[i], sizes_[i]);
DCHECK_NE(values[i], GenericFeatureFunction::kNone);
conjoined = values[i] + conjoined * sizes_[i];
}
return conjoined;
}
private:
// The domain sizes of the sub-types.
std::vector<FeatureValue> sizes_;
};
// A wrapper which simply delegates to the sub-type. This does not take
// ownership of the sub-type, which must remain live while this is in use.
class WrappedFeatureType : public FeatureType {
public:
explicit WrappedFeatureType(FeatureType *sub_type)
: FeatureType(sub_type->name()), sub_type_(sub_type) {}
string GetFeatureValueName(FeatureValue value) const override {
return sub_type_->GetFeatureValueName(value);
}
FeatureValue GetDomainSize() const override {
return sub_type_->GetDomainSize();
}
private:
FeatureType *sub_type_;
};
};
// A class encapsulating all generic feature functions.
template <class OBJ, class... ARGS>
class GenericFeatures {
public:
// Base class for feature functions.
typedef FeatureFunction<OBJ, ARGS...> Base;
// Base class for nested feature functions: these still have their own feature
// type, so make sure not to pass to the nested ones.
class MetaBase : public MetaFeatureFunction<OBJ, ARGS...> {
public:
// Don't use the nested logic for feature types by default.
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
GenericFeatureFunction::GetFeatureTypes(types);
}
};
// Feature function that adds a bias value to the feature vector.
class Bias : public Base {
enum BiasFeatureValue { ON };
public:
// Initializes the feature.
void Init(TaskContext *context) override {
this->set_feature_type(
new EnumFeatureType(this->name(), {{BiasFeatureValue::ON, "ON"}}));
}
// Returns the bias value.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, const FeatureVector *fv) const override {
return 0;
}
};
// Feature function that returns a constant value.
class Constant : public Base {
public:
// Initializes the feature.
void Init(TaskContext *context) override {
value_ = this->GetIntParameter("value", 0);
this->set_feature_type(new NumericFeatureType(this->name(), value_ + 1));
}
// Returns the constant's value.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, const FeatureVector *fv) const override {
return value_;
}
private:
int value_ = 0;
};
// A feature function that tests equality between two nested features. This
// can be used, for example, to check morphological agreement.
class Equals : public MetaBase {
enum EqualsFeatureValue { DIFFERENT, EQUAL };
public:
// Initializes the feature.
void InitNested(TaskContext *context) override {
const auto &nested = this->nested();
CHECK_EQ(nested.size(), 2)
<< "The 'equals' feature requires two nested features.";
this->set_feature_type(new EnumFeatureType(
this->name(), {{EqualsFeatureValue::DIFFERENT, "DIFFERENT"},
{EqualsFeatureValue::EQUAL, "EQUAL"}}));
}
// Returns the equality value, or kNone if either value is absent.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, const FeatureVector *fv) const override {
const auto &nested = this->nested();
const FeatureValue a =
nested[0]->Compute(workspaces, object, args..., fv);
if (a == Base::kNone) return Base::kNone;
const FeatureValue b =
nested[1]->Compute(workspaces, object, args..., fv);
if (b == Base::kNone) return Base::kNone;
return a == b ? 1 : 0;
}
};
// Abstract base class for features that compare a nested feature's value
// to a target value (specified via the 'value' parameter).
//
// Subclasses must implement InitTypes() and ComputeValue().
class CompareValue : public MetaBase {
public:
// Initialize the type information.
virtual void InitTypes() = 0;
// Compute the feature value given the nested feature value and the target
// value (i.e., what was passed as the 'value' parameter).
virtual FeatureValue ComputeValue(FeatureValue nested_feature_value,
FeatureValue target_value) const = 0;
// Initializes the feature.
void InitNested(TaskContext *context) override {
string value_str = this->GetParameter("value");
CHECK_GT(value_str.size(), 0)
<< "The '" << this->FunctionName()
<< "' feature requires a 'value' parameter.";
const auto &nested = this->nested();
CHECK_EQ(nested.size(), 1) << "The '" << this->FunctionName()
<< "' feature requires one nested feature.";
// Only allow nested features with exactly one feature type.
FeatureType *nested_feature_type =
CHECK_NOTNULL(nested.front()->GetFeatureType());
for (int i = 0; i < nested_feature_type->GetDomainSize(); ++i) {
if (nested_feature_type->GetFeatureValueName(i) == value_str) {
value_ = i;
break;
}
}
CHECK_NE(value_, -1) << "Unknown feature value specified: " << value_str
<< ".";
InitTypes();
}
// Extracts the nested feature value, and delegates computation of the
// final feature value to ComputeValue().
// Returns kNone if the nested feature value is absent.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, const FeatureVector *fv) const override {
const auto &nested = this->nested();
FeatureValue feature_value =
nested.front()->Compute(workspaces, object, args..., fv);
if (feature_value == Base::kNone) return Base::kNone;
return ComputeValue(feature_value, value_);
}
private:
// The value to compare the feature against.
int value_ = -1;
};
// A feature function that fires if and only if the nested feature has the
// given value.
class Filter : public CompareValue {
enum FilterFeatureValue { ON };
public:
void InitTypes() override {
this->set_feature_type(
new EnumFeatureType(this->name(), {{FilterFeatureValue::ON, "ON"}}));
}
FeatureValue ComputeValue(FeatureValue nested_feature_value,
FeatureValue target_value) const override {
return nested_feature_value == target_value ? 0 : Base::kNone;
}
};
// A feature function that tests equality between a feature and a value.
class Is : public CompareValue {
enum IsFeatureValue { FALSE, TRUE };
public:
void InitTypes() override {
this->set_feature_type(new EnumFeatureType(
this->name(),
{{IsFeatureValue::FALSE, "FALSE"}, {IsFeatureValue::TRUE, "TRUE"}}));
}
FeatureValue ComputeValue(FeatureValue nested_feature_value,
FeatureValue target_value) const override {
return nested_feature_value == target_value;
}
};
// A feature function that forwards the nested feature value, unless it equals
// the target value (in which case, the feature doesn't fire).
class Ignore : public CompareValue {
public:
void InitTypes() override {
this->set_feature_type(new GenericFeatureTypes::WrappedFeatureType(
this->nested().front()->GetFeatureType()));
}
FeatureValue ComputeValue(FeatureValue nested_feature_value,
FeatureValue target_value) const override {
return nested_feature_value == target_value
? GenericFeatureFunction::kNone
: nested_feature_value;
}
};
// Abstract base class for features that reduce several binary values to a
// to a single binary value.
//
// Subclasses must implement Compute().
class BinaryReduce : public MetaBase {
enum BinaryReduceFeatureValue { FALSE, TRUE };
public:
// Initializes the feature.
// Checks that all the nested features are binary, and sets the output
// feature type to binary.
void InitNested(TaskContext *context) override {
for (const Base *function : this->nested()) {
FeatureType *nested_type = CHECK_NOTNULL(function->GetFeatureType());
CHECK_EQ(nested_type->GetDomainSize(), 2)
<< this->name() << " requires nested binary feature types only.";
}
this->set_feature_type(new EnumFeatureType(
this->name(), {{BinaryReduceFeatureValue::FALSE, "FALSE"},
{BinaryReduceFeatureValue::TRUE, "TRUE"}}));
}
};
// A feature function that takes any number of binary nested features, and
// returns whether they all evaluate to 1.
class All : public BinaryReduce {
public:
// Returns whether all nested feature values are 1, or kNone if any of them
// are unavailable.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, const FeatureVector *fv) const override {
for (const Base *function : this->nested()) {
const FeatureValue value =
function->Compute(workspaces, object, args..., fv);
if (value == Base::kNone) return Base::kNone;
if (value == 0) return 0;
}
return 1;
}
};
// A feature function that takes any number of binary nested features, and
// returns whether any of them evaluate to 1.
class Any : public BinaryReduce {
public:
// Returns whether any nested feature values are 1, or kNone if any of them
// are unavailable.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, const FeatureVector *fv) const override {
for (const Base *function : this->nested()) {
const FeatureValue value =
function->Compute(workspaces, object, args..., fv);
if (value == Base::kNone) return Base::kNone;
if (value == 1) return 1;
}
return 0;
}
};
// A feature function that computes a fixed-size tuple.
template <int kNumElements>
class StaticTuple : public MetaBase {
public:
// The associated fixed-size tuple type.
typedef GenericFeatureTypes::StaticTupleFeatureType<kNumElements> Type;
// Initializes the feature.
void InitNested(TaskContext *context) override {
std::vector<FeatureType *> sub_types;
for (const Base *function : this->nested()) {
sub_types.push_back(CHECK_NOTNULL(function->GetFeatureType()));
}
this->set_feature_type(new Type(this->SubPrefix(), sub_types));
}
// Returns the tuple value, or kNone if any sub-value is unavailable.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, const FeatureVector *fv) const override {
const auto &nested = this->nested();
FeatureValue values[kNumElements];
for (int i = 0; i < kNumElements; ++i) {
const FeatureValue value =
nested[i]->Compute(workspaces, object, args..., fv);
if (value == Base::kNone) return Base::kNone;
values[i] = value;
}
return static_cast<Type *>(this->feature_type())->Conjoin(values);
}
};
// Convenience aliases for common fixed-size tuples.
typedef StaticTuple<2> Pair;
typedef StaticTuple<3> Triple;
typedef StaticTuple<4> Quad;
typedef StaticTuple<5> Quint;
// A feature function that computes a dynamically-sized tuple.
class Tuple : public MetaBase {
public:
// The associated tuple type.
typedef GenericFeatureTypes::DynamicTupleFeatureType Type;
// Initializes the feature.
void InitNested(TaskContext *context) override {
std::vector<FeatureType *> sub_types;
for (const Base *function : this->nested()) {
sub_types.push_back(CHECK_NOTNULL(function->GetFeatureType()));
}
this->set_feature_type(new Type(this->SubPrefix(), sub_types));
}
// Returns the tuple value, or kNone if any sub-value is unavailable.
FeatureValue Compute(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, const FeatureVector *fv) const override {
std::vector<FeatureValue> values;
for (const Base *function : this->nested()) {
const FeatureValue value =
function->Compute(workspaces, object, args..., fv);
if (value == Base::kNone) return Base::kNone;
values.push_back(value);
}
return static_cast<Type *>(this->feature_type())->Conjoin(values);
}
};
// A feature function that creates all pairs of the features extracted by the
// nested feature functions. All the nested feature functions must return
// single valued features.
//
// Parameters:
// bool unary (false):
// If true, then unary features are also emitted.
class Pairs : public MetaBase {
public:
// The pair feature type.
typedef GenericFeatureTypes::StaticTupleFeatureType<2> Type;
// Discards the pair types.
~Pairs() override {
for (Type *type : pairs_) delete type;
}
// Initializes the feature.
void InitNested(TaskContext *context) override {
unary_ = this->GetParameter("unary") == "true";
const auto &nested = this->nested();
CHECK_GE(nested.size(), 2)
<< "The 'pairs' feature requires at least two sub-features.";
// Get the types of all nested features.
types_.clear();
for (const Base *function : nested) {
types_.push_back(CHECK_NOTNULL(function->GetFeatureType()));
}
// Initialize the pair types for all features.
pairs_.resize(NumPairs(nested.size()));
for (int right = 1; right < nested.size(); ++right) {
for (int left = 0; left < right; ++left) {
pairs_[PairIndex(left, right)] =
new Type(this->SubPrefix(), {types_[left], types_[right]});
}
}
}
// Produces all feature types.
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
if (unary_) types->insert(types->end(), types_.begin(), types_.end());
types->insert(types->end(), pairs_.begin(), pairs_.end());
}
// Evaluates the feature.
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const override {
const auto &nested = this->nested();
// Collect all active feature sub-values.
std::vector<FeatureValue> values(nested.size());
std::vector<int> active_indices;
active_indices.reserve(nested.size());
for (int i = 0; i < nested.size(); ++i) {
values[i] = nested[i]->Compute(workspaces, object, args..., result);
if (values[i] != Base::kNone) active_indices.push_back(i);
}
// Optionally generate unary features.
if (unary_) {
for (int index : active_indices) {
result->add(types_[index], values[index]);
}
}
// Generate all feature pairs.
FeatureValue pair_values[2];
for (int right = 1; right < active_indices.size(); ++right) {
int right_index = active_indices[right];
pair_values[1] = values[right_index];
for (int left = 0; left < right; ++left) {
int left_index = active_indices[left];
pair_values[0] = values[left_index];
Type *type = pairs_[PairIndex(left_index, right_index)];
result->add(type, type->Conjoin(pair_values));
}
}
}
private:
// Returns the number of pairs (i,j) where 0 <= i < j < size.
static int NumPairs(int size) {
DCHECK_GE(size, 0);
return (size * (size - 1)) / 2;
}
// Returns the index for a pair (left,right) where left < right. The
// indices are suitable for densely linearizing pairs into an array.
static int PairIndex(int left, int right) {
DCHECK_LE(0, left);
DCHECK_LT(left, right);
return left + NumPairs(right);
}
// Whether to also emit unary features.
bool unary_ = false;
// Feature types for all nested features. Not owned.
std::vector<FeatureType *> types_;
// Feature types for all pairs. Indexed according to PairIndex(). Owned.
std::vector<Type *> pairs_;
};
// Feature function for conjoining the first sub-feature with each of the
// rest of the sub-features.
//
// Parameters:
// bool unary (false):
// If true, then unary features are also emitted.
class Conjoin : public MetaBase {
public:
// The pair feature type.
typedef GenericFeatureTypes::StaticTupleFeatureType<2> Type;
// Discards the pair types.
~Conjoin() override {
for (Type *type : pairs_) delete type;
}
// Initializes the feature.
void InitNested(TaskContext *context) override {
unary_ = this->GetParameter("unary") == "true";
const auto &nested = this->nested();
CHECK_GE(nested.size(), 2)
<< "The 'conjoin' feature requires at least two sub-features.";
// Get the types of the rest of the nested features.
types_.clear();
for (const Base *function : nested) {
types_.push_back(CHECK_NOTNULL(function->GetFeatureType()));
}
// Initialize the pair types.
pairs_.assign(1, nullptr);
for (int i = 1; i < types_.size(); ++i) {
pairs_.push_back(new Type(this->SubPrefix(), {types_[0], types_[i]}));
}
}
// Produces all feature types.
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
if (unary_) types->insert(types->end(), types_.begin() + 1, types_.end());
types->insert(types->end(), pairs_.begin() + 1, pairs_.end());
}
// Evaluates the feature.
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const override {
const auto &nested = this->nested();
FeatureValue values[2];
values[0] = nested[0]->Compute(workspaces, object, args..., result);
// Stop early if the first feature is absent.
if (values[0] == Base::kNone) {
if (unary_) {
for (int i = 1; i < nested.size(); ++i) {
values[1] = nested[i]->Compute(workspaces, object, args..., result);
if (values[1] == Base::kNone) continue;
result->add(types_[i], values[1]);
}
}
return;
}
// Otherwise, the first feature exists; conjoin it with the rest.
for (int i = 1; i < nested.size(); ++i) {
values[1] = nested[i]->Compute(workspaces, object, args..., result);
if (values[1] == Base::kNone) continue;
if (unary_) result->add(types_[i], values[1]);
result->add(pairs_[i], pairs_[i]->Conjoin(values));
}
}
private:
// Whether to also emit unary features.
bool unary_ = false;
// Feature types for all nested features. Not owned.
std::vector<FeatureType *> types_;
// Feature types for all pairs. The first element is null, in order to
// align this list with types_. Owned.
std::vector<Type *> pairs_;
};
// Feature function for creating pairs of multi-valued features. By default,
// the feature computes the Cartesian product of the extracted sub-features,
// but a parallel product can be specified via the options.
//
// Parameters:
// bool parallel (false):
// If true, output features for parallel pairs, like a dot product. The
// two sub-features must produce identical numbers of features.
class MultiPair : public MetaBase {
public:
// The pair feature type.
typedef GenericFeatureTypes::StaticTupleFeatureType<2> Type;
// Initializes the feature.
void InitNested(TaskContext *context) override {
parallel_ = this->GetParameter("parallel") == "true";
std::vector<FeatureType *> sub_types;
for (const Base *function : this->nested()) {
sub_types.push_back(CHECK_NOTNULL(function->GetFeatureType()));
}
this->set_feature_type(new Type(this->SubPrefix(), sub_types));
}
// Evaluates the feature.
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const override {
const auto &nested = this->nested();
const int orig_size = result->size();
// Extract features from left half. Values are extracted directly into
// the result so that optimized variable references are handled properly.
nested[0]->Evaluate(workspaces, object, args..., result);
if (orig_size == result->size()) return; // no left features
std::vector<FeatureValue> left;
for (int i = orig_size; i < result->size(); ++i) {
left.push_back(result->value(i));
}
result->Truncate(orig_size);
// Extract features from right half.
nested[1]->Evaluate(workspaces, object, args..., result);
if (orig_size == result->size()) return; // no right features
std::vector<FeatureValue> right;
for (int i = orig_size; i < result->size(); ++i) {
right.push_back(result->value(i));
}
result->Truncate(orig_size);
// Compute the pair values.
FeatureValue values[2];
Type *type = static_cast<Type *>(this->feature_type());
if (parallel_) {
// Produce parallel pairs.
CHECK_EQ(left.size(), right.size());
for (int i = 0; i < left.size(); ++i) {
values[0] = left[i];
values[1] = right[i];
result->add(type, type->Conjoin(values));
}
} else {
// Produce all pairs.
for (const FeatureValue left_value : left) {
values[0] = left_value;
for (const FeatureValue right_value : right) {
values[1] = right_value;
result->add(type, type->Conjoin(values));
}
}
}
}
private:
// Whether to do a parallel product instead of a Cartesian product.
bool parallel_ = false;
};
// Feature function for conjoining the first multi-valued sub-feature with
// each of the rest of the multi-valued sub-features.
class MultiConjoin : public MetaBase {
public:
// The pair feature type.
typedef GenericFeatureTypes::StaticTupleFeatureType<2> Type;
// Discards the pair types.
~MultiConjoin() override {
for (Type *type : pairs_) delete type;
}
// Initializes the feature.
void InitNested(TaskContext *context) override {
const auto &nested = this->nested();
CHECK_GE(nested.size(), 2)
<< "The 'multiconjoin' feature requires at least two sub-features.";
// Get the types of the rest of the nested features.
std::vector<FeatureType *> types;
types.reserve(nested.size());
for (const Base *function : nested) {
types.push_back(CHECK_NOTNULL(function->GetFeatureType()));
}
// Initialize the pair types.
pairs_.clear();
for (int i = 1; i < types.size(); ++i) {
pairs_.push_back(new Type(this->SubPrefix(), {types[0], types[i]}));
}
}
// Produces all feature types.
void GetFeatureTypes(std::vector<FeatureType *> *types) const override {
types->insert(types->end(), pairs_.begin(), pairs_.end());
}
// Evaluates the feature.
void Evaluate(const WorkspaceSet &workspaces, const OBJ &object,
ARGS... args, FeatureVector *result) const override {
const auto &nested = this->nested();
const int orig_size = result->size();
// Gather the lists of sub-values for each nested feature. Sub-values
// are extracted directly into the result so that optimized variable
// references are handled properly.
std::vector<std::vector<FeatureValue> > sub_values(nested.size());
for (int i = 0; i < nested.size(); ++i) {
nested[i]->Evaluate(workspaces, object, args..., result);
if (orig_size == result->size()) {
if (i == 0) {
return; // no first values; nothing will be extracted
} else {
continue; // no non-first values; skip to next feature
}
}
std::vector<FeatureValue> &values = sub_values[i];
for (int j = orig_size; j < result->size(); ++j) {
values.push_back(result->value(j));
}
result->Truncate(orig_size);
}
// Produce conjoined features.
const std::vector<FeatureValue> &first_values = sub_values[0];
FeatureValue values[2];
for (int i = 1; i < sub_values.size(); ++i) {
const std::vector<FeatureValue> &other_values = sub_values[i];
if (other_values.empty()) continue;
Type *type = pairs_[i - 1];
for (const FeatureValue first_value : first_values) {
values[0] = first_value;
for (const FeatureValue other_value : other_values) {
values[1] = other_value;
result->add(type, type->Conjoin(values));
}
}
}
}
private:
// Feature types for all pairs. Owned.
std::vector<Type *> pairs_;
};
};
#define REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, name, type) \
typedef generics::type __##type##generics; \
REGISTER_SYNTAXNET_FEATURE_FUNCTION(generics::Base, name, __##type##generics)
#define REGISTER_SYNTAXNET_GENERIC_FEATURES(generics) \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "bias", Bias); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "constant", Constant); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "equals", Equals); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "filter", Filter); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "is", Is); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "all", All); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "any", Any); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "pair", Pair); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "triple", Triple); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "quad", Quad); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "quint", Quint); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "tuple", Tuple); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "pairs", Pairs); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "conjoin", Conjoin); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "multipair", MultiPair); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "ignore", Ignore); \
REGISTER_SYNTAXNET_GENERIC_FEATURE(generics, "multiconjoin", MultiConjoin)
} // namespace syntaxnet
#endif // SYNTAXNET_GENERIC_FEATURES_H_
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/generic_features.h"
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "syntaxnet/registry.h"
#include "syntaxnet/task_context.h"
#include <gmock/gmock.h>
namespace syntaxnet {
// Test feature extractor.
class TestFeatureExtractor : public FeatureExtractor<std::vector<int>, int> {};
// Registration macro.
#define REGISTER_TEST_FEATURE_FUNCTION(name, component) \
REGISTER_SYNTAXNET_FEATURE_FUNCTION(TestFeatureExtractor::Function, name, \
component)
// The registry must be declared in the global namespace.
REGISTER_SYNTAXNET_CLASS_REGISTRY("syntaxnet test feature function",
syntaxnet::TestFeatureExtractor::Function);
typedef GenericFeatures<std::vector<int>, int> GenericTestFeatures;
REGISTER_SYNTAXNET_GENERIC_FEATURES(GenericTestFeatures);
class TestVectorFeatureFunction : public TestFeatureExtractor::Function {
public:
// Initializes the feature.
void Init(TaskContext *context) override {
int arg = argument();
while (arg > 0) {
offsets_.push_back(arg % 10);
arg /= 10;
}
std::reverse(offsets_.begin(), offsets_.end());
if (offsets_.empty()) offsets_.push_back(0);
set_feature_type(new NumericFeatureType(name(), 10));
}
// Evaluates the feature.
void Evaluate(const WorkspaceSet &workspace, const std::vector<int> &object,
int focus, FeatureVector *features) const override {
for (const uint32 offset : offsets_) {
const uint32 index = focus + offset;
if (index >= object.size()) continue;
features->add(feature_type(), object[index]);
}
}
// Returns the first extracted feature, if available.
FeatureValue Compute(const WorkspaceSet &workspace,
const std::vector<int> &object, int focus,
const FeatureVector *fv) const override {
CHECK_EQ(1, offsets_.size());
FeatureVector features;
Evaluate(workspace, object, focus, &features);
return features.size() == 0 ? kNone : features.value(0);
}
private:
// A list of offsets extracted from the feature's argument.
std::vector<uint32> offsets_;
};
REGISTER_TEST_FEATURE_FUNCTION("f", TestVectorFeatureFunction);
class TestParityFeatureFunction : public TestFeatureExtractor::Function {
public:
// Initializes the feature.
void Init(TaskContext *context) override {
// "even" corresponds to feature value 0, "odd" to 1.
enum ParityFeatureValue { EVEN, ODD };
set_feature_type(
new EnumFeatureType(name(), {{EVEN, "even"}, {ODD, "odd"}}));
// Check the "offset" parameter.
for (const auto &param : this->descriptor()->parameter()) {
if (param.name() == "offset") {
offset_ = std::stoi(param.value());
CHECK(&offset_);
}
}
}
// Evaluates the feature.
void Evaluate(const WorkspaceSet &workspace, const std::vector<int> &object,
int focus, FeatureVector *features) const override {
uint32 offset_focus = focus += offset_;
if (offset_focus < object.size()) {
features->add(feature_type(), object[offset_focus] & 1);
}
}
// Returns the first extracted feature, if available.
FeatureValue Compute(const WorkspaceSet &workspace,
const std::vector<int> &object, int focus,
const FeatureVector *fv) const override {
FeatureVector features;
Evaluate(workspace, object, focus, &features);
return features.size() == 0 ? kNone : features.value(0);
}
private:
int offset_ = 0;
};
REGISTER_TEST_FEATURE_FUNCTION("parity", TestParityFeatureFunction);
// Testing rig.
class GenericFeaturesTest : public ::testing::Test {
public:
// Deallocates test state.
void TearDown() override {
object_.reset();
extractor_.reset();
context_.reset();
}
// Initializes the test.
void Init(const string &spec, const std::vector<int> &object) {
context_.reset(new TaskContext());
extractor_.reset(new TestFeatureExtractor());
extractor_->Parse(spec);
extractor_->Setup(context_.get());
extractor_->Init(context_.get());
object_.reset(new std::vector<int>(object));
}
// Tests extraction on the current object.
void TestExtract(int focus, const string &feature_string) const {
FeatureVector features;
WorkspaceSet workspace;
extractor_->Preprocess(&workspace, object_.get());
extractor_->ExtractFeatures(workspace, *object_, focus, &features);
EXPECT_EQ(feature_string, features.ToString());
}
private:
// The task context for tests.
std::unique_ptr<TaskContext> context_;
// Feature extractor for tests.
std::unique_ptr<TestFeatureExtractor> extractor_;
// Object for tests.
std::unique_ptr<std::vector<int> > object_;
};
TEST_F(GenericFeaturesTest, Singleton) {
Init("f", {5, 3, 2, 4, 6});
TestExtract(0, "[f=5]");
TestExtract(1, "[f=3]");
TestExtract(4, "[f=6]");
TestExtract(5, "[]");
}
TEST_F(GenericFeaturesTest, TwoFeatures) {
Init("f(0) f(1)", {5, 3, 2, 4, 6});
TestExtract(0, "[f=5,f(1)=3]");
}
TEST_F(GenericFeaturesTest, Bias) {
Init("bias", {0, 1});
TestExtract(0, "[bias=ON]");
}
TEST_F(GenericFeaturesTest, Constant) {
Init("constant(value=2)", {0, 1});
TestExtract(0, "[constant(value=2)=2]");
}
TEST_F(GenericFeaturesTest, Equals) {
Init("equals { f(0) f(1) }", {0, 1, 0});
TestExtract(0, "[equals { f f(1) }=DIFFERENT]");
Init("equals { f(0) f(2) }", {0, 1, 0});
TestExtract(0, "[equals { f f(2) }=EQUAL]");
}
TEST_F(GenericFeaturesTest, Filter) {
Init("filter(value=5).f", {3, 5});
TestExtract(0, "[]");
TestExtract(1, "[filter(value=5).f=ON]");
// Check that we are actually parsing feature value names.
Init("filter(value=odd).parity", {3, 4});
TestExtract(0, "[filter(value=odd).parity=ON]");
TestExtract(1, "[]");
Init("filter(value=even).parity", {3, 4});
TestExtract(0, "[]");
TestExtract(1, "[filter(value=even).parity=ON]");
}
TEST_F(GenericFeaturesTest, Is) {
Init("is(value=5).f", {3, 5});
TestExtract(0, "[is(value=5).f=FALSE]");
TestExtract(1, "[is(value=5).f=TRUE]");
// Check that we are actually parsing feature value names.
Init("is(value=odd).parity", {3, 4});
TestExtract(0, "[is(value=odd).parity=TRUE]");
TestExtract(1, "[is(value=odd).parity=FALSE]");
Init("is(value=even).parity", {3, 4});
TestExtract(0, "[is(value=even).parity=FALSE]");
TestExtract(1, "[is(value=even).parity=TRUE]");
}
TEST_F(GenericFeaturesTest, Ignore) {
Init("ignore(value=5).f", {3, 5});
TestExtract(0, "[ignore(value=5).f=3]");
TestExtract(1, "[]");
// Check that we are actually parsing feature value names.
Init("ignore(value=odd).parity", {3, 4});
TestExtract(0, "[]");
TestExtract(1, "[ignore(value=odd).parity=even]");
Init("ignore(value=even).parity", {3, 4});
TestExtract(0, "[ignore(value=even).parity=odd]");
TestExtract(1, "[]");
}
TEST_F(GenericFeaturesTest, All) {
Init("all { parity parity(offset=1) }", {2, 2});
TestExtract(0, "[all { parity parity(offset=1) }=FALSE]");
Init("all { parity parity(offset=1) }", {2, 3});
TestExtract(0, "[all { parity parity(offset=1) }=FALSE]");
Init("all { parity parity(offset=1) }", {3, 2});
TestExtract(0, "[all { parity parity(offset=1) }=FALSE]");
Init("all { parity parity(offset=1) }", {3, 3});
TestExtract(0, "[all { parity parity(offset=1) }=TRUE]");
}
TEST_F(GenericFeaturesTest, Any) {
Init("any { parity parity(offset=1) }", {2, 2});
TestExtract(0, "[any { parity parity(offset=1) }=FALSE]");
Init("any { parity parity(offset=1) }", {2, 3});
TestExtract(0, "[any { parity parity(offset=1) }=TRUE]");
Init("any { parity parity(offset=1) }", {3, 2});
TestExtract(0, "[any { parity parity(offset=1) }=TRUE]");
Init("any { parity parity(offset=1) }", {3, 3});
TestExtract(0, "[any { parity parity(offset=1) }=TRUE]");
}
TEST_F(GenericFeaturesTest, Pair) {
Init("pair { f(0) f(1) }", {5, 3, 2, 4, 6});
TestExtract(0, "[pair { f f(1) }=(5,3)]");
}
TEST_F(GenericFeaturesTest, NestedPair) {
Init("pair { pair { f(0) f(1) } pair { f(2) f(3) } }", {5, 3, 2, 4, 6});
TestExtract(0, "[pair { pair { f f(1) } pair { f(2) f(3) } }=((5,3),(2,4))]");
}
TEST_F(GenericFeaturesTest, Triple) {
Init("triple { f(0) f(1) f(2) }", {5, 3, 2, 4, 6});
TestExtract(0, "[triple { f f(1) f(2) }=(5,3,2)]");
}
TEST_F(GenericFeaturesTest, Quad) {
Init("quad { f(0) f(1) f(2) f(3) }", {5, 3, 2, 4, 6});
TestExtract(0, "[quad { f f(1) f(2) f(3) }=(5,3,2,4)]");
}
TEST_F(GenericFeaturesTest, Quint) {
Init("quint { f(0) f(1) f(2) f(3) f(4) }", {5, 3, 2, 4, 6});
TestExtract(0, "[quint { f f(1) f(2) f(3) f(4) }=(5,3,2,4,6)]");
}
TEST_F(GenericFeaturesTest, Tuple) {
Init("tuple { f(0) f(1) f(2) f(3) f(4) }", {5, 3, 2, 4, 6});
TestExtract(0, "[tuple { f f(1) f(2) f(3) f(4) }=(5,3,2,4,6)]");
}
TEST_F(GenericFeaturesTest, Pairs) {
Init("pairs { f(0) f(1) f(2) f(3) }", {0, 1, 2, 3, 4});
TestExtract(0,
"[pairs { f f(1) }=(0,1)"
",pairs { f f(2) }=(0,2)"
",pairs { f(1) f(2) }=(1,2)"
",pairs { f f(3) }=(0,3)"
",pairs { f(1) f(3) }=(1,3)"
",pairs { f(2) f(3) }=(2,3)]");
}
TEST_F(GenericFeaturesTest, PairsWithUnary) {
Init("pairs(unary=true) { f(0) f(1) f(2) }", {0, 1, 2, 3, 4});
TestExtract(0,
"[pairs(unary=true).f=0"
",pairs(unary=true).f(1)=1"
",pairs(unary=true).f(2)=2"
",pairs(unary=true) { f f(1) }=(0,1)"
",pairs(unary=true) { f f(2) }=(0,2)"
",pairs(unary=true) { f(1) f(2) }=(1,2)]");
}
TEST_F(GenericFeaturesTest, Conjoin) {
Init("conjoin { f(0) f(1) f(2) f(3) }", {0, 1, 2, 3, 4});
TestExtract(0,
"[conjoin { f f(1) }=(0,1)"
",conjoin { f f(2) }=(0,2)"
",conjoin { f f(3) }=(0,3)]");
}
TEST_F(GenericFeaturesTest, ConjoinWithUnary) {
Init("conjoin(unary=true) { f(0) f(1) f(2) f(3) }", {0, 1, 2, 3, 4});
TestExtract(0,
"[conjoin(unary=true).f(1)=1"
",conjoin(unary=true) { f f(1) }=(0,1)"
",conjoin(unary=true).f(2)=2"
",conjoin(unary=true) { f f(2) }=(0,2)"
",conjoin(unary=true).f(3)=3"
",conjoin(unary=true) { f f(3) }=(0,3)]");
}
TEST_F(GenericFeaturesTest, SingletonMultiValue) {
Init("f(12)", {0, 1, 2, 3, 4});
TestExtract(0, "[f(12)=1,f(12)=2]");
}
TEST_F(GenericFeaturesTest, MultiPairOneSided) {
Init("multipair { f(12) f(3) }", {0, 1, 2, 3, 4});
TestExtract(0,
"[multipair { f(12) f(3) }=(1,3)"
",multipair { f(12) f(3) }=(2,3)]");
}
TEST_F(GenericFeaturesTest, MultiPairTwoSided) {
Init("multipair { f(12) f(34) }", {0, 1, 2, 3, 4});
TestExtract(0,
"[multipair { f(12) f(34) }=(1,3)"
",multipair { f(12) f(34) }=(1,4)"
",multipair { f(12) f(34) }=(2,3)"
",multipair { f(12) f(34) }=(2,4)]");
}
TEST_F(GenericFeaturesTest, MultiPairParallel) {
Init("multipair(parallel=true) { f(12) f(34) }", {0, 1, 2, 3, 4});
TestExtract(0,
"[multipair(parallel=true) { f(12) f(34) }=(1,3)"
",multipair(parallel=true) { f(12) f(34) }=(2,4)]");
}
TEST_F(GenericFeaturesTest, MultiConjoinFirstOnly) {
Init("multiconjoin { f(12) f(3) f(0) }", {0, 1, 2, 3, 4});
TestExtract(0,
"[multiconjoin { f(12) f(3) }=(1,3)"
",multiconjoin { f(12) f(3) }=(2,3)"
",multiconjoin { f(12) f }=(1,0)"
",multiconjoin { f(12) f }=(2,0)]");
}
TEST_F(GenericFeaturesTest, MultiConjoinFirstAndRest) {
Init("multiconjoin { f(12) f(34) f(0) }", {0, 1, 2, 3, 4});
TestExtract(0,
"[multiconjoin { f(12) f(34) }=(1,3)"
",multiconjoin { f(12) f(34) }=(1,4)"
",multiconjoin { f(12) f(34) }=(2,3)"
",multiconjoin { f(12) f(34) }=(2,4)"
",multiconjoin { f(12) f }=(1,0)"
",multiconjoin { f(12) f }=(2,0)]");
}
} // namespace syntaxnet
......@@ -485,6 +485,7 @@ class GreedyParser(object):
vectors=embeddings_path,
task_context=task_context,
embedding_init=self._embedding_init,
cache_vectors_locally=False,
seed=seed1,
seed2=seed2)
......
/* Copyright 2017 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/head_label_transitions.h"
#include "syntaxnet/base.h"
using tensorflow::strings::StrAppend;
using tensorflow::strings::StrCat;
namespace syntaxnet {
// Parser transition state for head & label transitions.
class HeadLabelTransitionSystem::State : public ParserTransitionState {
public:
// Returns a copy of this state.
State *Clone() const override { return new State(*this); }
// Does nothing; no need for additional initialization.
void Init(ParserState *state) override {}
// Copies the selected heads to the |sentence|.
void AddParseToDocument(const ParserState &state, bool rewrite_root_labels,
Sentence *sentence) const override {
for (int i = 0; i < state.NumTokens(); ++i) {
Token *token = sentence->mutable_token(i);
token->set_head(state.Head(i));
token->set_label(state.LabelAsString(state.Label(i)));
if (rewrite_root_labels && state.Head(i) == -1) {
token->set_label(state.LabelAsString(state.RootLabel()));
}
}
}
// Returns true if the head and gold head match.
bool IsTokenCorrect(const ParserState &state, int index) const override {
return state.GoldHead(index) == state.Head(index);
}
// Returns a string representation of the |state|.
string ToString(const ParserState &state) const override {
string str = "[";
for (int i = 0; i < state.NumTokens(); ++i) {
StrAppend(&str, i == 0 ? "" : " ", state.Head(i));
}
StrAppend(&str, "]");
return str;
}
};
ParserAction HeadLabelTransitionSystem::GetDefaultAction(
const ParserState &state) const {
const int default_head = state.Next();
const int default_label = state.RootLabel();
return EncodeActionWithState(default_head, default_label, state);
}
ParserAction HeadLabelTransitionSystem::GetNextGoldAction(
const ParserState &state) const {
if (state.EndOfInput()) {
LOG(ERROR) << "Oracle called on invalid state: " << state.ToString();
return 0;
}
const int current = state.Next();
int head = state.GoldHead(current);
const int label = state.GoldLabel(current);
// In syntaxnet.Sentence, root arcs are token.head() == -1, whereas
// here, we use a self-loop to represent roots. So we need to convert here.
head = head == -1 ? current : head;
return EncodeActionWithState(head, label, state);
}
void HeadLabelTransitionSystem::PerformActionWithoutHistory(
ParserAction action, ParserState *state) const {
CHECK(IsAllowedAction(action, *state))
<< "Illegal action " << action << " at state: " << state->ToString();
const int current = state->Next();
int head, label;
DecodeActionWithState(action, *state, &head, &label);
VLOG(2) << "Adding arc: " << label << " (" << current << " <- " << head
<< ")";
state->AddArc(current, head == current ? -1 : head, label);
state->Advance();
}
bool HeadLabelTransitionSystem::IsAllowedAction(
ParserAction action, const ParserState &state) const {
if (state.EndOfInput()) return false;
// Unlike the labels transition system, we allow root tokens to receive
// non-root dependency labels and vice versa.
return action >= 0 && action < state.NumTokens() * state.NumLabels();
}
bool HeadLabelTransitionSystem::IsFinalState(const ParserState &state) const {
return state.EndOfInput();
}
string HeadLabelTransitionSystem::ActionAsString(
ParserAction action, const ParserState &state) const {
if (!IsAllowedAction(action, state)) return StrCat("INVALID:", action);
const auto &sentence = state.sentence();
const int current = state.Next();
int head, label;
DecodeActionWithState(action, state, &head, &label);
return StrCat(state.LabelAsString(label), "(",
sentence.token(current).word(), "<-",
head == current ? "ROOT" : sentence.token(head).word(), ")");
}
ParserTransitionState *HeadLabelTransitionSystem::NewTransitionState(
bool training_mode) const {
return new State();
}
void HeadLabelTransitionSystem::DecodeActionWithState(ParserAction action,
const ParserState &state,
ParserAction *base_action,
int *label) const {
const int num_labels = state.NumLabels();
*base_action = action / num_labels;
*label = action % num_labels;
}
ParserAction HeadLabelTransitionSystem::EncodeActionWithState(
ParserAction base_action, int label, const ParserState &state) const {
return base_action * state.NumLabels() + label;
}
REGISTER_TRANSITION_SYSTEM("heads_labels", HeadLabelTransitionSystem);
} // namespace syntaxnet
/* Copyright 2017 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef SYNTAXNET_HEAD_LABEL_TRANSITIONS_H_
#define SYNTAXNET_HEAD_LABEL_TRANSITIONS_H_
#include "syntaxnet/parser_state.h"
#include "syntaxnet/parser_transitions.h"
namespace syntaxnet {
// Heads and labels transition system. Predicts the syntactic heads and labels
// of a sentence directly.
//
// In this transition system actions encode heads and their labels, so the
// space of actions is num_labels*N (for a sentence with N tokens.) A token
// that points to itself is interpreted as a root. Unlike the heads transition
// system followed by labels, we allow root arcs to receive non-root
// dependency labels and vice versa since, unlike in the labels transition
// system, it is unclear whether the arc or label prediction should take
// precedence.
//
// Actions are interpreted as follows:
//
// For input pointer at position i:
// head = A / num_labels
// label = A % num_labels
// if head == i : Add a root arc to token i (with given label)
// if head != i : Add an arc head -> i (with given label)
//
// Note that in syntaxnet.Sentence, root arcs are token.head() == -1, whereas
// here, we use a self-loop to represent roots.
class HeadLabelTransitionSystem : public ParserTransitionSystem {
public:
class State; // defined in the .cc file
int NumActionTypes() const override { return 1; }
int NumActions(int num_labels) const override { return kDynamicNumActions; }
// The default action is to assign itself as root.
ParserAction GetDefaultAction(const ParserState &state) const override;
// Returns the next gold action for a given state according to the
// underlying annotated sentence.
ParserAction GetNextGoldAction(const ParserState &state) const override;
// Checks if the action is allowed in a given parser state.
bool IsAllowedAction(ParserAction action,
const ParserState &state) const override;
// Performs the specified action on a given parser state, without adding the
// action to the state's history.
void PerformActionWithoutHistory(ParserAction action,
ParserState *state) const override;
// Returns true if the state is at the end of the input.
bool IsFinalState(const ParserState &state) const override;
// Returns a string representation of a parser action.
string ActionAsString(ParserAction action,
const ParserState &state) const override;
// Returns a new transition state to be used to enhance the parser state.
ParserTransitionState *NewTransitionState(bool training_mode) const override;
// Returns false, since no states are deterministic.
bool IsDeterministicState(const ParserState &state) const override {
return false;
}
private:
// Given a ParseState, decodes an action into a base action and a label.
void DecodeActionWithState(ParserAction action, const ParserState &state,
ParserAction *base_action, int *label) const;
// Given a ParseState, encodes a base action and a label into a single-valued
// function.
ParserAction EncodeActionWithState(ParserAction base_action, int label,
const ParserState &state) const;
};
} // namespace syntaxnet
#endif // SYNTAXNET_HEAD_LABEL_TRANSITIONS_H_
/* Copyright 2017 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <memory>
#include "syntaxnet/base.h"
#include "syntaxnet/parser_state.h"
#include "syntaxnet/parser_transitions.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/task_context.h"
#include "syntaxnet/term_frequency_map.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace {
const char kSentence[] = R"(
text: 'I saw a man with a telescope.'
token { word: 'I' start: 0 end: 0 tag: 'PRP' category: 'PRON'
head: 1 label: 'nsubj' break_level: NO_BREAK }
token { word: 'saw' start: 2 end: 4 tag: 'VBD' category: 'VERB'
label: 'ROOT' break_level: SPACE_BREAK }
token { word: 'a' start: 6 end: 6 tag: 'DT' category: 'DET'
head: 3 label: 'det' break_level: SPACE_BREAK }
token { word: 'man' start: 8 end: 10 tag: 'NN' category: 'NOUN'
head: 1 label: 'dobj' break_level: SPACE_BREAK }
token { word: 'with' start: 12 end: 15 tag: 'IN' category: 'ADP'
head: 1 label: 'prep' break_level: SPACE_BREAK }
token { word: 'a' start: 17 end: 17 tag: 'DT' category: 'DET'
head: 6 label: 'det' break_level: SPACE_BREAK }
token { word: 'telescope' start: 19 end: 27 tag: 'NN' category: 'NOUN'
head: 4 label: 'pobj' break_level: SPACE_BREAK }
token { word: '.' start: 28 end: 28 tag: '.' category: '.'
head: 1 label: 'p' break_level: NO_BREAK }
)";
class HeadLabelTransitionTest : public ::testing::Test {
public:
HeadLabelTransitionTest() {
transition_system_->Setup(&context_);
transition_system_->Init(&context_);
CHECK(TextFormat::ParseFromString(kSentence, &sentence_));
for (auto &token : sentence_.token()) label_map_.Increment(token.label());
state_.reset(new ParserState(
&sentence_, transition_system_->NewTransitionState(true), &label_map_));
}
protected:
TermFrequencyMap label_map_;
TaskContext context_;
std::unique_ptr<ParserTransitionSystem> transition_system_{
ParserTransitionSystem::Create("heads_labels")};
Sentence sentence_;
std::unique_ptr<ParserState> state_;
};
TEST_F(HeadLabelTransitionTest, TestPerformActionSelfRoot) {
const int current = state_->Next();
const int head = current;
const int label = state_->RootLabel();
const int action = head * state_->NumLabels() + label;
transition_system_->PerformActionWithoutHistory(action, state_.get());
EXPECT_EQ(state_->Head(current), -1);
EXPECT_EQ(state_->Label(current), label);
}
TEST_F(HeadLabelTransitionTest, TestPerformActionAssignRootOtherLabel) {
const int label = label_map_.LookupIndex("det", -1);
const int current = state_->Next();
const int head = current;
const int action = head * state_->NumLabels() + label;
transition_system_->PerformActionWithoutHistory(action, state_.get());
EXPECT_EQ(state_->Head(current), -1);
EXPECT_EQ(state_->Label(current), label);
}
TEST_F(HeadLabelTransitionTest, GoldParsesCorrectly) {
LOG(INFO) << "Initial parser state: " << state_->ToString();
while (!transition_system_->IsFinalState(*state_)) {
ParserAction action = transition_system_->GetNextGoldAction(*state_);
EXPECT_TRUE(transition_system_->IsAllowedAction(action, *state_));
LOG(INFO) << "Performing action " << action << ": "
<< transition_system_->ActionAsString(action, *state_);
transition_system_->PerformActionWithoutHistory(action, state_.get());
LOG(INFO) << "Parser state: " << state_->ToString();
}
for (int i = 0; i < state_->NumTokens(); ++i) {
EXPECT_EQ(state_->GoldHead(i), state_->Head(i));
EXPECT_EQ(state_->GoldLabel(i), state_->Label(i));
}
}
} // namespace
} // namespace syntaxnet
......@@ -30,17 +30,14 @@ namespace syntaxnet {
// Action A == i : Add a root arc to token i.
// Action A != i : Add an arc A -> i.
//
// Note that in nlp_saft.Document, root arcs are token.head() == -1, whereas
// Note that in the Sentence proto, root arcs are token.head() == -1, whereas
// here, we use a self-loop to represent roots.
class HeadTransitionSystem : public ParserTransitionSystem {
public:
class State; // defined in the .cc file
// Returns 1 for number of actions. This is because each action should be
// scored separately; e.g. instead of a fixed output set, we have a single
// scoring function.
int NumActionTypes() const override { return 1; }
int NumActions(int num_labels) const override { return 1; }
int NumActions(int num_labels) const override { return kDynamicNumActions; }
// Returns the default action, which is to assign itself as root.
ParserAction GetDefaultAction(const ParserState &state) const override;
......
......@@ -68,7 +68,8 @@ class HeadTransitionSystemTest : public ::testing::Test {
TEST_F(HeadTransitionSystemTest, Characteristics) {
EXPECT_EQ(1, transition_system_->NumActionTypes());
EXPECT_EQ(1, transition_system_->NumActions(10));
EXPECT_EQ(ParserTransitionSystem::kDynamicNumActions,
transition_system_->NumActions(10));
}
TEST_F(HeadTransitionSystemTest, GoldParsesCorrectly) {
......
......@@ -26,6 +26,7 @@ limitations under the License.
#include "syntaxnet/utils.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/platform/env.h"
// A task that collects term statistics over a corpus and saves a set of
......
Parameter {
name: "brain_tokenizer_zh_embedding_dims"
value: "32;32"
}
Parameter {
name: "brain_tokenizer_zh_embedding_names"
value: "chars;words"
}
Parameter {
name: "brain_tokenizer_zh_features"
value: "input.char "
"input(1).char "
"input(2).char "
"input(3).char "
"input(-1).char "
"input(-2).char "
"input(-3).char "
"stack.char "
"stack.offset(1).char "
"stack.offset(-1).char "
"stack(1).char "
"stack(1).offset(1).char "
"stack(1).offset(-1).char "
"stack(2).char; "
"last-word(1,min-freq=2) "
"last-word(2,min-freq=2) "
"last-word(3,min-freq=2)"
}
Parameter {
name: "brain_tokenizer_zh_transition_system"
value: "binary-segment-transitions"
}
input {
name: "word-map"
Part {
file_pattern: "last-word-map"
}
}
input {
name: "char-map"
Part {
file_pattern: "char-map"
}
}
input {
name: "label-map"
Part {
file_pattern: "label-map"
}
}
input {
name: 'stdin-untoken'
record_format: 'untokenized-text'
Part {
file_pattern: '-'
}
}
input {
name: 'stdout-conll'
record_format: 'conll-sentence'
Part {
file_pattern: '-'
}
}
Parameter {
name: "brain_tokenizer_embedding_dims"
value: "16;16;16"
}
Parameter {
name: "brain_tokenizer_embedding_names"
value: "chars;digits;puncts"
}
Parameter {
name: "brain_tokenizer_features"
value: "input.char "
"input(-1).char "
"input(1).char; "
"input.digit "
"input(-1).digit "
"input(1).digit; "
"input.punctuation-amount "
"input(-1).punctuation-amount "
"input(1).punctuation-amount "
}
Parameter {
name: "brain_tokenizer_transition_system"
value: "binary-segment-transitions"
}
Parameter {
name: "brain_morpher_embedding_dims"
value: "2;16;8;16;16;16;16;16;64"
}
Parameter {
name: "brain_morpher_embedding_names"
value: "capitalization;char_ngram;other;prefix2;prefix3;suffix2;suffix3;tags;words"
}
Parameter {
name: "brain_morpher_features"
value: "input.capitalization "
"input(1).capitalization "
"input(2).capitalization "
"input(3).capitalization "
"input(-1).capitalization "
"input(-2).capitalization "
"input(-3).capitalization "
"input(-4).capitalization; "
"input.token.char-ngram "
"input(1).token.char-ngram "
"input(2).token.char-ngram "
"input(3).token.char-ngram "
"input(-1).token.char-ngram "
"input(-2).token.char-ngram "
"input(-3).token.char-ngram "
"input(-4).token.char-ngram; "
"input.digit "
"input.hyphen "
"input.token.punctuation-amount "
"input.token.quote; "
"input.token.prefix(length=2) "
"input(1).token.prefix(length=2) "
"input(2).token.prefix(length=2) "
"input(3).token.prefix(length=2) "
"input(-1).token.prefix(length=2) "
"input(-2).token.prefix(length=2) "
"input(-3).token.prefix(length=2) "
"input(-4).token.prefix(length=2); "
"input.token.prefix(length=3) "
"input(1).token.prefix(length=3) "
"input(2).token.prefix(length=3) "
"input(3).token.prefix(length=3) "
"input(-1).token.prefix(length=3) "
"input(-2).token.prefix(length=3) "
"input(-3).token.prefix(length=3) "
"input(-4).token.prefix(length=3); "
"input.token.suffix(length=2) "
"input(1).token.suffix(length=2) "
"input(2).token.suffix(length=2) "
"input(3).token.suffix(length=2) "
"input(-1).token.suffix(length=2) "
"input(-2).token.suffix(length=2) "
"input(-3).token.suffix(length=2) "
"input(-4).token.suffix(length=2); "
"input.token.suffix(length=3) "
"input(1).token.suffix(length=3) "
"input(2).token.suffix(length=3) "
"input(3).token.suffix(length=3) "
"input(-1).token.suffix(length=3) "
"input(-2).token.suffix(length=3) "
"input(-3).token.suffix(length=3) "
"input(-4).token.suffix(length=3); "
"input(-1).pred-morph-tag "
"input(-2).pred-morph-tag "
"input(-3).pred-morph-tag "
"input(-4).pred-morph-tag; "
"input.token.word "
"input(1).token.word "
"input(2).token.word "
"input(3).token.word "
"input(-1).token.word "
"input(-2).token.word "
"input(-3).token.word "
"input(-4).token.word"
}
Parameter {
name: "brain_morpher_transition_system"
value: "morpher"
}
Parameter {
name: "brain_tagger_embedding_dims"
value: "2;16;8;16;16;16;16;16;64"
}
Parameter {
name: "brain_tagger_embedding_names"
value: "capitalization;char_ngram;other;prefix2;prefix3;suffix2;suffix3;tags;words"
}
Parameter {
name: "brain_tagger_features"
value: "input.capitalization "
"input(1).capitalization "
"input(2).capitalization "
"input(3).capitalization "
"input(-1).capitalization "
"input(-2).capitalization "
"input(-3).capitalization "
"input(-4).capitalization; "
"input.token.char-ngram "
"input(1).token.char-ngram "
"input(2).token.char-ngram "
"input(3).token.char-ngram "
"input(-1).token.char-ngram "
"input(-2).token.char-ngram "
"input(-3).token.char-ngram "
"input(-4).token.char-ngram; "
"input.digit "
"input.hyphen "
"input.token.punctuation-amount "
"input.token.quote; "
"input.token.prefix(length=2) "
"input(1).token.prefix(length=2) "
"input(2).token.prefix(length=2) "
"input(3).token.prefix(length=2) "
"input(-1).token.prefix(length=2) "
"input(-2).token.prefix(length=2) "
"input(-3).token.prefix(length=2) "
"input(-4).token.prefix(length=2); "
"input.token.prefix(length=3) "
"input(1).token.prefix(length=3) "
"input(2).token.prefix(length=3) "
"input(3).token.prefix(length=3) "
"input(-1).token.prefix(length=3) "
"input(-2).token.prefix(length=3) "
"input(-3).token.prefix(length=3) "
"input(-4).token.prefix(length=3); "
"input.token.suffix(length=2) "
"input(1).token.suffix(length=2) "
"input(2).token.suffix(length=2) "
"input(3).token.suffix(length=2) "
"input(-1).token.suffix(length=2) "
"input(-2).token.suffix(length=2) "
"input(-3).token.suffix(length=2) "
"input(-4).token.suffix(length=2); "
"input.token.suffix(length=3) "
"input(1).token.suffix(length=3) "
"input(2).token.suffix(length=3) "
"input(3).token.suffix(length=3) "
"input(-1).token.suffix(length=3) "
"input(-2).token.suffix(length=3) "
"input(-3).token.suffix(length=3) "
"input(-4).token.suffix(length=3); "
"input(-1).pred-tag "
"input(-2).pred-tag "
"input(-3).pred-tag "
"input(-4).pred-tag; "
"input.token.word "
"input(1).token.word "
"input(2).token.word "
"input(3).token.word "
"input(-1).token.word "
"input(-2).token.word "
"input(-3).token.word "
"input(-4).token.word"
}
Parameter {
name: "brain_tagger_transition_system"
value: "tagger"
}
Parameter {
name: "brain_parser_embedding_dims"
value: "32;32;32;64"
}
Parameter {
name: "brain_parser_embedding_names"
value: "labels;morphology;tags;words"
}
Parameter {
name: "brain_parser_features"
value: "stack.child(1).label "
"stack.child(1).sibling(-1).label "
"stack.child(-1).label "
"stack.child(-1).sibling(1).label "
"stack.child(2).label "
"stack.child(-2).label "
"stack(1).child(1).label "
"stack(1).child(1).sibling(-1).label "
"stack(1).child(-1).label "
"stack(1).child(-1).sibling(1).label "
"stack(1).child(2).label "
"stack(1).child(-2).label; "
"input.token.morphology-set "
"input(1).token.morphology-set "
"input(2).token.morphology-set "
"input(3).token.morphology-set "
"stack.token.morphology-set "
"stack.child(1).token.morphology-set "
"stack.child(1).sibling(-1).token.morphology-set "
"stack.child(-1).token.morphology-set "
"stack.child(-1).sibling(1).token.morphology-set "
"stack.child(2).token.morphology-set "
"stack.child(-2).token.morphology-set "
"stack(1).token.morphology-set "
"stack(1).child(1).token.morphology-set "
"stack(1).child(1).sibling(-1).token.morphology-set "
"stack(1).child(-1).token.morphology-set "
"stack(1).child(-1).sibling(1).token.morphology-set "
"stack(1).child(2).token.morphology-set "
"stack(1).child(-2).token.morphology-set "
"stack(2).token.morphology-set "
"stack(3).token.morphology-set; "
"input.token.tag "
"input(1).token.tag "
"input(2).token.tag "
"input(3).token.tag "
"stack.token.tag "
"stack.child(1).token.tag "
"stack.child(1).sibling(-1).token.tag "
"stack.child(-1).token.tag "
"stack.child(-1).sibling(1).token.tag "
"stack.child(2).token.tag "
"stack.child(-2).token.tag "
"stack(1).token.tag "
"stack(1).child(1).token.tag "
"stack(1).child(1).sibling(-1).token.tag "
"stack(1).child(-1).token.tag "
"stack(1).child(-1).sibling(1).token.tag "
"stack(1).child(2).token.tag "
"stack(1).child(-2).token.tag "
"stack(2).token.tag "
"stack(3).token.tag; "
"input.token.word "
"input(1).token.word "
"input(2).token.word "
"input(3).token.word "
"stack.token.word "
"stack.child(1).token.word "
"stack.child(1).sibling(-1).token.word "
"stack.child(-1).token.word "
"stack.child(-1).sibling(1).token.word "
"stack.child(2).token.word "
"stack.child(-2).token.word "
"stack(1).token.word "
"stack(1).child(1).token.word "
"stack(1).child(1).sibling(-1).token.word "
"stack(1).child(-1).token.word "
"stack(1).child(-1).sibling(1).token.word "
"stack(1).child(2).token.word "
"stack(1).child(-2).token.word "
"stack(2).token.word "
"stack(3).token.word "
}
Parameter {
name: "brain_parser_transition_system"
value: "arc-standard"
}
Parameter {
name: "join_category_to_pos"
value: "true"
}
input {
name: "word-map"
Part {
file_pattern: "word-map"
}
}
input {
name: "char-map"
Part {
file_pattern: "char-map"
}
}
input {
name: "tag-map"
Part {
file_pattern: "tag-map"
}
}
input {
name: "tag-to-category"
Part {
file_pattern: "tag-to-category"
}
}
input {
name: "label-map"
Part {
file_pattern: "label-map"
}
}
input {
name: "char-ngram-map"
Part {
file_pattern: "char-ngram-map"
}
}
input {
name: "prefix-table"
Part {
file_pattern: "prefix-table"
}
}
input {
name: "suffix-table"
Part {
file_pattern: "suffix-table"
}
}
input {
name: "morph-label-set"
Part {
file_pattern: "morph-label-set"
}
}
input {
name: "morphology-map"
Part {
file_pattern: "morphology-map"
}
}
input {
name: 'stdin'
record_format: 'tokenized-text'
Part {
file_pattern: '-'
}
}
input {
name: 'stdin-conll'
record_format: 'conll-sentence'
Part {
file_pattern: '-'
}
}
input {
name: 'stdin-untoken'
record_format: 'untokenized-text'
Part {
file_pattern: '-'
}
}
input {
name: 'stdout-conll'
record_format: 'conll-sentence'
Part {
file_pattern: '-'
}
}
# A script that runs a morphological analyzer, a part-of-speech tagger and a
# dependency parser on a text file, with one sentence per line.
#
# Example usage:
# bazel build syntaxnet:parser_eval
# cat sentences.txt |
# syntaxnet/models/parsey_universal/parse.sh \
# $MODEL_DIRECTORY > output.conll
#
# To run on a conll formatted file, add the --conll command line argument:
# cat sentences.conll |
# syntaxnet/models/parsey_universal/parse.sh \
# --conll $MODEL_DIRECTORY > output.conll
#
# Models can be downloaded from
# http://download.tensorflow.org/models/parsey_universal/<language>.zip
# for the languages listed at
# https://github.com/tensorflow/models/blob/master/research/syntaxnet/universal.md
#
PARSER_EVAL=bazel-bin/syntaxnet/parser_eval
CONTEXT=syntaxnet/models/parsey_universal/context.pbtxt
if [[ "$1" == "--conll" ]]; then
INPUT_FORMAT=stdin-conll
shift
else
INPUT_FORMAT=stdin
fi
MODEL_DIR=$1
$PARSER_EVAL \
--input=$INPUT_FORMAT \
--output=stdout-conll \
--hidden_layer_sizes=64 \
--arg_prefix=brain_morpher \
--graph_builder=structured \
--task_context=$CONTEXT \
--resource_dir=$MODEL_DIR \
--model_path=$MODEL_DIR/morpher-params \
--slim_model \
--batch_size=1024 \
--alsologtostderr \
| \
$PARSER_EVAL \
--input=stdin-conll \
--output=stdout-conll \
--hidden_layer_sizes=64 \
--arg_prefix=brain_tagger \
--graph_builder=structured \
--task_context=$CONTEXT \
--resource_dir=$MODEL_DIR \
--model_path=$MODEL_DIR/tagger-params \
--slim_model \
--batch_size=1024 \
--alsologtostderr \
| \
$PARSER_EVAL \
--input=stdin-conll \
--output=stdout-conll \
--hidden_layer_sizes=512,512 \
--arg_prefix=brain_parser \
--graph_builder=structured \
--task_context=$CONTEXT \
--resource_dir=$MODEL_DIR \
--model_path=$MODEL_DIR/parser-params \
--slim_model \
--batch_size=1024 \
--alsologtostderr
# A script that runs a tokenizer on a text file with one sentence per line.
#
# Example usage:
# bazel build syntaxnet:parser_eval
# cat untokenized-sentences.txt |
# syntaxnet/models/parsey_universal/tokenize.sh \
# $MODEL_DIRECTORY > output.conll
#
# Models can be downloaded from
# http://download.tensorflow.org/models/parsey_universal/<language>.zip
# for the languages listed at
# https://github.com/tensorflow/models/blob/master/research/syntaxnet/universal.md
#
PARSER_EVAL=bazel-bin/syntaxnet/parser_eval
CONTEXT=syntaxnet/models/parsey_universal/context.pbtxt
INPUT_FORMAT=stdin-untoken
MODEL_DIR=$1
$PARSER_EVAL \
--input=$INPUT_FORMAT \
--output=stdin-untoken \
--hidden_layer_sizes=128,128 \
--arg_prefix=brain_tokenizer \
--graph_builder=greedy \
--task_context=$CONTEXT \
--resource_dir=$MODEL_DIR \
--model_path=$MODEL_DIR/tokenizer-params \
--batch_size=32 \
--alsologtostderr \
--slim_model
# A script that runs a traditional Chinese tokenizer on a text file with one
# sentence per line.
#
# Example usage:
# bazel build syntaxnet:parser_eval
# cat untokenized-sentences.txt |
# syntaxnet/models/parsey_universal/tokenize_zh.sh \
# $MODEL_DIRECTORY > output.conll
#
# The traditional Chinese model can be downloaded from
# http://download.tensorflow.org/models/parsey_universal/Chinese.zip
#
PARSER_EVAL=bazel-bin/syntaxnet/parser_eval
CONTEXT=syntaxnet/models/parsey_universal/context-tokenize-zh.pbtxt
INPUT_FORMAT=stdin-untoken
MODEL_DIR=$1
$PARSER_EVAL \
--input=$INPUT_FORMAT \
--output=stdin-untoken \
--hidden_layer_sizes=256,256 \
--arg_prefix=brain_tokenizer_zh \
--graph_builder=structured \
--task_context=$CONTEXT \
--resource_dir=$MODEL_DIR \
--model_path=$MODEL_DIR/tokenizer-params \
--batch_size=1024 \
--alsologtostderr \
--slim_model
......@@ -43,8 +43,7 @@ class MorphologyLabelSet {
int Add(const TokenMorphology &morph);
// Look up an existing TokenMorphology. If it is not present, return -1.
// Note: This is slow, and should not be called outside of training workflow
// or init.
// Note: This is slow, and should not be called outside of training or init.
int LookupExisting(const TokenMorphology &morph) const;
// Return the TokenMorphology at position i. The input i should be in the
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment