Commit 64675fc7 authored by calberti's avatar calberti Committed by GitHub
Browse files

New transition systems and features for syntaxnet (#301)

* Morpher and segmenter transition systems and new features (quotes, punctuation, capitalization, character ngrams, morphology attributes).
parent a591478c
......@@ -107,8 +107,8 @@ Bazel should complete reporting all tests passed.
You can also compile SyntaxNet in a [Docker](https://www.docker.com/what-docker)
container using this [Dockerfile](Dockerfile).
**Note:** If you are running Docker on OSX, make sure that you have enough memory allocated
for your Docker VM.
**Note:** If you are running Docker on OSX, make sure that you have enough
memory allocated for your Docker VM.
## Getting Started
......@@ -612,6 +612,7 @@ Original authors of the code in this package include (in alphabetical order):
* David Weiss
* Emily Pitler
* Greg Coppola
* Ji Ma
* Keith Hall
* Kuzman Ganchev
* Michael Collins
......
......@@ -158,6 +158,31 @@ cc_library(
],
)
cc_library(
name = "char_properties",
srcs = ["char_properties.cc"],
hdrs = ["char_properties.h"],
deps = [
":registry",
":utils",
"//util/utf8:unicodetext",
],
alwayslink = 1,
)
cc_library(
name = "segmenter_utils",
srcs = ["segmenter_utils.cc"],
hdrs = ["segmenter_utils.h"],
deps = [
":base",
":char_properties",
":sentence_proto",
"//util/utf8:unicodetext",
],
alwayslink = 1,
)
cc_library(
name = "feature_extractor",
srcs = ["feature_extractor.cc"],
......@@ -199,6 +224,7 @@ cc_library(
":affix",
":feature_extractor",
":registry",
":segmenter_utils",
],
)
......@@ -250,25 +276,51 @@ cc_library(
alwayslink = 1,
)
cc_library(
name = "morphology_label_set",
srcs = ["morphology_label_set.cc"],
hdrs = ["morphology_label_set.h"],
deps = [
":document_format",
":feature_extractor",
":proto_io",
":registry",
":sentence_proto",
":utils",
],
)
cc_library(
name = "parser_transitions",
srcs = [
"arc_standard_transitions.cc",
"binary_segment_state.cc",
"binary_segment_transitions.cc",
"morpher_transitions.cc",
"parser_features.cc",
"parser_state.cc",
"parser_transitions.cc",
"tagger_transitions.cc",
],
hdrs = [
"binary_segment_state.h",
"parser_features.h",
"parser_state.h",
"parser_transitions.h",
],
deps = [
":affix",
":feature_extractor",
":kbest_syntax_proto",
":morphology_label_set",
":registry",
":segmenter_utils",
":sentence_features",
":sentence_proto",
":shared_store",
":task_context",
":term_frequency_map",
":workspace",
],
alwayslink = 1,
)
......@@ -288,30 +340,12 @@ cc_library(
],
)
cc_library(
name = "parser_features",
srcs = ["parser_features.cc"],
hdrs = ["parser_features.h"],
deps = [
":affix",
":feature_extractor",
":parser_transitions",
":registry",
":sentence_features",
":task_context",
":term_frequency_map",
":workspace",
],
alwayslink = 1,
)
cc_library(
name = "embedding_feature_extractor",
srcs = ["embedding_feature_extractor.cc"],
hdrs = ["embedding_feature_extractor.h"],
deps = [
":feature_extractor",
":parser_features",
":parser_transitions",
":sparse_proto",
":task_context",
......@@ -326,7 +360,6 @@ cc_library(
deps = [
":embedding_feature_extractor",
":feature_extractor",
":parser_features",
":parser_transitions",
":sentence_proto",
":sparse_proto",
......@@ -344,7 +377,6 @@ cc_library(
"reader_ops.cc",
],
deps = [
":parser_features",
":parser_transitions",
":sentence_batch",
":sentence_proto",
......@@ -360,7 +392,6 @@ cc_library(
srcs = ["document_filters.cc"],
deps = [
":document_format",
":parser_features",
":parser_transitions",
":sentence_batch",
":sentence_proto",
......@@ -376,8 +407,8 @@ cc_library(
deps = [
":dictionary_proto",
":document_format",
":parser_features",
":parser_transitions",
":segmenter_utils",
":sentence_batch",
":sentence_proto",
":task_context",
......@@ -438,6 +469,18 @@ filegroup(
srcs = glob(["models/parsey_mcparseface/*"]),
)
cc_test(
name = "binary_segment_state_test",
size = "small",
srcs = ["binary_segment_state_test.cc"],
deps = [
":base",
":parser_transitions",
":term_frequency_map",
":test_main",
],
)
cc_test(
name = "shared_store_test",
size = "small",
......@@ -448,6 +491,26 @@ cc_test(
],
)
cc_test(
name = "char_properties_test",
srcs = ["char_properties_test.cc"],
deps = [
":char_properties",
":test_main",
],
)
cc_test(
name = "segmenter_utils_test",
srcs = ["segmenter_utils_test.cc"],
deps = [
":base",
":segmenter_utils",
":sentence_proto",
":test_main",
],
)
cc_test(
name = "sentence_features_test",
size = "medium",
......@@ -465,6 +528,15 @@ cc_test(
],
)
cc_test(
name = "morphology_label_set_test",
srcs = ["morphology_label_set_test.cc"],
deps = [
":morphology_label_set",
":test_main",
],
)
cc_test(
name = "arc_standard_transitions_test",
size = "small",
......@@ -479,6 +551,17 @@ cc_test(
],
)
cc_test(
name = "binary_segment_transitions_test",
size = "small",
srcs = ["binary_segment_transitions_test.cc"],
deps = [
":parser_transitions",
":sentence_proto",
":test_main",
],
)
cc_test(
name = "tagger_transitions_test",
size = "small",
......@@ -499,7 +582,6 @@ cc_test(
srcs = ["parser_features_test.cc"],
deps = [
":feature_extractor",
":parser_features",
":parser_transitions",
":populate_test_inputs",
":sentence_proto",
......
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/binary_segment_state.h"
#include <string>
#include "syntaxnet/segmenter_utils.h"
#include "syntaxnet/sentence.pb.h"
namespace syntaxnet {
ParserTransitionState *BinarySegmentState::Clone() const {
return new BinarySegmentState();
}
string BinarySegmentState::ToString(const ParserState &state) const {
string str("[");
for (int i = NumStarts(state) - 1; i >=0; --i) {
int start = LastStart(i, state);
int end = 0;
if (i - 1 >= 0) {
end = LastStart(i - 1, state) - 1;
} else if (state.EndOfInput()) {
end = state.sentence().token_size() - 1;
} else {
end = state.Next() - 1;
}
for (int k = start; k <= end; ++k) {
str.append(state.GetToken(k).word());
}
if (i >= 1) str.append(" ");
}
str.append("] ");
for (int i = state.Next(); i < state.NumTokens(); ++i) {
str.append(state.GetToken(i).word());
}
return str;
}
void BinarySegmentState::AddParseToDocument(const ParserState &state,
bool rewrite_root_labels,
Sentence *sentence) const {
if (sentence->token_size() == 0) return;
vector<bool> is_starts(sentence->token_size(), false);
for (int i = 0; i < NumStarts(state); ++i) {
is_starts[LastStart(i, state)] = true;
}
// Break level of the current token is determined based on its previous token.
Token::BreakLevel break_level = Token::NO_BREAK;
bool is_first_token = true;
Sentence new_sentence;
for (int i = 0; i < sentence->token_size(); ++i) {
const Token &token = sentence->token(i);
const string &word = token.word();
bool is_break = SegmenterUtils::IsBreakChar(word);
if (is_starts[i] || is_first_token) {
if (!is_break) {
// The current character is the first char of a new token/word.
Token *new_token = new_sentence.add_token();
new_token->set_start(token.start());
new_token->set_end(token.end());
new_token->set_word(word);
// For the first token, keep the old break level to make sure that the
// number of sentences stays unchanged.
new_token->set_break_level(break_level);
is_first_token = false;
}
} else {
// Append the character to the previous token.
if (!is_break) {
int index = new_sentence.token_size() - 1;
auto *last_token = new_sentence.mutable_token(index);
last_token->mutable_word()->append(word);
last_token->set_end(token.end());
}
}
// Update break level. Note we do not introduce new sentences in the
// transition system, thus anything goes beyond line break would be reduced
// to line break.
break_level = is_break ? SegmenterUtils::BreakLevel(word) : Token::NO_BREAK;
if (break_level >= Token::LINE_BREAK) break_level = Token::LINE_BREAK;
}
sentence->mutable_token()->Swap(new_sentence.mutable_token());
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef SYNTAXNET_BINARY_SEGMENT_STATE_H_
#define SYNTAXNET_BINARY_SEGMENT_STATE_H_
#include "syntaxnet/parser_state.h"
#include "syntaxnet/parser_transitions.h"
namespace syntaxnet {
class Sentence;
// Parser state for binary segmentation transition system. The input of the
// system is a sequence of utf8 characters that are to be segmented into tokens.
// The system contains two type of transitions/actions:
// -START: the token at input is the first character of a new word.
// -MERGE: the token at input is to be merged with the its previous token.
//
// A BinarySegmentState is used to store segmentation histories that can be used
// as features. In addition, it also provides the functionality to add
// segmentation results to the document. The function assumes that sentences in
// a document are processed in left-to-right order. See also the comments of
// the FinishDocument function for explaination.
//
// Note on spaces:
// Spaces, or more generally break-characters, should never be any part of a
// word, and the START/MERGE of spaces would be ignored. In addition, if a space
// starts a new word, then the actual first char of that word is the first
// non-space token following the space.
// Some examples:
// -chars: ' ' A B
// -tags: S M M
// -result: 'AB'
//
// -chars: A ' ' B
// -tags: S M M
// -result: 'AB'
//
// -chars: A ' ' B
// -tags: S S M
// -result: 'AB'
//
// -chars: A B ' '
// -tags: S S M
// -result: 'A', 'B'
class BinarySegmentState : public ParserTransitionState {
public:
ParserTransitionState *Clone() const override;
void Init(ParserState *state) override {}
// Returns the number of start tokens that have already been identified. In
// other words, number of start tokens between the first token of the sentence
// and state.Input(), with state.Input() excluded.
static int NumStarts(const ParserState &state) {
return state.StackSize();
}
// Returns the index of the k-th most recent start token.
static int LastStart(int k, const ParserState &state) {
DCHECK_GE(k, 0);
DCHECK_LT(k, NumStarts(state));
return state.Stack(k);
}
// Adds the token at given index as a new start token.
static void AddStart(int index, ParserState *state) {
state->Push(index);
}
// Adds segmentation results to the given sentence.
void AddParseToDocument(const ParserState &state,
bool rewrite_root_labels,
Sentence *sentence) const override;
// Whether a parsed token should be considered correct for evaluation.
bool IsTokenCorrect(const ParserState &state, int index) const override {
return true;
}
// Returns a human readable string representation of this state.
string ToString(const ParserState &state) const override;
};
} // namespace syntaxnet
#endif // SYNTAXNET_BINARY_SEGMENT_STATE_H_
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/binary_segment_state.h"
#include <memory>
#include "syntaxnet/base.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/term_frequency_map.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
class BinarySegmentStateTest : public ::testing::Test {
protected:
void SetUp() override {
// Prepare a sentence.
const char *str_sentence = "text: '测试 的 句子' "
"token { word: '测' start: 0 end: 2 } "
"token { word: '试' start: 3 end: 5 } "
"token { word: ' ' start: 6 end: 6 } "
"token { word: '的' start: 7 end: 9 } "
"token { word: ' ' start: 10 end: 10 } "
"token { word: '句' start: 11 end: 13 } "
"token { word: '子' start: 14 end: 16 } ";
sentence_ = std::unique_ptr<Sentence>(new Sentence());
TextFormat::ParseFromString(str_sentence, sentence_.get());
}
// The test document, parse tree, and sentence.
std::unique_ptr<Sentence> sentence_;
TermFrequencyMap label_map_;
};
TEST_F(BinarySegmentStateTest, AddStartLastStartNumStartsTest) {
BinarySegmentState *segment_state = new BinarySegmentState();
ParserState state(sentence_.get(), segment_state, &label_map_);
// Test segment_state initialized with zero starts.
EXPECT_EQ(0, segment_state->NumStarts(state));
// Adding the first token as a start token.
segment_state->AddStart(0, &state);
ASSERT_EQ(1, segment_state->NumStarts(state));
EXPECT_EQ(0, segment_state->LastStart(0, state));
// Adding more starts.
segment_state->AddStart(2, &state);
segment_state->AddStart(3, &state);
segment_state->AddStart(4, &state);
segment_state->AddStart(5, &state);
ASSERT_EQ(5, segment_state->NumStarts(state));
EXPECT_EQ(5, segment_state->LastStart(0, state));
EXPECT_EQ(4, segment_state->LastStart(1, state));
EXPECT_EQ(3, segment_state->LastStart(2, state));
EXPECT_EQ(2, segment_state->LastStart(3, state));
EXPECT_EQ(0, segment_state->LastStart(4, state));
}
TEST_F(BinarySegmentStateTest, AddParseToDocumentTest) {
BinarySegmentState *segment_state = new BinarySegmentState();
ParserState state(sentence_.get(), segment_state, &label_map_);
// Test gold segmentation.
// 0 1 2 3 4 5 6
// 测 试 ' ' 的 ' ' 句 子
// S M S S S S M
segment_state->AddStart(0, &state);
segment_state->AddStart(2, &state);
segment_state->AddStart(3, &state);
segment_state->AddStart(4, &state);
segment_state->AddStart(5, &state);
Sentence sentence_with_annotation = *sentence_;
segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
// Test the number of tokens as well as the start/end byte-offsets of each
// token.
ASSERT_EQ(3, sentence_with_annotation.token_size());
// The first token is 测试.
EXPECT_EQ(0, sentence_with_annotation.token(0).start());
EXPECT_EQ(5, sentence_with_annotation.token(0).end());
// The second token is 的.
EXPECT_EQ(7, sentence_with_annotation.token(1).start());
EXPECT_EQ(9, sentence_with_annotation.token(1).end());
// The third token is 句子.
EXPECT_EQ(11, sentence_with_annotation.token(2).start());
EXPECT_EQ(16, sentence_with_annotation.token(2).end());
// Test merge space to other tokens. Since spaces, or more generally break
// characters, should never be a part of any word, they are skipped no matter
// how they are tagged.
// 0 1 2 3 4 5 6
// 测 试 ' ' 的 ' ' 句 子
// S M M S M M M
while (!state.StackEmpty()) state.Pop();
segment_state->AddStart(0, &state);
segment_state->AddStart(3, &state);
sentence_with_annotation = *sentence_;
segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
ASSERT_EQ(2, sentence_with_annotation.token_size());
// The first token is 测试. Note even a space is tagged as "merge", it is not
// attached to its previous word.
EXPECT_EQ(0, sentence_with_annotation.token(0).start());
EXPECT_EQ(5, sentence_with_annotation.token(0).end());
// The second token is 的句子.
EXPECT_EQ(7, sentence_with_annotation.token(1).start());
EXPECT_EQ(16, sentence_with_annotation.token(1).end());
// Test merge a token to space tokens. In such case, the current token would
// be merged to the first non-space token on its left side.
// 0 1 2 3 4 5 6
// 测 试 ' ' 的 ' ' 句 子
// S M S M S M M
while (!state.StackEmpty()) state.Pop();
segment_state->AddStart(0, &state);
segment_state->AddStart(2, &state);
segment_state->AddStart(4, &state);
sentence_with_annotation = *sentence_;
segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
ASSERT_EQ(1, sentence_with_annotation.token_size());
EXPECT_EQ(0, sentence_with_annotation.token(0).start());
EXPECT_EQ(16, sentence_with_annotation.token(0).end());
}
TEST_F(BinarySegmentStateTest, SpaceDocumentTest) {
const char *str_sentence = "text: ' \t\t' "
"token { word: ' ' start: 0 end: 0 } "
"token { word: '\t' start: 1 end: 1 } "
"token { word: '\t' start: 2 end: 2 } ";
TextFormat::ParseFromString(str_sentence, sentence_.get());
BinarySegmentState *segment_state = new BinarySegmentState();
ParserState state(sentence_.get(), segment_state, &label_map_);
// Break-chars should always be skipped, no matter how they are tagged.
// 0 1 2
//' ' '\t' '\t'
// M M M
Sentence sentence_with_annotation = *sentence_;
segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
ASSERT_EQ(0, sentence_with_annotation.token_size());
// 0 1 2
//' ' '\t' '\t'
// S S S
segment_state->AddStart(0, &state);
segment_state->AddStart(1, &state);
segment_state->AddStart(2, &state);
sentence_with_annotation = *sentence_;
segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
ASSERT_EQ(0, sentence_with_annotation.token_size());
}
TEST_F(BinarySegmentStateTest, DocumentBeginWithSpaceTest) {
const char *str_sentence = "text: ' 空格' "
"token { word: ' ' start: 0 end: 0 } "
"token { word: '空' start: 1 end: 3 } "
"token { word: '格' start: 4 end: 6 } ";
TextFormat::ParseFromString(str_sentence, sentence_.get());
BinarySegmentState *segment_state = new BinarySegmentState();
ParserState state(sentence_.get(), segment_state, &label_map_);
// 0 1 2
//' ' 空 格
// M M M
Sentence sentence_with_annotation = *sentence_;
segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
ASSERT_EQ(1, sentence_with_annotation.token_size());
// The first token is 空格.
EXPECT_EQ(1, sentence_with_annotation.token(0).start());
EXPECT_EQ(6, sentence_with_annotation.token(0).end());
// 0 1 2
//' ' 空 格
// S M M
while (!state.StackEmpty()) state.Pop();
segment_state->AddStart(0, &state);
sentence_with_annotation = *sentence_;
segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
ASSERT_EQ(1, sentence_with_annotation.token_size());
// The first token is 空格.
EXPECT_EQ(1, sentence_with_annotation.token(0).start());
EXPECT_EQ(6, sentence_with_annotation.token(0).end());
}
TEST_F(BinarySegmentStateTest, EmptyDocumentTest) {
const char *str_sentence = "text: '' ";
TextFormat::ParseFromString(str_sentence, sentence_.get());
BinarySegmentState *segment_state = new BinarySegmentState();
ParserState state(sentence_.get(), segment_state, &label_map_);
Sentence sentence_with_annotation = *sentence_;
segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
ASSERT_EQ(0, sentence_with_annotation.token_size());
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/binary_segment_state.h"
#include "syntaxnet/parser_state.h"
#include "syntaxnet/parser_transitions.h"
namespace syntaxnet {
// Given an input of utf8 characters, the BinarySegmentTransitionSystem
// conducts word segmentation by performing one of the following two actions:
// -START: starts a new word with the token at state.input, and also advances
// the state.input.
// -MERGE: adds the token at state.input to its prevous word, and also advances
// state.input.
//
// Also see nlp/saft/components/segmentation/transition/binary-segment-state.h
// for examples on handling spaces.
class BinarySegmentTransitionSystem : public ParserTransitionSystem {
public:
BinarySegmentTransitionSystem() {}
ParserTransitionState *NewTransitionState(bool train_mode) const override {
return new BinarySegmentState();
}
// Action types for the segmentation-transition system.
enum ParserActionType {
START = 0,
MERGE = 1,
CARDINAL = 2
};
static int StartAction() { return 0; }
static int MergeAction() { return 1; }
// The system always starts a new word by default.
ParserAction GetDefaultAction(const ParserState &state) const override {
return START;
}
// Returns the number of action types.
int NumActionTypes() const override {
return CARDINAL;
}
// Returns the number of possible actions.
int NumActions(int num_labels) const override {
return CARDINAL;
}
// Returns the next gold action for a given state according to the underlying
// annotated sentence. The training data for the transition system is created
// by the binary-segmenter-data task. If a token's break_level is NO_BREAK,
// then it is a MERGE, START otherwise. The only exception is that the first
// token in a sentence for the transition sysytem is always a START.
ParserAction GetNextGoldAction(const ParserState &state) const override {
if (state.Next() == 0) return StartAction();
const Token &token = state.GetToken(state.Next());
return (token.break_level() != Token::NO_BREAK ?
StartAction() : MergeAction());
}
// Both START and MERGE can be applied to any tokens in the sentence.
bool IsAllowedAction(
ParserAction action, const ParserState &state) const override {
return true;
}
// Performs the specified action on a given parser state, without adding the
// action to the state's history.
void PerformActionWithoutHistory(
ParserAction action, ParserState *state) const override {
// Note when the action is less than 0, it is treated as a START.
if (action < 0 || action == StartAction()) {
MutableTransitionState(state)->AddStart(state->Next(), state);
}
state->Advance();
}
// Allows backoff to best allowable transition.
bool BackOffToBestAllowableTransition() const override { return true; }
// A state is a deterministic state iff no tokens have been consumed.
bool IsDeterministicState(const ParserState &state) const override {
return state.Next() == 0;
}
// For binary segmentation, a state is a final state iff all tokens have been
// consumed.
bool IsFinalState(const ParserState &state) const override {
return state.EndOfInput();
}
// Returns a string representation of a parser action.
string ActionAsString(
ParserAction action, const ParserState &state) const override {
return action == StartAction() ? "START" : "MERGE";
}
// Downcasts the TransitionState in ParserState to an BinarySegmentState.
static BinarySegmentState *MutableTransitionState(ParserState *state) {
return static_cast<BinarySegmentState *>(state->mutable_transition_state());
}
};
REGISTER_TRANSITION_SYSTEM("binary-segment-transitions",
BinarySegmentTransitionSystem);
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/binary_segment_state.h"
#include "syntaxnet/parser_state.h"
#include "syntaxnet/parser_transitions.h"
#include "syntaxnet/term_frequency_map.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
class SegmentationTransitionTest : public ::testing::Test {
protected:
void SetUp() override {
transition_system_ = std::unique_ptr<ParserTransitionSystem>(
ParserTransitionSystem::Create("binary-segment-transitions"));
// Prepare a sentence.
const char *str_sentence = "text: '因为 有 这样' "
"token { word: '因' start: 0 end: 2 break_level: SPACE_BREAK } "
"token { word: '为' start: 3 end: 5 break_level: NO_BREAK } "
"token { word: ' ' start: 6 end: 6 break_level: SPACE_BREAK } "
"token { word: '有' start: 7 end: 9 break_level: SPACE_BREAK } "
"token { word: ' ' start: 10 end: 10 break_level: SPACE_BREAK } "
"token { word: '这' start: 11 end: 13 break_level: SPACE_BREAK } "
"token { word: '样' start: 14 end: 16 break_level: NO_BREAK } ";
sentence_ = std::unique_ptr<Sentence>(new Sentence());
TextFormat::ParseFromString(str_sentence, sentence_.get());
}
void CheckStarts(const ParserState &state, const vector<int> &target) {
ASSERT_EQ(state.StackSize(), target.size());
vector<int> starts;
for (int i = 0; i < state.StackSize(); ++i) {
EXPECT_EQ(state.Stack(i), target[i]);
}
}
// The test document, parse tree, and sentence with tags and partial parses.
std::unique_ptr<Sentence> sentence_;
std::unique_ptr<ParserTransitionSystem> transition_system_;
TermFrequencyMap label_map_;
};
TEST_F(SegmentationTransitionTest, GoldNextActionTest) {
BinarySegmentState *segment_state = static_cast<BinarySegmentState *>(
transition_system_->NewTransitionState(true));
ParserState state(sentence_.get(), segment_state, &label_map_);
// Do segmentation by following the gold actions.
while (transition_system_->IsFinalState(state) == false) {
ParserAction action = transition_system_->GetNextGoldAction(state);
transition_system_->PerformActionWithoutHistory(action, &state);
}
// Test STARTs.
CheckStarts(state, {5, 4, 3, 2, 0});
// Test the annotated tokens.
segment_state->AddParseToDocument(state, false, sentence_.get());
ASSERT_EQ(sentence_->token_size(), 3);
EXPECT_EQ(sentence_->token(0).word(), "因为");
EXPECT_EQ(sentence_->token(1).word(), "有");
EXPECT_EQ(sentence_->token(2).word(), "这样");
// Test start/end annotation of each token.
EXPECT_EQ(sentence_->token(0).start(), 0);
EXPECT_EQ(sentence_->token(0).end(), 5);
EXPECT_EQ(sentence_->token(1).start(), 7);
EXPECT_EQ(sentence_->token(1).end(), 9);
EXPECT_EQ(sentence_->token(2).start(), 11);
EXPECT_EQ(sentence_->token(2).end(), 16);
}
TEST_F(SegmentationTransitionTest, DefaultActionTest) {
BinarySegmentState *segment_state = static_cast<BinarySegmentState *>(
transition_system_->NewTransitionState(true));
ParserState state(sentence_.get(), segment_state, &label_map_);
// Do segmentation, tagging and parsing by following the gold actions.
while (transition_system_->IsFinalState(state) == false) {
ParserAction action = transition_system_->GetDefaultAction(state);
transition_system_->PerformActionWithoutHistory(action, &state);
}
// Every character should be START.
CheckStarts(state, {6, 5, 4, 3, 2, 1, 0});
// Every non-space character should be a word.
segment_state->AddParseToDocument(state, false, sentence_.get());
ASSERT_EQ(sentence_->token_size(), 5);
EXPECT_EQ(sentence_->token(0).word(), "因");
EXPECT_EQ(sentence_->token(1).word(), "为");
EXPECT_EQ(sentence_->token(2).word(), "有");
EXPECT_EQ(sentence_->token(3).word(), "这");
EXPECT_EQ(sentence_->token(4).word(), "样");
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// char_properties.cc - define is_X() tests for various character properties
//
// See char_properties.h for how to write a character property.
//
// References for the char sets below:
//
// . http://www.unicode.org/Public/UNIDATA/PropList.txt
//
// Large (but not exhaustive) list of Unicode chars and their "properties"
// (e.g., the property "Pi" = an initial quote punctuation char).
//
// . http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
//
// Defines the list of properties, such as "Pi", used in the above list.
//
// . http://www.unipad.org/unimap/index.php?param_char=XXXX&page=detail
//
// Gives detail about a particular character code.
// XXXX is a 4-hex-digit Unicode character code.
//
// . http://www.unicode.org/Public/UNIDATA/UCD.html
//
// General reference for Unicode characters.
//
#include "syntaxnet/char_properties.h"
#include <ctype.h> // for ispunct, isspace
#include <memory>
#include <utility>
#include <vector> // for vector
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "third_party/utf/utf.h" // for runetochar, ::UTFmax, Rune
#include "util/utf8/unilib.h" // for IsValidCodepoint, etc
#include "util/utf8/unilib_utf8_utils.h"
//============================================================
// CharPropertyImplementation
//
// A CharPropertyImplementation stores a set of Unicode characters,
// encoded in UTF-8, as a trie. The trie is represented as a vector
// of nodes. Each node is a 256-element array that specifies what to
// do with one byte of the UTF-8 sequence. Each element n of a node
// is one of:
// n = 0, indicating that the Property is not true of any
// character whose UTF-8 encoding includes this byte at
// this position
// n = -1, indicating that the Property is true for the UTF-8 sequence
// that ends with this byte.
// n > 0, indicating the index of the row that describes the
// remaining bytes in the UTF-8 sequence.
//
// The only operation that needs to be fast is HoldsFor, which tests
// whether a character has a given property. We use each byte of the
// character's UTF-8 encoding to index into a row. If the value is 0,
// then the property is not true for the character. (We might discover
// this even before getting to the end of the sequence.) If the value
// is -1, then the property is true for this character. Otherwise,
// the value is the index of another row, which we index using the next
// byte in the sequence, and so on. The design of UTF-8 prevents
// ambiguities here; no prefix of a UTF-8 sequence is a valid UTF-8
// sequence.
//
// While it is possible to implement an iterator for this representation,
// it is much easier to use set<char32> for this purpose. In fact, we
// would use that as the entire representation, were it not for concerns
// that HoldsFor might be slower.
namespace syntaxnet {
struct CharPropertyImplementation {
unordered_set<char32> chars;
vector<vector<int> > rows;
CharPropertyImplementation() {
rows.reserve(10);
rows.resize(1);
rows[0].resize(256, 0);
}
void AddChar(char *buf, int len) {
int n = 0; // row index
for (int i = 0; i < len; ++i) {
int ch = reinterpret_cast<unsigned char *>(buf)[i];
int m = rows[n][ch];
if (m > 0) {
CHECK_LT(i, len - 1)
<< " : " << (i + 1) << "-byte UTF-8 sequence "
<< "(" << tensorflow::str_util::CEscape(string(buf, i + 1)) << ")"
<< " is prefix of previously-seen UTF-8 sequence(s)";
n = m;
} else if (i == len - 1) {
rows[n][ch] = -1;
} else {
CHECK_EQ(m, 0) << " : UTF-8 sequence is extension of previously-seen "
<< (i + 1) << "-byte UTF-8 sequence "
<< "("
<< tensorflow::str_util::CEscape(string(buf, i + 1))
<< ")";
int a = rows.size();
rows.resize(a + 1);
rows[a].resize(256, 0);
rows[n][ch] = a;
n = a;
}
}
}
bool HoldsFor(const char *buf) const {
const unsigned char *bytes = reinterpret_cast<const unsigned char *>(buf);
// Lookup each byte of the UTF-8 sequence, starting in row 0.
int n = rows[0][*bytes];
if (n == 0) return false;
if (n == -1) return true;
// If the value is not 0 or -1, then it is the index of the row for the
// second byte in the sequence.
n = rows[n][*++bytes];
if (n == 0) return false;
if (n == -1) return true;
n = rows[n][*++bytes]; // Likewise for the third byte.
if (n == 0) return false;
if (n == -1) return true;
n = rows[n][*++bytes]; // Likewise for the fourth byte.
if (n == 0) return false;
// Since there can be at most 4 bytes in the sequence, n must be -1.
return true;
// Implementation note: it is possible (and perhaps clearer) to write this
// code as a loop, "for (int i = 0; i < 4; ++i) ...", but the TestHoldsFor
// benchmark results indicate that doing so produces slower code for
// anything other than short 7-bit ASCII strings (< 512 bytes). This is
// mysterious, since the compiler unrolls the loop, producing code that
// is almost the same as what we have here, except for the shortcut on
// the 4th byte.
}
};
//============================================================
// CharProperty - a property that holds for selected Unicode chars
//
CharProperty::CharProperty(const char *name,
const int *unicodes,
int num_unicodes)
: name_(name),
impl_(new CharPropertyImplementation) {
// Initialize CharProperty to its char set.
AddCharSpec(unicodes, num_unicodes);
}
CharProperty::CharProperty(const char *name, CharPropertyInitializer *init_fn)
: name_(name),
impl_(new CharPropertyImplementation) {
(*init_fn)(this);
}
CharProperty::~CharProperty() {
delete impl_;
}
void CharProperty::AddChar(int c) {
CheckUnicodeVal(c);
impl_->chars.insert(c);
char buf[UTFmax];
Rune r = c;
int len = runetochar(buf, &r);
impl_->AddChar(buf, len);
}
void CharProperty::AddCharRange(int c1, int c2) {
for (int c = c1; c <= c2; ++c) {
AddChar(c);
}
}
void CharProperty::AddAsciiPredicate(AsciiPredicate *pred) {
for (int c = 0; c < 256; ++c) {
if ((*pred)(c)) {
AddChar(c);
}
}
}
void CharProperty::AddCharProperty(const char *propname) {
const CharProperty *prop = CharProperty::Lookup(propname);
CHECK(prop != NULL) << ": unknown char property \"" << propname
<< "\" in " << name_;
int c = -1;
while ((c = prop->NextElementAfter(c)) >= 0) {
AddChar(c);
}
}
void CharProperty::AddCharSpec(const int *unicodes, int num_unicodes) {
for (int i = 0; i < num_unicodes; ++i) {
if (i + 3 < num_unicodes && unicodes[i] == kPreUnicodeRange &&
unicodes[i + 3] == kPostUnicodeRange) {
// Range of unicode values
int lower = unicodes[i + 1];
int upper = unicodes[i + 2];
i += 3; // i will be incremented once more at top of loop
CHECK(lower <= upper) << ": invalid char range in " << name_
<< ": [" << UnicodeToString(lower) << ", "
<< UnicodeToString(upper) << "]";
AddCharRange(lower, upper);
} else {
AddChar(unicodes[i]);
}
}
}
bool CharProperty::HoldsFor(int c) const {
if (!UniLib::IsValidCodepoint(c)) return false;
char buf[UTFmax];
Rune r = c;
runetochar(buf, &r);
return impl_->HoldsFor(buf);
}
bool CharProperty::HoldsFor(const char *str, int len) const {
// UniLib::IsUTF8ValidCodepoint also checks for structural validity.
return len > 0 && UniLib::IsUTF8ValidCodepoint(StringPiece(str, len)) &&
impl_->HoldsFor(str);
}
// Return -1 or the smallest Unicode char greater than c for which
// the CharProperty holds. Expects c == -1 or HoldsFor(c).
int CharProperty::NextElementAfter(int c) const {
DCHECK(c == -1 || HoldsFor(c));
unordered_set<char32>::const_iterator end = impl_->chars.end();
if (c < 0) {
unordered_set<char32>::const_iterator it = impl_->chars.begin();
if (it == end) return -1;
return *it;
}
char32 r = c;
unordered_set<char32>::const_iterator it = impl_->chars.find(r);
if (it == end) return -1;
it++;
if (it == end) return -1;
return *it;
}
REGISTER_CLASS_REGISTRY("char property wrapper", CharPropertyWrapper);
const CharProperty *CharProperty::Lookup(const char *subclass) {
// Create a CharPropertyWrapper object and delete it. We only care about
// the CharProperty it provides.
std::unique_ptr<CharPropertyWrapper> wrapper(
CharPropertyWrapper::Create(subclass));
if (wrapper.get() == NULL) {
LOG(ERROR) << "CharPropertyWrapper not found for subclass: "
<< "\"" << subclass << "\"";
return NULL;
}
return wrapper->GetCharProperty();
}
// Check that a given Unicode value is in range.
void CharProperty::CheckUnicodeVal(int c) const {
CHECK(UniLib::IsValidCodepoint(c))
<< "Unicode in " << name_ << " out of range: " << UnicodeToString(c);
}
// Converts a Unicode value to a string (for error messages).
string CharProperty::UnicodeToString(int c) {
const char *fmt;
if (c < 0) {
fmt = "%d"; // out-of-range
} else if (c <= 0x7f) {
fmt = "'%c'"; // ascii
} else if (c <= 0xffff) {
fmt = "0x%04X"; // 4 hex digits
} else {
fmt = "0x%X"; // also out-of-range
}
return tensorflow::strings::Printf(fmt, c);
}
//======================================================================
// Expression-level punctuation
//
// Punctuation that starts a sentence.
DEFINE_CHAR_PROPERTY_AS_SET(start_sentence_punc,
0x00A1, // Spanish inverted exclamation mark
0x00BF, // Spanish inverted question mark
)
// Punctuation that ends a sentence.
// Based on: http://www.unicode.org/unicode/reports/tr29/#Sentence_Boundaries
DEFINE_CHAR_PROPERTY_AS_SET(end_sentence_punc,
'.',
'!',
'?',
0x055C, // Armenian exclamation mark
0x055E, // Armenian question mark
0x0589, // Armenian full stop
0x061F, // Arabic question mark
0x06D4, // Arabic full stop
0x0700, // Syriac end of paragraph
0x0701, // Syriac supralinear full stop
0x0702, // Syriac sublinear full stop
RANGE(0x0964, 0x0965), // Devanagari danda..Devanagari double danda
0x1362, // Ethiopic full stop
0x1367, // Ethiopic question mark
0x1368, // Ethiopic paragraph separator
0x104A, // Myanmar sign little section
0x104B, // Myanmar sign section
0x166E, // Canadian syllabics full stop
0x17d4, // Khmer sign khan
0x1803, // Mongolian full stop
0x1809, // Mongolian Manchu full stop
0x1944, // Limbu exclamation mark
0x1945, // Limbu question mark
0x203C, // double exclamation mark
0x203D, // interrobang
0x2047, // double question mark
0x2048, // question exclamation mark
0x2049, // exclamation question mark
0x3002, // ideographic full stop
0x037E, // Greek question mark
0xFE52, // small full stop
0xFE56, // small question mark
0xFE57, // small exclamation mark
0xFF01, // fullwidth exclamation mark
0xFF0E, // fullwidth full stop
0xFF1F, // fullwidth question mark
0xFF61, // halfwidth ideographic full stop
0x2026, // ellipsis
)
// Punctuation, such as parens, that opens a "nested expression" of text.
DEFINE_CHAR_PROPERTY_AS_SET(open_expr_punc,
'(',
'[',
'<',
'{',
0x207D, // superscript left parenthesis
0x208D, // subscript left parenthesis
0x27E6, // mathematical left white square bracket
0x27E8, // mathematical left angle bracket
0x27EA, // mathematical left double angle bracket
0x2983, // left white curly bracket
0x2985, // left white parenthesis
0x2987, // Z notation left image bracket
0x2989, // Z notation left binding bracket
0x298B, // left square bracket with underbar
0x298D, // left square bracket with tick in top corner
0x298F, // left square bracket with tick in bottom corner
0x2991, // left angle bracket with dot
0x2993, // left arc less-than bracket
0x2995, // double left arc greater-than bracket
0x2997, // left black tortoise shell bracket
0x29D8, // left wiggly fence
0x29DA, // left double wiggly fence
0x29FC, // left-pointing curved angle bracket
0x3008, // CJK left angle bracket
0x300A, // CJK left double angle bracket
0x3010, // CJK left black lenticular bracket
0x3014, // CJK left tortoise shell bracket
0x3016, // CJK left white lenticular bracket
0x3018, // CJK left white tortoise shell bracket
0x301A, // CJK left white square bracket
0xFD3E, // Ornate left parenthesis
0xFE59, // small left parenthesis
0xFE5B, // small left curly bracket
0xFF08, // fullwidth left parenthesis
0xFF3B, // fullwidth left square bracket
0xFF5B, // fullwidth left curly bracket
)
// Punctuation, such as parens, that closes a "nested expression" of text.
DEFINE_CHAR_PROPERTY_AS_SET(close_expr_punc,
')',
']',
'>',
'}',
0x207E, // superscript right parenthesis
0x208E, // subscript right parenthesis
0x27E7, // mathematical right white square bracket
0x27E9, // mathematical right angle bracket
0x27EB, // mathematical right double angle bracket
0x2984, // right white curly bracket
0x2986, // right white parenthesis
0x2988, // Z notation right image bracket
0x298A, // Z notation right binding bracket
0x298C, // right square bracket with underbar
0x298E, // right square bracket with tick in top corner
0x2990, // right square bracket with tick in bottom corner
0x2992, // right angle bracket with dot
0x2994, // right arc greater-than bracket
0x2996, // double right arc less-than bracket
0x2998, // right black tortoise shell bracket
0x29D9, // right wiggly fence
0x29DB, // right double wiggly fence
0x29FD, // right-pointing curved angle bracket
0x3009, // CJK right angle bracket
0x300B, // CJK right double angle bracket
0x3011, // CJK right black lenticular bracket
0x3015, // CJK right tortoise shell bracket
0x3017, // CJK right white lenticular bracket
0x3019, // CJK right white tortoise shell bracket
0x301B, // CJK right white square bracket
0xFD3F, // Ornate right parenthesis
0xFE5A, // small right parenthesis
0xFE5C, // small right curly bracket
0xFF09, // fullwidth right parenthesis
0xFF3D, // fullwidth right square bracket
0xFF5D, // fullwidth right curly bracket
)
// Chars that open a quotation.
// Based on: http://www.unicode.org/uni2book/ch06.pdf
DEFINE_CHAR_PROPERTY_AS_SET(open_quote,
'"',
'\'',
'`',
0xFF07, // fullwidth apostrophe
0xFF02, // fullwidth quotation mark
0x2018, // left single quotation mark (English, others)
0x201C, // left double quotation mark (English, others)
0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
0x201A, // single low-9 quotation mark (Czech, German, Slovak)
0x201E, // double low-9 quotation mark (Czech, German, Slovak)
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
0x2039, // single left-pointing angle quotation mark (French, others)
0x00AB, // left-pointing double angle quotation mark (French, others)
0x203A, // single right-pointing angle quotation mark (Slovenian, others)
0x00BB, // right-pointing double angle quotation mark (Slovenian, others)
0x300C, // left corner bracket (East Asian languages)
0xFE41, // presentation form for vertical left corner bracket
0xFF62, // halfwidth left corner bracket (East Asian languages)
0x300E, // left white corner bracket (East Asian languages)
0xFE43, // presentation form for vertical left white corner bracket
0x301D, // reversed double prime quotation mark (East Asian langs, horiz.)
)
// Chars that close a quotation.
// Based on: http://www.unicode.org/uni2book/ch06.pdf
DEFINE_CHAR_PROPERTY_AS_SET(close_quote,
'\'',
'"',
'`',
0xFF07, // fullwidth apostrophe
0xFF02, // fullwidth quotation mark
0x2019, // right single quotation mark (English, others)
0x201D, // right double quotation mark (English, others)
0x2018, // left single quotation mark (Czech, German, Slovak)
0x201C, // left double quotation mark (Czech, German, Slovak)
0x203A, // single right-pointing angle quotation mark (French, others)
0x00BB, // right-pointing double angle quotation mark (French, others)
0x2039, // single left-pointing angle quotation mark (Slovenian, others)
0x00AB, // left-pointing double angle quotation mark (Slovenian, others)
0x300D, // right corner bracket (East Asian languages)
0xfe42, // presentation form for vertical right corner bracket
0xFF63, // halfwidth right corner bracket (East Asian languages)
0x300F, // right white corner bracket (East Asian languages)
0xfe44, // presentation form for vertical right white corner bracket
0x301F, // low double prime quotation mark (East Asian languages)
0x301E, // close double prime (East Asian languages written horizontally)
)
// Punctuation chars that open an expression or a quotation.
DEFINE_CHAR_PROPERTY(open_punc, prop) {
prop->AddCharProperty("open_expr_punc");
prop->AddCharProperty("open_quote");
}
// Punctuation chars that close an expression or a quotation.
DEFINE_CHAR_PROPERTY(close_punc, prop) {
prop->AddCharProperty("close_expr_punc");
prop->AddCharProperty("close_quote");
}
// Punctuation chars that can come at the beginning of a sentence.
DEFINE_CHAR_PROPERTY(leading_sentence_punc, prop) {
prop->AddCharProperty("open_punc");
prop->AddCharProperty("start_sentence_punc");
}
// Punctuation chars that can come at the end of a sentence.
DEFINE_CHAR_PROPERTY(trailing_sentence_punc, prop) {
prop->AddCharProperty("close_punc");
prop->AddCharProperty("end_sentence_punc");
}
//======================================================================
// Special symbols
//
// Currency symbols.
// From: http://www.unicode.org/charts/PDF/U20A0.pdf
DEFINE_CHAR_PROPERTY_AS_SET(currency_symbol,
'$',
// 0x00A2, // cents (NB: typically FOLLOWS the amount)
0x00A3, // pounds and liras
0x00A4, // general currency sign
0x00A5, // yen or yuan
0x0192, // Dutch florin (latin small letter "f" with hook)
0x09F2, // Bengali rupee mark
0x09F3, // Bengali rupee sign
0x0AF1, // Guajarati rupee sign
0x0BF9, // Tamil rupee sign
0x0E3F, // Thai baht
0x17DB, // Khmer riel
0x20A0, // alternative euro sign
0x20A1, // Costa Rica, El Salvador (colon sign)
0x20A2, // Brazilian cruzeiro
0x20A3, // French Franc
0x20A4, // alternative lira sign
0x20A5, // mill sign (USA 1/10 cent)
0x20A6, // Nigerian Naira
0x20A7, // Spanish peseta
0x20A8, // Indian rupee
0x20A9, // Korean won
0x20AA, // Israeli new sheqel
0x20AB, // Vietnam dong
0x20AC, // euro sign
0x20AD, // Laotian kip
0x20AE, // Mongolian tugrik
0x20AF, // Greek drachma
0x20B0, // German penny
0x20B1, // Philippine peso (Mexican peso uses "$")
0x2133, // Old German mark (script capital M)
0xFDFC, // rial sign
0xFFE0, // fullwidth cents
0xFFE1, // fullwidth pounds
0xFFE5, // fullwidth Japanese yen
0xFFE6, // fullwidth Korean won
)
// Chinese bookquotes.
// They look like "<<" and ">>" except that they are single UTF8 chars
// (U+300A, U+300B). These are used in chinese as special
// punctuation, refering to the title of a book, an article, a movie,
// etc. For example: "cellphone" means cellphone, but <<cellphone>>
// means (exclusively) the movie.
DEFINE_CHAR_PROPERTY_AS_SET(open_bookquote,
0x300A
)
DEFINE_CHAR_PROPERTY_AS_SET(close_bookquote,
0x300B
)
//======================================================================
// Token-level punctuation
//
// Token-prefix symbols, excluding currency symbols -- glom on
// to following token (esp. if no space after)
DEFINE_CHAR_PROPERTY_AS_SET(noncurrency_token_prefix_symbol,
'#',
0x2116, // numero sign ("No")
)
// Token-prefix symbols -- glom on to following token (esp. if no space after)
DEFINE_CHAR_PROPERTY(token_prefix_symbol, prop) {
prop->AddCharProperty("currency_symbol");
prop->AddCharProperty("noncurrency_token_prefix_symbol");
}
// Token-suffix symbols -- glom on to preceding token (esp. if no space before)
DEFINE_CHAR_PROPERTY_AS_SET(token_suffix_symbol,
'%',
0x066A, // Arabic percent sign
0x2030, // per mille
0x2031, // per ten thousand
0x00A2, // cents sign
0x2125, // ounces sign
0x00AA, // feminine ordinal indicator (Spanish)
0x00BA, // masculine ordinal indicator (Spanish)
0x00B0, // degrees
0x2109, // degrees Fahrenheit
0x2103, // degrees Celsius
0x2126, // ohms
0x212A, // Kelvin
0x212B, // Angstroms ("A" with circle on top)
0x00A9, // copyright
0x2117, // sound recording copyright (circled "P")
0x2122, // trade mark
0x00AE, // registered trade mark
0x2120, // service mark
0x2106, // cada una ("c/a" == "each" in Spanish)
0x2020, // dagger (can be used for footnotes)
0x2021, // double dagger (can be used for footnotes)
)
// Subscripts
DEFINE_CHAR_PROPERTY_AS_SET(subscript_symbol,
0x2080, // subscript 0
0x2081, // subscript 1
0x2082, // subscript 2
0x2083, // subscript 3
0x2084, // subscript 4
0x2085, // subscript 5
0x2086, // subscript 6
0x2087, // subscript 7
0x2088, // subscript 8
0x2089, // subscript 9
0x208A, // subscript "+"
0x208B, // subscript "-"
0x208C, // subscript "="
0x208D, // subscript "("
0x208E, // subscript ")"
)
// Superscripts
DEFINE_CHAR_PROPERTY_AS_SET(superscript_symbol,
0x2070, // superscript 0
0x00B9, // superscript 1
0x00B2, // superscript 2
0x00B3, // superscript 3
0x2074, // superscript 4
0x2075, // superscript 5
0x2076, // superscript 6
0x2077, // superscript 7
0x2078, // superscript 8
0x2079, // superscript 9
0x2071, // superscript Latin small "i"
0x207A, // superscript "+"
0x207B, // superscript "-"
0x207C, // superscript "="
0x207D, // superscript "("
0x207E, // superscript ")"
0x207F, // superscript Latin small "n"
)
//======================================================================
// General punctuation
//
// Connector punctuation
// Code Pc from http://www.unicode.org/Public/UNIDATA/PropList.txt
// NB: This list is not necessarily exhaustive.
DEFINE_CHAR_PROPERTY_AS_SET(connector_punc,
0x30fb, // Katakana middle dot
0xff65, // halfwidth Katakana middle dot
0x2040, // character tie
)
// Dashes
// Code Pd from http://www.unicode.org/Public/UNIDATA/PropList.txt
// NB: This list is not necessarily exhaustive.
DEFINE_CHAR_PROPERTY_AS_SET(dash_punc,
'-',
'~',
0x058a, // Armenian hyphen
0x1806, // Mongolian todo soft hyphen
RANGE(0x2010, 0x2015), // hyphen..horizontal bar
0x2053, // swung dash -- from Table 6-3 of Unicode book
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0x301c, // wave dash
0x3030, // wavy dash
RANGE(0xfe31, 0xfe32), // presentation form for vertical em dash..en dash
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
)
// Other punctuation
// Code Po from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
// NB: This list is not exhaustive.
DEFINE_CHAR_PROPERTY_AS_SET(other_punc,
',',
':',
';',
0x00b7, // middle dot
0x0387, // Greek ano teleia
0x05c3, // Hebrew punctuation sof pasuq
0x060c, // Arabic comma
0x061b, // Arabic semicolon
0x066b, // Arabic decimal separator
0x066c, // Arabic thousands separator
RANGE(0x0703, 0x70a), // Syriac contraction and others
0x070c, // Syric harklean metobelus
0x0e5a, // Thai character angkhankhu
0x0e5b, // Thai character khomut
0x0f08, // Tibetan mark sbrul shad
RANGE(0x0f0d, 0x0f12), // Tibetan mark shad..Tibetan mark rgya gram shad
0x1361, // Ethiopic wordspace
RANGE(0x1363, 0x1366), // other Ethiopic chars
0x166d, // Canadian syllabics chi sign
RANGE(0x16eb, 0x16ed), // Runic single punctuation..Runic cross punctuation
RANGE(0x17d5, 0x17d6), // Khmer sign camnuc pii huuh and other
0x17da, // Khmer sign koomut
0x1802, // Mongolian comma
RANGE(0x1804, 0x1805), // Mongolian four dots and other
0x1808, // Mongolian manchu comma
0x3001, // ideographic comma
RANGE(0xfe50, 0xfe51), // small comma and others
RANGE(0xfe54, 0xfe55), // small semicolon and other
0xff0c, // fullwidth comma
RANGE(0xff0e, 0xff0f), // fullwidth stop..fullwidth solidus
RANGE(0xff1a, 0xff1b), // fullwidth colon..fullwidth semicolon
0xff64, // halfwidth ideographic comma
0x2016, // double vertical line
RANGE(0x2032, 0x2034), // prime..triple prime
0xfe61, // small asterisk
0xfe68, // small reverse solidus
0xff3c, // fullwidth reverse solidus
)
// All punctuation.
// Code P from http://www.unicode.org/Public/UNIDATA/PropList.txt
// NB: This list is not necessarily exhaustive.
DEFINE_CHAR_PROPERTY(punctuation, prop) {
prop->AddCharProperty("open_punc");
prop->AddCharProperty("close_punc");
prop->AddCharProperty("leading_sentence_punc");
prop->AddCharProperty("trailing_sentence_punc");
prop->AddCharProperty("connector_punc");
prop->AddCharProperty("dash_punc");
prop->AddCharProperty("other_punc");
prop->AddAsciiPredicate(&ispunct);
}
//======================================================================
// Separators
//
// Line separators
// Code Zl from http://www.unicode.org/Public/UNIDATA/PropList.txt
// NB: This list is not necessarily exhaustive.
DEFINE_CHAR_PROPERTY_AS_SET(line_separator,
0x2028, // line separator
)
// Paragraph separators
// Code Zp from http://www.unicode.org/Public/UNIDATA/PropList.txt
// NB: This list is not necessarily exhaustive.
DEFINE_CHAR_PROPERTY_AS_SET(paragraph_separator,
0x2029, // paragraph separator
)
// Space separators
// Code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
// NB: This list is not necessarily exhaustive.
DEFINE_CHAR_PROPERTY_AS_SET(space_separator,
0x0020, // space
0x00a0, // no-break space
0x1680, // Ogham space mark
0x180e, // Mongolian vowel separator
RANGE(0x2000, 0x200a), // en quad..hair space
0x202f, // narrow no-break space
0x205f, // medium mathematical space
0x3000, // ideographic space
// Google additions
0xe5e5, // "private" char used as space in Chinese
)
// Separators -- all line, paragraph, and space separators.
// Code Z from http://www.unicode.org/Public/UNIDATA/PropList.txt
// NB: This list is not necessarily exhaustive.
DEFINE_CHAR_PROPERTY(separator, prop) {
prop->AddCharProperty("line_separator");
prop->AddCharProperty("paragraph_separator");
prop->AddCharProperty("space_separator");
prop->AddAsciiPredicate(&isspace);
}
//======================================================================
// Alphanumeric Characters
//
// Digits
DEFINE_CHAR_PROPERTY_AS_SET(digit,
RANGE('0', '9'),
RANGE(0x0660, 0x0669), // Arabic-Indic digits
RANGE(0x06F0, 0x06F9), // Eastern Arabic-Indic digits
)
//======================================================================
// Japanese Katakana
//
DEFINE_CHAR_PROPERTY_AS_SET(katakana,
0x3099, // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
0x309A, // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
0x309B, // KATAKANA-HIRAGANA VOICED SOUND MARK
0x309C, // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
RANGE(0x30A0, 0x30FF), // Fullwidth Katakana
RANGE(0xFF65, 0xFF9F), // Halfwidth Katakana
)
//======================================================================
// BiDi Directional Formatting Codes
//
// See http://www.unicode.org/reports/tr9/ for a description of Bidi
// and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
DEFINE_CHAR_PROPERTY_AS_SET(directional_formatting_code,
0x200E, // LRM (Left-to-Right Mark)
0x200F, // RLM (Right-to-Left Mark)
0x202A, // LRE (Left-to-Right Embedding)
0x202B, // RLE (Right-to-Left Embedding)
0x202C, // PDF (Pop Directional Format)
0x202D, // LRO (Left-to-Right Override)
0x202E, // RLO (Right-to-Left Override)
)
//======================================================================
// Special collections
//
// NB: This does not check for all punctuation and symbols in the
// standard; just those listed in our code. See the definitions in
// char_properties.cc
DEFINE_CHAR_PROPERTY(punctuation_or_symbol, prop) {
prop->AddCharProperty("punctuation");
prop->AddCharProperty("subscript_symbol");
prop->AddCharProperty("superscript_symbol");
prop->AddCharProperty("token_prefix_symbol");
prop->AddCharProperty("token_suffix_symbol");
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// char_properties.h - define is_X() tests for various character properties
//
// Character properties can be defined in two ways:
//
// (1) Set-based:
//
// Enumerate the chars that have the property. Example:
//
// DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
// RANGE('0', '9'),
// '\'',
// 0x00BF, // Spanish inverted question mark
// )
//
// Characters are expressed as Unicode code points; note that ascii codes
// are a subset. RANGE() specifies an inclusive range of code points.
//
// This defines two functions:
//
// bool is_my_fave(const char *str, int len)
// bool is_my_fave(int c)
//
// Each returns true for precisely the 12 characters specified above.
// Each takes a *single* UTf8 char as its argument -- the first expresses
// it as a char * and a length, the second as a Unicode code point.
// Please do not pass a string of multiple UTF8 chars to the first one.
//
// To make is_my_fave() externally accessible, put in your .h file:
//
// DECLARE_CHAR_PROPERTY(my_fave)
//
// (2) Function-based:
//
// Specify a function that assigns the desired chars to a CharProperty
// object. Example:
//
// DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
// for (int i = '0'; i <= '9'; i += 2) {
// prop->AddChar(i);
// }
// prop->AddAsciiPredicate(&ispunct);
// prop->AddCharProperty("currency_symbol");
// }
//
// This defines a function of one arg: CharProperty *prop. The function
// calls various CharProperty methods to populate the prop. The last call
// above, AddCharProperty(), adds the chars from another char property
// ("currency_symbol").
//
// As in the set-based case, put a DECLARE_CHAR_PROPERTY(my_other_fave)
// in your .h if you want is_my_other_fave() to be externally accessible.
//
#ifndef SYNTAXNET_CHAR_PROPERTIES_H_
#define SYNTAXNET_CHAR_PROPERTIES_H_
#include <string> // for string
#include "syntaxnet/registry.h"
#include "syntaxnet/utils.h"
// =====================================================================
// Registry for accessing CharProperties by name
//
// This is for internal use by the CharProperty class and macros; callers
// should not use it explicitly.
//
namespace syntaxnet {
class CharProperty; // forward declaration
// Wrapper around a CharProperty, allowing it to be stored in a registry.
struct CharPropertyWrapper : RegisterableClass<CharPropertyWrapper> {
virtual ~CharPropertyWrapper() { }
virtual CharProperty *GetCharProperty() = 0;
};
#define REGISTER_CHAR_PROPERTY_WRAPPER(type, component) \
REGISTER_CLASS_COMPONENT(CharPropertyWrapper, type, component)
#define REGISTER_CHAR_PROPERTY(lsp, name) \
struct name##CharPropertyWrapper : public CharPropertyWrapper { \
CharProperty *GetCharProperty() { return lsp.get(); } \
}; \
REGISTER_CHAR_PROPERTY_WRAPPER(#name, name##CharPropertyWrapper)
// =====================================================================
// Macros for defining character properties
//
// Define is_X() functions to test whether a single UTF8 character has
// the 'X' char prop.
#define DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(lsp, name) \
bool is_##name(const char *str, int len) { \
return lsp->HoldsFor(str, len); \
} \
bool is_##name(int c) { \
return lsp->HoldsFor(c); \
}
// Define a char property by enumerating the unicode char points,
// or RANGE()s thereof, for which it holds. Example:
//
// DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
// 'q',
// RANGE('0', '9'),
// 0x20AB,
// )
//
// "..." is a GNU extension.
#define DEFINE_CHAR_PROPERTY_AS_SET(name, unicodes...) \
static const int k_##name##_unicodes[] = {unicodes}; \
static utils::LazyStaticPtr<CharProperty, const char *, const int *, size_t> \
name##_char_property = {#name, k_##name##_unicodes, \
arraysize(k_##name##_unicodes)}; \
REGISTER_CHAR_PROPERTY(name##_char_property, name); \
DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name)
// Specify a range (inclusive) of Unicode character values.
// Example: RANGE('0', '9') specifies the 10 digits.
// For use as an element in a DEFINE_CHAR_PROPERTY_AS_SET() list.
static const int kPreUnicodeRange = -1;
static const int kPostUnicodeRange = -2;
#define RANGE(lower, upper) \
kPreUnicodeRange, lower, upper, kPostUnicodeRange
// A function to initialize a CharProperty.
typedef void CharPropertyInitializer(CharProperty *prop);
// Define a char property by specifying a block of code that initializes it.
// Example:
//
// DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
// for (int i = '0'; i <= '9'; i += 2) {
// prop->AddChar(i);
// }
// prop->AddAsciiPredicate(&ispunct);
// prop->AddCharProperty("currency_symbol");
// }
//
#define DEFINE_CHAR_PROPERTY(name, charpropvar) \
static void init_##name##_char_property(CharProperty *charpropvar); \
static utils::LazyStaticPtr<CharProperty, const char *, \
CharPropertyInitializer *> \
name##_char_property = {#name, &init_##name##_char_property}; \
REGISTER_CHAR_PROPERTY(name##_char_property, name); \
DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name) \
static void init_##name##_char_property(CharProperty *charpropvar)
// =====================================================================
// Macro for declaring character properties
//
#define DECLARE_CHAR_PROPERTY(name) \
extern bool is_##name(const char *str, int len); \
extern bool is_##name(int c); \
// ===========================================================
// CharProperty - a property that holds for selected Unicode chars
//
// A CharProperty is semantically equivalent to set<char32>.
//
// The characters for which a CharProperty holds are represented as a trie,
// i.e., a tree that is indexed by successive bytes of the UTF-8 encoding
// of the characters. This permits fast lookup (HoldsFor).
//
// A function that defines a subset of [0..255], e.g., isspace.
typedef int AsciiPredicate(int c);
class CharProperty {
public:
// Constructor for set-based char properties.
CharProperty(const char *name, const int *unicodes, int num_unicodes);
// Constructor for function-based char properties.
CharProperty(const char *name, CharPropertyInitializer *init_fn);
virtual ~CharProperty();
// Various ways of adding chars to a CharProperty; for use only in
// CharPropertyInitializer functions.
void AddChar(int c);
void AddCharRange(int c1, int c2);
void AddAsciiPredicate(AsciiPredicate *pred);
void AddCharProperty(const char *name);
void AddCharSpec(const int *unicodes, int num_unicodes);
// Return true iff the CharProperty holds for a single given UTF8 char.
bool HoldsFor(const char *str, int len) const;
// Return true iff the CharProperty holds for a single given Unicode char.
bool HoldsFor(int c) const;
// You can use this to enumerate the set elements (it was easier
// than defining a real iterator). Returns -1 if there are no more.
// Call with -1 to get the first element. Expects c == -1 or HoldsFor(c).
int NextElementAfter(int c) const;
// Return NULL or the CharProperty with the given name. Looks up the name
// in a CharProperty registry.
static const CharProperty *Lookup(const char *name);
private:
void CheckUnicodeVal(int c) const;
static string UnicodeToString(int c);
const char *name_;
struct CharPropertyImplementation *impl_;
TF_DISALLOW_COPY_AND_ASSIGN(CharProperty);
};
//======================================================================
// Expression-level punctuation
//
// Punctuation that starts a sentence.
DECLARE_CHAR_PROPERTY(start_sentence_punc);
// Punctuation that ends a sentence.
DECLARE_CHAR_PROPERTY(end_sentence_punc);
// Punctuation, such as parens, that opens a "nested expression" of text.
DECLARE_CHAR_PROPERTY(open_expr_punc);
// Punctuation, such as parens, that closes a "nested expression" of text.
DECLARE_CHAR_PROPERTY(close_expr_punc);
// Chars that open a quotation.
DECLARE_CHAR_PROPERTY(open_quote);
// Chars that close a quotation.
DECLARE_CHAR_PROPERTY(close_quote);
// Punctuation chars that open an expression or a quotation.
DECLARE_CHAR_PROPERTY(open_punc);
// Punctuation chars that close an expression or a quotation.
DECLARE_CHAR_PROPERTY(close_punc);
// Punctuation chars that can come at the beginning of a sentence.
DECLARE_CHAR_PROPERTY(leading_sentence_punc);
// Punctuation chars that can come at the end of a sentence.
DECLARE_CHAR_PROPERTY(trailing_sentence_punc);
//======================================================================
// Token-level punctuation
//
// Token-prefix symbols -- glom on to following token
// (esp. if no space after) -- except for currency symbols.
DECLARE_CHAR_PROPERTY(noncurrency_token_prefix_symbol);
// Token-prefix symbols -- glom on to following token (esp. if no space after).
DECLARE_CHAR_PROPERTY(token_prefix_symbol);
// Token-suffix symbols -- glom on to preceding token (esp. if no space
// before).
DECLARE_CHAR_PROPERTY(token_suffix_symbol);
// Subscripts.
DECLARE_CHAR_PROPERTY(subscript_symbol);
// Superscripts.
DECLARE_CHAR_PROPERTY(superscript_symbol);
//======================================================================
// General punctuation
//
// Connector punctuation.
DECLARE_CHAR_PROPERTY(connector_punc);
// Dashes.
DECLARE_CHAR_PROPERTY(dash_punc);
// Other punctuation.
DECLARE_CHAR_PROPERTY(other_punc);
// All punctuation.
DECLARE_CHAR_PROPERTY(punctuation);
//======================================================================
// Special symbols
//
// Currency symbols.
DECLARE_CHAR_PROPERTY(currency_symbol);
// Chinese bookquotes.
DECLARE_CHAR_PROPERTY(open_bookquote);
DECLARE_CHAR_PROPERTY(close_bookquote);
//======================================================================
// Separators
//
// Line separators.
DECLARE_CHAR_PROPERTY(line_separator);
// Paragraph separators.
DECLARE_CHAR_PROPERTY(paragraph_separator);
// Space separators.
DECLARE_CHAR_PROPERTY(space_separator);
// Separators -- all line, paragraph, and space separators.
DECLARE_CHAR_PROPERTY(separator);
//======================================================================
// Alphanumeric Characters
//
// Digits.
DECLARE_CHAR_PROPERTY(digit);
// Japanese Katakana.
DECLARE_CHAR_PROPERTY(katakana);
//======================================================================
// BiDi Directional Formatting Codes
//
// Explicit directional formatting codes (LRM, RLM, LRE, RLE, PDF, LRO, RLO)
// used by the bidirectional algorithm.
//
// Note: Use this only to classify characters. To actually determine
// directionality of BiDi text, look under i18n/bidi.
//
// See http://www.unicode.org/reports/tr9/ for a description of the algorithm
// and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
DECLARE_CHAR_PROPERTY(directional_formatting_code);
//======================================================================
// Special collections
//
// NB: This does not check for all punctuation and symbols in the standard;
// just those listed in our code. See the definitions in char_properties.cc.
DECLARE_CHAR_PROPERTY(punctuation_or_symbol);
} // namespace syntaxnet
#endif // SYNTAXNET_CHAR_PROPERTIES_H_
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Tests for char_properties.cc:
//
// (1) Test the DEFINE_CHAR_PROPERTY_AS_SET and DEFINE_CHAR_PROPERTY macros
// by defining a few fake char properties and verifying their contents.
//
// (2) Test the char properties defined in char_properties.cc by spot-checking
// a few chars.
//
#include "syntaxnet/char_properties.h"
#include <ctype.h> // for ispunct, isspace
#include <map>
#include <set>
#include <utility>
#include <vector>
#include <gmock/gmock.h> // for ContainerEq, EXPECT_THAT
#include "tensorflow/core/platform/test.h"
#include "third_party/utf/utf.h"
#include "util/utf8/unilib.h" // for IsValidCodepoint, etc
#include "util/utf8/unilib_utf8_utils.h"
using ::testing::ContainerEq;
namespace syntaxnet {
// Invalid UTF-8 bytes are decoded as the Replacement Character, U+FFFD
// (which is also Runeerror). Invalid code points are encoded in UTF-8
// with the UTF-8 representation of the Replacement Character.
static const char ReplacementCharacterUTF8[3] = {'\xEF', '\xBF', '\xBD'};
// ====================================================================
// CharPropertiesTest
//
class CharPropertiesTest : public testing::Test {
protected:
// Collect a set of chars.
void CollectChars(const std::set<char32> &chars) {
collected_set_.insert(chars.begin(), chars.end());
}
// Collect an array of chars.
void CollectArray(const char32 arr[], int len) {
collected_set_.insert(arr, arr + len);
}
// Collect the chars for which the named CharProperty holds.
void CollectCharProperty(const char *name) {
const CharProperty *prop = CharProperty::Lookup(name);
ASSERT_TRUE(prop != nullptr) << "for " << name;
for (char32 c = 0; c <= 0x10FFFF; ++c) {
if (UniLib::IsValidCodepoint(c) && prop->HoldsFor(c)) {
collected_set_.insert(c);
}
}
}
// Collect the chars for which an ascii predicate holds.
void CollectAsciiPredicate(AsciiPredicate *pred) {
for (char32 c = 0; c < 256; ++c) {
if ((*pred)(c)) {
collected_set_.insert(c);
}
}
}
// Expect the named char property to be true for precisely the chars in
// the collected set.
void ExpectCharPropertyEqualsCollectedSet(const char *name) {
const CharProperty *prop = CharProperty::Lookup(name);
ASSERT_TRUE(prop != nullptr) << "for " << name;
// Test that char property holds for all collected chars. Exercises both
// signatures of CharProperty::HoldsFor().
for (std::set<char32>::const_iterator it = collected_set_.begin();
it != collected_set_.end(); ++it) {
// Test utf8 version of is_X().
const char32 c = *it;
string utf8_char = EncodeAsUTF8(&c, 1);
EXPECT_TRUE(prop->HoldsFor(utf8_char.c_str(), utf8_char.size()));
// Test ucs-2 version of is_X().
EXPECT_TRUE(prop->HoldsFor(static_cast<int>(c)));
}
// Test that the char property holds for precisely the collected chars.
// Somewhat redundant with previous test, but exercises
// CharProperty::NextElementAfter().
std::set<char32> actual_chars;
int c = -1;
while ((c = prop->NextElementAfter(c)) >= 0) {
actual_chars.insert(static_cast<char32>(c));
}
EXPECT_THAT(actual_chars, ContainerEq(collected_set_))
<< " for " << name;
}
// Expect the named char property to be true for at least the chars in
// the collected set.
void ExpectCharPropertyContainsCollectedSet(const char *name) {
const CharProperty *prop = CharProperty::Lookup(name);
ASSERT_TRUE(prop != nullptr) << "for " << name;
for (std::set<char32>::const_iterator it = collected_set_.begin();
it != collected_set_.end(); ++it) {
EXPECT_TRUE(prop->HoldsFor(static_cast<int>(*it)));
}
}
string EncodeAsUTF8(const char32 *in, int size) {
string out;
out.reserve(size);
for (int i = 0; i < size; ++i) {
char buf[UTFmax];
int len = EncodeAsUTF8Char(*in++, buf);
out.append(buf, len);
}
return out;
}
int EncodeAsUTF8Char(char32 in, char *out) {
if (UniLib::IsValidCodepoint(in)) {
return runetochar(out, &in);
} else {
memcpy(out, ReplacementCharacterUTF8, 3);
return 3;
}
}
private:
std::set<char32> collected_set_;
};
//======================================================================
// Declarations of the sample character sets below
// (to test the DECLARE_CHAR_PROPERTY() macro)
//
DECLARE_CHAR_PROPERTY(test_digit);
DECLARE_CHAR_PROPERTY(test_wavy_dash);
DECLARE_CHAR_PROPERTY(test_digit_or_wavy_dash);
DECLARE_CHAR_PROPERTY(test_punctuation_plus);
//======================================================================
// Definitions of sample character sets
//
// Digits.
DEFINE_CHAR_PROPERTY_AS_SET(test_digit,
RANGE('0', '9'),
)
// Wavy dashes.
DEFINE_CHAR_PROPERTY_AS_SET(test_wavy_dash,
'~',
0x301C, // wave dash
0x3030, // wavy dash
)
// Digits or wavy dashes.
DEFINE_CHAR_PROPERTY(test_digit_or_wavy_dash, prop) {
prop->AddCharProperty("test_digit");
prop->AddCharProperty("test_wavy_dash");
}
// Punctuation plus a few extraneous chars.
DEFINE_CHAR_PROPERTY(test_punctuation_plus, prop) {
prop->AddChar('a');
prop->AddCharRange('b', 'b');
prop->AddCharRange('c', 'e');
static const int kUnicodes[] = {'f', RANGE('g', 'i'), 'j'};
prop->AddCharSpec(kUnicodes, arraysize(kUnicodes));
prop->AddCharProperty("punctuation");
}
//====================================================================
// Another form of the character sets above -- for verification
//
const char32 kTestDigit[] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
};
const char32 kTestWavyDash[] = {
'~',
0x301C, // wave dash,
0x3030, // wavy dash
};
const char32 kTestPunctuationPlusExtras[] = {
'a',
'b',
'c',
'd',
'e',
'f',
'g',
'h',
'i',
'j',
};
// ====================================================================
// Tests
//
TEST_F(CharPropertiesTest, TestDigit) {
CollectArray(kTestDigit, arraysize(kTestDigit));
ExpectCharPropertyEqualsCollectedSet("test_digit");
}
TEST_F(CharPropertiesTest, TestWavyDash) {
CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
ExpectCharPropertyEqualsCollectedSet("test_wavy_dash");
}
TEST_F(CharPropertiesTest, TestDigitOrWavyDash) {
CollectArray(kTestDigit, arraysize(kTestDigit));
CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
ExpectCharPropertyEqualsCollectedSet("test_digit_or_wavy_dash");
}
TEST_F(CharPropertiesTest, TestPunctuationPlus) {
CollectCharProperty("punctuation");
CollectArray(kTestPunctuationPlusExtras,
arraysize(kTestPunctuationPlusExtras));
ExpectCharPropertyEqualsCollectedSet("test_punctuation_plus");
}
// ====================================================================
// Spot-check predicates in char_properties.cc
//
TEST_F(CharPropertiesTest, StartSentencePunc) {
CollectChars({0x00A1, 0x00BF});
ExpectCharPropertyContainsCollectedSet("start_sentence_punc");
}
TEST_F(CharPropertiesTest, EndSentencePunc) {
CollectChars({'.', '!', '?'});
ExpectCharPropertyContainsCollectedSet("end_sentence_punc");
}
TEST_F(CharPropertiesTest, OpenExprPunc) {
CollectChars({'(', '['});
ExpectCharPropertyContainsCollectedSet("open_expr_punc");
}
TEST_F(CharPropertiesTest, CloseExprPunc) {
CollectChars({')', ']'});
ExpectCharPropertyContainsCollectedSet("close_expr_punc");
}
TEST_F(CharPropertiesTest, OpenQuote) {
CollectChars({'\'', '"'});
ExpectCharPropertyContainsCollectedSet("open_quote");
}
TEST_F(CharPropertiesTest, CloseQuote) {
CollectChars({'\'', '"'});
ExpectCharPropertyContainsCollectedSet("close_quote");
}
TEST_F(CharPropertiesTest, OpenBookquote) {
CollectChars({0x300A});
ExpectCharPropertyContainsCollectedSet("open_bookquote");
}
TEST_F(CharPropertiesTest, CloseBookquote) {
CollectChars({0x300B});
ExpectCharPropertyContainsCollectedSet("close_bookquote");
}
TEST_F(CharPropertiesTest, OpenPunc) {
CollectChars({'(', '['});
CollectChars({'\'', '"'});
ExpectCharPropertyContainsCollectedSet("open_punc");
}
TEST_F(CharPropertiesTest, ClosePunc) {
CollectChars({')', ']'});
CollectChars({'\'', '"'});
ExpectCharPropertyContainsCollectedSet("close_punc");
}
TEST_F(CharPropertiesTest, LeadingSentencePunc) {
CollectChars({'(', '['});
CollectChars({'\'', '"'});
CollectChars({0x00A1, 0x00BF});
ExpectCharPropertyContainsCollectedSet("leading_sentence_punc");
}
TEST_F(CharPropertiesTest, TrailingSentencePunc) {
CollectChars({')', ']'});
CollectChars({'\'', '"'});
CollectChars({'.', '!', '?'});
ExpectCharPropertyContainsCollectedSet("trailing_sentence_punc");
}
TEST_F(CharPropertiesTest, NoncurrencyTokenPrefixSymbol) {
CollectChars({'#'});
ExpectCharPropertyContainsCollectedSet("noncurrency_token_prefix_symbol");
}
TEST_F(CharPropertiesTest, TokenSuffixSymbol) {
CollectChars({'%', 0x2122, 0x00A9, 0x00B0});
ExpectCharPropertyContainsCollectedSet("token_suffix_symbol");
}
TEST_F(CharPropertiesTest, TokenPrefixSymbol) {
CollectChars({'#'});
CollectChars({'$', 0x00A5, 0x20AC});
ExpectCharPropertyContainsCollectedSet("token_prefix_symbol");
}
TEST_F(CharPropertiesTest, SubscriptSymbol) {
CollectChars({0x2082, 0x2083});
ExpectCharPropertyContainsCollectedSet("subscript_symbol");
}
TEST_F(CharPropertiesTest, SuperscriptSymbol) {
CollectChars({0x00B2, 0x00B3});
ExpectCharPropertyContainsCollectedSet("superscript_symbol");
}
TEST_F(CharPropertiesTest, CurrencySymbol) {
CollectChars({'$', 0x00A5, 0x20AC});
ExpectCharPropertyContainsCollectedSet("currency_symbol");
}
TEST_F(CharPropertiesTest, DirectionalFormattingCode) {
CollectChars({0x200E, 0x200F, 0x202A, 0x202B, 0x202C, 0x202D, 0x202E});
ExpectCharPropertyContainsCollectedSet("directional_formatting_code");
}
TEST_F(CharPropertiesTest, Punctuation) {
CollectAsciiPredicate(ispunct);
ExpectCharPropertyContainsCollectedSet("punctuation");
}
TEST_F(CharPropertiesTest, Separator) {
CollectAsciiPredicate(isspace);
ExpectCharPropertyContainsCollectedSet("separator");
}
} // namespace syntaxnet
......@@ -77,7 +77,8 @@ class DocumentSource : public OpKernel {
OP_REQUIRES_OK(context, context->GetAttr("batch_size", &batch_size_));
OP_REQUIRES(context, batch_size_ > 0,
InvalidArgument("invalid batch_size provided"));
corpus_.reset(new TextReader(*task_context_.GetInput(corpus_name)));
corpus_.reset(
new TextReader(*task_context_.GetInput(corpus_name), &task_context_));
}
void Compute(OpKernelContext *context) override {
......@@ -124,7 +125,8 @@ class DocumentSink : public OpKernel {
GetTaskContext(context, &task_context_);
string corpus_name;
OP_REQUIRES_OK(context, context->GetAttr("corpus_name", &corpus_name));
writer_.reset(new TextWriter(*task_context_.GetInput(corpus_name)));
writer_.reset(
new TextWriter(*task_context_.GetInput(corpus_name), &task_context_));
}
void Compute(OpKernelContext *context) override {
......
......@@ -38,6 +38,8 @@ class DocumentFormat : public RegisterableClass<DocumentFormat> {
DocumentFormat() {}
virtual ~DocumentFormat() {}
virtual void Setup(TaskContext *context) {}
// Reads a record from the given input buffer with format specific logic.
// Returns false if no record could be read because we reached end of file.
virtual bool ReadRecord(tensorflow::io::InputBuffer *buffer,
......
......@@ -19,6 +19,7 @@ limitations under the License.
#include "syntaxnet/affix.h"
#include "syntaxnet/dictionary.pb.h"
#include "syntaxnet/feature_extractor.h"
#include "syntaxnet/segmenter_utils.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/sentence_batch.h"
#include "syntaxnet/term_frequency_map.h"
......@@ -75,6 +76,7 @@ class LexiconBuilder : public OpKernel {
TermFrequencyMap tags;
TermFrequencyMap categories;
TermFrequencyMap labels;
TermFrequencyMap chars;
// Affix tables to be populated by the corpus.
AffixTable prefixes(AffixTable::PREFIX, max_prefix_length_);
......@@ -87,7 +89,7 @@ class LexiconBuilder : public OpKernel {
int64 num_tokens = 0;
int64 num_documents = 0;
Sentence *document;
TextReader corpus(*task_context_.GetInput(corpus_name_));
TextReader corpus(*task_context_.GetInput(corpus_name_), &task_context_);
while ((document = corpus.Read()) != nullptr) {
// Gather token information.
for (int t = 0; t < document->token_size(); ++t) {
......@@ -114,6 +116,14 @@ class LexiconBuilder : public OpKernel {
// Add mapping from tag to category.
tag_to_category.SetCategory(token.tag(), token.category());
// Add characters.
vector<tensorflow::StringPiece> char_sp;
SegmenterUtils::GetUTF8Chars(word, &char_sp);
for (const auto &c : char_sp) {
const string c_str = c.ToString();
if (!c_str.empty() && !HasSpaces(c_str)) chars.Increment(c_str);
}
// Update the number of processed tokens.
++num_tokens;
}
......@@ -131,6 +141,7 @@ class LexiconBuilder : public OpKernel {
categories.Save(
TaskContext::InputFile(*task_context_.GetInput("category-map")));
labels.Save(TaskContext::InputFile(*task_context_.GetInput("label-map")));
chars.Save(TaskContext::InputFile(*task_context_.GetInput("char-map")));
// Write affixes to disk.
WriteAffixTable(prefixes, TaskContext::InputFile(
......
......@@ -69,6 +69,8 @@ TOKENIZED_DOCS = u'''बात गलत हो तो गुस्सा से
लेकिन अभिनेत्री के इस कदम से वहां रंग में भंग पड़ गया ।
'''
CHARS = u'''अ इ आ क ग ज ट त द न प भ ब य म र ल व ह स ि ा ु ी े ै ो ् ड़ । ं'''
COMMENTS = u'# Line with fake comments.'
......@@ -93,7 +95,7 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
self.AddInput('documents', self.corpus_file, corpus_format, context)
for name in ('word-map', 'lcword-map', 'tag-map',
'category-map', 'label-map', 'prefix-table',
'suffix-table', 'tag-to-category'):
'suffix-table', 'tag-to-category', 'char-map'):
self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
logging.info('Writing context to: %s', self.context_file)
with open(self.context_file, 'w') as f:
......@@ -133,6 +135,26 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
self.assertIn(tag, TAGS)
self.assertIn(category, CATEGORIES)
def LoadMap(self, map_name):
loaded_map = {}
with file(os.path.join(FLAGS.test_tmpdir, map_name), 'r') as f:
for line in f:
entries = line.strip().split(' ')
if len(entries) == 2:
loaded_map[entries[0]] = entries[1]
return loaded_map
def ValidateCharMap(self):
char_map = self.LoadMap('char-map')
self.assertEqual(len(char_map), len(CHARS.split(' ')))
for char in CHARS.split(' '):
self.assertIn(char.encode('utf-8'), char_map)
def ValidateWordMap(self):
word_map = self.LoadMap('word-map')
for word in filter(None, TOKENIZED_DOCS.replace('\n', ' ').split(' ')):
self.assertIn(word.encode('utf-8'), word_map)
def BuildLexicon(self):
with self.test_session():
gen_parser_ops.lexicon_builder(task_context=self.context_file).run()
......@@ -146,6 +168,8 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
self.ValidateDocuments()
self.BuildLexicon()
self.ValidateTagToCategoryMap()
self.ValidateCharMap()
self.ValidateWordMap()
def testCoNLLFormatExtraNewlinesAndComments(self):
self.WriteContext('conll-sentence')
......
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Morpher transition system.
//
// This transition system has one type of actions:
// - The SHIFT action pushes the next input token to the stack and
// advances to the next input token, assigning a part-of-speech tag to the
// token that was shifted.
//
// The transition system operates with parser actions encoded as integers:
// - A SHIFT action is encoded as number starting from 0.
#include <string>
#include "syntaxnet/morphology_label_set.h"
#include "syntaxnet/parser_features.h"
#include "syntaxnet/parser_state.h"
#include "syntaxnet/parser_transitions.h"
#include "syntaxnet/sentence_features.h"
#include "syntaxnet/shared_store.h"
#include "syntaxnet/task_context.h"
#include "syntaxnet/term_frequency_map.h"
#include "syntaxnet/utils.h"
#include "tensorflow/core/lib/strings/strcat.h"
namespace syntaxnet {
class MorphologyTransitionState : public ParserTransitionState {
public:
explicit MorphologyTransitionState(const MorphologyLabelSet *label_set)
: label_set_(label_set) {}
explicit MorphologyTransitionState(const MorphologyTransitionState *state)
: MorphologyTransitionState(state->label_set_) {
tag_ = state->tag_;
gold_tag_ = state->gold_tag_;
}
// Clones the transition state by returning a new object.
ParserTransitionState *Clone() const override {
return new MorphologyTransitionState(this);
}
// Reads gold tags for each token.
void Init(ParserState *state) override {
tag_.resize(state->sentence().token_size(), -1);
gold_tag_.resize(state->sentence().token_size(), -1);
for (int pos = 0; pos < state->sentence().token_size(); ++pos) {
const Token &token = state->GetToken(pos);
// NOTE: we allow token to not have a TokenMorphology extension or for the
// TokenMorphology to be absent from the label_set_ because this can
// happen at test time.
gold_tag_[pos] = label_set_->LookupExisting(
token.GetExtension(TokenMorphology::morphology));
}
}
// Returns the tag assigned to a given token.
int Tag(int index) const {
DCHECK_GE(index, 0);
DCHECK_LT(index, tag_.size());
return index == -1 ? -1 : tag_[index];
}
// Sets this tag on the token at index.
void SetTag(int index, int tag) {
DCHECK_GE(index, 0);
DCHECK_LT(index, tag_.size());
tag_[index] = tag;
}
// Returns the gold tag for a given token.
int GoldTag(int index) const {
DCHECK_GE(index, -1);
DCHECK_LT(index, gold_tag_.size());
return index == -1 ? -1 : gold_tag_[index];
}
// Returns the proto corresponding to the tag, or an empty proto if the tag is
// not found.
const TokenMorphology &TagAsProto(int tag) const {
if (tag >= 0 && tag < label_set_->Size()) {
return label_set_->Lookup(tag);
}
return TokenMorphology::default_instance();
}
// Adds transition state specific annotations to the document.
void AddParseToDocument(const ParserState &state, bool rewrite_root_labels,
Sentence *sentence) const override {
for (int i = 0; i < tag_.size(); ++i) {
Token *token = sentence->mutable_token(i);
*token->MutableExtension(TokenMorphology::morphology) =
TagAsProto(Tag(i));
}
}
// Whether a parsed token should be considered correct for evaluation.
bool IsTokenCorrect(const ParserState &state, int index) const override {
return GoldTag(index) == Tag(index);
}
// Returns a human readable string representation of this state.
string ToString(const ParserState &state) const override {
string str;
for (int i = state.StackSize(); i > 0; --i) {
const string &word = state.GetToken(state.Stack(i - 1)).word();
if (i != state.StackSize() - 1) str.append(" ");
tensorflow::strings::StrAppend(
&str, word, "[",
TagAsProto(Tag(state.StackSize() - i)).ShortDebugString(), "]");
}
for (int i = state.Next(); i < state.NumTokens(); ++i) {
tensorflow::strings::StrAppend(&str, " ", state.GetToken(i).word());
}
return str;
}
private:
// Currently assigned morphological analysis for each token in this sentence.
vector<int> tag_;
// Gold morphological analysis from the input document.
vector<int> gold_tag_;
// Tag map used for conversions between integer and string representations
// part of speech tags. Not owned.
const MorphologyLabelSet *label_set_ = nullptr;
TF_DISALLOW_COPY_AND_ASSIGN(MorphologyTransitionState);
};
class MorphologyTransitionSystem : public ParserTransitionSystem {
public:
~MorphologyTransitionSystem() override { SharedStore::Release(label_set_); }
// Determines tag map location.
void Setup(TaskContext *context) override {
context->GetInput("morph-label-set");
}
// Reads tag map and tag to category map.
void Init(TaskContext *context) override {
const string fname =
TaskContext::InputFile(*context->GetInput("morph-label-set"));
label_set_ =
SharedStoreUtils::GetWithDefaultName<MorphologyLabelSet>(fname);
}
// The SHIFT action uses the same value as the corresponding action type.
static ParserAction ShiftAction(int tag) { return tag; }
// The morpher transition system doesn't look at the dependency tree, so it
// allows non-projective trees.
bool AllowsNonProjective() const override { return true; }
// Returns the number of action types.
int NumActionTypes() const override { return 1; }
// Returns the number of possible actions.
int NumActions(int num_labels) const override { return label_set_->Size(); }
// The default action for a given state is assigning the most frequent tag.
ParserAction GetDefaultAction(const ParserState &state) const override {
return ShiftAction(0);
}
// Returns the next gold action for a given state according to the
// underlying annotated sentence.
ParserAction GetNextGoldAction(const ParserState &state) const override {
if (!state.EndOfInput()) {
return ShiftAction(TransitionState(state).GoldTag(state.Next()));
}
return ShiftAction(0);
}
// Checks if the action is allowed in a given parser state.
bool IsAllowedAction(ParserAction action,
const ParserState &state) const override {
return !state.EndOfInput();
}
// Makes a shift by pushing the next input token on the stack and moving to
// the next position.
void PerformActionWithoutHistory(ParserAction action,
ParserState *state) const override {
DCHECK(!state->EndOfInput());
if (!state->EndOfInput()) {
MutableTransitionState(state)->SetTag(state->Next(), action);
state->Push(state->Next());
state->Advance();
}
}
// We are in a final state when we reached the end of the input and the stack
// is empty.
bool IsFinalState(const ParserState &state) const override {
return state.EndOfInput();
}
// Returns a string representation of a parser action.
string ActionAsString(ParserAction action,
const ParserState &state) const override {
return tensorflow::strings::StrCat(
"SHIFT(", label_set_->Lookup(action).ShortDebugString(), ")");
}
// No state is deterministic in this transition system.
bool IsDeterministicState(const ParserState &state) const override {
return false;
}
// Returns a new transition state to be used to enhance the parser state.
ParserTransitionState *NewTransitionState(bool training_mode) const override {
return new MorphologyTransitionState(label_set_);
}
// Downcasts the const ParserTransitionState in ParserState to a const
// MorphologyTransitionState.
static const MorphologyTransitionState &TransitionState(
const ParserState &state) {
return *static_cast<const MorphologyTransitionState *>(
state.transition_state());
}
// Downcasts the ParserTransitionState in ParserState to an
// MorphologyTransitionState.
static MorphologyTransitionState *MutableTransitionState(ParserState *state) {
return static_cast<MorphologyTransitionState *>(
state->mutable_transition_state());
}
// Input for the tag map. Not owned.
TaskInput *input_label_set_ = nullptr;
// Tag map used for conversions between integer and string representations
// morphology labels. Owned through SharedStore.
const MorphologyLabelSet *label_set_;
};
REGISTER_TRANSITION_SYSTEM("morpher", MorphologyTransitionSystem);
// Feature function for retrieving the tag assigned to a token by the tagger
// transition system.
class PredictedMorphTagFeatureFunction : public ParserIndexFeatureFunction {
public:
PredictedMorphTagFeatureFunction() {}
// Determines tag map location.
void Setup(TaskContext *context) override {
context->GetInput("morph-label-set", "recordio", "token-morphology");
}
// Reads tag map.
void Init(TaskContext *context) override {
const string fname =
TaskContext::InputFile(*context->GetInput("morph-label-set"));
label_set_ = SharedStore::Get<MorphologyLabelSet>(fname, fname);
set_feature_type(new FullLabelFeatureType(name(), label_set_));
}
// Gets the MorphologyTransitionState from the parser state and reads the
// assigned
// tag at the focus index. Returns -1 if the focus is not within the sentence.
FeatureValue Compute(const WorkspaceSet &workspaces, const ParserState &state,
int focus, const FeatureVector *result) const override {
if (focus < 0 || focus >= state.sentence().token_size()) return -1;
return static_cast<const MorphologyTransitionState *>(
state.transition_state())
->Tag(focus);
}
private:
// Tag map used for conversions between integer and string representations
// part of speech tags. Owned through SharedStore.
const MorphologyLabelSet *label_set_;
TF_DISALLOW_COPY_AND_ASSIGN(PredictedMorphTagFeatureFunction);
};
REGISTER_PARSER_IDX_FEATURE_FUNCTION("pred-morph-tag",
PredictedMorphTagFeatureFunction);
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/morphology_label_set.h"
namespace syntaxnet {
const char MorphologyLabelSet::kSeparator[] = "\t";
int MorphologyLabelSet::Add(const TokenMorphology &morph) {
string repr = StringForMatch(morph);
auto it = fast_lookup_.find(repr);
if (it != fast_lookup_.end()) return it->second;
fast_lookup_[repr] = label_set_.size();
label_set_.push_back(morph);
return label_set_.size() - 1;
}
// Look up an existing TokenMorphology. If it is not present, return -1.
int MorphologyLabelSet::LookupExisting(const TokenMorphology &morph) const {
string repr = StringForMatch(morph);
auto it = fast_lookup_.find(repr);
if (it != fast_lookup_.end()) return it->second;
return -1;
}
// Return the TokenMorphology at position i. The input i should be in the range
// 0..size().
const TokenMorphology &MorphologyLabelSet::Lookup(int i) const {
CHECK_GE(i, 0);
CHECK_LT(i, label_set_.size());
return label_set_[i];
}
void MorphologyLabelSet::Read(const string &filename) {
ProtoRecordReader reader(filename);
Read(&reader);
}
void MorphologyLabelSet::Read(ProtoRecordReader *reader) {
TokenMorphology morph;
while (reader->Read(&morph).ok()) {
CHECK_EQ(-1, LookupExisting(morph));
Add(morph);
}
}
void MorphologyLabelSet::Write(const string &filename) const {
ProtoRecordWriter writer(filename);
Write(&writer);
}
void MorphologyLabelSet::Write(ProtoRecordWriter *writer) const {
for (const TokenMorphology &morph : label_set_) {
writer->Write(morph);
}
}
string MorphologyLabelSet::StringForMatch(const TokenMorphology &morph) const {
vector<string> attributes;
for (const auto &a : morph.attribute()) {
attributes.push_back(
tensorflow::strings::StrCat(a.name(), kSeparator, a.value()));
}
std::sort(attributes.begin(), attributes.end());
return utils::Join(attributes, kSeparator);
}
string FullLabelFeatureType::GetFeatureValueName(FeatureValue value) const {
const TokenMorphology &morph = label_set_->Lookup(value);
vector<string> attributes;
for (const auto &a : morph.attribute()) {
attributes.push_back(tensorflow::strings::StrCat(a.name(), ":", a.value()));
}
std::sort(attributes.begin(), attributes.end());
return utils::Join(attributes, ",");
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// A class to store the set of possible TokenMorphology objects. This includes
// lookup, iteration and serialziation.
#ifndef SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
#define SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
#include <unordered_map>
#include <string>
#include <vector>
#include "syntaxnet/proto_io.h"
#include "syntaxnet/sentence.pb.h"
namespace syntaxnet {
class MorphologyLabelSet {
public:
// Initalize as an empty morphology.
MorphologyLabelSet() {}
// Initalizes by reading the given file, which has been saved by Write().
// This makes using the shared store easier.
explicit MorphologyLabelSet(const string &fname) { Read(fname); }
// Adds a TokenMorphology to the set if it is not present. In any case, return
// its position in the list. Note: This is slow, and should not be called
// outside of training or init.
int Add(const TokenMorphology &morph);
// Look up an existing TokenMorphology. If it is not present, return -1.
// Note: This is slow, and should not be called outside of training workflow
// or init.
int LookupExisting(const TokenMorphology &morph) const;
// Return the TokenMorphology at position i. The input i should be in the
// range 0..size(). Note: this will be called at inference time and needs to
// be kept fast.
const TokenMorphology &Lookup(int i) const;
// Return the number of elements.
int Size() const { return label_set_.size(); }
// Deserialization and serialization.
void Read(const string &filename);
void Write(const string &filename) const;
private:
string StringForMatch(const TokenMorphology &morhp) const;
// Deserialization and serialziation implementation.
void Read(ProtoRecordReader *reader);
void Write(ProtoRecordWriter *writer) const;
// List of all possible annotations. This is a unique list, where equality is
// defined as follows:
//
// a == b iff the set of attribute pairs (attribute, value) is identical.
vector<TokenMorphology> label_set_;
// Because protocol buffer equality is complicated, we implement our own
// equality operator based on strings. This unordered_map allows us to do the
// lookup more quickly.
unordered_map<string, int> fast_lookup_;
// A separator string that should not occur in any of the attribute names.
// This should never be serialized, so that it can be changed in the code if
// we change attribute names and it occurs in the new names.
static const char kSeparator[];
};
// A feature type with one value for each complete morphological analysis
// (analogous to the fulltag analyzer).
class FullLabelFeatureType : public FeatureType {
public:
FullLabelFeatureType(const string &name, const MorphologyLabelSet *label_set)
: FeatureType(name), label_set_(label_set) {}
~FullLabelFeatureType() override {}
// Converts a feature value to a name. We don't use StringForMatch, since the
// goal of these are to be readable, even if they might occasionally be
// non-unique.
string GetFeatureValueName(FeatureValue value) const override;
// Returns the size of the feature values domain.
FeatureValue GetDomainSize() const override { return label_set_->Size(); }
private:
// Not owned.
const MorphologyLabelSet *label_set_ = nullptr;
};
} // namespace syntaxnet
#endif // SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/morphology_label_set.h"
#include "syntaxnet/sentence.pb.h"
#include <gmock/gmock.h>
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
class MorphologyLabelSetTest : public ::testing::Test {
protected:
MorphologyLabelSet label_set_;
};
// Test that Add and LookupExisting work as expected.
TEST_F(MorphologyLabelSetTest, AddLookupExisting) {
TokenMorphology si1, si2; // singular, imperative
TokenMorphology pi; // plural, imperative
TokenMorphology six; // singular, imperative with extra value
TextFormat::ParseFromString(R"(
attribute {name: "Number" value: "Singular"}
attribute {name: "POS" value: "IMP"})",
&si1);
TextFormat::ParseFromString(R"(
attribute {name: "POS" value: "IMP"}
attribute {name: "Number" value: "Singular"})",
&si2);
TextFormat::ParseFromString(R"(
attribute {name: "Number" value: "Plural"}
attribute {name: "POS" value: "IMP"})",
&pi);
TextFormat::ParseFromString(R"(
attribute {name: "Number" value: "Plural"}
attribute {name: "POS" value: "IMP"}
attribute {name: "x" value: "x"})",
&six);
// Check Lookup existing returns -1 for non-existing entries.
EXPECT_EQ(-1, label_set_.LookupExisting(si1));
EXPECT_EQ(-1, label_set_.LookupExisting(si2));
EXPECT_EQ(0, label_set_.Size());
// Check that adding returns 0 (this is the only possiblity given Size())
EXPECT_EQ(0, label_set_.Add(si1));
EXPECT_EQ(0, label_set_.Add(si1)); // calling Add twice adds only once
EXPECT_EQ(1, label_set_.Size());
// Check that order of attributes does not matter.
EXPECT_EQ(0, label_set_.LookupExisting(si2));
// Check that un-added entries still are not present.
EXPECT_EQ(-1, label_set_.LookupExisting(pi));
EXPECT_EQ(-1, label_set_.LookupExisting(six));
// Check that we can add them.
EXPECT_EQ(1, label_set_.Add(pi));
EXPECT_EQ(2, label_set_.Add(six));
EXPECT_EQ(3, label_set_.Size());
}
// Test write and deserializing constructor.
TEST_F(MorphologyLabelSetTest, Serialization) {
TokenMorphology si; // singular, imperative
TokenMorphology pi; // plural, imperative
TextFormat::ParseFromString(R"(
attribute {name: "Number" value: "Singular"}
attribute {name: "POS" value: "IMP"})",
&si);
TextFormat::ParseFromString(R"(
attribute {name: "Number" value: "Plural"}
attribute {name: "POS" value: "IMP"})",
&pi);
EXPECT_EQ(0, label_set_.Add(si));
EXPECT_EQ(1, label_set_.Add(pi));
// Serialize and deserialize.
string fname = utils::JoinPath({tensorflow::testing::TmpDir(), "label-set"});
label_set_.Write(fname);
MorphologyLabelSet label_set2(fname);
EXPECT_EQ(0, label_set2.LookupExisting(si));
EXPECT_EQ(1, label_set2.LookupExisting(pi));
EXPECT_EQ(2, label_set2.Size());
}
} // namespace syntaxnet
......@@ -22,7 +22,6 @@ import time
import tensorflow as tf
from tensorflow.python.platform import gfile
from tensorflow.python.platform import tf_logging as logging
from syntaxnet import sentence_pb2
from syntaxnet import graph_builder
......
......@@ -166,6 +166,9 @@ REGISTER_PARSER_IDX_FEATURE_FUNCTION("label", LabelFeatureFunction);
typedef BasicParserSentenceFeatureFunction<Word> WordFeatureFunction;
REGISTER_PARSER_IDX_FEATURE_FUNCTION("word", WordFeatureFunction);
typedef BasicParserSentenceFeatureFunction<Char> CharFeatureFunction;
REGISTER_PARSER_IDX_FEATURE_FUNCTION("char", CharFeatureFunction);
typedef BasicParserSentenceFeatureFunction<Tag> TagFeatureFunction;
REGISTER_PARSER_IDX_FEATURE_FUNCTION("tag", TagFeatureFunction);
......@@ -175,6 +178,21 @@ REGISTER_PARSER_IDX_FEATURE_FUNCTION("digit", DigitFeatureFunction);
typedef BasicParserSentenceFeatureFunction<Hyphen> HyphenFeatureFunction;
REGISTER_PARSER_IDX_FEATURE_FUNCTION("hyphen", HyphenFeatureFunction);
typedef BasicParserSentenceFeatureFunction<Capitalization>
CapitalizationFeatureFunction;
REGISTER_PARSER_IDX_FEATURE_FUNCTION("capitalization",
CapitalizationFeatureFunction);
typedef BasicParserSentenceFeatureFunction<PunctuationAmount>
PunctuationAmountFeatureFunction;
REGISTER_PARSER_IDX_FEATURE_FUNCTION("punctuation-amount",
PunctuationAmountFeatureFunction);
typedef BasicParserSentenceFeatureFunction<Quote>
QuoteFeatureFunction;
REGISTER_PARSER_IDX_FEATURE_FUNCTION("quote",
QuoteFeatureFunction);
typedef BasicParserSentenceFeatureFunction<PrefixFeature> PrefixFeatureFunction;
REGISTER_PARSER_IDX_FEATURE_FUNCTION("prefix", PrefixFeatureFunction);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment