Commit 32ab5a58 authored by calberti's avatar calberti Committed by Martin Wicke
Browse files

Adding SyntaxNet to tensorflow/models (#63)

parent 148a15fb
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef $TARGETDIR_TASK_CONTEXT_H_
#define $TARGETDIR_TASK_CONTEXT_H_
#include <string>
#include <vector>
#include "syntaxnet/task_spec.pb.h"
#include "syntaxnet/utils.h"
namespace syntaxnet {
// A task context holds configuration information for a task. It is basically a
// wrapper around a TaskSpec protocol buffer.
class TaskContext {
public:
// Returns the underlying task specification protocol buffer for the context.
const TaskSpec &spec() const { return spec_; }
TaskSpec *mutable_spec() { return &spec_; }
// Returns a named input descriptor for the task. A new input is created if
// the task context does not already have an input with that name.
TaskInput *GetInput(const string &name);
TaskInput *GetInput(const string &name, const string &file_format,
const string &record_format);
// Sets task parameter.
void SetParameter(const string &name, const string &value);
// Returns task parameter. If the parameter is not in the task configuration
// the (default) value of the corresponding command line flag is returned.
string GetParameter(const string &name) const;
int GetIntParameter(const string &name) const;
int64 GetInt64Parameter(const string &name) const;
bool GetBoolParameter(const string &name) const;
double GetFloatParameter(const string &name) const;
// Returns task parameter. If the parameter is not in the task configuration
// the default value is returned. Parameters retrieved using these methods
// don't need to be defined with a DEFINE_*() macro.
string Get(const string &name, const string &defval) const;
string Get(const string &name, const char *defval) const;
int Get(const string &name, int defval) const;
int64 Get(const string &name, int64 defval) const;
double Get(const string &name, double defval) const;
bool Get(const string &name, bool defval) const;
// Returns input file name for a single-file task input.
static string InputFile(const TaskInput &input);
// Returns true if task input supports the file and record format.
static bool Supports(const TaskInput &input, const string &file_format,
const string &record_format);
private:
// Underlying task specification protocol buffer.
TaskSpec spec_;
// Vector of parameters required by this task. These must be specified in the
// task rather than relying on default values.
vector<string> required_parameters_;
};
} // namespace syntaxnet
#endif // $TARGETDIR_TASK_CONTEXT_H_
// LINT: ALLOW_GROUPS
// Protocol buffer specifications for task configuration.
syntax = "proto2";
package syntaxnet;
// Task input descriptor.
message TaskInput {
// Name of input resource.
required string name = 1;
// Name of stage responsible of creating this resource.
optional string creator = 2;
// File format for resource.
repeated string file_format = 3;
// Record format for resource.
repeated string record_format = 4;
// Is this resource multi-file?
optional bool multi_file = 5 [default = false];
// An input can consist of multiple file sets.
repeated group Part = 6 {
// File pattern for file set.
optional string file_pattern = 7;
// File format for file set.
optional string file_format = 8;
// Record format for file set.
optional string record_format = 9;
}
}
// Task output descriptor.
message TaskOutput {
// Name of output resource.
required string name = 1;
// File format for output resource.
optional string file_format = 2;
// Record format for output resource.
optional string record_format = 3;
// Number of shards in output. If it is different from zero this output is
// sharded. If the number of shards is set to -1 this means that the output is
// sharded, but the number of shard is unknown. The files are then named
// 'base-*-of-*'.
optional int32 shards = 4 [default = 0];
// Base file name for output resource. If this is not set by the task
// component it is set to a default value by the workflow engine.
optional string file_base = 5;
// Optional extension added to the file name.
optional string file_extension = 6;
}
// A task specification is used for describing executing parameters.
message TaskSpec {
// Name of task.
optional string task_name = 1;
// Workflow task type.
optional string task_type = 2;
// Task parameters.
repeated group Parameter = 3 {
required string name = 4;
optional string value = 5;
}
// Task inputs.
repeated TaskInput input = 6;
// Task outputs.
repeated TaskOutput output = 7;
}
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/term_frequency_map.h"
#include <stddef.h>
#include <algorithm>
#include <limits>
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/env.h"
namespace syntaxnet {
int TermFrequencyMap::Increment(const string &term) {
CHECK_EQ(term_index_.size(), term_data_.size());
const TermIndex::const_iterator it = term_index_.find(term);
if (term_index_.find(term) != term_index_.end()) {
// Increment the existing term.
pair<string, int64> &data = term_data_[it->second];
CHECK_EQ(term, data.first);
++(data.second);
return it->second;
} else {
// Add a new term.
const int index = term_index_.size();
CHECK_LT(index, std::numeric_limits<int32>::max()); // overflow
term_index_[term] = index;
term_data_.push_back(pair<string, int64>(term, 1));
return index;
}
}
void TermFrequencyMap::Clear() {
term_index_.clear();
term_data_.clear();
}
void TermFrequencyMap::Load(const string &filename, int min_frequency,
int max_num_terms) {
Clear();
// If max_num_terms is non-positive, replace it with INT_MAX.
if (max_num_terms <= 0) max_num_terms = std::numeric_limits<int>::max();
// Read the first line (total # of terms in the mapping).
tensorflow::RandomAccessFile *file;
TF_CHECK_OK(tensorflow::Env::Default()->NewRandomAccessFile(filename, &file));
static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
tensorflow::io::InputBuffer input(file, kInputBufferSize);
string line;
TF_CHECK_OK(input.ReadLine(&line));
int32 total = -1;
CHECK(utils::ParseInt32(line.c_str(), &total));
CHECK_GE(total, 0);
// Read the mapping.
int64 last_frequency = -1;
for (int i = 0; i < total && i < max_num_terms; ++i) {
TF_CHECK_OK(input.ReadLine(&line));
vector<string> elements = utils::Split(line, ' ');
CHECK_EQ(2, elements.size());
CHECK(!elements[0].empty());
CHECK(!elements[1].empty());
int64 frequency = 0;
CHECK(utils::ParseInt64(elements[1].c_str(), &frequency));
CHECK_GT(frequency, 0);
const string &term = elements[0];
// Check frequency sorting (descending order).
if (i > 0) CHECK_GE(last_frequency, frequency);
last_frequency = frequency;
// Ignore low-frequency items.
if (frequency < min_frequency) continue;
// Check uniqueness of the mapped terms.
CHECK(term_index_.find(term) == term_index_.end())
<< "File " << filename << " has duplicate term: " << term;
// Assign the next available index.
const int index = term_index_.size();
term_index_[term] = index;
term_data_.push_back(pair<string, int64>(term, frequency));
}
CHECK_EQ(term_index_.size(), term_data_.size());
LOG(INFO) << "Loaded " << term_index_.size() << " terms from " << filename
<< ".";
}
struct TermFrequencyMap::SortByFrequencyThenTerm {
// Return a > b to sort in descending order of frequency; otherwise,
// lexicographic sort on term.
bool operator()(const pair<string, int64> &a,
const pair<string, int64> &b) const {
return (a.second > b.second || (a.second == b.second && a.first < b.first));
}
};
void TermFrequencyMap::Save(const string &filename) const {
CHECK_EQ(term_index_.size(), term_data_.size());
// Copy and sort the term data.
vector<pair<string, int64>> sorted_data(term_data_);
std::sort(sorted_data.begin(), sorted_data.end(), SortByFrequencyThenTerm());
// Write the number of terms.
tensorflow::WritableFile *file;
TF_CHECK_OK(tensorflow::Env::Default()->NewWritableFile(filename, &file));
CHECK_LE(term_index_.size(), std::numeric_limits<int32>::max()); // overflow
const int32 num_terms = term_index_.size();
const string header = tensorflow::strings::StrCat(num_terms, "\n");
TF_CHECK_OK(file->Append(header));
// Write each term and frequency.
for (size_t i = 0; i < sorted_data.size(); ++i) {
if (i > 0) CHECK_GE(sorted_data[i - 1].second, sorted_data[i].second);
const string line = tensorflow::strings::StrCat(
sorted_data[i].first, " ", sorted_data[i].second, "\n");
TF_CHECK_OK(file->Append(line));
}
TF_CHECK_OK(file->Close()) << "for file " << filename;
LOG(INFO) << "Saved " << term_index_.size() << " terms to " << filename
<< ".";
delete file;
}
TagToCategoryMap::TagToCategoryMap(const string &filename) {
// Load the mapping.
tensorflow::RandomAccessFile *file;
TF_CHECK_OK(tensorflow::Env::Default()->NewRandomAccessFile(filename, &file));
static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
tensorflow::io::InputBuffer input(file, kInputBufferSize);
string line;
while (input.ReadLine(&line) == tensorflow::Status::OK()) {
vector<string> pair = utils::Split(line, '\t');
CHECK(line.empty() || pair.size() == 2) << line;
tag_to_category_[pair[0]] = pair[1];
}
}
// Returns the category associated with the given tag.
const string &TagToCategoryMap::GetCategory(const string &tag) const {
const auto it = tag_to_category_.find(tag);
CHECK(it != tag_to_category_.end()) << "No category found for tag " << tag;
return it->second;
}
void TagToCategoryMap::SetCategory(const string &tag, const string &category) {
const auto it = tag_to_category_.find(tag);
if (it != tag_to_category_.end()) {
CHECK_EQ(category, it->second)
<< "POS tag cannot be mapped to multiple coarse POS tags. "
<< "'" << tag << "' is mapped to: '" << category << "' and '"
<< it->second << "'";
} else {
tag_to_category_[tag] = category;
}
}
void TagToCategoryMap::Save(const string &filename) const {
// Write tag and category on each line.
tensorflow::WritableFile *file;
TF_CHECK_OK(tensorflow::Env::Default()->NewWritableFile(filename, &file));
for (const auto &pair : tag_to_category_) {
const string line =
tensorflow::strings::StrCat(pair.first, "\t", pair.second, "\n");
TF_CHECK_OK(file->Append(line));
}
TF_CHECK_OK(file->Close()) << "for file " << filename;
delete file;
}
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef $TARGETDIR_TERM_FREQUENCY_MAP_H_
#define $TARGETDIR_TERM_FREQUENCY_MAP_H_
#include <stddef.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "syntaxnet/utils.h"
namespace syntaxnet {
// A mapping from strings to frequencies with save and load functionality.
class TermFrequencyMap {
public:
// Creates an empty frequency map.
TermFrequencyMap() {}
// Creates a term frequency map by calling Load.
TermFrequencyMap(const string &file, int min_frequency, int max_num_terms) {
Load(file, min_frequency, max_num_terms);
}
// Returns the number of terms with positive frequency.
int Size() const { return term_index_.size(); }
// Returns the index associated with the given term. If the term does not
// exist, the unknown index is returned instead.
int LookupIndex(const string &term, int unknown) const {
const TermIndex::const_iterator it = term_index_.find(term);
return (it != term_index_.end() ? it->second : unknown);
}
// Returns the term associated with the given index.
const string &GetTerm(int index) const { return term_data_[index].first; }
// Increases the frequency of the given term by 1, creating a new entry if
// necessary, and returns the index of the term.
int Increment(const string &term);
// Clears all frequencies.
void Clear();
// Loads a frequency mapping from the given file, which must have been created
// by an earlier call to Save(). After loading, the term indices are
// guaranteed to be ordered in descending order of frequency (breaking ties
// arbitrarily). However, any new terms inserted after loading do not
// maintain this sorting invariant.
//
// Only loads terms with frequency >= min_frequency. If max_num_terms <= 0,
// then all qualifying terms are loaded; otherwise, max_num_terms terms with
// maximal frequency are loaded (breaking ties arbitrarily).
void Load(const string &filename, int min_frequency, int max_num_terms);
// Saves a frequency mapping to the given file.
void Save(const string &filename) const;
private:
// Hashtable for term-to-index mapping.
typedef std::unordered_map<string, int> TermIndex;
// Sorting functor for term data.
struct SortByFrequencyThenTerm;
// Mapping from terms to indices.
TermIndex term_index_;
// Mapping from indices to term and frequency.
vector<pair<string, int64>> term_data_;
TF_DISALLOW_COPY_AND_ASSIGN(TermFrequencyMap);
};
// A mapping from tags to categories.
class TagToCategoryMap {
public:
TagToCategoryMap() {}
~TagToCategoryMap() {}
// Loads a tag to category map from a text file.
explicit TagToCategoryMap(const string &filename);
// Sets the category for the given tag.
void SetCategory(const string &tag, const string &category);
// Returns the category associated with the given tag.
const string &GetCategory(const string &tag) const;
// Saves a tag to category map to the given file.
void Save(const string &filename) const;
private:
map<string, string> tag_to_category_;
TF_DISALLOW_COPY_AND_ASSIGN(TagToCategoryMap);
};
} // namespace syntaxnet
#endif // $TARGETDIR_TERM_FREQUENCY_MAP_H_
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// A program with a main that is suitable for unittests, including those
// that also define microbenchmarks. Based on whether the user specified
// the --benchmark_filter flag which specifies which benchmarks to run,
// we will either run benchmarks or run the gtest tests in the program.
#include "tensorflow/core/platform/platform.h"
#include "tensorflow/core/platform/types.h"
#if defined(PLATFORM_GOOGLE) || defined(__ANDROID__)
// main() is supplied by gunit_main
#else
#include "gtest/gtest.h"
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/platform/test_benchmark.h"
GTEST_API_ int main(int argc, char **argv) {
std::cout << "Running main() from test_main.cc\n";
testing::InitGoogleTest(&argc, argv);
for (int i = 1; i < argc; i++) {
if (tensorflow::StringPiece(argv[i]).starts_with("--benchmarks=")) {
const char *pattern = argv[i] + strlen("--benchmarks=");
tensorflow::testing::Benchmark::Run(pattern);
return 0;
}
}
return RUN_ALL_TESTS();
}
#endif
Parameter {
name: 'brain_parser_embedding_dims'
value: '8;8;8'
}
Parameter {
name: 'brain_parser_features'
value: 'input.token.word input(1).token.word input(2).token.word stack.token.word stack(1).token.word stack(2).token.word;input.tag input(1).tag input(2).tag stack.tag stack(1).tag stack(2).tag;stack.child(1).label stack.child(1).sibling(-1).label stack.child(-1).label stack.child(-1).sibling(1).label'
}
Parameter {
name: 'brain_parser_embedding_names'
value: 'words;tags;labels'
}
input {
name: 'training-corpus'
record_format: 'conll-sentence'
Part {
file_pattern: 'syntaxnet/testdata/mini-training-set'
}
}
input {
name: 'tuning-corpus'
record_format: 'conll-sentence'
Part {
file_pattern: 'syntaxnet/testdata/mini-training-set'
}
}
input {
name: 'parsed-tuning-corpus'
creator: 'brain_parser/greedy'
record_format: 'conll-sentence'
}
input {
name: 'label-map'
file_format: 'text'
Part {
file_pattern: 'OUTPATH/label-map'
}
}
input {
name: 'word-map'
Part {
file_pattern: 'OUTPATH/word-map'
}
}
input {
name: 'lcword-map'
Part {
file_pattern: 'OUTPATH/lcword-map'
}
}
input {
name: 'tag-map'
Part {
file_pattern: 'OUTPATH/tag-map'
}
}
input {
name: 'category-map'
Part {
file_pattern: 'OUTPATH/category-map'
}
}
input {
name: 'prefix-table'
Part {
file_pattern: 'OUTPATH/prefix-table'
}
}
input {
name: 'suffix-table'
Part {
file_pattern: 'OUTPATH/suffix-table'
}
}
input {
name: 'tag-to-category'
Part {
file_pattern: 'OUTPATH/tag-to-category'
}
}
input {
name: 'stdout'
record_format: 'conll-sentence'
Part {
file_pattern: '-'
}
}
text : "I can not recall any disorder in currency markets since the 1974 guidelines were adopted ."
token: {
word : "I"
start : 0
end : 0
head : 3
tag : "PRP"
category: "PRON"
label : "nsubj"
break_level : SENTENCE_BREAK
}
token: {
word : "can"
start : 2
end : 4
head : 3
tag : "MD"
category: "VERB"
label : "aux"
}
token: {
word : "not"
start : 6
end : 8
head : 3
tag : "RB"
category: "ADV"
label : "neg"
}
token: {
word : "recall"
start : 10
end : 15
tag : "VB"
category: "VERB"
label : "ROOT"
}
token: {
word : "any"
start : 17
end : 19
head : 5
tag : "DT"
category: "DET"
label : "det"
}
token: {
word : "disorder"
start : 21
end : 28
head : 3
tag : "NN"
category: "NOUN"
label : "dobj"
}
token: {
word : "in"
start : 30
end : 31
head : 5
tag : "IN"
category: "ADP"
label : "prep"
}
token: {
word : "currency"
start : 33
end : 40
head : 8
tag : "NN"
category: "NOUN"
label : "nn"
}
token: {
word : "markets"
start : 42
end : 48
head : 6
tag : "NNS"
category: "NOUN"
label : "pobj"
}
token: {
word : "since"
start : 50
end : 54
head : 14
tag : "IN"
category: "ADP"
label : "mark"
}
token: {
word : "the"
start : 56
end : 58
head : 12
tag : "DT"
category: "DET"
label : "det"
}
token: {
word : "1974"
start : 60
end : 63
head : 12
tag : "CD"
category: "NUM"
label : "num"
}
token: {
word : "guidelines"
start : 65
end : 74
head : 14
tag : "NNS"
category: "NOUN"
label : "nsubjpass"
}
token: {
word : "were"
start : 76
end : 79
head : 14
tag : "VBD"
category: "VERB"
label : "auxpass"
}
token: {
word : "adopted"
start : 81
end : 87
head : 3
tag : "VBN"
category: "VERB"
label : "advcl"
}
token: {
word : "."
start : 89
end : 89
head : 3
tag : "."
category: "."
label : "p"
}
1 I _ PRP PRP _ 2 nsubj _ _
2 knew _ VBD VBD _ 0 ROOT _ _
3 I _ PRP PRP _ 5 nsubj _ _
4 could _ MD MD _ 5 aux _ _
5 do _ VB VB _ 2 ccomp _ _
6 it _ PRP PRP _ 5 dobj _ _
7 properly _ RB RB _ 5 advmod _ _
8 if _ IN IN _ 9 mark _ _
9 given _ VBN VBN _ 5 advcl _ _
10 the _ DT DT _ 12 det _ _
11 right _ JJ JJ _ 12 amod _ _
12 kind _ NN NN _ 9 dobj _ _
13 of _ IN IN _ 12 prep _ _
14 support _ NN NN _ 13 pobj _ _
15 . _ . . _ 2 punct _ _
1 The _ DT DT _ 2 det _ _
2 journey _ NN NN _ 8 nsubj _ _
3 through _ IN IN _ 2 prep _ _
4 deserts _ NNS NNS _ 3 pobj _ _
5 and _ CC CC _ 4 cc _ _
6 mountains _ NNS NNS _ 4 conj _ _
7 can _ MD MD _ 8 aux _ _
8 take _ VB VB _ 0 ROOT _ _
9 a _ DT DT _ 10 det _ _
10 month _ NN NN _ 8 tmod _ _
11 . _ . . _ 8 punct _ _
1 You _ PRP PRP _ 2 nsubj _ _
2 say _ VBP VBP _ 0 ROOT _ _
3 they _ PRP PRP _ 4 nsubj _ _
4 're _ VBP VBP _ 2 ccomp _ _
5 in _ IN IN _ 4 prep _ _
6 the _ DT DT _ 7 det _ _
7 pipeline _ NN NN _ 5 pobj _ _
8 ? _ . . _ 2 punct _ _
1 Border _ NNP NNP _ 5 nn _ _
2 police _ NN NN _ 5 nn _ _
3 commander _ NN NN _ 5 nn _ _
4 Abdul _ NNP NNP _ 5 nn _ _
5 Raziq _ NNP NNP _ 6 nsubj _ _
6 says _ VBZ VBZ _ 0 ROOT _ _
7 the _ DT DT _ 8 det _ _
8 drugs _ NNS NNS _ 10 nsubjpass _ _
9 were _ VBD VBD _ 10 auxpass _ _
10 found _ VBN VBN _ 6 ccomp _ _
11 in _ IN IN _ 10 prep _ _
12 the _ DT DT _ 13 det _ _
13 basement _ NN NN _ 11 pobj _ _
14 of _ IN IN _ 13 prep _ _
15 a _ DT DT _ 16 det _ _
16 compound _ NN NN _ 14 pobj _ _
17 in _ IN IN _ 16 prep _ _
18 Nawa _ NNP NNP _ 20 nn _ _
19 Kili _ NNP NNP _ 20 nn _ _
20 village _ NN NN _ 17 pobj _ _
21 . _ . . _ 6 punct _ _
1 Fourth _ JJ JJ _ 3 amod _ _
2 quarter _ NN NN _ 3 nn _ _
3 production _ NN NN _ 5 nsubjpass _ _
4 is _ VBZ VBZ _ 5 auxpass _ _
5 expected _ VBN VBN _ 0 ROOT _ _
6 to _ TO TO _ 7 aux _ _
7 increase _ VB VB _ 5 xcomp _ _
8 to _ TO TO _ 7 prep _ _
9 130,000 _ CD CD _ 10 num _ _
10 ounces _ NNS NNS _ 8 pobj _ _
11 . _ . . _ 5 punct _ _
1 Minor _ NNP NNP _ 2 nn _ _
2 scuffling _ NN NN _ 3 nsubj _ _
3 broke _ VBD VBD _ 0 ROOT _ _
4 out _ RP RP _ 3 prt _ _
5 as _ IN IN _ 7 mark _ _
6 officials _ NNS NNS _ 7 nsubj _ _
7 sought _ VBD VBD _ 3 advcl _ _
8 to _ TO TO _ 9 aux _ _
9 separate _ VB VB _ 7 xcomp _ _
10 the _ DT DT _ 11 det _ _
11 groups _ NNS NNS _ 9 dobj _ _
12 . _ . . _ 3 punct _ _
1 According _ VBG VBG _ 18 prep _ _
2 to _ TO TO _ 1 pcomp _ _
3 Facebook _ NNP NNP _ 2 pobj _ _
4 , _ , , _ 3 punct _ _
5 which _ WDT WDT _ 7 nsubjpass _ _
6 is _ VBZ VBZ _ 7 auxpass _ _
7 based _ VBN VBN _ 3 rcmod _ _
8 in _ IN IN _ 7 prep _ _
9 Palo _ NNP NNP _ 10 nn _ _
10 Alto _ NNP NNP _ 8 pobj _ _
11 , _ , , _ 10 punct _ _
12 Calif _ NNP NNP _ 10 appos _ _
13 . _ . . _ 12 punct _ _
14 , _ , , _ 18 punct _ _
15 the _ DT DT _ 17 det _ _
16 Web _ NNP NNP _ 17 nn _ _
17 site _ NN NN _ 18 nsubj _ _
18 has _ VBZ VBZ _ 0 ROOT _ _
19 about _ IN IN _ 21 quantmod _ _
20 47 _ CD CD _ 21 number _ _
21 million _ CD CD _ 23 num _ _
22 active _ JJ JJ _ 23 amod _ _
23 users _ NNS NNS _ 18 dobj _ _
24 . _ . . _ 18 punct _ _
1 Among _ IN IN _ 10 prep _ _
2 those _ DT DT _ 1 pobj _ _
3 leaning _ VBG VBG _ 2 partmod _ _
4 toward _ IN IN _ 3 prep _ _
5 McDonnell _ NNP NNP _ 4 pobj _ _
6 , _ , , _ 10 punct _ _
7 however _ RB RB _ 10 advmod _ _
8 , _ , , _ 10 punct _ _
9 some _ DT DT _ 10 nsubj _ _
10 took _ VBD VBD _ 0 ROOT _ _
11 a _ DT DT _ 14 det _ _
12 more _ RBR RBR _ 13 advmod _ _
13 nuanced _ JJ JJ _ 14 amod _ _
14 view _ NN NN _ 10 dobj _ _
15 , _ , , _ 10 punct _ _
16 allowing _ VBG VBG _ 10 partmod _ _
17 for _ IN IN _ 16 prep _ _
18 the _ DT DT _ 19 det _ _
19 possibility _ NN NN _ 17 pobj _ _
20 that _ IN IN _ 24 mark _ _
21 McDonnell _ NNP NNP _ 24 nsubj _ _
22 could _ MD MD _ 24 aux _ _
23 have _ VB VB _ 24 aux _ _
24 changed _ VBN VBN _ 19 ccomp _ _
25 his _ PRP$ PRP$ _ 26 poss _ _
26 mind _ NN NN _ 24 dobj _ _
27 in _ IN IN _ 24 prep _ _
28 the _ DT DT _ 31 det _ _
29 intervening _ VBG VBG _ 31 amod _ _
30 20 _ CD CD _ 31 num _ _
31 years _ NNS NNS _ 27 pobj _ _
32 or _ CC CC _ 24 cc _ _
33 that _ IN IN _ 39 mark _ _
34 his _ PRP$ PRP$ _ 36 poss _ _
35 personal _ JJ JJ _ 36 amod _ _
36 convictions _ NNS NNS _ 39 nsubj _ _
37 would _ MD MD _ 39 aux _ _
38 not _ RB RB _ 39 neg _ _
39 interfere _ VB VB _ 24 conj _ _
40 with _ IN IN _ 39 prep _ _
41 his _ PRP$ PRP$ _ 42 poss _ _
42 governing _ NN NN _ 40 pobj _ _
43 . _ . . _ 10 punct _ _
1 Both _ DT DT _ 2 det _ _
2 teams _ NNS NNS _ 3 nsubj _ _
3 have _ VBP VBP _ 0 ROOT _ _
4 97 _ CD CD _ 5 num _ _
5 points _ NNS NNS _ 3 dobj _ _
6 . _ . . _ 3 punct _ _
1 Star-Banner _ NNP NNP _ 2 nsubj _ _
2 reported _ VBD VBD _ 0 ROOT _ _
3 Tuesday _ NNP NNP _ 2 tmod _ _
4 . _ . . _ 2 punct _ _
1 Harry _ NNP NNP _ 2 nn _ _
2 Redknapp _ NNP NNP _ 9 nsubj _ _
3 , _ , , _ 2 punct _ _
4 the _ DT DT _ 6 det _ _
5 Tottenham _ NNP NNP _ 6 nn _ _
6 manager _ NN NN _ 2 appos _ _
7 , _ , , _ 2 punct _ _
8 was _ VBD VBD _ 9 aux _ _
9 disbelieving _ VBG VBG _ 0 ROOT _ _
10 that _ IN IN _ 18 mark _ _
11 Lennon _ NNP NNP _ 13 poss _ _
12 's _ POS POS _ 11 possessive _ _
13 delivery _ NN NN _ 18 nsubj _ _
14 could _ MD MD _ 18 aux _ _
15 be _ VB VB _ 18 cop _ _
16 so _ RB RB _ 18 advmod _ _
17 radically _ RB RB _ 18 advmod _ _
18 different _ JJ JJ _ 9 ccomp _ _
19 . _ . . _ 9 punct _ _
1 The _ DT DT _ 3 det _ _
2 US _ NNP NNP _ 3 nn _ _
3 uptick _ NN NN _ 4 nsubj _ _
4 mirrors _ VBZ VBZ _ 0 ROOT _ _
5 an _ DT DT _ 6 det _ _
6 improvement _ NN NN _ 4 dobj _ _
7 in _ IN IN _ 6 prep _ _
8 many _ JJ JJ _ 10 amod _ _
9 other _ JJ JJ _ 10 amod _ _
10 parts _ NNS NNS _ 7 pobj _ _
11 of _ IN IN _ 10 prep _ _
12 the _ DT DT _ 13 det _ _
13 world _ NN NN _ 11 pobj _ _
14 . _ . . _ 4 punct _ _
1 Although _ IN IN _ 4 mark _ _
2 satellite _ NN NN _ 3 nn _ _
3 television _ NN NN _ 4 nsubj _ _
4 has _ VBZ VBZ _ 17 advcl _ _
5 the _ DT DT _ 6 det _ _
6 capacity _ NN NN _ 4 dobj _ _
7 for _ IN IN _ 6 prep _ _
8 hundreds _ NNS NNS _ 7 pobj _ _
9 of _ IN IN _ 8 prep _ _
10 conventional _ JJ JJ _ 12 amod _ _
11 television _ NN NN _ 12 nn _ _
12 channels _ NNS NNS _ 9 pobj _ _
13 , _ , , _ 17 punct _ _
14 it _ PRP PRP _ 17 nsubj _ _
15 is _ VBZ VBZ _ 17 cop _ _
16 less _ RBR RBR _ 17 advmod _ _
17 able _ JJ JJ _ 0 ROOT _ _
18 to _ TO TO _ 19 aux _ _
19 provide _ VB VB _ 17 xcomp _ _
20 video-on-demand _ NN NN _ 19 dobj _ _
21 . _ . . _ 17 punct _ _
1 Our _ PRP$ PRP$ _ 3 poss _ _
2 comfortable _ JJ JJ _ 3 amod _ _
3 room _ NN NN _ 4 nsubj _ _
4 feels _ VBZ VBZ _ 0 ROOT _ _
5 on _ IN IN _ 4 prep _ _
6 the _ DT DT _ 8 det _ _
7 small _ JJ JJ _ 8 amod _ _
8 side _ NN NN _ 5 pobj _ _
9 , _ , , _ 4 punct _ _
10 mainly _ RB RB _ 17 advmod _ _
11 because _ IN IN _ 17 mark _ _
12 too _ RB RB _ 13 advmod _ _
13 much _ JJ JJ _ 14 amod _ _
14 furniture _ NN NN _ 17 nsubjpass _ _
15 has _ VBZ VBZ _ 17 aux _ _
16 been _ VBN VBN _ 17 auxpass _ _
17 shoehorned _ VBN VBN _ 4 advcl _ _
18 into _ IN IN _ 17 prep _ _
19 it _ PRP PRP _ 18 pobj _ _
20 . _ . . _ 4 punct _ _
1 They _ PRP PRP _ 3 nsubj _ _
2 also _ RB RB _ 3 advmod _ _
3 require _ VBP VBP _ 0 ROOT _ _
4 a _ DT DT _ 6 det _ _
5 slower _ JJR JJR _ 6 amod _ _
6 inhale _ NN NN _ 3 dobj _ _
7 . _ . . _ 3 punct _ _
1 Her _ PRP$ PRP$ _ 2 poss _ _
2 ring _ NN NN _ 4 nsubjpass _ _
3 was _ VBD VBD _ 4 auxpass _ _
4 found _ VBN VBN _ 0 ROOT _ _
5 in _ IN IN _ 4 prep _ _
6 the _ DT DT _ 7 det _ _
7 car _ NN NN _ 5 pobj _ _
8 . _ . . _ 4 punct _ _
1 In _ IN IN _ 12 prep _ _
2 the _ DT DT _ 4 det _ _
3 past _ JJ JJ _ 4 amod _ _
4 year _ NN NN _ 1 pobj _ _
5 , _ , , _ 7 punct _ _
6 Forsythe _ NNP NNP _ 7 nsubj _ _
7 said _ VBD VBD _ 12 parataxis _ _
8 , _ , , _ 7 punct _ _
9 the _ DT DT _ 11 det _ _
10 Salvation _ NNP NNP _ 11 nn _ _
11 Army _ NNP NNP _ 12 nsubj _ _
12 provided _ VBD VBD _ 0 ROOT _ _
13 rental _ JJ JJ _ 14 amod _ _
14 subsidies _ NNS NNS _ 12 dobj _ _
15 that _ WDT WDT _ 16 nsubj _ _
16 prevented _ VBD VBD _ 14 rcmod _ _
17 1,172 _ CD CD _ 18 num _ _
18 evictions _ NNS NNS _ 16 dobj _ _
19 . _ . . _ 12 punct _ _
1 A _ DT DT _ 3 det _ _
2 23-year-old _ JJ JJ _ 3 amod _ _
3 man _ NN NN _ 6 nsubjpass _ _
4 has _ VBZ VBZ _ 6 aux _ _
5 been _ VBN VBN _ 6 auxpass _ _
6 jailed _ VBN VBN _ 0 ROOT _ _
7 for _ IN IN _ 6 prep _ _
8 two _ CD CD _ 9 num _ _
9 years _ NNS NNS _ 7 pobj _ _
10 after _ IN IN _ 6 prep _ _
11 pleading _ VBG VBG _ 10 pcomp _ _
12 guilty _ JJ JJ _ 11 acomp _ _
13 to _ TO TO _ 12 prep _ _
14 the _ DT DT _ 15 det _ _
15 manslaughter _ NN NN _ 13 pobj _ _
16 of _ IN IN _ 15 prep _ _
17 a _ DT DT _ 18 det _ _
18 man _ NN NN _ 16 pobj _ _
19 in _ IN IN _ 18 prep _ _
20 Hertfordshire _ NNP NNP _ 19 pobj _ _
21 . _ . . _ 6 punct _ _
1 But _ CC CC _ 10 cc _ _
2 the _ DT DT _ 3 det _ _
3 sustainability _ NN NN _ 10 nsubj _ _
4 of _ IN IN _ 3 prep _ _
5 any _ DT DT _ 7 det _ _
6 post-bubble _ JJ JJ _ 7 amod _ _
7 recovery _ NN NN _ 4 pobj _ _
8 is _ VBZ VBZ _ 10 cop _ _
9 always _ RB RB _ 10 advmod _ _
10 dubious _ JJ JJ _ 0 ROOT _ _
11 . _ . . _ 10 punct _ _
1 They _ PRP PRP _ 2 nsubj _ _
2 spoke _ VBD VBD _ 0 ROOT _ _
3 to _ TO TO _ 2 prep _ _
4 the _ DT DT _ 5 det _ _
5 BBC _ NNP NNP _ 8 poss _ _
6 's _ POS POS _ 5 possessive _ _
7 Artyom _ NNP NNP _ 8 nn _ _
8 Liss _ NNP NNP _ 3 pobj _ _
9 . _ . . _ 2 punct _ _
1 That _ DT DT _ 2 nsubj _ _
2 includes _ VBZ VBZ _ 0 ROOT _ _
3 me _ PRP PRP _ 2 dobj _ _
4 , _ , , _ 2 punct _ _
5 too _ RB RB _ 2 advmod _ _
6 . _ . . _ 2 punct _ _
1 The _ DT DT _ 2 det _ _
2 name _ NN NN _ 9 nsubj _ _
3 of _ IN IN _ 2 prep _ _
4 Rachel _ NNP NNP _ 5 nn _ _
5 Harris _ NNP NNP _ 8 poss _ _
6 ' _ POS POS _ 5 possessive _ _
7 Web _ NNP NNP _ 8 nn _ _
8 site _ NN NN _ 3 pobj _ _
9 says _ VBZ VBZ _ 0 ROOT _ _
10 it _ PRP PRP _ 9 dobj _ _
11 all _ DT DT _ 10 det _ _
12 . _ . . _ 9 punct _ _
1 If _ IN IN _ 3 mark _ _
2 you _ PRP PRP _ 3 nsubj _ _
3 prefer _ VBP VBP _ 19 advcl _ _
4 to _ TO TO _ 5 aux _ _
5 maximize _ VB VB _ 3 xcomp _ _
6 your _ PRP$ PRP$ _ 7 poss _ _
7 travel _ NN NN _ 5 dobj _ _
8 with _ IN IN _ 5 prep _ _
9 shorter _ JJR JJR _ 10 amod _ _
10 stays _ NNS NNS _ 8 pobj _ _
11 in _ IN IN _ 10 prep _ _
12 more _ JJR JJR _ 13 mwe _ _
13 than _ IN IN _ 14 quantmod _ _
14 one _ CD CD _ 15 num _ _
15 destination _ NN NN _ 11 pobj _ _
16 , _ , , _ 19 punct _ _
17 you _ PRP PRP _ 19 nsubj _ _
18 may _ MD MD _ 19 aux _ _
19 like _ VB VB _ 0 ROOT _ _
20 this _ DT DT _ 22 det _ _
21 multi-country _ JJ JJ _ 22 amod _ _
22 jaunt _ NN NN _ 19 dobj _ _
23 from _ IN IN _ 22 prep _ _
24 Virgin _ NNP NNP _ 25 nn _ _
25 Vacations _ NNPS NNPS _ 23 pobj _ _
26 . _ . . _ 19 punct _ _
1 The _ DT DT _ 3 det _ _
2 Afghan _ JJ JJ _ 3 amod _ _
3 government _ NN NN _ 6 nsubj _ _
4 also _ RB RB _ 6 advmod _ _
5 is _ VBZ VBZ _ 6 aux _ _
6 trying _ VBG VBG _ 0 ROOT _ _
7 to _ TO TO _ 8 aux _ _
8 persuade _ VB VB _ 6 xcomp _ _
9 farmers _ NNS NNS _ 8 dobj _ _
10 to _ TO TO _ 11 aux _ _
11 stop _ VB VB _ 8 xcomp _ _
12 growing _ VBG VBG _ 13 amod _ _
13 poppy _ NN NN _ 11 dobj _ _
14 and _ CC CC _ 11 cc _ _
15 shift _ VB VB _ 11 conj _ _
16 to _ TO TO _ 15 prep _ _
17 other _ JJ JJ _ 18 amod _ _
18 crops _ NNS NNS _ 16 pobj _ _
19 , _ , , _ 18 punct _ _
20 particularly _ RB RB _ 18 advmod _ _
21 wheat _ NN NN _ 18 dep _ _
22 . _ . . _ 6 punct _ _
1 The _ DT DT _ 3 det _ _
2 most _ RBS RBS _ 3 advmod _ _
3 striking _ JJ JJ _ 6 nsubj _ _
4 is _ VBZ VBZ _ 6 cop _ _
5 the _ DT DT _ 6 det _ _
6 differences _ NNS NNS _ 0 ROOT _ _
7 over _ IN IN _ 6 prep _ _
8 what _ WP WP _ 10 nsubj _ _
9 to _ TO TO _ 10 aux _ _
10 do _ VB VB _ 7 pcomp _ _
11 with _ IN IN _ 10 prep _ _
12 the _ DT DT _ 13 det _ _
13 banks _ NNS NNS _ 11 pobj _ _
14 . _ . . _ 6 punct _ _
1 Philo _ NNP NNP _ 4 nsubj _ _
2 did _ VBD VBD _ 4 aux _ _
3 not _ RB RB _ 4 neg _ _
4 mention _ VB VB _ 0 ROOT _ _
5 any _ DT DT _ 6 det _ _
6 name _ NN NN _ 4 dobj _ _
7 , _ , , _ 6 punct _ _
8 place _ NN NN _ 6 conj _ _
9 , _ , , _ 6 punct _ _
10 date _ NN NN _ 6 conj _ _
11 , _ , , _ 6 punct _ _
12 or _ CC CC _ 6 cc _ _
13 historical _ JJ JJ _ 14 amod _ _
14 circumstances _ NNS NNS _ 6 conj _ _
15 , _ , , _ 6 punct _ _
16 or _ CC CC _ 6 cc _ _
17 any _ DT DT _ 18 det _ _
18 background _ NN NN _ 6 conj _ _
19 to _ TO TO _ 18 prep _ _
20 the _ DT DT _ 21 det _ _
21 consolidation _ NN NN _ 19 pobj _ _
22 of _ IN IN _ 21 prep _ _
23 this _ DT DT _ 24 det _ _
24 group _ NN NN _ 22 pobj _ _
25 . _ . . _ 4 punct _ _
1 Created _ VBN VBN _ 8 partmod _ _
2 in _ IN IN _ 1 prep _ _
3 1996 _ CD CD _ 2 pobj _ _
4 , _ , , _ 8 punct _ _
5 the _ DT DT _ 6 det _ _
6 payments _ NNS NNS _ 8 nsubjpass _ _
7 are _ VBP VBP _ 8 auxpass _ _
8 based _ VBN VBN _ 0 ROOT _ _
9 on _ IN IN _ 8 prep _ _
10 a _ DT DT _ 11 det _ _
11 farm _ NN NN _ 14 poss _ _
12 's _ POS POS _ 11 possessive _ _
13 past _ JJ JJ _ 14 amod _ _
14 production _ NN NN _ 9 pobj _ _
15 and _ CC CC _ 8 cc _ _
16 are _ VBP VBP _ 17 auxpass _ _
17 issued _ VBN VBN _ 8 conj _ _
18 regardless _ RB RB _ 17 advmod _ _
19 of _ IN IN _ 18 prep _ _
20 current _ JJ JJ _ 21 amod _ _
21 production _ NN NN _ 19 pobj _ _
22 or _ CC CC _ 21 cc _ _
23 market _ NN NN _ 24 nn _ _
24 prices _ NNS NNS _ 21 conj _ _
25 . _ . . _ 8 punct _ _
1 Prosecutors _ NNS NNS _ 2 nsubj _ _
2 said _ VBD VBD _ 0 ROOT _ _
3 some _ DT DT _ 16 nsubjpass _ _
4 of _ IN IN _ 3 prep _ _
5 the _ DT DT _ 6 det _ _
6 billions _ NNS NNS _ 4 pobj _ _
7 of _ IN IN _ 6 prep _ _
8 dollars _ NNS NNS _ 7 pobj _ _
9 transferred _ VBN VBN _ 8 partmod _ _
10 from _ IN IN _ 9 prep _ _
11 Mexican _ JJ JJ _ 14 amod _ _
12 money _ NN NN _ 14 nn _ _
13 exchange _ NN NN _ 14 nn _ _
14 houses _ NNS NNS _ 10 pobj _ _
15 was _ VBD VBD _ 16 auxpass _ _
16 used _ VBN VBN _ 2 ccomp _ _
17 to _ TO TO _ 18 aux _ _
18 buy _ VB VB _ 16 xcomp _ _
19 planes _ NNS NNS _ 18 dobj _ _
20 for _ IN IN _ 18 prep _ _
21 drug _ NN NN _ 22 nn _ _
22 traffickers _ NNS NNS _ 20 pobj _ _
23 . _ . . _ 2 punct _ _
1 Margaret _ NNP NNP _ 2 nn _ _
2 Rutherford _ NNP NNP _ 11 nsubj _ _
3 , _ , , _ 2 punct _ _
4 chairwoman _ NN NN _ 2 appos _ _
5 of _ IN IN _ 4 prep _ _
6 Loxton _ NNP NNP _ 9 poss _ _
7 's _ POS POS _ 6 possessive _ _
8 parish _ JJ JJ _ 9 amod _ _
9 council _ NN NN _ 5 pobj _ _
10 , _ , , _ 2 punct _ _
11 told _ VBD VBD _ 0 ROOT _ _
12 BBC _ NNP NNP _ 13 nn _ _
13 Somerset _ NNP NNP _ 11 dobj _ _
14 that _ IN IN _ 16 mark _ _
15 she _ PRP PRP _ 16 nsubj _ _
16 hoped _ VBD VBD _ 11 ccomp _ _
17 the _ DT DT _ 18 det _ _
18 lines _ NNS NNS _ 21 nsubjpass _ _
19 could _ MD MD _ 21 aux _ _
20 be _ VB VB _ 21 auxpass _ _
21 sited _ VBN VBN _ 16 ccomp _ _
22 underground _ RB RB _ 21 advmod _ _
23 . _ . . _ 11 punct _ _
1 Amid _ IN IN _ 3 mark _ _
2 US _ PRP PRP _ 3 nsubj _ _
3 fears _ VBZ VBZ _ 16 advcl _ _
4 that _ IN IN _ 7 mark _ _
5 they _ PRP PRP _ 7 nsubj _ _
6 could _ MD MD _ 7 aux _ _
7 face _ VB VB _ 3 ccomp _ _
8 torture _ VB VB _ 7 dobj _ _
9 if _ IN IN _ 10 mark _ _
10 returned _ VBN VBN _ 7 advcl _ _
11 to _ TO TO _ 10 prep _ _
12 China _ NNP NNP _ 11 pobj _ _
13 , _ , , _ 16 punct _ _
14 five _ CD CD _ 16 nsubjpass _ _
15 were _ VBD VBD _ 16 auxpass _ _
16 released _ VBN VBN _ 0 ROOT _ _
17 to _ TO TO _ 16 prep _ _
18 Albania _ NNP NNP _ 17 pobj _ _
19 in _ IN IN _ 16 prep _ _
20 2006 _ CD CD _ 19 pobj _ _
21 , _ , , _ 16 punct _ _
22 and _ CC CC _ 16 cc _ _
23 four _ CD CD _ 25 nsubjpass _ _
24 were _ VBD VBD _ 25 auxpass _ _
25 resettled _ VBN VBN _ 16 conj _ _
26 in _ IN IN _ 25 prep _ _
27 Bermuda _ NNP NNP _ 26 pobj _ _
28 this _ DT DT _ 29 det _ _
29 year _ NN NN _ 25 tmod _ _
30 . _ . . _ 16 punct _ _
1 He _ PRP PRP _ 3 nsubj _ _
2 then _ RB RB _ 3 advmod _ _
3 provided _ VBD VBD _ 0 ROOT _ _
4 Marshal _ NNP NNP _ 5 nn _ _
5 McAvoy _ NNP NNP _ 8 poss _ _
6 's _ POS POS _ 5 possessive _ _
7 phone _ NN NN _ 8 nn _ _
8 number _ NN NN _ 3 dobj _ _
9 . _ . . _ 3 punct _ _
1 Tech _ NNP NNP _ 2 nn _ _
2 credits _ NNS NNS _ 5 nsubj _ _
3 are _ VBP VBP _ 5 cop _ _
4 just _ RB RB _ 5 advmod _ _
5 fine _ JJ JJ _ 0 ROOT _ _
6 for _ IN IN _ 5 prep _ _
7 what _ WP WP _ 12 nsubj _ _
8 essentially _ RB RB _ 12 advmod _ _
9 is _ VBZ VBZ _ 12 cop _ _
10 an _ DT DT _ 12 det _ _
11 un-reality _ JJ JJ _ 12 amod _ _
12 show _ NN NN _ 6 pcomp _ _
13 . _ . . _ 5 punct _ _
1 But _ CC CC _ 8 cc _ _
2 my _ PRP$ PRP$ _ 4 poss _ _
3 eldest _ JJS JJS _ 4 amod _ _
4 daughter _ NN NN _ 8 nsubj _ _
5 , _ , , _ 4 punct _ _
6 Donna _ NNP NNP _ 4 appos _ _
7 , _ , , _ 4 punct _ _
8 did _ VBD VBD _ 0 ROOT _ _
9 . _ . . _ 8 punct _ _
1 The _ DT DT _ 2 det _ _
2 department _ NN NN _ 4 nsubj _ _
3 has _ VBZ VBZ _ 4 aux _ _
4 spent _ VBN VBN _ 0 ROOT _ _
5 $ _ $ $ _ 4 dobj _ _
6 2.9 _ CD CD _ 7 number _ _
7 million _ CD CD _ 5 num _ _
8 on _ IN IN _ 4 prep _ _
9 the _ DT DT _ 11 det _ _
10 hot _ JJ JJ _ 11 amod _ _
11 line _ NN NN _ 8 pobj _ _
12 thus _ RB RB _ 13 advmod _ _
13 far _ RB RB _ 4 advmod _ _
14 . _ . . _ 4 punct _ _
1 Picoplatin _ NNP NNP _ 3 nsubjpass _ _
2 is _ VBZ VBZ _ 3 auxpass _ _
3 designed _ VBN VBN _ 0 ROOT _ _
4 to _ TO TO _ 5 aux _ _
5 overcome _ VB VB _ 3 xcomp _ _
6 platinum _ NN NN _ 7 nn _ _
7 resistance _ NN NN _ 5 dobj _ _
8 associated _ VBN VBN _ 7 partmod _ _
9 with _ IN IN _ 8 prep _ _
10 chemotherapy _ NN NN _ 9 pobj _ _
11 in _ IN IN _ 10 prep _ _
12 solid _ JJ JJ _ 13 amod _ _
13 tumors _ NNS NNS _ 11 pobj _ _
14 , _ , , _ 3 punct _ _
15 and _ CC CC _ 3 cc _ _
16 is _ VBZ VBZ _ 18 aux _ _
17 being _ VBG VBG _ 18 auxpass _ _
18 studied _ VBN VBN _ 3 conj _ _
19 in _ IN IN _ 18 prep _ _
20 multiple _ JJ JJ _ 22 amod _ _
21 cancer _ NN NN _ 22 nn _ _
22 indications _ NNS NNS _ 19 pobj _ _
23 , _ , , _ 22 punct _ _
24 combinations _ NNS NNS _ 22 conj _ _
25 and _ CC CC _ 22 cc _ _
26 formulations _ NNS NNS _ 22 conj _ _
27 . _ . . _ 3 punct _ _
1 Only _ RB RB _ 4 advmod _ _
2 you _ PRP PRP _ 4 nsubj _ _
3 can _ MD MD _ 4 aux _ _
4 decide _ VB VB _ 0 ROOT _ _
5 what _ WP WP _ 7 nsubj _ _
6 's _ VBZ VBZ _ 7 cop _ _
7 important _ JJ JJ _ 4 ccomp _ _
8 . _ . . _ 4 punct _ _
1 Lt. _ NNP NNP _ 4 nn _ _
2 Col. _ NNP NNP _ 4 nn _ _
3 David _ NNP NNP _ 4 nn _ _
4 Accetta _ NNP NNP _ 14 nsubj _ _
5 , _ , , _ 4 punct _ _
6 the _ DT DT _ 10 det _ _
7 top _ JJ JJ _ 10 amod _ _
8 U.S. _ NNP NNP _ 10 nn _ _
9 military _ JJ JJ _ 10 amod _ _
10 spokesman _ NN NN _ 4 appos _ _
11 in _ IN IN _ 10 prep _ _
12 Afghanistan _ NNP NNP _ 11 pobj _ _
13 , _ , , _ 4 punct _ _
14 said _ VBD VBD _ 0 ROOT _ _
15 he _ PRP PRP _ 18 nsubj _ _
16 could _ MD MD _ 18 aux _ _
17 not _ RB RB _ 18 neg _ _
18 confirm _ VB VB _ 14 ccomp _ _
19 the _ DT DT _ 20 det _ _
20 report _ NN NN _ 18 dobj _ _
21 . _ . . _ 14 punct _ _
1 The _ DT DT _ 3 det _ _
2 four _ CD CD _ 3 num _ _
3 teams _ NNS NNS _ 14 nsubj _ _
4 that _ WDT WDT _ 6 nsubj _ _
5 will _ MD MD _ 6 aux _ _
6 play _ VB VB _ 3 rcmod _ _
7 in _ IN IN _ 6 prep _ _
8 the _ DT DT _ 9 det _ _
9 women _ NNS NNS _ 11 poss _ _
10 's _ POS POS _ 9 possessive _ _
11 tournament _ NN NN _ 7 pobj _ _
12 are _ VBP VBP _ 14 cop _ _
13 Alaska _ NNP NNP _ 14 nn _ _
14 Anchorage _ NNP NNP _ 0 ROOT _ _
15 , _ , , _ 14 punct _ _
16 Cincinnati _ NNP NNP _ 14 conj _ _
17 , _ , , _ 14 punct _ _
18 Coastal _ NNP NNP _ 19 nn _ _
19 Carolina _ NNP NNP _ 14 conj _ _
20 and _ CC CC _ 14 cc _ _
21 Western _ NNP NNP _ 22 nn _ _
22 Carolina _ NNP NNP _ 14 conj _ _
23 . _ . . _ 14 punct _ _
1 Speaking _ VBG VBG _ 8 partmod _ _
2 to _ TO TO _ 1 prep _ _
3 reporters _ NNS NNS _ 2 pobj _ _
4 , _ , , _ 8 punct _ _
5 she _ PRP PRP _ 8 nsubj _ _
6 did _ VBD VBD _ 8 aux _ _
7 not _ RB RB _ 8 neg _ _
8 repeat _ VB VB _ 0 ROOT _ _
9 her _ PRP$ PRP$ _ 10 poss _ _
10 demand _ NN NN _ 8 dobj _ _
11 that _ IN IN _ 17 mark _ _
12 a _ DT DT _ 15 det _ _
13 new _ JJ JJ _ 15 amod _ _
14 government-run _ JJ JJ _ 15 amod _ _
15 plan _ NN NN _ 17 nsubj _ _
16 be _ VB VB _ 17 cop _ _
17 part _ NN NN _ 10 ccomp _ _
18 of _ IN IN _ 17 prep _ _
19 the _ DT DT _ 21 det _ _
20 final _ JJ JJ _ 21 amod _ _
21 legislation _ NN NN _ 18 pobj _ _
22 . _ . . _ 8 punct _ _
1 ' _ '' '' _ 10 punct _ _
2 But _ CC CC _ 10 cc _ _
3 with _ IN IN _ 10 prep _ _
4 the _ DT DT _ 5 det _ _
5 help _ NN NN _ 3 pobj _ _
6 of _ IN IN _ 5 prep _ _
7 English _ NNP NNP _ 8 nn _ _
8 Heritage _ NNP NNP _ 6 pobj _ _
9 we _ PRP PRP _ 10 nsubj _ _
10 restored _ VBD VBD _ 0 ROOT _ _
11 them _ PRP PRP _ 10 dobj _ _
12 . _ . . _ 10 punct _ _
13 ' _ '' '' _ 10 punct _ _
1 Mr _ NNP NNP _ 2 nn _ _
2 Oubridge _ NNP NNP _ 3 nsubj _ _
3 said _ VBD VBD _ 0 ROOT _ _
4 when _ WRB WRB _ 8 advmod _ _
5 the _ DT DT _ 7 det _ _
6 festival _ NN NN _ 7 nn _ _
7 team _ NN NN _ 8 nsubj _ _
8 met _ VBD VBD _ 20 advcl _ _
9 council _ NN NN _ 10 nn _ _
10 officials _ NNS NNS _ 8 dobj _ _
11 and _ CC CC _ 10 cc _ _
12 the _ DT DT _ 13 det _ _
13 police _ NN NN _ 10 conj _ _
14 on _ IN IN _ 8 prep _ _
15 Thursday _ NNP NNP _ 14 pobj _ _
16 there _ EX EX _ 20 expl _ _
17 had _ VBD VBD _ 20 aux _ _
18 been _ VBN VBN _ 20 cop _ _
19 no _ DT DT _ 20 det _ _
20 mention _ NN NN _ 3 ccomp _ _
21 of _ IN IN _ 20 prep _ _
22 a _ DT DT _ 24 det _ _
23 potential _ JJ JJ _ 24 amod _ _
24 injunction _ NN NN _ 21 pobj _ _
25 . _ . . _ 3 punct _ _
1 A _ DT DT _ 2 det _ _
2 number _ NN NN _ 6 nsubj _ _
3 of _ IN IN _ 2 prep _ _
4 ministers _ NNS NNS _ 3 pobj _ _
5 have _ VBP VBP _ 6 aux _ _
6 left _ VBN VBN _ 0 ROOT _ _
7 the _ DT DT _ 8 det _ _
8 government _ NN NN _ 9 nsubj _ _
9 facing _ VBG VBG _ 6 dep _ _
10 questions _ NNS NNS _ 9 dobj _ _
11 over _ IN IN _ 10 prep _ _
12 their _ PRP$ PRP$ _ 13 poss _ _
13 expenses _ NNS NNS _ 11 pobj _ _
14 , _ , , _ 13 punct _ _
15 including _ VBG VBG _ 13 prep _ _
16 Hazel _ NNP NNP _ 17 nn _ _
17 Blears _ NNP NNP _ 15 pobj _ _
18 , _ , , _ 17 punct _ _
19 the _ DT DT _ 22 det _ _
20 former _ JJ JJ _ 22 amod _ _
21 communities _ NNS NNS _ 22 nn _ _
22 secretary _ NN NN _ 17 appos _ _
23 ; _ : : _ 17 punct _ _
24 Jacqui _ NNP NNP _ 25 nn _ _
25 Smith _ NNP NNP _ 17 conj _ _
26 , _ , , _ 25 punct _ _
27 the _ DT DT _ 30 det _ _
28 former _ JJ JJ _ 30 amod _ _
29 home _ NN NN _ 30 nn _ _
30 secretary _ NN NN _ 25 appos _ _
31 ; _ : : _ 17 punct _ _
32 and _ CC CC _ 17 cc _ _
33 Tony _ NNP NNP _ 34 nn _ _
34 McNulty _ NNP NNP _ 17 conj _ _
35 , _ , , _ 34 punct _ _
36 the _ DT DT _ 39 det _ _
37 former _ JJ JJ _ 39 amod _ _
38 employment _ NN NN _ 39 nn _ _
39 minister _ NN NN _ 34 appos _ _
40 . _ . . _ 6 punct _ _
1 An _ DT DT _ 4 det _ _
2 enticingly _ RB RB _ 3 advmod _ _
3 big _ JJ JJ _ 4 amod _ _
4 button _ NN NN _ 10 nsubj _ _
5 that _ WDT WDT _ 6 nsubj _ _
6 looked _ VBD VBD _ 4 rcmod _ _
7 like _ IN IN _ 6 prep _ _
8 a _ DT DT _ 9 det _ _
9 latch _ NN NN _ 7 pobj _ _
10 turned _ VBD VBD _ 0 ROOT _ _
11 out _ RP RP _ 10 prt _ _
12 to _ TO TO _ 15 aux _ _
13 be _ VB VB _ 15 cop _ _
14 a _ DT DT _ 15 det _ _
15 hinge _ NN NN _ 10 xcomp _ _
16 . _ . . _ 10 punct _ _
1 After _ IN IN _ 8 prep _ _
2 an _ DT DT _ 5 det _ _
3 oustanding _ JJ JJ _ 5 amod _ _
4 opening _ NN NN _ 5 nn _ _
5 round _ NN NN _ 1 pobj _ _
6 , _ , , _ 8 punct _ _
7 Garcia _ NNP NNP _ 8 nsubj _ _
8 found _ VBD VBD _ 0 ROOT _ _
9 himself _ PRP PRP _ 10 nsubj _ _
10 tied _ VBD VBD _ 8 ccomp _ _
11 with _ IN IN _ 10 prep _ _
12 the _ DT DT _ 14 det _ _
13 50-year-old _ JJ JJ _ 14 amod _ _
14 Langer _ NNP NNP _ 11 pobj _ _
15 , _ , , _ 14 punct _ _
16 who _ WP WP _ 17 nsubj _ _
17 fired _ VBD VBD _ 14 rcmod _ _
18 a _ DT DT _ 20 det _ _
19 five-under _ JJ JJ _ 20 amod _ _
20 67 _ NN NN _ 17 dobj _ _
21 following _ VBG VBG _ 17 prep _ _
22 his _ PRP$ PRP$ _ 24 poss _ _
23 first-round _ JJ JJ _ 24 amod _ _
24 72 _ CD CD _ 21 pobj _ _
25 . _ . . _ 8 punct _ _
1 We _ PRP PRP _ 2 nsubj _ _
2 made _ VBD VBD _ 0 ROOT _ _
3 mistakes _ NNS NNS _ 2 dobj _ _
4 in _ IN IN _ 2 prep _ _
5 those _ DT DT _ 6 det _ _
6 games _ NNS NNS _ 4 pobj _ _
7 in _ IN IN _ 2 prep _ _
8 the _ DT DT _ 10 det _ _
9 last _ JJ JJ _ 10 amod _ _
10 minute _ NN NN _ 7 pobj _ _
11 , _ , , _ 2 punct _ _
12 so _ IN IN _ 16 mark _ _
13 it _ PRP PRP _ 16 nsubj _ _
14 's _ VBZ VBZ _ 16 cop _ _
15 our _ PRP$ PRP$ _ 16 poss _ _
16 fault _ NN NN _ 2 advcl _ _
17 in _ IN IN _ 16 prep _ _
18 the _ DT DT _ 19 det _ _
19 end _ NN NN _ 17 pobj _ _
20 . _ . . _ 2 punct _ _
1 This _ DT DT _ 3 det _ _
2 latest _ JJS JJS _ 3 amod _ _
3 incident _ NN NN _ 7 nsubj _ _
4 is _ VBZ VBZ _ 7 cop _ _
5 the _ DT DT _ 7 det _ _
6 second _ JJ JJ _ 7 amod _ _
7 time _ NN NN _ 0 ROOT _ _
8 in _ IN IN _ 7 prep _ _
9 four _ CD CD _ 10 num _ _
10 weeks _ NNS NNS _ 8 pobj _ _
11 the _ DT DT _ 12 det _ _
12 Revenue _ NN NN _ 14 nsubj _ _
13 has _ VBZ VBZ _ 14 aux _ _
14 admitted _ VBN VBN _ 7 rcmod _ _
15 losing _ VBG VBG _ 14 xcomp _ _
16 taxpayers _ NNS NNS _ 18 poss _ _
17 ' _ POS POS _ 16 possessive _ _
18 details _ NNS NNS _ 15 dobj _ _
19 . _ . . _ 7 punct _ _
1 NebuAd _ NNP NNP _ 2 nsubj _ _
2 confirmed _ VBD VBD _ 0 ROOT _ _
3 Friday _ NNP NNP _ 2 tmod _ _
4 that _ IN IN _ 7 mark _ _
5 it _ PRP PRP _ 7 nsubj _ _
6 is _ VBZ VBZ _ 7 aux _ _
7 partnering _ VBG VBG _ 2 ccomp _ _
8 with _ IN IN _ 7 prep _ _
9 Charter _ NNP NNP _ 8 pobj _ _
10 but _ CC CC _ 2 cc _ _
11 declined _ VBD VBD _ 2 conj _ _
12 further _ JJ JJ _ 13 amod _ _
13 comment _ NN NN _ 11 dobj _ _
14 . _ . . _ 2 punct _ _
1 Needless _ JJ JJ _ 6 ccomp _ _
2 to _ TO TO _ 3 aux _ _
3 say _ VB VB _ 1 xcomp _ _
4 , _ , , _ 6 punct _ _
5 it _ PRP PRP _ 6 nsubj _ _
6 wasn _ VBP VBP _ 0 ROOT _ _
7 't _ NN NN _ 6 dobj _ _
8 long _ RB RB _ 11 advmod _ _
9 before _ IN IN _ 11 mark _ _
10 he _ PRP PRP _ 11 nsubj _ _
11 sat _ VBD VBD _ 6 advcl _ _
12 down _ RP RP _ 11 prt _ _
13 . _ . . _ 6 punct _ _
1 For _ IN IN _ 18 prep _ _
2 Judy _ NNP NNP _ 3 nn _ _
3 John-Baptiste _ NNP NNP _ 1 pobj _ _
4 , _ , , _ 3 punct _ _
5 who _ WP WP _ 6 nsubj _ _
6 runs _ VBZ VBZ _ 3 rcmod _ _
7 the _ DT DT _ 10 det _ _
8 Basement _ NNP NNP _ 10 nn _ _
9 Dance _ NNP NNP _ 10 nn _ _
10 Studio _ NNP NNP _ 6 dobj _ _
11 in _ IN IN _ 10 prep _ _
12 London _ NNP NNP _ 11 pobj _ _
13 , _ , , _ 18 punct _ _
14 ballet _ NN NN _ 18 nsubj _ _
15 is _ VBZ VBZ _ 18 cop _ _
16 the _ DT DT _ 18 det _ _
17 most _ RBS RBS _ 18 advmod _ _
18 popular _ JJ JJ _ 0 ROOT _ _
19 of _ IN IN _ 18 prep _ _
20 all _ PDT PDT _ 22 predet _ _
21 the _ DT DT _ 22 det _ _
22 classes _ NNS NNS _ 19 pobj _ _
23 she _ PRP PRP _ 24 nsubj _ _
24 offers _ VBZ VBZ _ 22 rcmod _ _
25 . _ . . _ 18 punct _ _
1 Russ _ NNP NNP _ 2 nn _ _
2 Dixon _ NNP NNP _ 7 nsubj _ _
3 , _ , , _ 2 punct _ _
4 an _ DT DT _ 5 det _ _
5 infielder _ NN NN _ 2 appos _ _
6 , _ , , _ 2 punct _ _
7 homered _ VBD VBD _ 0 ROOT _ _
8 to _ TO TO _ 7 prep _ _
9 right _ NN NN _ 8 pobj _ _
10 , _ , , _ 7 punct _ _
11 then _ RB RB _ 7 advmod _ _
12 sheepishly _ RB RB _ 13 advmod _ _
13 put _ VBD VBD _ 7 dep _ _
14 his _ PRP$ PRP$ _ 15 poss _ _
15 head _ NN NN _ 13 dobj _ _
16 down _ RP RP _ 13 prt _ _
17 to _ TO TO _ 18 aux _ _
18 avoid _ VB VB _ 13 xcomp _ _
19 eye _ NN NN _ 20 nn _ _
20 contact _ NN NN _ 18 dobj _ _
21 with _ IN IN _ 20 prep _ _
22 the _ DT DT _ 23 det _ _
23 pitcher _ NN NN _ 21 pobj _ _
24 . _ . . _ 7 punct _ _
1 Mr. _ NNP NNP _ 2 nn _ _
2 Gore _ NNP NNP _ 3 nsubj _ _
3 was _ VBD VBD _ 0 ROOT _ _
4 not _ RB RB _ 3 neg _ _
5 here _ RB RB _ 3 advmod _ _
6 , _ , , _ 3 punct _ _
7 but _ CC CC _ 3 cc _ _
8 his _ PRP$ PRP$ _ 9 poss _ _
9 name _ NN NN _ 10 nsubj _ _
10 came _ VBD VBD _ 3 conj _ _
11 up _ RP RP _ 10 prt _ _
12 frequently _ RB RB _ 10 advmod _ _
13 . _ . . _ 3 punct _ _
1 The _ DT DT _ 2 det _ _
2 lawsuit _ NN NN _ 4 nsubj _ _
3 also _ RB RB _ 4 advmod _ _
4 names _ VBD VBD _ 0 ROOT _ _
5 the _ DT DT _ 7 det _ _
6 shopping _ NN NN _ 7 nn _ _
7 mall _ NN NN _ 4 dobj _ _
8 where _ WRB WRB _ 11 advmod _ _
9 the _ DT DT _ 10 det _ _
10 incident _ NN NN _ 11 nsubj _ _
11 occurred _ VBD VBD _ 7 rcmod _ _
12 and _ CC CC _ 7 cc _ _
13 the _ DT DT _ 15 det _ _
14 security _ NN NN _ 15 nn _ _
15 company _ NN NN _ 7 conj _ _
16 employed _ VBN VBN _ 15 partmod _ _
17 by _ IN IN _ 16 prep _ _
18 Wal-Mart _ NNP NNP _ 17 pobj _ _
19 . _ . . _ 4 punct _ _
1 Rudy _ NNP NNP _ 2 nn _ _
2 Crutchfield _ NNP NNP _ 9 nsubj _ _
3 and _ CC CC _ 2 cc _ _
4 Steve _ NNP NNP _ 5 nn _ _
5 Hadeed _ NNP NNP _ 2 conj _ _
6 have _ VBP VBP _ 9 aux _ _
7 been _ VBN VBN _ 9 cop _ _
8 close _ JJ JJ _ 9 amod _ _
9 friends _ NNS NNS _ 0 ROOT _ _
10 since _ IN IN _ 9 prep _ _
11 their _ PRP$ PRP$ _ 12 poss _ _
12 days _ NNS NNS _ 10 pobj _ _
13 at _ IN IN _ 12 prep _ _
14 Wheaton _ NNP NNP _ 16 nn _ _
15 High _ NNP NNP _ 16 nn _ _
16 School _ NNP NNP _ 13 pobj _ _
17 . _ . . _ 9 punct _ _
1 Earlier _ RBR RBR _ 3 advmod _ _
2 this _ DT DT _ 3 det _ _
3 month _ NN NN _ 6 tmod _ _
4 , _ , , _ 6 punct _ _
5 GM _ NNP NNP _ 6 nsubj _ _
6 announced _ VBD VBD _ 0 ROOT _ _
7 plans _ NNS NNS _ 6 dobj _ _
8 to _ TO TO _ 9 aux _ _
9 sell _ VB VB _ 7 infmod _ _
10 Hummer _ NNP NNP _ 9 dobj _ _
11 to _ TO TO _ 9 prep _ _
12 a _ DT DT _ 14 det _ _
13 Chinese _ JJ JJ _ 14 amod _ _
14 manufacturer _ NN NN _ 11 pobj _ _
15 and _ CC CC _ 14 cc _ _
16 Saturn _ NNP NNP _ 14 conj _ _
17 to _ TO TO _ 9 prep _ _
18 Michigan-based _ JJ JJ _ 24 amod _ _
19 dealership _ NN NN _ 24 nn _ _
20 chain _ NN NN _ 24 nn _ _
21 Penske _ NNP NNP _ 24 nn _ _
22 Automotive _ NNP NNP _ 24 nn _ _
23 Group _ NNP NNP _ 24 nn _ _
24 Inc _ NNP NNP _ 17 pobj _ _
25 . _ . . _ 6 punct _ _
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <memory>
#include <string>
#include <vector>
#include "syntaxnet/document_format.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/utils.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/regexp.h"
namespace syntaxnet {
// CoNLL document format reader for dependency annotated corpora.
// The expected format is described e.g. at http://ilk.uvt.nl/conll/#dataformat
//
// Data should adhere to the following rules:
// - Data files contain sentences separated by a blank line.
// - A sentence consists of one or tokens, each one starting on a new line.
// - A token consists of ten fields described in the table below.
// - Fields are separated by a single tab character.
// - All data files will contains these ten fields, although only the ID
// column is required to contain non-dummy (i.e. non-underscore) values.
// Data files should be UTF-8 encoded (Unicode).
//
// Fields:
// 1 ID: Token counter, starting at 1 for each new sentence and increasing
// by 1 for every new token.
// 2 FORM: Word form or punctuation symbol.
// 3 LEMMA: Lemma or stem.
// 4 CPOSTAG: Coarse-grained part-of-speech tag or category.
// 5 POSTAG: Fine-grained part-of-speech tag. Note that the same POS tag
// cannot appear with multiple coarse-grained POS tags.
// 6 FEATS: Unordered set of syntactic and/or morphological features.
// 7 HEAD: Head of the current token, which is either a value of ID or '0'.
// 8 DEPREL: Dependency relation to the HEAD.
// 9 PHEAD: Projective head of current token.
// 10 PDEPREL: Dependency relation to the PHEAD.
//
// This CoNLL reader is compatible with the CoNLL-U format described at
// http://universaldependencies.org/format.html
// Note that this reader skips CoNLL-U multiword tokens and ignores the last two
// fields of every line, which are PHEAD and PDEPREL in CoNLL format, but are
// replaced by DEPS and MISC in CoNLL-U.
//
class CoNLLSyntaxFormat : public DocumentFormat {
public:
CoNLLSyntaxFormat() {}
// Reads up to the first empty line and returns false end of file is reached.
bool ReadRecord(tensorflow::io::InputBuffer *buffer,
string *record) override {
string line;
record->clear();
tensorflow::Status status = buffer->ReadLine(&line);
while (!line.empty() && status.ok()) {
tensorflow::strings::StrAppend(record, line, "\n");
status = buffer->ReadLine(&line);
}
return status.ok() || !record->empty();
}
void ConvertFromString(const string &key, const string &value,
vector<Sentence *> *sentences) override {
// Create new sentence.
Sentence *sentence = new Sentence();
// Each line corresponds to one token.
string text;
vector<string> lines = utils::Split(value, '\n');
// Add each token to the sentence.
vector<string> fields;
int expected_id = 1;
for (size_t i = 0; i < lines.size(); ++i) {
// Split line into tab-separated fields.
fields.clear();
fields = utils::Split(lines[i], '\t');
if (fields.size() == 0) continue;
// Skip comment lines.
if (fields[0][0] == '#') continue;
// Skip CoNLLU lines for multiword tokens which are indicated by
// hyphenated line numbers, e.g., "2-4".
// http://universaldependencies.github.io/docs/format.html
if (RE2::FullMatch(fields[0], "[0-9]+-[0-9]+")) continue;
// Clear all optional fields equal to '_'.
for (size_t j = 2; j < fields.size(); ++j) {
if (fields[j].length() == 1 && fields[j][0] == '_') fields[j].clear();
}
// Check that the line is valid.
CHECK_GE(fields.size(), 8)
<< "Every line has to have at least 8 tab separated fields.";
// Check that the ids follow the expected format.
const int id = utils::ParseUsing<int>(fields[0], 0, utils::ParseInt32);
CHECK_EQ(expected_id++, id)
<< "Token ids start at 1 for each new sentence and increase by 1 "
<< "on each new token. Sentences are separated by an empty line.";
// Get relevant fields.
const string &word = fields[1];
const string &cpostag = fields[3];
const string &tag = fields[4];
const int head = utils::ParseUsing<int>(fields[6], 0, utils::ParseInt32);
const string &label = fields[7];
// Add token to sentence text.
if (!text.empty()) text.append(" ");
const int start = text.size();
const int end = start + word.size() - 1;
text.append(word);
// Add token to sentence.
Token *token = sentence->add_token();
token->set_word(word);
token->set_start(start);
token->set_end(end);
if (head > 0) token->set_head(head - 1);
if (!tag.empty()) token->set_tag(tag);
if (!cpostag.empty()) token->set_category(cpostag);
if (!label.empty()) token->set_label(label);
}
if (sentence->token_size() > 0) {
sentence->set_docid(key);
sentence->set_text(text);
sentences->push_back(sentence);
} else {
// If the sentence was empty (e.g., blank lines at the beginning of a
// file), then don't save it.
delete sentence;
}
}
// Converts a sentence to a key/value pair.
void ConvertToString(const Sentence &sentence, string *key,
string *value) override {
*key = sentence.docid();
vector<string> lines;
for (int i = 0; i < sentence.token_size(); ++i) {
vector<string> fields(10);
fields[0] = tensorflow::strings::Printf("%d", i + 1);
fields[1] = sentence.token(i).word();
fields[2] = "_";
fields[3] = sentence.token(i).category();
fields[4] = sentence.token(i).tag();
fields[5] = "_";
fields[6] =
tensorflow::strings::Printf("%d", sentence.token(i).head() + 1);
fields[7] = sentence.token(i).label();
fields[8] = "_";
fields[9] = "_";
lines.push_back(utils::Join(fields, "\t"));
}
*value = tensorflow::strings::StrCat(utils::Join(lines, "\n"), "\n\n");
}
private:
TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
};
REGISTER_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);
// Reader for tokenized text. This reader expects every sentence to be on a
// single line and tokens on that line to be separated by single spaces.
//
class TokenizedTextFormat : public DocumentFormat {
public:
TokenizedTextFormat() {}
// Reads a line and returns false if end of file is reached.
bool ReadRecord(tensorflow::io::InputBuffer *buffer,
string *record) override {
return buffer->ReadLine(record).ok();
}
void ConvertFromString(const string &key, const string &value,
vector<Sentence *> *sentences) override {
Sentence *sentence = new Sentence();
string text;
for (const string &word : utils::Split(value, ' ')) {
if (word.empty()) continue;
const int start = text.size();
const int end = start + word.size() - 1;
if (!text.empty()) text.append(" ");
text.append(word);
Token *token = sentence->add_token();
token->set_word(word);
token->set_start(start);
token->set_end(end);
}
if (sentence->token_size() > 0) {
sentence->set_docid(key);
sentence->set_text(text);
sentences->push_back(sentence);
} else {
// If the sentence was empty (e.g., blank lines at the beginning of a
// file), then don't save it.
delete sentence;
}
}
void ConvertToString(const Sentence &sentence, string *key,
string *value) override {
*key = sentence.docid();
value->clear();
for (const Token &token : sentence.token()) {
if (!value->empty()) value->append(" ");
value->append(token.word());
if (token.has_tag()) {
value->append("_");
value->append(token.tag());
}
if (token.has_head()) {
value->append("_");
value->append(tensorflow::strings::StrCat(token.head()));
}
}
value->append("\n");
}
private:
TF_DISALLOW_COPY_AND_ASSIGN(TokenizedTextFormat);
};
REGISTER_DOCUMENT_FORMAT("tokenized-text", TokenizedTextFormat);
// Text reader that attmpts to perform Penn Treebank tokenization on arbitrary
// raw text. Adapted from https://www.cis.upenn.edu/~treebank/tokenizer.sed
// by Robert MacIntyre, University of Pennsylvania, late 1995.
// Expected input: raw text with one sentence per line.
//
class EnglishTextFormat : public TokenizedTextFormat {
public:
EnglishTextFormat() {}
void ConvertFromString(const string &key, const string &value,
vector<Sentence *> *sentences) override {
vector<pair<string, string>> preproc_rules = {
// Punctuation.
{"’", "'"},
{"…", "..."},
{"---", "--"},
{"—", "--"},
{"–", "--"},
{",", ","},
{"。", "."},
{"!", "!"},
{"?", "?"},
{":", ":"},
{";", ";"},
{"&", "&"},
// Brackets.
{"\\[", "("},
{"]", ")"},
{"{", "("},
{"}", ")"},
{"【", "("},
{"】", ")"},
{"(", "("},
{")", ")"},
// Quotation marks.
{"\"", "\""},
{"″", "\""},
{"“", "\""},
{"„", "\""},
{"‵‵", "\""},
{"”", "\""},
{"’", "\""},
{"‘", "\""},
{"′′", "\""},
{"‹", "\""},
{"›", "\""},
{"«", "\""},
{"»", "\""},
// Discarded punctuation that breaks sentences.
{"|", ""},
{"·", ""},
{"•", ""},
{"●", ""},
{"▪", ""},
{"■", ""},
{"□", ""},
{"❑", ""},
{"◆", ""},
{"★", ""},
{"*", ""},
{"♦", ""},
};
vector<pair<string, string>> rules = {
// attempt to get correct directional quotes
{R"re(^")re", "`` "},
{R"re(([ \([{<])")re", "\\1 `` "},
// close quotes handled at end
{R"re(\.\.\.)re", " ... "},
{"[,;:@#$%&]", " \\0 "},
// Assume sentence tokenization has been done first, so split FINAL
// periods only.
{R"re(([^.])(\.)([\]\)}>"']*)[ ]*$)re", "\\1 \\2\\3 "},
// however, we may as well split ALL question marks and exclamation
// points, since they shouldn't have the abbrev.-marker ambiguity
// problem
{"[?!]", " \\0 "},
// parentheses, brackets, etc.
{R"re([\]\[\(\){}<>])re", " \\0 "},
// Like Adwait Ratnaparkhi's MXPOST, we use the parsed-file version of
// these symbols.
{"\\(", "-LRB-"},
{"\\)", "-RRB-"},
{"\\]", "-LSB-"},
{"\\]", "-RSB-"},
{"{", "-LCB-"},
{"}", "-RCB-"},
{"--", " -- "},
// First off, add a space to the beginning and end of each line, to
// reduce necessary number of regexps.
{"$", " "},
{"^", " "},
{"\"", " '' "},
// possessive or close-single-quote
{"([^'])' ", "\\1 ' "},
// as in it's, I'm, we'd
{"'([sSmMdD]) ", " '\\1 "},
{"'ll ", " 'll "},
{"'re ", " 're "},
{"'ve ", " 've "},
{"n't ", " n't "},
{"'LL ", " 'LL "},
{"'RE ", " 'RE "},
{"'VE ", " 'VE "},
{"N'T ", " N'T "},
{" ([Cc])annot ", " \\1an not "},
{" ([Dd])'ye ", " \\1' ye "},
{" ([Gg])imme ", " \\1im me "},
{" ([Gg])onna ", " \\1on na "},
{" ([Gg])otta ", " \\1ot ta "},
{" ([Ll])emme ", " \\1em me "},
{" ([Mm])ore'n ", " \\1ore 'n "},
{" '([Tt])is ", " '\\1 is "},
{" '([Tt])was ", " '\\1 was "},
{" ([Ww])anna ", " \\1an na "},
{" ([Ww])haddya ", " \\1ha dd ya "},
{" ([Ww])hatcha ", " \\1ha t cha "},
// clean out extra spaces
{" *", " "},
{"^ *", ""},
};
string rewritten = value;
for (const pair<string, string> &rule : preproc_rules) {
RE2::GlobalReplace(&rewritten, rule.first, rule.second);
}
for (const pair<string, string> &rule : rules) {
RE2::GlobalReplace(&rewritten, rule.first, rule.second);
}
TokenizedTextFormat::ConvertFromString(key, rewritten, sentences);
}
private:
TF_DISALLOW_COPY_AND_ASSIGN(EnglishTextFormat);
};
REGISTER_DOCUMENT_FORMAT("english-text", EnglishTextFormat);
} // namespace syntaxnet
# coding=utf-8
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for english_tokenizer."""
# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
import os.path
import tensorflow as tf
import syntaxnet.load_parser_ops
from tensorflow.python.framework import test_util
from tensorflow.python.platform import googletest
from tensorflow.python.platform import logging
from syntaxnet import sentence_pb2
from syntaxnet import task_spec_pb2
from syntaxnet.ops import gen_parser_ops
FLAGS = tf.app.flags.FLAGS
class TextFormatsTest(test_util.TensorFlowTestCase):
def setUp(self):
if not hasattr(FLAGS, 'test_srcdir'):
FLAGS.test_srcdir = ''
if not hasattr(FLAGS, 'test_tmpdir'):
FLAGS.test_tmpdir = tf.test.get_temp_dir()
self.corpus_file = os.path.join(FLAGS.test_tmpdir, 'documents.conll')
self.context_file = os.path.join(FLAGS.test_tmpdir, 'context.pbtxt')
def AddInput(self, name, file_pattern, record_format, context):
inp = context.input.add()
inp.name = name
inp.record_format.append(record_format)
inp.part.add().file_pattern = file_pattern
def WriteContext(self, corpus_format):
context = task_spec_pb2.TaskSpec()
self.AddInput('documents', self.corpus_file, corpus_format, context)
for name in ('word-map', 'lcword-map', 'tag-map',
'category-map', 'label-map', 'prefix-table',
'suffix-table', 'tag-to-category'):
self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
logging.info('Writing context to: %s', self.context_file)
with open(self.context_file, 'w') as f:
f.write(str(context))
def ReadNextDocument(self, sess, sentence):
sentence_str, = sess.run([sentence])
if sentence_str:
sentence_doc = sentence_pb2.Sentence()
sentence_doc.ParseFromString(sentence_str[0])
else:
sentence_doc = None
return sentence_doc
def CheckTokenization(self, sentence, tokenization):
self.WriteContext('english-text')
logging.info('Writing text file to: %s', self.corpus_file)
with open(self.corpus_file, 'w') as f:
f.write(sentence)
sentence, _ = gen_parser_ops.document_source(
self.context_file, batch_size=1)
with self.test_session() as sess:
sentence_doc = self.ReadNextDocument(sess, sentence)
self.assertEqual(' '.join([t.word for t in sentence_doc.token]),
tokenization)
def testSimple(self):
self.CheckTokenization('Hello, world!', 'Hello , world !')
self.CheckTokenization('"Hello"', "`` Hello ''")
self.CheckTokenization('{"Hello@#$', '-LRB- `` Hello @ # $')
self.CheckTokenization('"Hello..."', "`` Hello ... ''")
self.CheckTokenization('()[]{}<>',
'-LRB- -RRB- -LRB- -RRB- -LRB- -RRB- < >')
self.CheckTokenization('Hello--world', 'Hello -- world')
self.CheckTokenization("Isn't", "Is n't")
self.CheckTokenization("n't", "n't")
self.CheckTokenization('Hello Mr. Smith.', 'Hello Mr. Smith .')
self.CheckTokenization("It's Mr. Smith's.", "It 's Mr. Smith 's .")
self.CheckTokenization("It's the Smiths'.", "It 's the Smiths ' .")
self.CheckTokenization('Gotta go', 'Got ta go')
self.CheckTokenization('50-year-old', '50-year-old')
def testUrl(self):
self.CheckTokenization('http://www.google.com/news is down',
'http : //www.google.com/news is down')
if __name__ == '__main__':
googletest.main()
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#define EIGEN_USE_THREADS
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "syntaxnet/utils.h"
#include "syntaxnet/sparse.pb.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/types.h"
using tensorflow::DEVICE_CPU;
using tensorflow::DT_FLOAT;
using tensorflow::DT_INT32;
using tensorflow::DT_INT64;
using tensorflow::DT_STRING;
using tensorflow::OpKernel;
using tensorflow::OpKernelConstruction;
using tensorflow::OpKernelContext;
using tensorflow::Tensor;
using tensorflow::TensorShape;
using tensorflow::errors::InvalidArgument;
namespace syntaxnet {
// Operator to unpack ids and weights stored in SparseFeatures proto.
class UnpackSparseFeatures : public OpKernel {
public:
explicit UnpackSparseFeatures(OpKernelConstruction *context)
: OpKernel(context) {
OP_REQUIRES_OK(context, context->MatchSignature(
{DT_STRING}, {DT_INT32, DT_INT64, DT_FLOAT}));
}
void Compute(OpKernelContext *context) override {
const Tensor &input = context->input(0);
OP_REQUIRES(context, IsLegacyVector(input.shape()),
InvalidArgument("input should be a vector."));
const int64 n = input.NumElements();
const auto input_vec = input.flat<string>();
SparseFeatures sf;
int output_size = 0;
std::vector<std::pair<int64, float> > id_and_weight;
// Guess that we'll be averaging a handful of ids per SparseFeatures record.
id_and_weight.reserve(n * 4);
std::vector<int> num_ids(n);
for (int64 i = 0; i < n; ++i) {
OP_REQUIRES(context, sf.ParseFromString(input_vec(i)),
InvalidArgument("Couldn't parse as SparseFeature"));
OP_REQUIRES(context,
sf.weight_size() == 0 || sf.weight_size() == sf.id_size(),
InvalidArgument(tensorflow::strings::StrCat(
"Incorrect number of weights", sf.DebugString())));
int n_ids = sf.id_size();
num_ids[i] = n_ids;
output_size += n_ids;
for (int j = 0; j < n_ids; j++) {
float w = (sf.weight_size() > 0) ? sf.weight(j) : 1.0f;
id_and_weight.push_back(std::make_pair(sf.id(j), w));
}
}
Tensor *indices_t;
OP_REQUIRES_OK(context, context->allocate_output(
0, TensorShape({output_size}), &indices_t));
Tensor *ids_t;
OP_REQUIRES_OK(context, context->allocate_output(
1, TensorShape({output_size}), &ids_t));
Tensor *weights_t;
OP_REQUIRES_OK(context, context->allocate_output(
2, TensorShape({output_size}), &weights_t));
auto indices = indices_t->vec<int32>();
auto ids = ids_t->vec<int64>();
auto weights = weights_t->vec<float>();
int c = 0;
for (int64 i = 0; i < n; ++i) {
for (int j = 0; j < num_ids[i]; ++j) {
indices(c) = i;
ids(c) = id_and_weight[c].first;
weights(c) = id_and_weight[c].second;
++c;
}
}
}
};
REGISTER_KERNEL_BUILDER(Name("UnpackSparseFeatures").Device(DEVICE_CPU),
UnpackSparseFeatures);
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/utils.h"
#include "tensorflow/core/platform/macros.h"
namespace syntaxnet {
namespace utils {
bool ParseInt32(const char *c_str, int *value) {
char *temp;
*value = strtol(c_str, &temp, 0); // NOLINT
return (*temp == '\0');
}
bool ParseInt64(const char *c_str, int64 *value) {
char *temp;
*value = strtol(c_str, &temp, 0); // NOLINT
return (*temp == '\0');
}
bool ParseDouble(const char *c_str, double *value) {
char *temp;
*value = strtod(c_str, &temp);
return (*temp == '\0');
}
static char hex_char[] = "0123456789abcdef";
string CEscape(const string &src) {
string dest;
for (unsigned char c : src) {
switch (c) {
case '\n':
dest.append("\\n");
break;
case '\r':
dest.append("\\r");
break;
case '\t':
dest.append("\\t");
break;
case '\"':
dest.append("\\\"");
break;
case '\'':
dest.append("\\'");
break;
case '\\':
dest.append("\\\\");
break;
default:
// Note that if we emit \xNN and the src character after that is a hex
// digit then that digit must be escaped too to prevent it being
// interpreted as part of the character code by C.
if ((c >= 0x80) || !isprint(c)) {
dest.append("\\");
dest.push_back(hex_char[c / 64]);
dest.push_back(hex_char[(c % 64) / 8]);
dest.push_back(hex_char[c % 8]);
} else {
dest.push_back(c);
break;
}
}
}
return dest;
}
std::vector<string> Split(const string &text, char delim) {
std::vector<string> result;
int token_start = 0;
if (!text.empty()) {
for (size_t i = 0; i < text.size() + 1; i++) {
if ((i == text.size()) || (text[i] == delim)) {
result.push_back(string(text.data() + token_start, i - token_start));
token_start = i + 1;
}
}
}
return result;
}
bool IsAbsolutePath(tensorflow::StringPiece path) {
return !path.empty() && path[0] == '/';
}
// For an array of paths of length count, append them all together,
// ensuring that the proper path separators are inserted between them.
string JoinPath(std::initializer_list<tensorflow::StringPiece> paths) {
string result;
for (tensorflow::StringPiece path : paths) {
if (path.empty()) {
continue;
}
if (result.empty()) {
result = path.ToString();
continue;
}
if (result[result.size() - 1] == '/') {
if (IsAbsolutePath(path)) {
tensorflow::strings::StrAppend(&result, path.substr(1));
} else {
tensorflow::strings::StrAppend(&result, path);
}
} else {
if (IsAbsolutePath(path)) {
tensorflow::strings::StrAppend(&result, path);
} else {
tensorflow::strings::StrAppend(&result, "/", path);
}
}
}
return result;
}
size_t RemoveLeadingWhitespace(tensorflow::StringPiece *text) {
size_t count = 0;
const char *ptr = text->data();
while (count < text->size() && isspace(*ptr)) {
count++;
ptr++;
}
text->remove_prefix(count);
return count;
}
size_t RemoveTrailingWhitespace(tensorflow::StringPiece *text) {
size_t count = 0;
const char *ptr = text->data() + text->size() - 1;
while (count < text->size() && isspace(*ptr)) {
++count;
--ptr;
}
text->remove_suffix(count);
return count;
}
size_t RemoveWhitespaceContext(tensorflow::StringPiece *text) {
// use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
return RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text);
}
namespace {
// Lower-level versions of Get... that read directly from a character buffer
// without any bounds checking.
inline uint32 DecodeFixed32(const char *ptr) {
return ((static_cast<uint32>(static_cast<unsigned char>(ptr[0]))) |
(static_cast<uint32>(static_cast<unsigned char>(ptr[1])) << 8) |
(static_cast<uint32>(static_cast<unsigned char>(ptr[2])) << 16) |
(static_cast<uint32>(static_cast<unsigned char>(ptr[3])) << 24));
}
// 0xff is in case char is signed.
static inline uint32 ByteAs32(char c) { return static_cast<uint32>(c) & 0xff; }
} // namespace
uint32 Hash32(const char *data, size_t n, uint32 seed) {
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const uint32 m = 0x5bd1e995;
const int r = 24;
// Initialize the hash to a 'random' value
uint32 h = seed ^ n;
// Mix 4 bytes at a time into the hash
while (n >= 4) {
uint32 k = DecodeFixed32(data);
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
data += 4;
n -= 4;
}
// Handle the last few bytes of the input array
switch (n) {
case 3:
h ^= ByteAs32(data[2]) << 16;
TF_FALLTHROUGH_INTENDED;
case 2:
h ^= ByteAs32(data[1]) << 8;
TF_FALLTHROUGH_INTENDED;
case 1:
h ^= ByteAs32(data[0]);
h *= m;
}
// Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated.
h ^= h >> 13;
h *= m;
h ^= h >> 15;
return h;
}
string Lowercase(tensorflow::StringPiece s) {
string result(s.data(), s.size());
for (char &c : result) {
c = tolower(c);
}
return result;
}
PunctuationUtil::CharacterRange PunctuationUtil::kPunctuation[] = {
{33, 35}, {37, 42}, {44, 47}, {58, 59},
{63, 64}, {91, 93}, {95, 95}, {123, 123},
{125, 125}, {161, 161}, {171, 171}, {183, 183},
{187, 187}, {191, 191}, {894, 894}, {903, 903},
{1370, 1375}, {1417, 1418}, {1470, 1470}, {1472, 1472},
{1475, 1475}, {1478, 1478}, {1523, 1524}, {1548, 1549},
{1563, 1563}, {1566, 1567}, {1642, 1645}, {1748, 1748},
{1792, 1805}, {2404, 2405}, {2416, 2416}, {3572, 3572},
{3663, 3663}, {3674, 3675}, {3844, 3858}, {3898, 3901},
{3973, 3973}, {4048, 4049}, {4170, 4175}, {4347, 4347},
{4961, 4968}, {5741, 5742}, {5787, 5788}, {5867, 5869},
{5941, 5942}, {6100, 6102}, {6104, 6106}, {6144, 6154},
{6468, 6469}, {6622, 6623}, {6686, 6687}, {8208, 8231},
{8240, 8259}, {8261, 8273}, {8275, 8286}, {8317, 8318},
{8333, 8334}, {9001, 9002}, {9140, 9142}, {10088, 10101},
{10181, 10182}, {10214, 10219}, {10627, 10648}, {10712, 10715},
{10748, 10749}, {11513, 11516}, {11518, 11519}, {11776, 11799},
{11804, 11805}, {12289, 12291}, {12296, 12305}, {12308, 12319},
{12336, 12336}, {12349, 12349}, {12448, 12448}, {12539, 12539},
{64830, 64831}, {65040, 65049}, {65072, 65106}, {65108, 65121},
{65123, 65123}, {65128, 65128}, {65130, 65131}, {65281, 65283},
{65285, 65290}, {65292, 65295}, {65306, 65307}, {65311, 65312},
{65339, 65341}, {65343, 65343}, {65371, 65371}, {65373, 65373},
{65375, 65381}, {65792, 65793}, {66463, 66463}, {68176, 68184},
{-1, -1}};
void NormalizeDigits(string *form) {
for (size_t i = 0; i < form->size(); ++i) {
if ((*form)[i] >= '0' && (*form)[i] <= '9') (*form)[i] = '9';
}
}
} // namespace utils
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef $TARGETDIR_UTILS_H_
#define $TARGETDIR_UTILS_H_
#include <functional>
#include <string>
#include <vector>
#include <unordered_set>
#include "syntaxnet/base.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/default/integral_types.h"
#include "tensorflow/core/platform/mutex.h"
#include "util/utf8/unicodetext.h"
namespace syntaxnet {
namespace utils {
bool ParseInt32(const char *c_str, int *value);
bool ParseInt64(const char *c_str, int64 *value);
bool ParseDouble(const char *c_str, double *value);
template <typename T>
T ParseUsing(const string &str, std::function<bool(const char *, T *)> func) {
T value;
CHECK(func(str.c_str(), &value)) << "Failed to convert: " << str;
return value;
}
template <typename T>
T ParseUsing(const string &str, T defval,
std::function<bool(const char *, T *)> func) {
return str.empty() ? defval : ParseUsing<T>(str, func);
}
string CEscape(const string &src);
std::vector<string> Split(const string &text, char delim);
template <typename T>
string Join(const std::vector<T> &s, const char *sep) {
string result;
bool first = true;
for (const auto &x : s) {
tensorflow::strings::StrAppend(&result, (first ? "" : sep), x);
first = false;
}
return result;
}
string JoinPath(std::initializer_list<StringPiece> paths);
size_t RemoveLeadingWhitespace(tensorflow::StringPiece *text);
size_t RemoveTrailingWhitespace(tensorflow::StringPiece *text);
size_t RemoveWhitespaceContext(tensorflow::StringPiece *text);
uint32 Hash32(const char *data, size_t n, uint32 seed);
// Deletes all the elements in an STL container and clears the container. This
// function is suitable for use with a vector, set, hash_set, or any other STL
// container which defines sensible begin(), end(), and clear() methods.
// If container is NULL, this function is a no-op.
template <typename T>
void STLDeleteElements(T *container) {
if (!container) return;
auto it = container->begin();
while (it != container->end()) {
auto temp = it;
++it;
delete *temp;
}
container->clear();
}
// Returns lower-cased version of s.
string Lowercase(tensorflow::StringPiece s);
class PunctuationUtil {
public:
// Unicode character ranges for punctuation characters according to CoNLL.
struct CharacterRange {
int first;
int last;
};
static CharacterRange kPunctuation[];
// Returns true if Unicode character is a punctuation character.
static bool IsPunctuation(int u) {
int i = 0;
while (kPunctuation[i].first > 0) {
if (u < kPunctuation[i].first) return false;
if (u <= kPunctuation[i].last) return true;
++i;
}
return false;
}
// Determine if tag is a punctuation tag.
static bool IsPunctuationTag(const string &tag) {
for (size_t i = 0; i < tag.length(); ++i) {
int c = tag[i];
if (c != ',' && c != ':' && c != '.' && c != '\'' && c != '`') {
return false;
}
}
return true;
}
// Returns true if word consists of punctuation characters.
static bool IsPunctuationToken(const string &word) {
UnicodeText text;
text.PointToUTF8(word.c_str(), word.length());
UnicodeText::const_iterator it;
for (it = text.begin(); it != text.end(); ++it) {
if (!IsPunctuation(*it)) return false;
}
return true;
}
// Returns true if tag is non-empty and has only punctuation or parens
// symbols.
static bool IsPunctuationTagOrParens(const string &tag) {
if (tag.empty()) return false;
for (size_t i = 0; i < tag.length(); ++i) {
int c = tag[i];
if (c != '(' && c != ')' && c != ',' && c != ':' && c != '.' &&
c != '\'' && c != '`') {
return false;
}
}
return true;
}
// Decides whether to score a token, given the word, the POS tag and
// and the scoring type.
static bool ScoreToken(const string &word, const string &tag,
const string &scoring_type) {
if (scoring_type == "default") {
return tag.empty() || !IsPunctuationTag(tag);
} else if (scoring_type == "conllx") {
return !IsPunctuationToken(word);
} else if (scoring_type == "ignore_parens") {
return !IsPunctuationTagOrParens(tag);
}
CHECK(scoring_type.empty()) << "Unknown scoring strategy " << scoring_type;
return true;
}
};
void NormalizeDigits(string *form);
} // namespace utils
} // namespace syntaxnet
#endif // $TARGETDIR_UTILS_H_
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/workspace.h"
#include "tensorflow/core/lib/strings/strcat.h"
namespace syntaxnet {
string WorkspaceRegistry::DebugString() const {
string str;
for (auto &it : workspace_names_) {
const string &type_name = workspace_types_.at(it.first);
for (size_t index = 0; index < it.second.size(); ++index) {
const string &workspace_name = it.second[index];
tensorflow::strings::StrAppend(&str, "\n ", type_name, " :: ",
workspace_name);
}
}
return str;
}
VectorIntWorkspace::VectorIntWorkspace(int size) : elements_(size) {}
VectorIntWorkspace::VectorIntWorkspace(int size, int value)
: elements_(size, value) {}
VectorIntWorkspace::VectorIntWorkspace(const vector<int> &elements)
: elements_(elements) {}
string VectorIntWorkspace::TypeName() { return "Vector"; }
VectorVectorIntWorkspace::VectorVectorIntWorkspace(int size)
: elements_(size) {}
string VectorVectorIntWorkspace::TypeName() { return "VectorVector"; }
} // namespace syntaxnet
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Notes on thread-safety: All of the classes here are thread-compatible. More
// specifically, the registry machinery is thread-safe, as long as each thread
// performs feature extraction on a different Sentence object.
#ifndef $TARGETDIR_WORKSPACE_H_
#define $TARGETDIR_WORKSPACE_H_
#include <string>
#include <typeindex>
#include <unordered_map>
#include <utility>
#include <vector>
#include "syntaxnet/utils.h"
namespace syntaxnet {
// A base class for shared workspaces. Derived classes implement a static member
// function TypeName() which returns a human readable string name for the class.
class Workspace {
public:
// Polymorphic destructor.
virtual ~Workspace() {}
protected:
// Create an empty workspace.
Workspace() {}
private:
TF_DISALLOW_COPY_AND_ASSIGN(Workspace);
};
// A registry that keeps track of workspaces.
class WorkspaceRegistry {
public:
// Create an empty registry.
WorkspaceRegistry() {}
// Returns the index of a named workspace, adding it to the registry first
// if necessary.
template <class W>
int Request(const string &name) {
const std::type_index id = std::type_index(typeid(W));
workspace_types_[id] = W::TypeName();
vector<string> &names = workspace_names_[id];
for (int i = 0; i < names.size(); ++i) {
if (names[i] == name) return i;
}
names.push_back(name);
return names.size() - 1;
}
const std::unordered_map<std::type_index, vector<string> > &WorkspaceNames()
const {
return workspace_names_;
}
// Returns a string describing the registered workspaces.
string DebugString() const;
private:
// Workspace type names, indexed as workspace_types_[typeid].
std::unordered_map<std::type_index, string> workspace_types_;
// Workspace names, indexed as workspace_names_[typeid][workspace].
std::unordered_map<std::type_index, vector<string> > workspace_names_;
TF_DISALLOW_COPY_AND_ASSIGN(WorkspaceRegistry);
};
// A typed collected of workspaces. The workspaces are indexed according to an
// external WorkspaceRegistry. If the WorkspaceSet is const, the contents are
// also immutable.
class WorkspaceSet {
public:
~WorkspaceSet() { Reset(WorkspaceRegistry()); }
// Returns true if a workspace has been set.
template <class W>
bool Has(int index) const {
const std::type_index id = std::type_index(typeid(W));
DCHECK(workspaces_.find(id) != workspaces_.end());
DCHECK_LT(index, workspaces_.find(id)->second.size());
return workspaces_.find(id)->second[index] != nullptr;
}
// Returns an indexed workspace; the workspace must have been set.
template <class W>
const W &Get(int index) const {
DCHECK(Has<W>(index));
const Workspace *w =
workspaces_.find(std::type_index(typeid(W)))->second[index];
return reinterpret_cast<const W &>(*w);
}
// Sets an indexed workspace; this takes ownership of the workspace, which
// must have been new-allocated. It is an error to set a workspace twice.
template <class W>
void Set(int index, W *workspace) {
const std::type_index id = std::type_index(typeid(W));
DCHECK(workspaces_.find(id) != workspaces_.end());
DCHECK_LT(index, workspaces_[id].size());
DCHECK(workspaces_[id][index] == nullptr);
DCHECK(workspace != nullptr);
workspaces_[id][index] = workspace;
}
void Reset(const WorkspaceRegistry &registry) {
// Deallocate current workspaces.
for (auto &it : workspaces_) {
for (size_t index = 0; index < it.second.size(); ++index) {
delete it.second[index];
}
}
workspaces_.clear();
// Allocate space for new workspaces.
for (auto &it : registry.WorkspaceNames()) {
workspaces_[it.first].resize(it.second.size());
}
}
private:
// The set of workspaces, indexed as workspaces_[typeid][index].
std::unordered_map<std::type_index, vector<Workspace *> > workspaces_;
};
// A workspace that wraps around a single int.
class SingletonIntWorkspace : public Workspace {
public:
// Default-initializes the int value.
SingletonIntWorkspace() {}
// Initializes the int with the given value.
explicit SingletonIntWorkspace(int value) : value_(value) {}
// Returns the name of this type of workspace.
static string TypeName() { return "SingletonInt"; }
// Returns the int value.
int get() const { return value_; }
// Sets the int value.
void set(int value) { value_ = value; }
private:
// The enclosed int.
int value_ = 0;
};
// A workspace that wraps around a vector of int.
class VectorIntWorkspace : public Workspace {
public:
// Creates a vector of the given size.
explicit VectorIntWorkspace(int size);
// Creates a vector initialized with the given array.
explicit VectorIntWorkspace(const vector<int> &elements);
// Creates a vector of the given size, with each element initialized to the
// given value.
VectorIntWorkspace(int size, int value);
// Returns the name of this type of workspace.
static string TypeName();
// Returns the i'th element.
int element(int i) const { return elements_[i]; }
// Sets the i'th element.
void set_element(int i, int value) { elements_[i] = value; }
private:
// The enclosed vector.
vector<int> elements_;
};
// A workspace that wraps around a vector of vector of int.
class VectorVectorIntWorkspace : public Workspace {
public:
// Creates a vector of empty vectors of the given size.
explicit VectorVectorIntWorkspace(int size);
// Returns the name of this type of workspace.
static string TypeName();
// Returns the i'th vector of elements.
const vector<int> &elements(int i) const { return elements_[i]; }
// Mutable access to the i'th vector of elements.
vector<int> *mutable_elements(int i) { return &(elements_[i]); }
private:
// The enclosed vector of vector of elements.
vector<vector<int> > elements_;
};
} // namespace syntaxnet
#endif // $TARGETDIR_WORKSPACE_H_
Subproject commit 3402f51ecd11a26d0c071b1d06b4edab1b0ef351
licenses(["notice"])
cc_library(
name = "utf",
srcs = [
"rune.c",
"runestrcat.c",
"runestrchr.c",
"runestrcmp.c",
"runestrcpy.c",
"runestrdup.c",
"runestrecpy.c",
"runestrlen.c",
"runestrncat.c",
"runestrncmp.c",
"runestrncpy.c",
"runestrrchr.c",
"runestrstr.c",
"runetype.c",
"utfecpy.c",
"utflen.c",
"utfnlen.c",
"utfrrune.c",
"utfrune.c",
"utfutf.c",
],
hdrs = [
"runetypebody.c",
"utf.h",
"utfdef.h",
],
includes = ["."],
visibility = ["//visibility:public"],
)
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 1998-2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "third_party/utf/utf.h"
#include "third_party/utf/utfdef.h"
enum
{
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror,
};
/*
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
* This is a slower but "safe" version of the old chartorune
* that works on strings that are not necessarily null-terminated.
*
* If you know for sure that your string is null-terminated,
* chartorune will be a bit faster.
*
* It is guaranteed not to attempt to access "length"
* past the incoming pointer. This is to avoid
* possible access violations. If the string appears to be
* well-formed but incomplete (i.e., to get the whole Rune
* we'd need to read past str+length) then we'll set the Rune
* to Bad and return 0.
*
* Note that if we have decoding problems for other
* reasons, we return 1 instead of 0.
*/
int
charntorune(Rune *rune, const char *str, int length)
{
int c, c1, c2, c3;
long l;
/* When we're not allowed to read anything */
if(length <= 0) {
goto badlen;
}
/*
* one character sequence (7-bit value)
* 00000-0007F => T1
*/
c = *(uchar*)str;
if(c < Tx) {
*rune = c;
return 1;
}
// If we can't read more than one character we must stop
if(length <= 1) {
goto badlen;
}
/*
* two character sequence (11-bit value)
* 0080-07FF => T2 Tx
*/
c1 = *(uchar*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
// If we can't read more than two characters we must stop
if(length <= 2) {
goto badlen;
}
/*
* three character sequence (16-bit value)
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(uchar*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
if (length <= 3)
goto badlen;
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(uchar*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
if (l > Runemax)
goto bad;
*rune = l;
return 4;
}
// Support for 5-byte or longer UTF-8 would go here, but
// since we don't have that, we'll just fall through to bad.
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
badlen:
*rune = Bad;
return 0;
}
/*
* This is the older "unsafe" version, which works fine on
* null-terminated strings.
*/
int
chartorune(Rune *rune, const char *str)
{
int c, c1, c2, c3;
long l;
/*
* one character sequence
* 00000-0007F => T1
*/
c = *(uchar*)str;
if(c < Tx) {
*rune = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
c1 = *(uchar*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(uchar*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(uchar*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
if (l > Runemax)
goto bad;
*rune = l;
return 4;
}
/*
* Support for 5-byte or longer UTF-8 would go here, but
* since we don't have that, we'll just fall through to bad.
*/
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
}
int
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
*consumed = charntorune(rune, str, length);
return *rune != Runeerror || *consumed == 3;
}
int
runetochar(char *str, const Rune *rune)
{
/* Runes are signed, so convert to unsigned for range check. */
unsigned long c;
/*
* one character sequence
* 00000-0007F => 00-7F
*/
c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
int
runelen(Rune rune)
{
char str[10];
return runetochar(str, &rune);
}
int
runenlen(const Rune *r, int nrune)
{
int nb;
ulong c; /* Rune is signed, so use unsigned for range check. */
nb = 0;
while(nrune--) {
c = *r++;
if (c <= Rune1)
nb++;
else if (c <= Rune2)
nb += 2;
else if (c <= Rune3)
nb += 3;
else if (c <= Runemax)
nb += 4;
else
nb += 3; /* Runeerror = 0xFFFD, see runetochar */
}
return nb;
}
int
fullrune(const char *str, int n)
{
if (n > 0) {
int c = *(uchar*)str;
if (c < Tx)
return 1;
if (n > 1) {
if (c < T3)
return 1;
if (n > 2) {
if (c < T4 || n > 3)
return 1;
}
}
}
return 0;
}
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "third_party/utf/utf.h"
#include "third_party/utf/utfdef.h"
Rune*
runestrcat(Rune *s1, const Rune *s2)
{
runestrcpy((Rune*)runestrchr(s1, 0), s2);
return s1;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment