Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
32ab5a58
Commit
32ab5a58
authored
May 12, 2016
by
calberti
Committed by
Martin Wicke
May 12, 2016
Browse files
Adding SyntaxNet to tensorflow/models (#63)
parent
148a15fb
Changes
131
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3505 additions
and
0 deletions
+3505
-0
syntaxnet/syntaxnet/task_context.h
syntaxnet/syntaxnet/task_context.h
+80
-0
syntaxnet/syntaxnet/task_spec.proto
syntaxnet/syntaxnet/task_spec.proto
+82
-0
syntaxnet/syntaxnet/term_frequency_map.cc
syntaxnet/syntaxnet/term_frequency_map.cc
+188
-0
syntaxnet/syntaxnet/term_frequency_map.h
syntaxnet/syntaxnet/term_frequency_map.h
+117
-0
syntaxnet/syntaxnet/test_main.cc
syntaxnet/syntaxnet/test_main.cc
+45
-0
syntaxnet/syntaxnet/testdata/context.pbtxt
syntaxnet/syntaxnet/testdata/context.pbtxt
+87
-0
syntaxnet/syntaxnet/testdata/document
syntaxnet/syntaxnet/testdata/document
+145
-0
syntaxnet/syntaxnet/testdata/mini-training-set
syntaxnet/syntaxnet/testdata/mini-training-set
+1017
-0
syntaxnet/syntaxnet/text_formats.cc
syntaxnet/syntaxnet/text_formats.cc
+399
-0
syntaxnet/syntaxnet/text_formats_test.py
syntaxnet/syntaxnet/text_formats_test.py
+108
-0
syntaxnet/syntaxnet/unpack_sparse_features.cc
syntaxnet/syntaxnet/unpack_sparse_features.cc
+111
-0
syntaxnet/syntaxnet/utils.cc
syntaxnet/syntaxnet/utils.cc
+260
-0
syntaxnet/syntaxnet/utils.h
syntaxnet/syntaxnet/utils.h
+171
-0
syntaxnet/syntaxnet/workspace.cc
syntaxnet/syntaxnet/workspace.cc
+50
-0
syntaxnet/syntaxnet/workspace.h
syntaxnet/syntaxnet/workspace.h
+215
-0
syntaxnet/tensorflow
syntaxnet/tensorflow
+1
-0
syntaxnet/third_party/utf/BUILD
syntaxnet/third_party/utf/BUILD
+34
-0
syntaxnet/third_party/utf/README
syntaxnet/third_party/utf/README
+13
-0
syntaxnet/third_party/utf/rune.c
syntaxnet/third_party/utf/rune.c
+357
-0
syntaxnet/third_party/utf/runestrcat.c
syntaxnet/third_party/utf/runestrcat.c
+25
-0
No files found.
syntaxnet/syntaxnet/task_context.h
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef $TARGETDIR_TASK_CONTEXT_H_
#define $TARGETDIR_TASK_CONTEXT_H_
#include <string>
#include <vector>
#include "syntaxnet/task_spec.pb.h"
#include "syntaxnet/utils.h"
namespace
syntaxnet
{
// A task context holds configuration information for a task. It is basically a
// wrapper around a TaskSpec protocol buffer.
class
TaskContext
{
public:
// Returns the underlying task specification protocol buffer for the context.
const
TaskSpec
&
spec
()
const
{
return
spec_
;
}
TaskSpec
*
mutable_spec
()
{
return
&
spec_
;
}
// Returns a named input descriptor for the task. A new input is created if
// the task context does not already have an input with that name.
TaskInput
*
GetInput
(
const
string
&
name
);
TaskInput
*
GetInput
(
const
string
&
name
,
const
string
&
file_format
,
const
string
&
record_format
);
// Sets task parameter.
void
SetParameter
(
const
string
&
name
,
const
string
&
value
);
// Returns task parameter. If the parameter is not in the task configuration
// the (default) value of the corresponding command line flag is returned.
string
GetParameter
(
const
string
&
name
)
const
;
int
GetIntParameter
(
const
string
&
name
)
const
;
int64
GetInt64Parameter
(
const
string
&
name
)
const
;
bool
GetBoolParameter
(
const
string
&
name
)
const
;
double
GetFloatParameter
(
const
string
&
name
)
const
;
// Returns task parameter. If the parameter is not in the task configuration
// the default value is returned. Parameters retrieved using these methods
// don't need to be defined with a DEFINE_*() macro.
string
Get
(
const
string
&
name
,
const
string
&
defval
)
const
;
string
Get
(
const
string
&
name
,
const
char
*
defval
)
const
;
int
Get
(
const
string
&
name
,
int
defval
)
const
;
int64
Get
(
const
string
&
name
,
int64
defval
)
const
;
double
Get
(
const
string
&
name
,
double
defval
)
const
;
bool
Get
(
const
string
&
name
,
bool
defval
)
const
;
// Returns input file name for a single-file task input.
static
string
InputFile
(
const
TaskInput
&
input
);
// Returns true if task input supports the file and record format.
static
bool
Supports
(
const
TaskInput
&
input
,
const
string
&
file_format
,
const
string
&
record_format
);
private:
// Underlying task specification protocol buffer.
TaskSpec
spec_
;
// Vector of parameters required by this task. These must be specified in the
// task rather than relying on default values.
vector
<
string
>
required_parameters_
;
};
}
// namespace syntaxnet
#endif // $TARGETDIR_TASK_CONTEXT_H_
syntaxnet/syntaxnet/task_spec.proto
0 → 100644
View file @
32ab5a58
// LINT: ALLOW_GROUPS
// Protocol buffer specifications for task configuration.
syntax
=
"proto2"
;
package
syntaxnet
;
// Task input descriptor.
message
TaskInput
{
// Name of input resource.
required
string
name
=
1
;
// Name of stage responsible of creating this resource.
optional
string
creator
=
2
;
// File format for resource.
repeated
string
file_format
=
3
;
// Record format for resource.
repeated
string
record_format
=
4
;
// Is this resource multi-file?
optional
bool
multi_file
=
5
[
default
=
false
];
// An input can consist of multiple file sets.
repeated
group
Part
=
6
{
// File pattern for file set.
optional
string
file_pattern
=
7
;
// File format for file set.
optional
string
file_format
=
8
;
// Record format for file set.
optional
string
record_format
=
9
;
}
}
// Task output descriptor.
message
TaskOutput
{
// Name of output resource.
required
string
name
=
1
;
// File format for output resource.
optional
string
file_format
=
2
;
// Record format for output resource.
optional
string
record_format
=
3
;
// Number of shards in output. If it is different from zero this output is
// sharded. If the number of shards is set to -1 this means that the output is
// sharded, but the number of shard is unknown. The files are then named
// 'base-*-of-*'.
optional
int32
shards
=
4
[
default
=
0
];
// Base file name for output resource. If this is not set by the task
// component it is set to a default value by the workflow engine.
optional
string
file_base
=
5
;
// Optional extension added to the file name.
optional
string
file_extension
=
6
;
}
// A task specification is used for describing executing parameters.
message
TaskSpec
{
// Name of task.
optional
string
task_name
=
1
;
// Workflow task type.
optional
string
task_type
=
2
;
// Task parameters.
repeated
group
Parameter
=
3
{
required
string
name
=
4
;
optional
string
value
=
5
;
}
// Task inputs.
repeated
TaskInput
input
=
6
;
// Task outputs.
repeated
TaskOutput
output
=
7
;
}
syntaxnet/syntaxnet/term_frequency_map.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/term_frequency_map.h"
#include <stddef.h>
#include <algorithm>
#include <limits>
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/env.h"
namespace
syntaxnet
{
int
TermFrequencyMap
::
Increment
(
const
string
&
term
)
{
CHECK_EQ
(
term_index_
.
size
(),
term_data_
.
size
());
const
TermIndex
::
const_iterator
it
=
term_index_
.
find
(
term
);
if
(
term_index_
.
find
(
term
)
!=
term_index_
.
end
())
{
// Increment the existing term.
pair
<
string
,
int64
>
&
data
=
term_data_
[
it
->
second
];
CHECK_EQ
(
term
,
data
.
first
);
++
(
data
.
second
);
return
it
->
second
;
}
else
{
// Add a new term.
const
int
index
=
term_index_
.
size
();
CHECK_LT
(
index
,
std
::
numeric_limits
<
int32
>::
max
());
// overflow
term_index_
[
term
]
=
index
;
term_data_
.
push_back
(
pair
<
string
,
int64
>
(
term
,
1
));
return
index
;
}
}
void
TermFrequencyMap
::
Clear
()
{
term_index_
.
clear
();
term_data_
.
clear
();
}
void
TermFrequencyMap
::
Load
(
const
string
&
filename
,
int
min_frequency
,
int
max_num_terms
)
{
Clear
();
// If max_num_terms is non-positive, replace it with INT_MAX.
if
(
max_num_terms
<=
0
)
max_num_terms
=
std
::
numeric_limits
<
int
>::
max
();
// Read the first line (total # of terms in the mapping).
tensorflow
::
RandomAccessFile
*
file
;
TF_CHECK_OK
(
tensorflow
::
Env
::
Default
()
->
NewRandomAccessFile
(
filename
,
&
file
));
static
const
int
kInputBufferSize
=
1
*
1024
*
1024
;
/* bytes */
tensorflow
::
io
::
InputBuffer
input
(
file
,
kInputBufferSize
);
string
line
;
TF_CHECK_OK
(
input
.
ReadLine
(
&
line
));
int32
total
=
-
1
;
CHECK
(
utils
::
ParseInt32
(
line
.
c_str
(),
&
total
));
CHECK_GE
(
total
,
0
);
// Read the mapping.
int64
last_frequency
=
-
1
;
for
(
int
i
=
0
;
i
<
total
&&
i
<
max_num_terms
;
++
i
)
{
TF_CHECK_OK
(
input
.
ReadLine
(
&
line
));
vector
<
string
>
elements
=
utils
::
Split
(
line
,
' '
);
CHECK_EQ
(
2
,
elements
.
size
());
CHECK
(
!
elements
[
0
].
empty
());
CHECK
(
!
elements
[
1
].
empty
());
int64
frequency
=
0
;
CHECK
(
utils
::
ParseInt64
(
elements
[
1
].
c_str
(),
&
frequency
));
CHECK_GT
(
frequency
,
0
);
const
string
&
term
=
elements
[
0
];
// Check frequency sorting (descending order).
if
(
i
>
0
)
CHECK_GE
(
last_frequency
,
frequency
);
last_frequency
=
frequency
;
// Ignore low-frequency items.
if
(
frequency
<
min_frequency
)
continue
;
// Check uniqueness of the mapped terms.
CHECK
(
term_index_
.
find
(
term
)
==
term_index_
.
end
())
<<
"File "
<<
filename
<<
" has duplicate term: "
<<
term
;
// Assign the next available index.
const
int
index
=
term_index_
.
size
();
term_index_
[
term
]
=
index
;
term_data_
.
push_back
(
pair
<
string
,
int64
>
(
term
,
frequency
));
}
CHECK_EQ
(
term_index_
.
size
(),
term_data_
.
size
());
LOG
(
INFO
)
<<
"Loaded "
<<
term_index_
.
size
()
<<
" terms from "
<<
filename
<<
"."
;
}
struct
TermFrequencyMap
::
SortByFrequencyThenTerm
{
// Return a > b to sort in descending order of frequency; otherwise,
// lexicographic sort on term.
bool
operator
()(
const
pair
<
string
,
int64
>
&
a
,
const
pair
<
string
,
int64
>
&
b
)
const
{
return
(
a
.
second
>
b
.
second
||
(
a
.
second
==
b
.
second
&&
a
.
first
<
b
.
first
));
}
};
void
TermFrequencyMap
::
Save
(
const
string
&
filename
)
const
{
CHECK_EQ
(
term_index_
.
size
(),
term_data_
.
size
());
// Copy and sort the term data.
vector
<
pair
<
string
,
int64
>>
sorted_data
(
term_data_
);
std
::
sort
(
sorted_data
.
begin
(),
sorted_data
.
end
(),
SortByFrequencyThenTerm
());
// Write the number of terms.
tensorflow
::
WritableFile
*
file
;
TF_CHECK_OK
(
tensorflow
::
Env
::
Default
()
->
NewWritableFile
(
filename
,
&
file
));
CHECK_LE
(
term_index_
.
size
(),
std
::
numeric_limits
<
int32
>::
max
());
// overflow
const
int32
num_terms
=
term_index_
.
size
();
const
string
header
=
tensorflow
::
strings
::
StrCat
(
num_terms
,
"
\n
"
);
TF_CHECK_OK
(
file
->
Append
(
header
));
// Write each term and frequency.
for
(
size_t
i
=
0
;
i
<
sorted_data
.
size
();
++
i
)
{
if
(
i
>
0
)
CHECK_GE
(
sorted_data
[
i
-
1
].
second
,
sorted_data
[
i
].
second
);
const
string
line
=
tensorflow
::
strings
::
StrCat
(
sorted_data
[
i
].
first
,
" "
,
sorted_data
[
i
].
second
,
"
\n
"
);
TF_CHECK_OK
(
file
->
Append
(
line
));
}
TF_CHECK_OK
(
file
->
Close
())
<<
"for file "
<<
filename
;
LOG
(
INFO
)
<<
"Saved "
<<
term_index_
.
size
()
<<
" terms to "
<<
filename
<<
"."
;
delete
file
;
}
TagToCategoryMap
::
TagToCategoryMap
(
const
string
&
filename
)
{
// Load the mapping.
tensorflow
::
RandomAccessFile
*
file
;
TF_CHECK_OK
(
tensorflow
::
Env
::
Default
()
->
NewRandomAccessFile
(
filename
,
&
file
));
static
const
int
kInputBufferSize
=
1
*
1024
*
1024
;
/* bytes */
tensorflow
::
io
::
InputBuffer
input
(
file
,
kInputBufferSize
);
string
line
;
while
(
input
.
ReadLine
(
&
line
)
==
tensorflow
::
Status
::
OK
())
{
vector
<
string
>
pair
=
utils
::
Split
(
line
,
'\t'
);
CHECK
(
line
.
empty
()
||
pair
.
size
()
==
2
)
<<
line
;
tag_to_category_
[
pair
[
0
]]
=
pair
[
1
];
}
}
// Returns the category associated with the given tag.
const
string
&
TagToCategoryMap
::
GetCategory
(
const
string
&
tag
)
const
{
const
auto
it
=
tag_to_category_
.
find
(
tag
);
CHECK
(
it
!=
tag_to_category_
.
end
())
<<
"No category found for tag "
<<
tag
;
return
it
->
second
;
}
void
TagToCategoryMap
::
SetCategory
(
const
string
&
tag
,
const
string
&
category
)
{
const
auto
it
=
tag_to_category_
.
find
(
tag
);
if
(
it
!=
tag_to_category_
.
end
())
{
CHECK_EQ
(
category
,
it
->
second
)
<<
"POS tag cannot be mapped to multiple coarse POS tags. "
<<
"'"
<<
tag
<<
"' is mapped to: '"
<<
category
<<
"' and '"
<<
it
->
second
<<
"'"
;
}
else
{
tag_to_category_
[
tag
]
=
category
;
}
}
void
TagToCategoryMap
::
Save
(
const
string
&
filename
)
const
{
// Write tag and category on each line.
tensorflow
::
WritableFile
*
file
;
TF_CHECK_OK
(
tensorflow
::
Env
::
Default
()
->
NewWritableFile
(
filename
,
&
file
));
for
(
const
auto
&
pair
:
tag_to_category_
)
{
const
string
line
=
tensorflow
::
strings
::
StrCat
(
pair
.
first
,
"
\t
"
,
pair
.
second
,
"
\n
"
);
TF_CHECK_OK
(
file
->
Append
(
line
));
}
TF_CHECK_OK
(
file
->
Close
())
<<
"for file "
<<
filename
;
delete
file
;
}
}
// namespace syntaxnet
syntaxnet/syntaxnet/term_frequency_map.h
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef $TARGETDIR_TERM_FREQUENCY_MAP_H_
#define $TARGETDIR_TERM_FREQUENCY_MAP_H_
#include <stddef.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "syntaxnet/utils.h"
namespace
syntaxnet
{
// A mapping from strings to frequencies with save and load functionality.
class
TermFrequencyMap
{
public:
// Creates an empty frequency map.
TermFrequencyMap
()
{}
// Creates a term frequency map by calling Load.
TermFrequencyMap
(
const
string
&
file
,
int
min_frequency
,
int
max_num_terms
)
{
Load
(
file
,
min_frequency
,
max_num_terms
);
}
// Returns the number of terms with positive frequency.
int
Size
()
const
{
return
term_index_
.
size
();
}
// Returns the index associated with the given term. If the term does not
// exist, the unknown index is returned instead.
int
LookupIndex
(
const
string
&
term
,
int
unknown
)
const
{
const
TermIndex
::
const_iterator
it
=
term_index_
.
find
(
term
);
return
(
it
!=
term_index_
.
end
()
?
it
->
second
:
unknown
);
}
// Returns the term associated with the given index.
const
string
&
GetTerm
(
int
index
)
const
{
return
term_data_
[
index
].
first
;
}
// Increases the frequency of the given term by 1, creating a new entry if
// necessary, and returns the index of the term.
int
Increment
(
const
string
&
term
);
// Clears all frequencies.
void
Clear
();
// Loads a frequency mapping from the given file, which must have been created
// by an earlier call to Save(). After loading, the term indices are
// guaranteed to be ordered in descending order of frequency (breaking ties
// arbitrarily). However, any new terms inserted after loading do not
// maintain this sorting invariant.
//
// Only loads terms with frequency >= min_frequency. If max_num_terms <= 0,
// then all qualifying terms are loaded; otherwise, max_num_terms terms with
// maximal frequency are loaded (breaking ties arbitrarily).
void
Load
(
const
string
&
filename
,
int
min_frequency
,
int
max_num_terms
);
// Saves a frequency mapping to the given file.
void
Save
(
const
string
&
filename
)
const
;
private:
// Hashtable for term-to-index mapping.
typedef
std
::
unordered_map
<
string
,
int
>
TermIndex
;
// Sorting functor for term data.
struct
SortByFrequencyThenTerm
;
// Mapping from terms to indices.
TermIndex
term_index_
;
// Mapping from indices to term and frequency.
vector
<
pair
<
string
,
int64
>>
term_data_
;
TF_DISALLOW_COPY_AND_ASSIGN
(
TermFrequencyMap
);
};
// A mapping from tags to categories.
class
TagToCategoryMap
{
public:
TagToCategoryMap
()
{}
~
TagToCategoryMap
()
{}
// Loads a tag to category map from a text file.
explicit
TagToCategoryMap
(
const
string
&
filename
);
// Sets the category for the given tag.
void
SetCategory
(
const
string
&
tag
,
const
string
&
category
);
// Returns the category associated with the given tag.
const
string
&
GetCategory
(
const
string
&
tag
)
const
;
// Saves a tag to category map to the given file.
void
Save
(
const
string
&
filename
)
const
;
private:
map
<
string
,
string
>
tag_to_category_
;
TF_DISALLOW_COPY_AND_ASSIGN
(
TagToCategoryMap
);
};
}
// namespace syntaxnet
#endif // $TARGETDIR_TERM_FREQUENCY_MAP_H_
syntaxnet/syntaxnet/test_main.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// A program with a main that is suitable for unittests, including those
// that also define microbenchmarks. Based on whether the user specified
// the --benchmark_filter flag which specifies which benchmarks to run,
// we will either run benchmarks or run the gtest tests in the program.
#include "tensorflow/core/platform/platform.h"
#include "tensorflow/core/platform/types.h"
#if defined(PLATFORM_GOOGLE) || defined(__ANDROID__)
// main() is supplied by gunit_main
#else
#include "gtest/gtest.h"
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/platform/test_benchmark.h"
GTEST_API_
int
main
(
int
argc
,
char
**
argv
)
{
std
::
cout
<<
"Running main() from test_main.cc
\n
"
;
testing
::
InitGoogleTest
(
&
argc
,
argv
);
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
if
(
tensorflow
::
StringPiece
(
argv
[
i
]).
starts_with
(
"--benchmarks="
))
{
const
char
*
pattern
=
argv
[
i
]
+
strlen
(
"--benchmarks="
);
tensorflow
::
testing
::
Benchmark
::
Run
(
pattern
);
return
0
;
}
}
return
RUN_ALL_TESTS
();
}
#endif
syntaxnet/syntaxnet/testdata/context.pbtxt
0 → 100644
View file @
32ab5a58
Parameter {
name: 'brain_parser_embedding_dims'
value: '8;8;8'
}
Parameter {
name: 'brain_parser_features'
value: 'input.token.word input(1).token.word input(2).token.word stack.token.word stack(1).token.word stack(2).token.word;input.tag input(1).tag input(2).tag stack.tag stack(1).tag stack(2).tag;stack.child(1).label stack.child(1).sibling(-1).label stack.child(-1).label stack.child(-1).sibling(1).label'
}
Parameter {
name: 'brain_parser_embedding_names'
value: 'words;tags;labels'
}
input {
name: 'training-corpus'
record_format: 'conll-sentence'
Part {
file_pattern: 'syntaxnet/testdata/mini-training-set'
}
}
input {
name: 'tuning-corpus'
record_format: 'conll-sentence'
Part {
file_pattern: 'syntaxnet/testdata/mini-training-set'
}
}
input {
name: 'parsed-tuning-corpus'
creator: 'brain_parser/greedy'
record_format: 'conll-sentence'
}
input {
name: 'label-map'
file_format: 'text'
Part {
file_pattern: 'OUTPATH/label-map'
}
}
input {
name: 'word-map'
Part {
file_pattern: 'OUTPATH/word-map'
}
}
input {
name: 'lcword-map'
Part {
file_pattern: 'OUTPATH/lcword-map'
}
}
input {
name: 'tag-map'
Part {
file_pattern: 'OUTPATH/tag-map'
}
}
input {
name: 'category-map'
Part {
file_pattern: 'OUTPATH/category-map'
}
}
input {
name: 'prefix-table'
Part {
file_pattern: 'OUTPATH/prefix-table'
}
}
input {
name: 'suffix-table'
Part {
file_pattern: 'OUTPATH/suffix-table'
}
}
input {
name: 'tag-to-category'
Part {
file_pattern: 'OUTPATH/tag-to-category'
}
}
input {
name: 'stdout'
record_format: 'conll-sentence'
Part {
file_pattern: '-'
}
}
syntaxnet/syntaxnet/testdata/document
0 → 100644
View file @
32ab5a58
text : "I can not recall any disorder in currency markets since the 1974 guidelines were adopted ."
token: {
word : "I"
start : 0
end : 0
head : 3
tag : "PRP"
category: "PRON"
label : "nsubj"
break_level : SENTENCE_BREAK
}
token: {
word : "can"
start : 2
end : 4
head : 3
tag : "MD"
category: "VERB"
label : "aux"
}
token: {
word : "not"
start : 6
end : 8
head : 3
tag : "RB"
category: "ADV"
label : "neg"
}
token: {
word : "recall"
start : 10
end : 15
tag : "VB"
category: "VERB"
label : "ROOT"
}
token: {
word : "any"
start : 17
end : 19
head : 5
tag : "DT"
category: "DET"
label : "det"
}
token: {
word : "disorder"
start : 21
end : 28
head : 3
tag : "NN"
category: "NOUN"
label : "dobj"
}
token: {
word : "in"
start : 30
end : 31
head : 5
tag : "IN"
category: "ADP"
label : "prep"
}
token: {
word : "currency"
start : 33
end : 40
head : 8
tag : "NN"
category: "NOUN"
label : "nn"
}
token: {
word : "markets"
start : 42
end : 48
head : 6
tag : "NNS"
category: "NOUN"
label : "pobj"
}
token: {
word : "since"
start : 50
end : 54
head : 14
tag : "IN"
category: "ADP"
label : "mark"
}
token: {
word : "the"
start : 56
end : 58
head : 12
tag : "DT"
category: "DET"
label : "det"
}
token: {
word : "1974"
start : 60
end : 63
head : 12
tag : "CD"
category: "NUM"
label : "num"
}
token: {
word : "guidelines"
start : 65
end : 74
head : 14
tag : "NNS"
category: "NOUN"
label : "nsubjpass"
}
token: {
word : "were"
start : 76
end : 79
head : 14
tag : "VBD"
category: "VERB"
label : "auxpass"
}
token: {
word : "adopted"
start : 81
end : 87
head : 3
tag : "VBN"
category: "VERB"
label : "advcl"
}
token: {
word : "."
start : 89
end : 89
head : 3
tag : "."
category: "."
label : "p"
}
syntaxnet/syntaxnet/testdata/mini-training-set
0 → 100644
View file @
32ab5a58
1 I _ PRP PRP _ 2 nsubj _ _
2 knew _ VBD VBD _ 0 ROOT _ _
3 I _ PRP PRP _ 5 nsubj _ _
4 could _ MD MD _ 5 aux _ _
5 do _ VB VB _ 2 ccomp _ _
6 it _ PRP PRP _ 5 dobj _ _
7 properly _ RB RB _ 5 advmod _ _
8 if _ IN IN _ 9 mark _ _
9 given _ VBN VBN _ 5 advcl _ _
10 the _ DT DT _ 12 det _ _
11 right _ JJ JJ _ 12 amod _ _
12 kind _ NN NN _ 9 dobj _ _
13 of _ IN IN _ 12 prep _ _
14 support _ NN NN _ 13 pobj _ _
15 . _ . . _ 2 punct _ _
1 The _ DT DT _ 2 det _ _
2 journey _ NN NN _ 8 nsubj _ _
3 through _ IN IN _ 2 prep _ _
4 deserts _ NNS NNS _ 3 pobj _ _
5 and _ CC CC _ 4 cc _ _
6 mountains _ NNS NNS _ 4 conj _ _
7 can _ MD MD _ 8 aux _ _
8 take _ VB VB _ 0 ROOT _ _
9 a _ DT DT _ 10 det _ _
10 month _ NN NN _ 8 tmod _ _
11 . _ . . _ 8 punct _ _
1 You _ PRP PRP _ 2 nsubj _ _
2 say _ VBP VBP _ 0 ROOT _ _
3 they _ PRP PRP _ 4 nsubj _ _
4 're _ VBP VBP _ 2 ccomp _ _
5 in _ IN IN _ 4 prep _ _
6 the _ DT DT _ 7 det _ _
7 pipeline _ NN NN _ 5 pobj _ _
8 ? _ . . _ 2 punct _ _
1 Border _ NNP NNP _ 5 nn _ _
2 police _ NN NN _ 5 nn _ _
3 commander _ NN NN _ 5 nn _ _
4 Abdul _ NNP NNP _ 5 nn _ _
5 Raziq _ NNP NNP _ 6 nsubj _ _
6 says _ VBZ VBZ _ 0 ROOT _ _
7 the _ DT DT _ 8 det _ _
8 drugs _ NNS NNS _ 10 nsubjpass _ _
9 were _ VBD VBD _ 10 auxpass _ _
10 found _ VBN VBN _ 6 ccomp _ _
11 in _ IN IN _ 10 prep _ _
12 the _ DT DT _ 13 det _ _
13 basement _ NN NN _ 11 pobj _ _
14 of _ IN IN _ 13 prep _ _
15 a _ DT DT _ 16 det _ _
16 compound _ NN NN _ 14 pobj _ _
17 in _ IN IN _ 16 prep _ _
18 Nawa _ NNP NNP _ 20 nn _ _
19 Kili _ NNP NNP _ 20 nn _ _
20 village _ NN NN _ 17 pobj _ _
21 . _ . . _ 6 punct _ _
1 Fourth _ JJ JJ _ 3 amod _ _
2 quarter _ NN NN _ 3 nn _ _
3 production _ NN NN _ 5 nsubjpass _ _
4 is _ VBZ VBZ _ 5 auxpass _ _
5 expected _ VBN VBN _ 0 ROOT _ _
6 to _ TO TO _ 7 aux _ _
7 increase _ VB VB _ 5 xcomp _ _
8 to _ TO TO _ 7 prep _ _
9 130,000 _ CD CD _ 10 num _ _
10 ounces _ NNS NNS _ 8 pobj _ _
11 . _ . . _ 5 punct _ _
1 Minor _ NNP NNP _ 2 nn _ _
2 scuffling _ NN NN _ 3 nsubj _ _
3 broke _ VBD VBD _ 0 ROOT _ _
4 out _ RP RP _ 3 prt _ _
5 as _ IN IN _ 7 mark _ _
6 officials _ NNS NNS _ 7 nsubj _ _
7 sought _ VBD VBD _ 3 advcl _ _
8 to _ TO TO _ 9 aux _ _
9 separate _ VB VB _ 7 xcomp _ _
10 the _ DT DT _ 11 det _ _
11 groups _ NNS NNS _ 9 dobj _ _
12 . _ . . _ 3 punct _ _
1 According _ VBG VBG _ 18 prep _ _
2 to _ TO TO _ 1 pcomp _ _
3 Facebook _ NNP NNP _ 2 pobj _ _
4 , _ , , _ 3 punct _ _
5 which _ WDT WDT _ 7 nsubjpass _ _
6 is _ VBZ VBZ _ 7 auxpass _ _
7 based _ VBN VBN _ 3 rcmod _ _
8 in _ IN IN _ 7 prep _ _
9 Palo _ NNP NNP _ 10 nn _ _
10 Alto _ NNP NNP _ 8 pobj _ _
11 , _ , , _ 10 punct _ _
12 Calif _ NNP NNP _ 10 appos _ _
13 . _ . . _ 12 punct _ _
14 , _ , , _ 18 punct _ _
15 the _ DT DT _ 17 det _ _
16 Web _ NNP NNP _ 17 nn _ _
17 site _ NN NN _ 18 nsubj _ _
18 has _ VBZ VBZ _ 0 ROOT _ _
19 about _ IN IN _ 21 quantmod _ _
20 47 _ CD CD _ 21 number _ _
21 million _ CD CD _ 23 num _ _
22 active _ JJ JJ _ 23 amod _ _
23 users _ NNS NNS _ 18 dobj _ _
24 . _ . . _ 18 punct _ _
1 Among _ IN IN _ 10 prep _ _
2 those _ DT DT _ 1 pobj _ _
3 leaning _ VBG VBG _ 2 partmod _ _
4 toward _ IN IN _ 3 prep _ _
5 McDonnell _ NNP NNP _ 4 pobj _ _
6 , _ , , _ 10 punct _ _
7 however _ RB RB _ 10 advmod _ _
8 , _ , , _ 10 punct _ _
9 some _ DT DT _ 10 nsubj _ _
10 took _ VBD VBD _ 0 ROOT _ _
11 a _ DT DT _ 14 det _ _
12 more _ RBR RBR _ 13 advmod _ _
13 nuanced _ JJ JJ _ 14 amod _ _
14 view _ NN NN _ 10 dobj _ _
15 , _ , , _ 10 punct _ _
16 allowing _ VBG VBG _ 10 partmod _ _
17 for _ IN IN _ 16 prep _ _
18 the _ DT DT _ 19 det _ _
19 possibility _ NN NN _ 17 pobj _ _
20 that _ IN IN _ 24 mark _ _
21 McDonnell _ NNP NNP _ 24 nsubj _ _
22 could _ MD MD _ 24 aux _ _
23 have _ VB VB _ 24 aux _ _
24 changed _ VBN VBN _ 19 ccomp _ _
25 his _ PRP$ PRP$ _ 26 poss _ _
26 mind _ NN NN _ 24 dobj _ _
27 in _ IN IN _ 24 prep _ _
28 the _ DT DT _ 31 det _ _
29 intervening _ VBG VBG _ 31 amod _ _
30 20 _ CD CD _ 31 num _ _
31 years _ NNS NNS _ 27 pobj _ _
32 or _ CC CC _ 24 cc _ _
33 that _ IN IN _ 39 mark _ _
34 his _ PRP$ PRP$ _ 36 poss _ _
35 personal _ JJ JJ _ 36 amod _ _
36 convictions _ NNS NNS _ 39 nsubj _ _
37 would _ MD MD _ 39 aux _ _
38 not _ RB RB _ 39 neg _ _
39 interfere _ VB VB _ 24 conj _ _
40 with _ IN IN _ 39 prep _ _
41 his _ PRP$ PRP$ _ 42 poss _ _
42 governing _ NN NN _ 40 pobj _ _
43 . _ . . _ 10 punct _ _
1 Both _ DT DT _ 2 det _ _
2 teams _ NNS NNS _ 3 nsubj _ _
3 have _ VBP VBP _ 0 ROOT _ _
4 97 _ CD CD _ 5 num _ _
5 points _ NNS NNS _ 3 dobj _ _
6 . _ . . _ 3 punct _ _
1 Star-Banner _ NNP NNP _ 2 nsubj _ _
2 reported _ VBD VBD _ 0 ROOT _ _
3 Tuesday _ NNP NNP _ 2 tmod _ _
4 . _ . . _ 2 punct _ _
1 Harry _ NNP NNP _ 2 nn _ _
2 Redknapp _ NNP NNP _ 9 nsubj _ _
3 , _ , , _ 2 punct _ _
4 the _ DT DT _ 6 det _ _
5 Tottenham _ NNP NNP _ 6 nn _ _
6 manager _ NN NN _ 2 appos _ _
7 , _ , , _ 2 punct _ _
8 was _ VBD VBD _ 9 aux _ _
9 disbelieving _ VBG VBG _ 0 ROOT _ _
10 that _ IN IN _ 18 mark _ _
11 Lennon _ NNP NNP _ 13 poss _ _
12 's _ POS POS _ 11 possessive _ _
13 delivery _ NN NN _ 18 nsubj _ _
14 could _ MD MD _ 18 aux _ _
15 be _ VB VB _ 18 cop _ _
16 so _ RB RB _ 18 advmod _ _
17 radically _ RB RB _ 18 advmod _ _
18 different _ JJ JJ _ 9 ccomp _ _
19 . _ . . _ 9 punct _ _
1 The _ DT DT _ 3 det _ _
2 US _ NNP NNP _ 3 nn _ _
3 uptick _ NN NN _ 4 nsubj _ _
4 mirrors _ VBZ VBZ _ 0 ROOT _ _
5 an _ DT DT _ 6 det _ _
6 improvement _ NN NN _ 4 dobj _ _
7 in _ IN IN _ 6 prep _ _
8 many _ JJ JJ _ 10 amod _ _
9 other _ JJ JJ _ 10 amod _ _
10 parts _ NNS NNS _ 7 pobj _ _
11 of _ IN IN _ 10 prep _ _
12 the _ DT DT _ 13 det _ _
13 world _ NN NN _ 11 pobj _ _
14 . _ . . _ 4 punct _ _
1 Although _ IN IN _ 4 mark _ _
2 satellite _ NN NN _ 3 nn _ _
3 television _ NN NN _ 4 nsubj _ _
4 has _ VBZ VBZ _ 17 advcl _ _
5 the _ DT DT _ 6 det _ _
6 capacity _ NN NN _ 4 dobj _ _
7 for _ IN IN _ 6 prep _ _
8 hundreds _ NNS NNS _ 7 pobj _ _
9 of _ IN IN _ 8 prep _ _
10 conventional _ JJ JJ _ 12 amod _ _
11 television _ NN NN _ 12 nn _ _
12 channels _ NNS NNS _ 9 pobj _ _
13 , _ , , _ 17 punct _ _
14 it _ PRP PRP _ 17 nsubj _ _
15 is _ VBZ VBZ _ 17 cop _ _
16 less _ RBR RBR _ 17 advmod _ _
17 able _ JJ JJ _ 0 ROOT _ _
18 to _ TO TO _ 19 aux _ _
19 provide _ VB VB _ 17 xcomp _ _
20 video-on-demand _ NN NN _ 19 dobj _ _
21 . _ . . _ 17 punct _ _
1 Our _ PRP$ PRP$ _ 3 poss _ _
2 comfortable _ JJ JJ _ 3 amod _ _
3 room _ NN NN _ 4 nsubj _ _
4 feels _ VBZ VBZ _ 0 ROOT _ _
5 on _ IN IN _ 4 prep _ _
6 the _ DT DT _ 8 det _ _
7 small _ JJ JJ _ 8 amod _ _
8 side _ NN NN _ 5 pobj _ _
9 , _ , , _ 4 punct _ _
10 mainly _ RB RB _ 17 advmod _ _
11 because _ IN IN _ 17 mark _ _
12 too _ RB RB _ 13 advmod _ _
13 much _ JJ JJ _ 14 amod _ _
14 furniture _ NN NN _ 17 nsubjpass _ _
15 has _ VBZ VBZ _ 17 aux _ _
16 been _ VBN VBN _ 17 auxpass _ _
17 shoehorned _ VBN VBN _ 4 advcl _ _
18 into _ IN IN _ 17 prep _ _
19 it _ PRP PRP _ 18 pobj _ _
20 . _ . . _ 4 punct _ _
1 They _ PRP PRP _ 3 nsubj _ _
2 also _ RB RB _ 3 advmod _ _
3 require _ VBP VBP _ 0 ROOT _ _
4 a _ DT DT _ 6 det _ _
5 slower _ JJR JJR _ 6 amod _ _
6 inhale _ NN NN _ 3 dobj _ _
7 . _ . . _ 3 punct _ _
1 Her _ PRP$ PRP$ _ 2 poss _ _
2 ring _ NN NN _ 4 nsubjpass _ _
3 was _ VBD VBD _ 4 auxpass _ _
4 found _ VBN VBN _ 0 ROOT _ _
5 in _ IN IN _ 4 prep _ _
6 the _ DT DT _ 7 det _ _
7 car _ NN NN _ 5 pobj _ _
8 . _ . . _ 4 punct _ _
1 In _ IN IN _ 12 prep _ _
2 the _ DT DT _ 4 det _ _
3 past _ JJ JJ _ 4 amod _ _
4 year _ NN NN _ 1 pobj _ _
5 , _ , , _ 7 punct _ _
6 Forsythe _ NNP NNP _ 7 nsubj _ _
7 said _ VBD VBD _ 12 parataxis _ _
8 , _ , , _ 7 punct _ _
9 the _ DT DT _ 11 det _ _
10 Salvation _ NNP NNP _ 11 nn _ _
11 Army _ NNP NNP _ 12 nsubj _ _
12 provided _ VBD VBD _ 0 ROOT _ _
13 rental _ JJ JJ _ 14 amod _ _
14 subsidies _ NNS NNS _ 12 dobj _ _
15 that _ WDT WDT _ 16 nsubj _ _
16 prevented _ VBD VBD _ 14 rcmod _ _
17 1,172 _ CD CD _ 18 num _ _
18 evictions _ NNS NNS _ 16 dobj _ _
19 . _ . . _ 12 punct _ _
1 A _ DT DT _ 3 det _ _
2 23-year-old _ JJ JJ _ 3 amod _ _
3 man _ NN NN _ 6 nsubjpass _ _
4 has _ VBZ VBZ _ 6 aux _ _
5 been _ VBN VBN _ 6 auxpass _ _
6 jailed _ VBN VBN _ 0 ROOT _ _
7 for _ IN IN _ 6 prep _ _
8 two _ CD CD _ 9 num _ _
9 years _ NNS NNS _ 7 pobj _ _
10 after _ IN IN _ 6 prep _ _
11 pleading _ VBG VBG _ 10 pcomp _ _
12 guilty _ JJ JJ _ 11 acomp _ _
13 to _ TO TO _ 12 prep _ _
14 the _ DT DT _ 15 det _ _
15 manslaughter _ NN NN _ 13 pobj _ _
16 of _ IN IN _ 15 prep _ _
17 a _ DT DT _ 18 det _ _
18 man _ NN NN _ 16 pobj _ _
19 in _ IN IN _ 18 prep _ _
20 Hertfordshire _ NNP NNP _ 19 pobj _ _
21 . _ . . _ 6 punct _ _
1 But _ CC CC _ 10 cc _ _
2 the _ DT DT _ 3 det _ _
3 sustainability _ NN NN _ 10 nsubj _ _
4 of _ IN IN _ 3 prep _ _
5 any _ DT DT _ 7 det _ _
6 post-bubble _ JJ JJ _ 7 amod _ _
7 recovery _ NN NN _ 4 pobj _ _
8 is _ VBZ VBZ _ 10 cop _ _
9 always _ RB RB _ 10 advmod _ _
10 dubious _ JJ JJ _ 0 ROOT _ _
11 . _ . . _ 10 punct _ _
1 They _ PRP PRP _ 2 nsubj _ _
2 spoke _ VBD VBD _ 0 ROOT _ _
3 to _ TO TO _ 2 prep _ _
4 the _ DT DT _ 5 det _ _
5 BBC _ NNP NNP _ 8 poss _ _
6 's _ POS POS _ 5 possessive _ _
7 Artyom _ NNP NNP _ 8 nn _ _
8 Liss _ NNP NNP _ 3 pobj _ _
9 . _ . . _ 2 punct _ _
1 That _ DT DT _ 2 nsubj _ _
2 includes _ VBZ VBZ _ 0 ROOT _ _
3 me _ PRP PRP _ 2 dobj _ _
4 , _ , , _ 2 punct _ _
5 too _ RB RB _ 2 advmod _ _
6 . _ . . _ 2 punct _ _
1 The _ DT DT _ 2 det _ _
2 name _ NN NN _ 9 nsubj _ _
3 of _ IN IN _ 2 prep _ _
4 Rachel _ NNP NNP _ 5 nn _ _
5 Harris _ NNP NNP _ 8 poss _ _
6 ' _ POS POS _ 5 possessive _ _
7 Web _ NNP NNP _ 8 nn _ _
8 site _ NN NN _ 3 pobj _ _
9 says _ VBZ VBZ _ 0 ROOT _ _
10 it _ PRP PRP _ 9 dobj _ _
11 all _ DT DT _ 10 det _ _
12 . _ . . _ 9 punct _ _
1 If _ IN IN _ 3 mark _ _
2 you _ PRP PRP _ 3 nsubj _ _
3 prefer _ VBP VBP _ 19 advcl _ _
4 to _ TO TO _ 5 aux _ _
5 maximize _ VB VB _ 3 xcomp _ _
6 your _ PRP$ PRP$ _ 7 poss _ _
7 travel _ NN NN _ 5 dobj _ _
8 with _ IN IN _ 5 prep _ _
9 shorter _ JJR JJR _ 10 amod _ _
10 stays _ NNS NNS _ 8 pobj _ _
11 in _ IN IN _ 10 prep _ _
12 more _ JJR JJR _ 13 mwe _ _
13 than _ IN IN _ 14 quantmod _ _
14 one _ CD CD _ 15 num _ _
15 destination _ NN NN _ 11 pobj _ _
16 , _ , , _ 19 punct _ _
17 you _ PRP PRP _ 19 nsubj _ _
18 may _ MD MD _ 19 aux _ _
19 like _ VB VB _ 0 ROOT _ _
20 this _ DT DT _ 22 det _ _
21 multi-country _ JJ JJ _ 22 amod _ _
22 jaunt _ NN NN _ 19 dobj _ _
23 from _ IN IN _ 22 prep _ _
24 Virgin _ NNP NNP _ 25 nn _ _
25 Vacations _ NNPS NNPS _ 23 pobj _ _
26 . _ . . _ 19 punct _ _
1 The _ DT DT _ 3 det _ _
2 Afghan _ JJ JJ _ 3 amod _ _
3 government _ NN NN _ 6 nsubj _ _
4 also _ RB RB _ 6 advmod _ _
5 is _ VBZ VBZ _ 6 aux _ _
6 trying _ VBG VBG _ 0 ROOT _ _
7 to _ TO TO _ 8 aux _ _
8 persuade _ VB VB _ 6 xcomp _ _
9 farmers _ NNS NNS _ 8 dobj _ _
10 to _ TO TO _ 11 aux _ _
11 stop _ VB VB _ 8 xcomp _ _
12 growing _ VBG VBG _ 13 amod _ _
13 poppy _ NN NN _ 11 dobj _ _
14 and _ CC CC _ 11 cc _ _
15 shift _ VB VB _ 11 conj _ _
16 to _ TO TO _ 15 prep _ _
17 other _ JJ JJ _ 18 amod _ _
18 crops _ NNS NNS _ 16 pobj _ _
19 , _ , , _ 18 punct _ _
20 particularly _ RB RB _ 18 advmod _ _
21 wheat _ NN NN _ 18 dep _ _
22 . _ . . _ 6 punct _ _
1 The _ DT DT _ 3 det _ _
2 most _ RBS RBS _ 3 advmod _ _
3 striking _ JJ JJ _ 6 nsubj _ _
4 is _ VBZ VBZ _ 6 cop _ _
5 the _ DT DT _ 6 det _ _
6 differences _ NNS NNS _ 0 ROOT _ _
7 over _ IN IN _ 6 prep _ _
8 what _ WP WP _ 10 nsubj _ _
9 to _ TO TO _ 10 aux _ _
10 do _ VB VB _ 7 pcomp _ _
11 with _ IN IN _ 10 prep _ _
12 the _ DT DT _ 13 det _ _
13 banks _ NNS NNS _ 11 pobj _ _
14 . _ . . _ 6 punct _ _
1 Philo _ NNP NNP _ 4 nsubj _ _
2 did _ VBD VBD _ 4 aux _ _
3 not _ RB RB _ 4 neg _ _
4 mention _ VB VB _ 0 ROOT _ _
5 any _ DT DT _ 6 det _ _
6 name _ NN NN _ 4 dobj _ _
7 , _ , , _ 6 punct _ _
8 place _ NN NN _ 6 conj _ _
9 , _ , , _ 6 punct _ _
10 date _ NN NN _ 6 conj _ _
11 , _ , , _ 6 punct _ _
12 or _ CC CC _ 6 cc _ _
13 historical _ JJ JJ _ 14 amod _ _
14 circumstances _ NNS NNS _ 6 conj _ _
15 , _ , , _ 6 punct _ _
16 or _ CC CC _ 6 cc _ _
17 any _ DT DT _ 18 det _ _
18 background _ NN NN _ 6 conj _ _
19 to _ TO TO _ 18 prep _ _
20 the _ DT DT _ 21 det _ _
21 consolidation _ NN NN _ 19 pobj _ _
22 of _ IN IN _ 21 prep _ _
23 this _ DT DT _ 24 det _ _
24 group _ NN NN _ 22 pobj _ _
25 . _ . . _ 4 punct _ _
1 Created _ VBN VBN _ 8 partmod _ _
2 in _ IN IN _ 1 prep _ _
3 1996 _ CD CD _ 2 pobj _ _
4 , _ , , _ 8 punct _ _
5 the _ DT DT _ 6 det _ _
6 payments _ NNS NNS _ 8 nsubjpass _ _
7 are _ VBP VBP _ 8 auxpass _ _
8 based _ VBN VBN _ 0 ROOT _ _
9 on _ IN IN _ 8 prep _ _
10 a _ DT DT _ 11 det _ _
11 farm _ NN NN _ 14 poss _ _
12 's _ POS POS _ 11 possessive _ _
13 past _ JJ JJ _ 14 amod _ _
14 production _ NN NN _ 9 pobj _ _
15 and _ CC CC _ 8 cc _ _
16 are _ VBP VBP _ 17 auxpass _ _
17 issued _ VBN VBN _ 8 conj _ _
18 regardless _ RB RB _ 17 advmod _ _
19 of _ IN IN _ 18 prep _ _
20 current _ JJ JJ _ 21 amod _ _
21 production _ NN NN _ 19 pobj _ _
22 or _ CC CC _ 21 cc _ _
23 market _ NN NN _ 24 nn _ _
24 prices _ NNS NNS _ 21 conj _ _
25 . _ . . _ 8 punct _ _
1 Prosecutors _ NNS NNS _ 2 nsubj _ _
2 said _ VBD VBD _ 0 ROOT _ _
3 some _ DT DT _ 16 nsubjpass _ _
4 of _ IN IN _ 3 prep _ _
5 the _ DT DT _ 6 det _ _
6 billions _ NNS NNS _ 4 pobj _ _
7 of _ IN IN _ 6 prep _ _
8 dollars _ NNS NNS _ 7 pobj _ _
9 transferred _ VBN VBN _ 8 partmod _ _
10 from _ IN IN _ 9 prep _ _
11 Mexican _ JJ JJ _ 14 amod _ _
12 money _ NN NN _ 14 nn _ _
13 exchange _ NN NN _ 14 nn _ _
14 houses _ NNS NNS _ 10 pobj _ _
15 was _ VBD VBD _ 16 auxpass _ _
16 used _ VBN VBN _ 2 ccomp _ _
17 to _ TO TO _ 18 aux _ _
18 buy _ VB VB _ 16 xcomp _ _
19 planes _ NNS NNS _ 18 dobj _ _
20 for _ IN IN _ 18 prep _ _
21 drug _ NN NN _ 22 nn _ _
22 traffickers _ NNS NNS _ 20 pobj _ _
23 . _ . . _ 2 punct _ _
1 Margaret _ NNP NNP _ 2 nn _ _
2 Rutherford _ NNP NNP _ 11 nsubj _ _
3 , _ , , _ 2 punct _ _
4 chairwoman _ NN NN _ 2 appos _ _
5 of _ IN IN _ 4 prep _ _
6 Loxton _ NNP NNP _ 9 poss _ _
7 's _ POS POS _ 6 possessive _ _
8 parish _ JJ JJ _ 9 amod _ _
9 council _ NN NN _ 5 pobj _ _
10 , _ , , _ 2 punct _ _
11 told _ VBD VBD _ 0 ROOT _ _
12 BBC _ NNP NNP _ 13 nn _ _
13 Somerset _ NNP NNP _ 11 dobj _ _
14 that _ IN IN _ 16 mark _ _
15 she _ PRP PRP _ 16 nsubj _ _
16 hoped _ VBD VBD _ 11 ccomp _ _
17 the _ DT DT _ 18 det _ _
18 lines _ NNS NNS _ 21 nsubjpass _ _
19 could _ MD MD _ 21 aux _ _
20 be _ VB VB _ 21 auxpass _ _
21 sited _ VBN VBN _ 16 ccomp _ _
22 underground _ RB RB _ 21 advmod _ _
23 . _ . . _ 11 punct _ _
1 Amid _ IN IN _ 3 mark _ _
2 US _ PRP PRP _ 3 nsubj _ _
3 fears _ VBZ VBZ _ 16 advcl _ _
4 that _ IN IN _ 7 mark _ _
5 they _ PRP PRP _ 7 nsubj _ _
6 could _ MD MD _ 7 aux _ _
7 face _ VB VB _ 3 ccomp _ _
8 torture _ VB VB _ 7 dobj _ _
9 if _ IN IN _ 10 mark _ _
10 returned _ VBN VBN _ 7 advcl _ _
11 to _ TO TO _ 10 prep _ _
12 China _ NNP NNP _ 11 pobj _ _
13 , _ , , _ 16 punct _ _
14 five _ CD CD _ 16 nsubjpass _ _
15 were _ VBD VBD _ 16 auxpass _ _
16 released _ VBN VBN _ 0 ROOT _ _
17 to _ TO TO _ 16 prep _ _
18 Albania _ NNP NNP _ 17 pobj _ _
19 in _ IN IN _ 16 prep _ _
20 2006 _ CD CD _ 19 pobj _ _
21 , _ , , _ 16 punct _ _
22 and _ CC CC _ 16 cc _ _
23 four _ CD CD _ 25 nsubjpass _ _
24 were _ VBD VBD _ 25 auxpass _ _
25 resettled _ VBN VBN _ 16 conj _ _
26 in _ IN IN _ 25 prep _ _
27 Bermuda _ NNP NNP _ 26 pobj _ _
28 this _ DT DT _ 29 det _ _
29 year _ NN NN _ 25 tmod _ _
30 . _ . . _ 16 punct _ _
1 He _ PRP PRP _ 3 nsubj _ _
2 then _ RB RB _ 3 advmod _ _
3 provided _ VBD VBD _ 0 ROOT _ _
4 Marshal _ NNP NNP _ 5 nn _ _
5 McAvoy _ NNP NNP _ 8 poss _ _
6 's _ POS POS _ 5 possessive _ _
7 phone _ NN NN _ 8 nn _ _
8 number _ NN NN _ 3 dobj _ _
9 . _ . . _ 3 punct _ _
1 Tech _ NNP NNP _ 2 nn _ _
2 credits _ NNS NNS _ 5 nsubj _ _
3 are _ VBP VBP _ 5 cop _ _
4 just _ RB RB _ 5 advmod _ _
5 fine _ JJ JJ _ 0 ROOT _ _
6 for _ IN IN _ 5 prep _ _
7 what _ WP WP _ 12 nsubj _ _
8 essentially _ RB RB _ 12 advmod _ _
9 is _ VBZ VBZ _ 12 cop _ _
10 an _ DT DT _ 12 det _ _
11 un-reality _ JJ JJ _ 12 amod _ _
12 show _ NN NN _ 6 pcomp _ _
13 . _ . . _ 5 punct _ _
1 But _ CC CC _ 8 cc _ _
2 my _ PRP$ PRP$ _ 4 poss _ _
3 eldest _ JJS JJS _ 4 amod _ _
4 daughter _ NN NN _ 8 nsubj _ _
5 , _ , , _ 4 punct _ _
6 Donna _ NNP NNP _ 4 appos _ _
7 , _ , , _ 4 punct _ _
8 did _ VBD VBD _ 0 ROOT _ _
9 . _ . . _ 8 punct _ _
1 The _ DT DT _ 2 det _ _
2 department _ NN NN _ 4 nsubj _ _
3 has _ VBZ VBZ _ 4 aux _ _
4 spent _ VBN VBN _ 0 ROOT _ _
5 $ _ $ $ _ 4 dobj _ _
6 2.9 _ CD CD _ 7 number _ _
7 million _ CD CD _ 5 num _ _
8 on _ IN IN _ 4 prep _ _
9 the _ DT DT _ 11 det _ _
10 hot _ JJ JJ _ 11 amod _ _
11 line _ NN NN _ 8 pobj _ _
12 thus _ RB RB _ 13 advmod _ _
13 far _ RB RB _ 4 advmod _ _
14 . _ . . _ 4 punct _ _
1 Picoplatin _ NNP NNP _ 3 nsubjpass _ _
2 is _ VBZ VBZ _ 3 auxpass _ _
3 designed _ VBN VBN _ 0 ROOT _ _
4 to _ TO TO _ 5 aux _ _
5 overcome _ VB VB _ 3 xcomp _ _
6 platinum _ NN NN _ 7 nn _ _
7 resistance _ NN NN _ 5 dobj _ _
8 associated _ VBN VBN _ 7 partmod _ _
9 with _ IN IN _ 8 prep _ _
10 chemotherapy _ NN NN _ 9 pobj _ _
11 in _ IN IN _ 10 prep _ _
12 solid _ JJ JJ _ 13 amod _ _
13 tumors _ NNS NNS _ 11 pobj _ _
14 , _ , , _ 3 punct _ _
15 and _ CC CC _ 3 cc _ _
16 is _ VBZ VBZ _ 18 aux _ _
17 being _ VBG VBG _ 18 auxpass _ _
18 studied _ VBN VBN _ 3 conj _ _
19 in _ IN IN _ 18 prep _ _
20 multiple _ JJ JJ _ 22 amod _ _
21 cancer _ NN NN _ 22 nn _ _
22 indications _ NNS NNS _ 19 pobj _ _
23 , _ , , _ 22 punct _ _
24 combinations _ NNS NNS _ 22 conj _ _
25 and _ CC CC _ 22 cc _ _
26 formulations _ NNS NNS _ 22 conj _ _
27 . _ . . _ 3 punct _ _
1 Only _ RB RB _ 4 advmod _ _
2 you _ PRP PRP _ 4 nsubj _ _
3 can _ MD MD _ 4 aux _ _
4 decide _ VB VB _ 0 ROOT _ _
5 what _ WP WP _ 7 nsubj _ _
6 's _ VBZ VBZ _ 7 cop _ _
7 important _ JJ JJ _ 4 ccomp _ _
8 . _ . . _ 4 punct _ _
1 Lt. _ NNP NNP _ 4 nn _ _
2 Col. _ NNP NNP _ 4 nn _ _
3 David _ NNP NNP _ 4 nn _ _
4 Accetta _ NNP NNP _ 14 nsubj _ _
5 , _ , , _ 4 punct _ _
6 the _ DT DT _ 10 det _ _
7 top _ JJ JJ _ 10 amod _ _
8 U.S. _ NNP NNP _ 10 nn _ _
9 military _ JJ JJ _ 10 amod _ _
10 spokesman _ NN NN _ 4 appos _ _
11 in _ IN IN _ 10 prep _ _
12 Afghanistan _ NNP NNP _ 11 pobj _ _
13 , _ , , _ 4 punct _ _
14 said _ VBD VBD _ 0 ROOT _ _
15 he _ PRP PRP _ 18 nsubj _ _
16 could _ MD MD _ 18 aux _ _
17 not _ RB RB _ 18 neg _ _
18 confirm _ VB VB _ 14 ccomp _ _
19 the _ DT DT _ 20 det _ _
20 report _ NN NN _ 18 dobj _ _
21 . _ . . _ 14 punct _ _
1 The _ DT DT _ 3 det _ _
2 four _ CD CD _ 3 num _ _
3 teams _ NNS NNS _ 14 nsubj _ _
4 that _ WDT WDT _ 6 nsubj _ _
5 will _ MD MD _ 6 aux _ _
6 play _ VB VB _ 3 rcmod _ _
7 in _ IN IN _ 6 prep _ _
8 the _ DT DT _ 9 det _ _
9 women _ NNS NNS _ 11 poss _ _
10 's _ POS POS _ 9 possessive _ _
11 tournament _ NN NN _ 7 pobj _ _
12 are _ VBP VBP _ 14 cop _ _
13 Alaska _ NNP NNP _ 14 nn _ _
14 Anchorage _ NNP NNP _ 0 ROOT _ _
15 , _ , , _ 14 punct _ _
16 Cincinnati _ NNP NNP _ 14 conj _ _
17 , _ , , _ 14 punct _ _
18 Coastal _ NNP NNP _ 19 nn _ _
19 Carolina _ NNP NNP _ 14 conj _ _
20 and _ CC CC _ 14 cc _ _
21 Western _ NNP NNP _ 22 nn _ _
22 Carolina _ NNP NNP _ 14 conj _ _
23 . _ . . _ 14 punct _ _
1 Speaking _ VBG VBG _ 8 partmod _ _
2 to _ TO TO _ 1 prep _ _
3 reporters _ NNS NNS _ 2 pobj _ _
4 , _ , , _ 8 punct _ _
5 she _ PRP PRP _ 8 nsubj _ _
6 did _ VBD VBD _ 8 aux _ _
7 not _ RB RB _ 8 neg _ _
8 repeat _ VB VB _ 0 ROOT _ _
9 her _ PRP$ PRP$ _ 10 poss _ _
10 demand _ NN NN _ 8 dobj _ _
11 that _ IN IN _ 17 mark _ _
12 a _ DT DT _ 15 det _ _
13 new _ JJ JJ _ 15 amod _ _
14 government-run _ JJ JJ _ 15 amod _ _
15 plan _ NN NN _ 17 nsubj _ _
16 be _ VB VB _ 17 cop _ _
17 part _ NN NN _ 10 ccomp _ _
18 of _ IN IN _ 17 prep _ _
19 the _ DT DT _ 21 det _ _
20 final _ JJ JJ _ 21 amod _ _
21 legislation _ NN NN _ 18 pobj _ _
22 . _ . . _ 8 punct _ _
1 ' _ '' '' _ 10 punct _ _
2 But _ CC CC _ 10 cc _ _
3 with _ IN IN _ 10 prep _ _
4 the _ DT DT _ 5 det _ _
5 help _ NN NN _ 3 pobj _ _
6 of _ IN IN _ 5 prep _ _
7 English _ NNP NNP _ 8 nn _ _
8 Heritage _ NNP NNP _ 6 pobj _ _
9 we _ PRP PRP _ 10 nsubj _ _
10 restored _ VBD VBD _ 0 ROOT _ _
11 them _ PRP PRP _ 10 dobj _ _
12 . _ . . _ 10 punct _ _
13 ' _ '' '' _ 10 punct _ _
1 Mr _ NNP NNP _ 2 nn _ _
2 Oubridge _ NNP NNP _ 3 nsubj _ _
3 said _ VBD VBD _ 0 ROOT _ _
4 when _ WRB WRB _ 8 advmod _ _
5 the _ DT DT _ 7 det _ _
6 festival _ NN NN _ 7 nn _ _
7 team _ NN NN _ 8 nsubj _ _
8 met _ VBD VBD _ 20 advcl _ _
9 council _ NN NN _ 10 nn _ _
10 officials _ NNS NNS _ 8 dobj _ _
11 and _ CC CC _ 10 cc _ _
12 the _ DT DT _ 13 det _ _
13 police _ NN NN _ 10 conj _ _
14 on _ IN IN _ 8 prep _ _
15 Thursday _ NNP NNP _ 14 pobj _ _
16 there _ EX EX _ 20 expl _ _
17 had _ VBD VBD _ 20 aux _ _
18 been _ VBN VBN _ 20 cop _ _
19 no _ DT DT _ 20 det _ _
20 mention _ NN NN _ 3 ccomp _ _
21 of _ IN IN _ 20 prep _ _
22 a _ DT DT _ 24 det _ _
23 potential _ JJ JJ _ 24 amod _ _
24 injunction _ NN NN _ 21 pobj _ _
25 . _ . . _ 3 punct _ _
1 A _ DT DT _ 2 det _ _
2 number _ NN NN _ 6 nsubj _ _
3 of _ IN IN _ 2 prep _ _
4 ministers _ NNS NNS _ 3 pobj _ _
5 have _ VBP VBP _ 6 aux _ _
6 left _ VBN VBN _ 0 ROOT _ _
7 the _ DT DT _ 8 det _ _
8 government _ NN NN _ 9 nsubj _ _
9 facing _ VBG VBG _ 6 dep _ _
10 questions _ NNS NNS _ 9 dobj _ _
11 over _ IN IN _ 10 prep _ _
12 their _ PRP$ PRP$ _ 13 poss _ _
13 expenses _ NNS NNS _ 11 pobj _ _
14 , _ , , _ 13 punct _ _
15 including _ VBG VBG _ 13 prep _ _
16 Hazel _ NNP NNP _ 17 nn _ _
17 Blears _ NNP NNP _ 15 pobj _ _
18 , _ , , _ 17 punct _ _
19 the _ DT DT _ 22 det _ _
20 former _ JJ JJ _ 22 amod _ _
21 communities _ NNS NNS _ 22 nn _ _
22 secretary _ NN NN _ 17 appos _ _
23 ; _ : : _ 17 punct _ _
24 Jacqui _ NNP NNP _ 25 nn _ _
25 Smith _ NNP NNP _ 17 conj _ _
26 , _ , , _ 25 punct _ _
27 the _ DT DT _ 30 det _ _
28 former _ JJ JJ _ 30 amod _ _
29 home _ NN NN _ 30 nn _ _
30 secretary _ NN NN _ 25 appos _ _
31 ; _ : : _ 17 punct _ _
32 and _ CC CC _ 17 cc _ _
33 Tony _ NNP NNP _ 34 nn _ _
34 McNulty _ NNP NNP _ 17 conj _ _
35 , _ , , _ 34 punct _ _
36 the _ DT DT _ 39 det _ _
37 former _ JJ JJ _ 39 amod _ _
38 employment _ NN NN _ 39 nn _ _
39 minister _ NN NN _ 34 appos _ _
40 . _ . . _ 6 punct _ _
1 An _ DT DT _ 4 det _ _
2 enticingly _ RB RB _ 3 advmod _ _
3 big _ JJ JJ _ 4 amod _ _
4 button _ NN NN _ 10 nsubj _ _
5 that _ WDT WDT _ 6 nsubj _ _
6 looked _ VBD VBD _ 4 rcmod _ _
7 like _ IN IN _ 6 prep _ _
8 a _ DT DT _ 9 det _ _
9 latch _ NN NN _ 7 pobj _ _
10 turned _ VBD VBD _ 0 ROOT _ _
11 out _ RP RP _ 10 prt _ _
12 to _ TO TO _ 15 aux _ _
13 be _ VB VB _ 15 cop _ _
14 a _ DT DT _ 15 det _ _
15 hinge _ NN NN _ 10 xcomp _ _
16 . _ . . _ 10 punct _ _
1 After _ IN IN _ 8 prep _ _
2 an _ DT DT _ 5 det _ _
3 oustanding _ JJ JJ _ 5 amod _ _
4 opening _ NN NN _ 5 nn _ _
5 round _ NN NN _ 1 pobj _ _
6 , _ , , _ 8 punct _ _
7 Garcia _ NNP NNP _ 8 nsubj _ _
8 found _ VBD VBD _ 0 ROOT _ _
9 himself _ PRP PRP _ 10 nsubj _ _
10 tied _ VBD VBD _ 8 ccomp _ _
11 with _ IN IN _ 10 prep _ _
12 the _ DT DT _ 14 det _ _
13 50-year-old _ JJ JJ _ 14 amod _ _
14 Langer _ NNP NNP _ 11 pobj _ _
15 , _ , , _ 14 punct _ _
16 who _ WP WP _ 17 nsubj _ _
17 fired _ VBD VBD _ 14 rcmod _ _
18 a _ DT DT _ 20 det _ _
19 five-under _ JJ JJ _ 20 amod _ _
20 67 _ NN NN _ 17 dobj _ _
21 following _ VBG VBG _ 17 prep _ _
22 his _ PRP$ PRP$ _ 24 poss _ _
23 first-round _ JJ JJ _ 24 amod _ _
24 72 _ CD CD _ 21 pobj _ _
25 . _ . . _ 8 punct _ _
1 We _ PRP PRP _ 2 nsubj _ _
2 made _ VBD VBD _ 0 ROOT _ _
3 mistakes _ NNS NNS _ 2 dobj _ _
4 in _ IN IN _ 2 prep _ _
5 those _ DT DT _ 6 det _ _
6 games _ NNS NNS _ 4 pobj _ _
7 in _ IN IN _ 2 prep _ _
8 the _ DT DT _ 10 det _ _
9 last _ JJ JJ _ 10 amod _ _
10 minute _ NN NN _ 7 pobj _ _
11 , _ , , _ 2 punct _ _
12 so _ IN IN _ 16 mark _ _
13 it _ PRP PRP _ 16 nsubj _ _
14 's _ VBZ VBZ _ 16 cop _ _
15 our _ PRP$ PRP$ _ 16 poss _ _
16 fault _ NN NN _ 2 advcl _ _
17 in _ IN IN _ 16 prep _ _
18 the _ DT DT _ 19 det _ _
19 end _ NN NN _ 17 pobj _ _
20 . _ . . _ 2 punct _ _
1 This _ DT DT _ 3 det _ _
2 latest _ JJS JJS _ 3 amod _ _
3 incident _ NN NN _ 7 nsubj _ _
4 is _ VBZ VBZ _ 7 cop _ _
5 the _ DT DT _ 7 det _ _
6 second _ JJ JJ _ 7 amod _ _
7 time _ NN NN _ 0 ROOT _ _
8 in _ IN IN _ 7 prep _ _
9 four _ CD CD _ 10 num _ _
10 weeks _ NNS NNS _ 8 pobj _ _
11 the _ DT DT _ 12 det _ _
12 Revenue _ NN NN _ 14 nsubj _ _
13 has _ VBZ VBZ _ 14 aux _ _
14 admitted _ VBN VBN _ 7 rcmod _ _
15 losing _ VBG VBG _ 14 xcomp _ _
16 taxpayers _ NNS NNS _ 18 poss _ _
17 ' _ POS POS _ 16 possessive _ _
18 details _ NNS NNS _ 15 dobj _ _
19 . _ . . _ 7 punct _ _
1 NebuAd _ NNP NNP _ 2 nsubj _ _
2 confirmed _ VBD VBD _ 0 ROOT _ _
3 Friday _ NNP NNP _ 2 tmod _ _
4 that _ IN IN _ 7 mark _ _
5 it _ PRP PRP _ 7 nsubj _ _
6 is _ VBZ VBZ _ 7 aux _ _
7 partnering _ VBG VBG _ 2 ccomp _ _
8 with _ IN IN _ 7 prep _ _
9 Charter _ NNP NNP _ 8 pobj _ _
10 but _ CC CC _ 2 cc _ _
11 declined _ VBD VBD _ 2 conj _ _
12 further _ JJ JJ _ 13 amod _ _
13 comment _ NN NN _ 11 dobj _ _
14 . _ . . _ 2 punct _ _
1 Needless _ JJ JJ _ 6 ccomp _ _
2 to _ TO TO _ 3 aux _ _
3 say _ VB VB _ 1 xcomp _ _
4 , _ , , _ 6 punct _ _
5 it _ PRP PRP _ 6 nsubj _ _
6 wasn _ VBP VBP _ 0 ROOT _ _
7 't _ NN NN _ 6 dobj _ _
8 long _ RB RB _ 11 advmod _ _
9 before _ IN IN _ 11 mark _ _
10 he _ PRP PRP _ 11 nsubj _ _
11 sat _ VBD VBD _ 6 advcl _ _
12 down _ RP RP _ 11 prt _ _
13 . _ . . _ 6 punct _ _
1 For _ IN IN _ 18 prep _ _
2 Judy _ NNP NNP _ 3 nn _ _
3 John-Baptiste _ NNP NNP _ 1 pobj _ _
4 , _ , , _ 3 punct _ _
5 who _ WP WP _ 6 nsubj _ _
6 runs _ VBZ VBZ _ 3 rcmod _ _
7 the _ DT DT _ 10 det _ _
8 Basement _ NNP NNP _ 10 nn _ _
9 Dance _ NNP NNP _ 10 nn _ _
10 Studio _ NNP NNP _ 6 dobj _ _
11 in _ IN IN _ 10 prep _ _
12 London _ NNP NNP _ 11 pobj _ _
13 , _ , , _ 18 punct _ _
14 ballet _ NN NN _ 18 nsubj _ _
15 is _ VBZ VBZ _ 18 cop _ _
16 the _ DT DT _ 18 det _ _
17 most _ RBS RBS _ 18 advmod _ _
18 popular _ JJ JJ _ 0 ROOT _ _
19 of _ IN IN _ 18 prep _ _
20 all _ PDT PDT _ 22 predet _ _
21 the _ DT DT _ 22 det _ _
22 classes _ NNS NNS _ 19 pobj _ _
23 she _ PRP PRP _ 24 nsubj _ _
24 offers _ VBZ VBZ _ 22 rcmod _ _
25 . _ . . _ 18 punct _ _
1 Russ _ NNP NNP _ 2 nn _ _
2 Dixon _ NNP NNP _ 7 nsubj _ _
3 , _ , , _ 2 punct _ _
4 an _ DT DT _ 5 det _ _
5 infielder _ NN NN _ 2 appos _ _
6 , _ , , _ 2 punct _ _
7 homered _ VBD VBD _ 0 ROOT _ _
8 to _ TO TO _ 7 prep _ _
9 right _ NN NN _ 8 pobj _ _
10 , _ , , _ 7 punct _ _
11 then _ RB RB _ 7 advmod _ _
12 sheepishly _ RB RB _ 13 advmod _ _
13 put _ VBD VBD _ 7 dep _ _
14 his _ PRP$ PRP$ _ 15 poss _ _
15 head _ NN NN _ 13 dobj _ _
16 down _ RP RP _ 13 prt _ _
17 to _ TO TO _ 18 aux _ _
18 avoid _ VB VB _ 13 xcomp _ _
19 eye _ NN NN _ 20 nn _ _
20 contact _ NN NN _ 18 dobj _ _
21 with _ IN IN _ 20 prep _ _
22 the _ DT DT _ 23 det _ _
23 pitcher _ NN NN _ 21 pobj _ _
24 . _ . . _ 7 punct _ _
1 Mr. _ NNP NNP _ 2 nn _ _
2 Gore _ NNP NNP _ 3 nsubj _ _
3 was _ VBD VBD _ 0 ROOT _ _
4 not _ RB RB _ 3 neg _ _
5 here _ RB RB _ 3 advmod _ _
6 , _ , , _ 3 punct _ _
7 but _ CC CC _ 3 cc _ _
8 his _ PRP$ PRP$ _ 9 poss _ _
9 name _ NN NN _ 10 nsubj _ _
10 came _ VBD VBD _ 3 conj _ _
11 up _ RP RP _ 10 prt _ _
12 frequently _ RB RB _ 10 advmod _ _
13 . _ . . _ 3 punct _ _
1 The _ DT DT _ 2 det _ _
2 lawsuit _ NN NN _ 4 nsubj _ _
3 also _ RB RB _ 4 advmod _ _
4 names _ VBD VBD _ 0 ROOT _ _
5 the _ DT DT _ 7 det _ _
6 shopping _ NN NN _ 7 nn _ _
7 mall _ NN NN _ 4 dobj _ _
8 where _ WRB WRB _ 11 advmod _ _
9 the _ DT DT _ 10 det _ _
10 incident _ NN NN _ 11 nsubj _ _
11 occurred _ VBD VBD _ 7 rcmod _ _
12 and _ CC CC _ 7 cc _ _
13 the _ DT DT _ 15 det _ _
14 security _ NN NN _ 15 nn _ _
15 company _ NN NN _ 7 conj _ _
16 employed _ VBN VBN _ 15 partmod _ _
17 by _ IN IN _ 16 prep _ _
18 Wal-Mart _ NNP NNP _ 17 pobj _ _
19 . _ . . _ 4 punct _ _
1 Rudy _ NNP NNP _ 2 nn _ _
2 Crutchfield _ NNP NNP _ 9 nsubj _ _
3 and _ CC CC _ 2 cc _ _
4 Steve _ NNP NNP _ 5 nn _ _
5 Hadeed _ NNP NNP _ 2 conj _ _
6 have _ VBP VBP _ 9 aux _ _
7 been _ VBN VBN _ 9 cop _ _
8 close _ JJ JJ _ 9 amod _ _
9 friends _ NNS NNS _ 0 ROOT _ _
10 since _ IN IN _ 9 prep _ _
11 their _ PRP$ PRP$ _ 12 poss _ _
12 days _ NNS NNS _ 10 pobj _ _
13 at _ IN IN _ 12 prep _ _
14 Wheaton _ NNP NNP _ 16 nn _ _
15 High _ NNP NNP _ 16 nn _ _
16 School _ NNP NNP _ 13 pobj _ _
17 . _ . . _ 9 punct _ _
1 Earlier _ RBR RBR _ 3 advmod _ _
2 this _ DT DT _ 3 det _ _
3 month _ NN NN _ 6 tmod _ _
4 , _ , , _ 6 punct _ _
5 GM _ NNP NNP _ 6 nsubj _ _
6 announced _ VBD VBD _ 0 ROOT _ _
7 plans _ NNS NNS _ 6 dobj _ _
8 to _ TO TO _ 9 aux _ _
9 sell _ VB VB _ 7 infmod _ _
10 Hummer _ NNP NNP _ 9 dobj _ _
11 to _ TO TO _ 9 prep _ _
12 a _ DT DT _ 14 det _ _
13 Chinese _ JJ JJ _ 14 amod _ _
14 manufacturer _ NN NN _ 11 pobj _ _
15 and _ CC CC _ 14 cc _ _
16 Saturn _ NNP NNP _ 14 conj _ _
17 to _ TO TO _ 9 prep _ _
18 Michigan-based _ JJ JJ _ 24 amod _ _
19 dealership _ NN NN _ 24 nn _ _
20 chain _ NN NN _ 24 nn _ _
21 Penske _ NNP NNP _ 24 nn _ _
22 Automotive _ NNP NNP _ 24 nn _ _
23 Group _ NNP NNP _ 24 nn _ _
24 Inc _ NNP NNP _ 17 pobj _ _
25 . _ . . _ 6 punct _ _
syntaxnet/syntaxnet/text_formats.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <memory>
#include <string>
#include <vector>
#include "syntaxnet/document_format.h"
#include "syntaxnet/sentence.pb.h"
#include "syntaxnet/utils.h"
#include "tensorflow/core/lib/io/inputbuffer.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/regexp.h"
namespace
syntaxnet
{
// CoNLL document format reader for dependency annotated corpora.
// The expected format is described e.g. at http://ilk.uvt.nl/conll/#dataformat
//
// Data should adhere to the following rules:
// - Data files contain sentences separated by a blank line.
// - A sentence consists of one or tokens, each one starting on a new line.
// - A token consists of ten fields described in the table below.
// - Fields are separated by a single tab character.
// - All data files will contains these ten fields, although only the ID
// column is required to contain non-dummy (i.e. non-underscore) values.
// Data files should be UTF-8 encoded (Unicode).
//
// Fields:
// 1 ID: Token counter, starting at 1 for each new sentence and increasing
// by 1 for every new token.
// 2 FORM: Word form or punctuation symbol.
// 3 LEMMA: Lemma or stem.
// 4 CPOSTAG: Coarse-grained part-of-speech tag or category.
// 5 POSTAG: Fine-grained part-of-speech tag. Note that the same POS tag
// cannot appear with multiple coarse-grained POS tags.
// 6 FEATS: Unordered set of syntactic and/or morphological features.
// 7 HEAD: Head of the current token, which is either a value of ID or '0'.
// 8 DEPREL: Dependency relation to the HEAD.
// 9 PHEAD: Projective head of current token.
// 10 PDEPREL: Dependency relation to the PHEAD.
//
// This CoNLL reader is compatible with the CoNLL-U format described at
// http://universaldependencies.org/format.html
// Note that this reader skips CoNLL-U multiword tokens and ignores the last two
// fields of every line, which are PHEAD and PDEPREL in CoNLL format, but are
// replaced by DEPS and MISC in CoNLL-U.
//
class
CoNLLSyntaxFormat
:
public
DocumentFormat
{
public:
CoNLLSyntaxFormat
()
{}
// Reads up to the first empty line and returns false end of file is reached.
bool
ReadRecord
(
tensorflow
::
io
::
InputBuffer
*
buffer
,
string
*
record
)
override
{
string
line
;
record
->
clear
();
tensorflow
::
Status
status
=
buffer
->
ReadLine
(
&
line
);
while
(
!
line
.
empty
()
&&
status
.
ok
())
{
tensorflow
::
strings
::
StrAppend
(
record
,
line
,
"
\n
"
);
status
=
buffer
->
ReadLine
(
&
line
);
}
return
status
.
ok
()
||
!
record
->
empty
();
}
void
ConvertFromString
(
const
string
&
key
,
const
string
&
value
,
vector
<
Sentence
*>
*
sentences
)
override
{
// Create new sentence.
Sentence
*
sentence
=
new
Sentence
();
// Each line corresponds to one token.
string
text
;
vector
<
string
>
lines
=
utils
::
Split
(
value
,
'\n'
);
// Add each token to the sentence.
vector
<
string
>
fields
;
int
expected_id
=
1
;
for
(
size_t
i
=
0
;
i
<
lines
.
size
();
++
i
)
{
// Split line into tab-separated fields.
fields
.
clear
();
fields
=
utils
::
Split
(
lines
[
i
],
'\t'
);
if
(
fields
.
size
()
==
0
)
continue
;
// Skip comment lines.
if
(
fields
[
0
][
0
]
==
'#'
)
continue
;
// Skip CoNLLU lines for multiword tokens which are indicated by
// hyphenated line numbers, e.g., "2-4".
// http://universaldependencies.github.io/docs/format.html
if
(
RE2
::
FullMatch
(
fields
[
0
],
"[0-9]+-[0-9]+"
))
continue
;
// Clear all optional fields equal to '_'.
for
(
size_t
j
=
2
;
j
<
fields
.
size
();
++
j
)
{
if
(
fields
[
j
].
length
()
==
1
&&
fields
[
j
][
0
]
==
'_'
)
fields
[
j
].
clear
();
}
// Check that the line is valid.
CHECK_GE
(
fields
.
size
(),
8
)
<<
"Every line has to have at least 8 tab separated fields."
;
// Check that the ids follow the expected format.
const
int
id
=
utils
::
ParseUsing
<
int
>
(
fields
[
0
],
0
,
utils
::
ParseInt32
);
CHECK_EQ
(
expected_id
++
,
id
)
<<
"Token ids start at 1 for each new sentence and increase by 1 "
<<
"on each new token. Sentences are separated by an empty line."
;
// Get relevant fields.
const
string
&
word
=
fields
[
1
];
const
string
&
cpostag
=
fields
[
3
];
const
string
&
tag
=
fields
[
4
];
const
int
head
=
utils
::
ParseUsing
<
int
>
(
fields
[
6
],
0
,
utils
::
ParseInt32
);
const
string
&
label
=
fields
[
7
];
// Add token to sentence text.
if
(
!
text
.
empty
())
text
.
append
(
" "
);
const
int
start
=
text
.
size
();
const
int
end
=
start
+
word
.
size
()
-
1
;
text
.
append
(
word
);
// Add token to sentence.
Token
*
token
=
sentence
->
add_token
();
token
->
set_word
(
word
);
token
->
set_start
(
start
);
token
->
set_end
(
end
);
if
(
head
>
0
)
token
->
set_head
(
head
-
1
);
if
(
!
tag
.
empty
())
token
->
set_tag
(
tag
);
if
(
!
cpostag
.
empty
())
token
->
set_category
(
cpostag
);
if
(
!
label
.
empty
())
token
->
set_label
(
label
);
}
if
(
sentence
->
token_size
()
>
0
)
{
sentence
->
set_docid
(
key
);
sentence
->
set_text
(
text
);
sentences
->
push_back
(
sentence
);
}
else
{
// If the sentence was empty (e.g., blank lines at the beginning of a
// file), then don't save it.
delete
sentence
;
}
}
// Converts a sentence to a key/value pair.
void
ConvertToString
(
const
Sentence
&
sentence
,
string
*
key
,
string
*
value
)
override
{
*
key
=
sentence
.
docid
();
vector
<
string
>
lines
;
for
(
int
i
=
0
;
i
<
sentence
.
token_size
();
++
i
)
{
vector
<
string
>
fields
(
10
);
fields
[
0
]
=
tensorflow
::
strings
::
Printf
(
"%d"
,
i
+
1
);
fields
[
1
]
=
sentence
.
token
(
i
).
word
();
fields
[
2
]
=
"_"
;
fields
[
3
]
=
sentence
.
token
(
i
).
category
();
fields
[
4
]
=
sentence
.
token
(
i
).
tag
();
fields
[
5
]
=
"_"
;
fields
[
6
]
=
tensorflow
::
strings
::
Printf
(
"%d"
,
sentence
.
token
(
i
).
head
()
+
1
);
fields
[
7
]
=
sentence
.
token
(
i
).
label
();
fields
[
8
]
=
"_"
;
fields
[
9
]
=
"_"
;
lines
.
push_back
(
utils
::
Join
(
fields
,
"
\t
"
));
}
*
value
=
tensorflow
::
strings
::
StrCat
(
utils
::
Join
(
lines
,
"
\n
"
),
"
\n\n
"
);
}
private:
TF_DISALLOW_COPY_AND_ASSIGN
(
CoNLLSyntaxFormat
);
};
REGISTER_DOCUMENT_FORMAT
(
"conll-sentence"
,
CoNLLSyntaxFormat
);
// Reader for tokenized text. This reader expects every sentence to be on a
// single line and tokens on that line to be separated by single spaces.
//
class
TokenizedTextFormat
:
public
DocumentFormat
{
public:
TokenizedTextFormat
()
{}
// Reads a line and returns false if end of file is reached.
bool
ReadRecord
(
tensorflow
::
io
::
InputBuffer
*
buffer
,
string
*
record
)
override
{
return
buffer
->
ReadLine
(
record
).
ok
();
}
void
ConvertFromString
(
const
string
&
key
,
const
string
&
value
,
vector
<
Sentence
*>
*
sentences
)
override
{
Sentence
*
sentence
=
new
Sentence
();
string
text
;
for
(
const
string
&
word
:
utils
::
Split
(
value
,
' '
))
{
if
(
word
.
empty
())
continue
;
const
int
start
=
text
.
size
();
const
int
end
=
start
+
word
.
size
()
-
1
;
if
(
!
text
.
empty
())
text
.
append
(
" "
);
text
.
append
(
word
);
Token
*
token
=
sentence
->
add_token
();
token
->
set_word
(
word
);
token
->
set_start
(
start
);
token
->
set_end
(
end
);
}
if
(
sentence
->
token_size
()
>
0
)
{
sentence
->
set_docid
(
key
);
sentence
->
set_text
(
text
);
sentences
->
push_back
(
sentence
);
}
else
{
// If the sentence was empty (e.g., blank lines at the beginning of a
// file), then don't save it.
delete
sentence
;
}
}
void
ConvertToString
(
const
Sentence
&
sentence
,
string
*
key
,
string
*
value
)
override
{
*
key
=
sentence
.
docid
();
value
->
clear
();
for
(
const
Token
&
token
:
sentence
.
token
())
{
if
(
!
value
->
empty
())
value
->
append
(
" "
);
value
->
append
(
token
.
word
());
if
(
token
.
has_tag
())
{
value
->
append
(
"_"
);
value
->
append
(
token
.
tag
());
}
if
(
token
.
has_head
())
{
value
->
append
(
"_"
);
value
->
append
(
tensorflow
::
strings
::
StrCat
(
token
.
head
()));
}
}
value
->
append
(
"
\n
"
);
}
private:
TF_DISALLOW_COPY_AND_ASSIGN
(
TokenizedTextFormat
);
};
REGISTER_DOCUMENT_FORMAT
(
"tokenized-text"
,
TokenizedTextFormat
);
// Text reader that attmpts to perform Penn Treebank tokenization on arbitrary
// raw text. Adapted from https://www.cis.upenn.edu/~treebank/tokenizer.sed
// by Robert MacIntyre, University of Pennsylvania, late 1995.
// Expected input: raw text with one sentence per line.
//
class
EnglishTextFormat
:
public
TokenizedTextFormat
{
public:
EnglishTextFormat
()
{}
void
ConvertFromString
(
const
string
&
key
,
const
string
&
value
,
vector
<
Sentence
*>
*
sentences
)
override
{
vector
<
pair
<
string
,
string
>>
preproc_rules
=
{
// Punctuation.
{
"’"
,
"'"
},
{
"…"
,
"..."
},
{
"---"
,
"--"
},
{
"—"
,
"--"
},
{
"–"
,
"--"
},
{
","
,
","
},
{
"。"
,
"."
},
{
"!"
,
"!"
},
{
"?"
,
"?"
},
{
":"
,
":"
},
{
";"
,
";"
},
{
"&"
,
"&"
},
// Brackets.
{
"
\\
["
,
"("
},
{
"]"
,
")"
},
{
"{"
,
"("
},
{
"}"
,
")"
},
{
"【"
,
"("
},
{
"】"
,
")"
},
{
"("
,
"("
},
{
")"
,
")"
},
// Quotation marks.
{
"
\"
"
,
"
\"
"
},
{
"″"
,
"
\"
"
},
{
"“"
,
"
\"
"
},
{
"„"
,
"
\"
"
},
{
"‵‵"
,
"
\"
"
},
{
"”"
,
"
\"
"
},
{
"’"
,
"
\"
"
},
{
"‘"
,
"
\"
"
},
{
"′′"
,
"
\"
"
},
{
"‹"
,
"
\"
"
},
{
"›"
,
"
\"
"
},
{
"«"
,
"
\"
"
},
{
"»"
,
"
\"
"
},
// Discarded punctuation that breaks sentences.
{
"|"
,
""
},
{
"·"
,
""
},
{
"•"
,
""
},
{
"●"
,
""
},
{
"▪"
,
""
},
{
"■"
,
""
},
{
"□"
,
""
},
{
"❑"
,
""
},
{
"◆"
,
""
},
{
"★"
,
""
},
{
"*"
,
""
},
{
"♦"
,
""
},
};
vector
<
pair
<
string
,
string
>>
rules
=
{
// attempt to get correct directional quotes
{
R"re(^")re"
,
"`` "
},
{
R"re(([ \([{<])")re"
,
"
\\
1 `` "
},
// close quotes handled at end
{
R"re(\.\.\.)re"
,
" ... "
},
{
"[,;:@#$%&]"
,
"
\\
0 "
},
// Assume sentence tokenization has been done first, so split FINAL
// periods only.
{
R"re(([^.])(\.)([\]\)}>"']*)[ ]*$)re"
,
"
\\
1
\\
2
\\
3 "
},
// however, we may as well split ALL question marks and exclamation
// points, since they shouldn't have the abbrev.-marker ambiguity
// problem
{
"[?!]"
,
"
\\
0 "
},
// parentheses, brackets, etc.
{
R"re([\]\[\(\){}<>])re"
,
"
\\
0 "
},
// Like Adwait Ratnaparkhi's MXPOST, we use the parsed-file version of
// these symbols.
{
"
\\
("
,
"-LRB-"
},
{
"
\\
)"
,
"-RRB-"
},
{
"
\\
]"
,
"-LSB-"
},
{
"
\\
]"
,
"-RSB-"
},
{
"{"
,
"-LCB-"
},
{
"}"
,
"-RCB-"
},
{
"--"
,
" -- "
},
// First off, add a space to the beginning and end of each line, to
// reduce necessary number of regexps.
{
"$"
,
" "
},
{
"^"
,
" "
},
{
"
\"
"
,
" '' "
},
// possessive or close-single-quote
{
"([^'])' "
,
"
\\
1 ' "
},
// as in it's, I'm, we'd
{
"'([sSmMdD]) "
,
" '
\\
1 "
},
{
"'ll "
,
" 'll "
},
{
"'re "
,
" 're "
},
{
"'ve "
,
" 've "
},
{
"n't "
,
" n't "
},
{
"'LL "
,
" 'LL "
},
{
"'RE "
,
" 'RE "
},
{
"'VE "
,
" 'VE "
},
{
"N'T "
,
" N'T "
},
{
" ([Cc])annot "
,
"
\\
1an not "
},
{
" ([Dd])'ye "
,
"
\\
1' ye "
},
{
" ([Gg])imme "
,
"
\\
1im me "
},
{
" ([Gg])onna "
,
"
\\
1on na "
},
{
" ([Gg])otta "
,
"
\\
1ot ta "
},
{
" ([Ll])emme "
,
"
\\
1em me "
},
{
" ([Mm])ore'n "
,
"
\\
1ore 'n "
},
{
" '([Tt])is "
,
" '
\\
1 is "
},
{
" '([Tt])was "
,
" '
\\
1 was "
},
{
" ([Ww])anna "
,
"
\\
1an na "
},
{
" ([Ww])haddya "
,
"
\\
1ha dd ya "
},
{
" ([Ww])hatcha "
,
"
\\
1ha t cha "
},
// clean out extra spaces
{
" *"
,
" "
},
{
"^ *"
,
""
},
};
string
rewritten
=
value
;
for
(
const
pair
<
string
,
string
>
&
rule
:
preproc_rules
)
{
RE2
::
GlobalReplace
(
&
rewritten
,
rule
.
first
,
rule
.
second
);
}
for
(
const
pair
<
string
,
string
>
&
rule
:
rules
)
{
RE2
::
GlobalReplace
(
&
rewritten
,
rule
.
first
,
rule
.
second
);
}
TokenizedTextFormat
::
ConvertFromString
(
key
,
rewritten
,
sentences
);
}
private:
TF_DISALLOW_COPY_AND_ASSIGN
(
EnglishTextFormat
);
};
REGISTER_DOCUMENT_FORMAT
(
"english-text"
,
EnglishTextFormat
);
}
// namespace syntaxnet
syntaxnet/syntaxnet/text_formats_test.py
0 → 100644
View file @
32ab5a58
# coding=utf-8
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for english_tokenizer."""
# disable=no-name-in-module,unused-import,g-bad-import-order,maybe-no-member
import
os.path
import
tensorflow
as
tf
import
syntaxnet.load_parser_ops
from
tensorflow.python.framework
import
test_util
from
tensorflow.python.platform
import
googletest
from
tensorflow.python.platform
import
logging
from
syntaxnet
import
sentence_pb2
from
syntaxnet
import
task_spec_pb2
from
syntaxnet.ops
import
gen_parser_ops
FLAGS
=
tf
.
app
.
flags
.
FLAGS
class
TextFormatsTest
(
test_util
.
TensorFlowTestCase
):
def
setUp
(
self
):
if
not
hasattr
(
FLAGS
,
'test_srcdir'
):
FLAGS
.
test_srcdir
=
''
if
not
hasattr
(
FLAGS
,
'test_tmpdir'
):
FLAGS
.
test_tmpdir
=
tf
.
test
.
get_temp_dir
()
self
.
corpus_file
=
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
'documents.conll'
)
self
.
context_file
=
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
'context.pbtxt'
)
def
AddInput
(
self
,
name
,
file_pattern
,
record_format
,
context
):
inp
=
context
.
input
.
add
()
inp
.
name
=
name
inp
.
record_format
.
append
(
record_format
)
inp
.
part
.
add
().
file_pattern
=
file_pattern
def
WriteContext
(
self
,
corpus_format
):
context
=
task_spec_pb2
.
TaskSpec
()
self
.
AddInput
(
'documents'
,
self
.
corpus_file
,
corpus_format
,
context
)
for
name
in
(
'word-map'
,
'lcword-map'
,
'tag-map'
,
'category-map'
,
'label-map'
,
'prefix-table'
,
'suffix-table'
,
'tag-to-category'
):
self
.
AddInput
(
name
,
os
.
path
.
join
(
FLAGS
.
test_tmpdir
,
name
),
''
,
context
)
logging
.
info
(
'Writing context to: %s'
,
self
.
context_file
)
with
open
(
self
.
context_file
,
'w'
)
as
f
:
f
.
write
(
str
(
context
))
def
ReadNextDocument
(
self
,
sess
,
sentence
):
sentence_str
,
=
sess
.
run
([
sentence
])
if
sentence_str
:
sentence_doc
=
sentence_pb2
.
Sentence
()
sentence_doc
.
ParseFromString
(
sentence_str
[
0
])
else
:
sentence_doc
=
None
return
sentence_doc
def
CheckTokenization
(
self
,
sentence
,
tokenization
):
self
.
WriteContext
(
'english-text'
)
logging
.
info
(
'Writing text file to: %s'
,
self
.
corpus_file
)
with
open
(
self
.
corpus_file
,
'w'
)
as
f
:
f
.
write
(
sentence
)
sentence
,
_
=
gen_parser_ops
.
document_source
(
self
.
context_file
,
batch_size
=
1
)
with
self
.
test_session
()
as
sess
:
sentence_doc
=
self
.
ReadNextDocument
(
sess
,
sentence
)
self
.
assertEqual
(
' '
.
join
([
t
.
word
for
t
in
sentence_doc
.
token
]),
tokenization
)
def
testSimple
(
self
):
self
.
CheckTokenization
(
'Hello, world!'
,
'Hello , world !'
)
self
.
CheckTokenization
(
'"Hello"'
,
"`` Hello ''"
)
self
.
CheckTokenization
(
'{"Hello@#$'
,
'-LRB- `` Hello @ # $'
)
self
.
CheckTokenization
(
'"Hello..."'
,
"`` Hello ... ''"
)
self
.
CheckTokenization
(
'()[]{}<>'
,
'-LRB- -RRB- -LRB- -RRB- -LRB- -RRB- < >'
)
self
.
CheckTokenization
(
'Hello--world'
,
'Hello -- world'
)
self
.
CheckTokenization
(
"Isn't"
,
"Is n't"
)
self
.
CheckTokenization
(
"n't"
,
"n't"
)
self
.
CheckTokenization
(
'Hello Mr. Smith.'
,
'Hello Mr. Smith .'
)
self
.
CheckTokenization
(
"It's Mr. Smith's."
,
"It 's Mr. Smith 's ."
)
self
.
CheckTokenization
(
"It's the Smiths'."
,
"It 's the Smiths ' ."
)
self
.
CheckTokenization
(
'Gotta go'
,
'Got ta go'
)
self
.
CheckTokenization
(
'50-year-old'
,
'50-year-old'
)
def
testUrl
(
self
):
self
.
CheckTokenization
(
'http://www.google.com/news is down'
,
'http : //www.google.com/news is down'
)
if
__name__
==
'__main__'
:
googletest
.
main
()
syntaxnet/syntaxnet/unpack_sparse_features.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#define EIGEN_USE_THREADS
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "syntaxnet/utils.h"
#include "syntaxnet/sparse.pb.h"
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/types.h"
using
tensorflow
::
DEVICE_CPU
;
using
tensorflow
::
DT_FLOAT
;
using
tensorflow
::
DT_INT32
;
using
tensorflow
::
DT_INT64
;
using
tensorflow
::
DT_STRING
;
using
tensorflow
::
OpKernel
;
using
tensorflow
::
OpKernelConstruction
;
using
tensorflow
::
OpKernelContext
;
using
tensorflow
::
Tensor
;
using
tensorflow
::
TensorShape
;
using
tensorflow
::
errors
::
InvalidArgument
;
namespace
syntaxnet
{
// Operator to unpack ids and weights stored in SparseFeatures proto.
class
UnpackSparseFeatures
:
public
OpKernel
{
public:
explicit
UnpackSparseFeatures
(
OpKernelConstruction
*
context
)
:
OpKernel
(
context
)
{
OP_REQUIRES_OK
(
context
,
context
->
MatchSignature
(
{
DT_STRING
},
{
DT_INT32
,
DT_INT64
,
DT_FLOAT
}));
}
void
Compute
(
OpKernelContext
*
context
)
override
{
const
Tensor
&
input
=
context
->
input
(
0
);
OP_REQUIRES
(
context
,
IsLegacyVector
(
input
.
shape
()),
InvalidArgument
(
"input should be a vector."
));
const
int64
n
=
input
.
NumElements
();
const
auto
input_vec
=
input
.
flat
<
string
>
();
SparseFeatures
sf
;
int
output_size
=
0
;
std
::
vector
<
std
::
pair
<
int64
,
float
>
>
id_and_weight
;
// Guess that we'll be averaging a handful of ids per SparseFeatures record.
id_and_weight
.
reserve
(
n
*
4
);
std
::
vector
<
int
>
num_ids
(
n
);
for
(
int64
i
=
0
;
i
<
n
;
++
i
)
{
OP_REQUIRES
(
context
,
sf
.
ParseFromString
(
input_vec
(
i
)),
InvalidArgument
(
"Couldn't parse as SparseFeature"
));
OP_REQUIRES
(
context
,
sf
.
weight_size
()
==
0
||
sf
.
weight_size
()
==
sf
.
id_size
(),
InvalidArgument
(
tensorflow
::
strings
::
StrCat
(
"Incorrect number of weights"
,
sf
.
DebugString
())));
int
n_ids
=
sf
.
id_size
();
num_ids
[
i
]
=
n_ids
;
output_size
+=
n_ids
;
for
(
int
j
=
0
;
j
<
n_ids
;
j
++
)
{
float
w
=
(
sf
.
weight_size
()
>
0
)
?
sf
.
weight
(
j
)
:
1.0
f
;
id_and_weight
.
push_back
(
std
::
make_pair
(
sf
.
id
(
j
),
w
));
}
}
Tensor
*
indices_t
;
OP_REQUIRES_OK
(
context
,
context
->
allocate_output
(
0
,
TensorShape
({
output_size
}),
&
indices_t
));
Tensor
*
ids_t
;
OP_REQUIRES_OK
(
context
,
context
->
allocate_output
(
1
,
TensorShape
({
output_size
}),
&
ids_t
));
Tensor
*
weights_t
;
OP_REQUIRES_OK
(
context
,
context
->
allocate_output
(
2
,
TensorShape
({
output_size
}),
&
weights_t
));
auto
indices
=
indices_t
->
vec
<
int32
>
();
auto
ids
=
ids_t
->
vec
<
int64
>
();
auto
weights
=
weights_t
->
vec
<
float
>
();
int
c
=
0
;
for
(
int64
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_ids
[
i
];
++
j
)
{
indices
(
c
)
=
i
;
ids
(
c
)
=
id_and_weight
[
c
].
first
;
weights
(
c
)
=
id_and_weight
[
c
].
second
;
++
c
;
}
}
}
};
REGISTER_KERNEL_BUILDER
(
Name
(
"UnpackSparseFeatures"
).
Device
(
DEVICE_CPU
),
UnpackSparseFeatures
);
}
// namespace syntaxnet
syntaxnet/syntaxnet/utils.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/utils.h"
#include "tensorflow/core/platform/macros.h"
namespace
syntaxnet
{
namespace
utils
{
bool
ParseInt32
(
const
char
*
c_str
,
int
*
value
)
{
char
*
temp
;
*
value
=
strtol
(
c_str
,
&
temp
,
0
);
// NOLINT
return
(
*
temp
==
'\0'
);
}
bool
ParseInt64
(
const
char
*
c_str
,
int64
*
value
)
{
char
*
temp
;
*
value
=
strtol
(
c_str
,
&
temp
,
0
);
// NOLINT
return
(
*
temp
==
'\0'
);
}
bool
ParseDouble
(
const
char
*
c_str
,
double
*
value
)
{
char
*
temp
;
*
value
=
strtod
(
c_str
,
&
temp
);
return
(
*
temp
==
'\0'
);
}
static
char
hex_char
[]
=
"0123456789abcdef"
;
string
CEscape
(
const
string
&
src
)
{
string
dest
;
for
(
unsigned
char
c
:
src
)
{
switch
(
c
)
{
case
'\n'
:
dest
.
append
(
"
\\
n"
);
break
;
case
'\r'
:
dest
.
append
(
"
\\
r"
);
break
;
case
'\t'
:
dest
.
append
(
"
\\
t"
);
break
;
case
'\"'
:
dest
.
append
(
"
\\\"
"
);
break
;
case
'\''
:
dest
.
append
(
"
\\
'"
);
break
;
case
'\\'
:
dest
.
append
(
"
\\\\
"
);
break
;
default:
// Note that if we emit \xNN and the src character after that is a hex
// digit then that digit must be escaped too to prevent it being
// interpreted as part of the character code by C.
if
((
c
>=
0x80
)
||
!
isprint
(
c
))
{
dest
.
append
(
"
\\
"
);
dest
.
push_back
(
hex_char
[
c
/
64
]);
dest
.
push_back
(
hex_char
[(
c
%
64
)
/
8
]);
dest
.
push_back
(
hex_char
[
c
%
8
]);
}
else
{
dest
.
push_back
(
c
);
break
;
}
}
}
return
dest
;
}
std
::
vector
<
string
>
Split
(
const
string
&
text
,
char
delim
)
{
std
::
vector
<
string
>
result
;
int
token_start
=
0
;
if
(
!
text
.
empty
())
{
for
(
size_t
i
=
0
;
i
<
text
.
size
()
+
1
;
i
++
)
{
if
((
i
==
text
.
size
())
||
(
text
[
i
]
==
delim
))
{
result
.
push_back
(
string
(
text
.
data
()
+
token_start
,
i
-
token_start
));
token_start
=
i
+
1
;
}
}
}
return
result
;
}
bool
IsAbsolutePath
(
tensorflow
::
StringPiece
path
)
{
return
!
path
.
empty
()
&&
path
[
0
]
==
'/'
;
}
// For an array of paths of length count, append them all together,
// ensuring that the proper path separators are inserted between them.
string
JoinPath
(
std
::
initializer_list
<
tensorflow
::
StringPiece
>
paths
)
{
string
result
;
for
(
tensorflow
::
StringPiece
path
:
paths
)
{
if
(
path
.
empty
())
{
continue
;
}
if
(
result
.
empty
())
{
result
=
path
.
ToString
();
continue
;
}
if
(
result
[
result
.
size
()
-
1
]
==
'/'
)
{
if
(
IsAbsolutePath
(
path
))
{
tensorflow
::
strings
::
StrAppend
(
&
result
,
path
.
substr
(
1
));
}
else
{
tensorflow
::
strings
::
StrAppend
(
&
result
,
path
);
}
}
else
{
if
(
IsAbsolutePath
(
path
))
{
tensorflow
::
strings
::
StrAppend
(
&
result
,
path
);
}
else
{
tensorflow
::
strings
::
StrAppend
(
&
result
,
"/"
,
path
);
}
}
}
return
result
;
}
size_t
RemoveLeadingWhitespace
(
tensorflow
::
StringPiece
*
text
)
{
size_t
count
=
0
;
const
char
*
ptr
=
text
->
data
();
while
(
count
<
text
->
size
()
&&
isspace
(
*
ptr
))
{
count
++
;
ptr
++
;
}
text
->
remove_prefix
(
count
);
return
count
;
}
size_t
RemoveTrailingWhitespace
(
tensorflow
::
StringPiece
*
text
)
{
size_t
count
=
0
;
const
char
*
ptr
=
text
->
data
()
+
text
->
size
()
-
1
;
while
(
count
<
text
->
size
()
&&
isspace
(
*
ptr
))
{
++
count
;
--
ptr
;
}
text
->
remove_suffix
(
count
);
return
count
;
}
size_t
RemoveWhitespaceContext
(
tensorflow
::
StringPiece
*
text
)
{
// use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
return
RemoveLeadingWhitespace
(
text
)
+
RemoveTrailingWhitespace
(
text
);
}
namespace
{
// Lower-level versions of Get... that read directly from a character buffer
// without any bounds checking.
inline
uint32
DecodeFixed32
(
const
char
*
ptr
)
{
return
((
static_cast
<
uint32
>
(
static_cast
<
unsigned
char
>
(
ptr
[
0
])))
|
(
static_cast
<
uint32
>
(
static_cast
<
unsigned
char
>
(
ptr
[
1
]))
<<
8
)
|
(
static_cast
<
uint32
>
(
static_cast
<
unsigned
char
>
(
ptr
[
2
]))
<<
16
)
|
(
static_cast
<
uint32
>
(
static_cast
<
unsigned
char
>
(
ptr
[
3
]))
<<
24
));
}
// 0xff is in case char is signed.
static
inline
uint32
ByteAs32
(
char
c
)
{
return
static_cast
<
uint32
>
(
c
)
&
0xff
;
}
}
// namespace
uint32
Hash32
(
const
char
*
data
,
size_t
n
,
uint32
seed
)
{
// 'm' and 'r' are mixing constants generated offline.
// They're not really 'magic', they just happen to work well.
const
uint32
m
=
0x5bd1e995
;
const
int
r
=
24
;
// Initialize the hash to a 'random' value
uint32
h
=
seed
^
n
;
// Mix 4 bytes at a time into the hash
while
(
n
>=
4
)
{
uint32
k
=
DecodeFixed32
(
data
);
k
*=
m
;
k
^=
k
>>
r
;
k
*=
m
;
h
*=
m
;
h
^=
k
;
data
+=
4
;
n
-=
4
;
}
// Handle the last few bytes of the input array
switch
(
n
)
{
case
3
:
h
^=
ByteAs32
(
data
[
2
])
<<
16
;
TF_FALLTHROUGH_INTENDED
;
case
2
:
h
^=
ByteAs32
(
data
[
1
])
<<
8
;
TF_FALLTHROUGH_INTENDED
;
case
1
:
h
^=
ByteAs32
(
data
[
0
]);
h
*=
m
;
}
// Do a few final mixes of the hash to ensure the last few
// bytes are well-incorporated.
h
^=
h
>>
13
;
h
*=
m
;
h
^=
h
>>
15
;
return
h
;
}
string
Lowercase
(
tensorflow
::
StringPiece
s
)
{
string
result
(
s
.
data
(),
s
.
size
());
for
(
char
&
c
:
result
)
{
c
=
tolower
(
c
);
}
return
result
;
}
PunctuationUtil
::
CharacterRange
PunctuationUtil
::
kPunctuation
[]
=
{
{
33
,
35
},
{
37
,
42
},
{
44
,
47
},
{
58
,
59
},
{
63
,
64
},
{
91
,
93
},
{
95
,
95
},
{
123
,
123
},
{
125
,
125
},
{
161
,
161
},
{
171
,
171
},
{
183
,
183
},
{
187
,
187
},
{
191
,
191
},
{
894
,
894
},
{
903
,
903
},
{
1370
,
1375
},
{
1417
,
1418
},
{
1470
,
1470
},
{
1472
,
1472
},
{
1475
,
1475
},
{
1478
,
1478
},
{
1523
,
1524
},
{
1548
,
1549
},
{
1563
,
1563
},
{
1566
,
1567
},
{
1642
,
1645
},
{
1748
,
1748
},
{
1792
,
1805
},
{
2404
,
2405
},
{
2416
,
2416
},
{
3572
,
3572
},
{
3663
,
3663
},
{
3674
,
3675
},
{
3844
,
3858
},
{
3898
,
3901
},
{
3973
,
3973
},
{
4048
,
4049
},
{
4170
,
4175
},
{
4347
,
4347
},
{
4961
,
4968
},
{
5741
,
5742
},
{
5787
,
5788
},
{
5867
,
5869
},
{
5941
,
5942
},
{
6100
,
6102
},
{
6104
,
6106
},
{
6144
,
6154
},
{
6468
,
6469
},
{
6622
,
6623
},
{
6686
,
6687
},
{
8208
,
8231
},
{
8240
,
8259
},
{
8261
,
8273
},
{
8275
,
8286
},
{
8317
,
8318
},
{
8333
,
8334
},
{
9001
,
9002
},
{
9140
,
9142
},
{
10088
,
10101
},
{
10181
,
10182
},
{
10214
,
10219
},
{
10627
,
10648
},
{
10712
,
10715
},
{
10748
,
10749
},
{
11513
,
11516
},
{
11518
,
11519
},
{
11776
,
11799
},
{
11804
,
11805
},
{
12289
,
12291
},
{
12296
,
12305
},
{
12308
,
12319
},
{
12336
,
12336
},
{
12349
,
12349
},
{
12448
,
12448
},
{
12539
,
12539
},
{
64830
,
64831
},
{
65040
,
65049
},
{
65072
,
65106
},
{
65108
,
65121
},
{
65123
,
65123
},
{
65128
,
65128
},
{
65130
,
65131
},
{
65281
,
65283
},
{
65285
,
65290
},
{
65292
,
65295
},
{
65306
,
65307
},
{
65311
,
65312
},
{
65339
,
65341
},
{
65343
,
65343
},
{
65371
,
65371
},
{
65373
,
65373
},
{
65375
,
65381
},
{
65792
,
65793
},
{
66463
,
66463
},
{
68176
,
68184
},
{
-
1
,
-
1
}};
void
NormalizeDigits
(
string
*
form
)
{
for
(
size_t
i
=
0
;
i
<
form
->
size
();
++
i
)
{
if
((
*
form
)[
i
]
>=
'0'
&&
(
*
form
)[
i
]
<=
'9'
)
(
*
form
)[
i
]
=
'9'
;
}
}
}
// namespace utils
}
// namespace syntaxnet
syntaxnet/syntaxnet/utils.h
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef $TARGETDIR_UTILS_H_
#define $TARGETDIR_UTILS_H_
#include <functional>
#include <string>
#include <vector>
#include <unordered_set>
#include "syntaxnet/base.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/default/integral_types.h"
#include "tensorflow/core/platform/mutex.h"
#include "util/utf8/unicodetext.h"
namespace
syntaxnet
{
namespace
utils
{
bool
ParseInt32
(
const
char
*
c_str
,
int
*
value
);
bool
ParseInt64
(
const
char
*
c_str
,
int64
*
value
);
bool
ParseDouble
(
const
char
*
c_str
,
double
*
value
);
template
<
typename
T
>
T
ParseUsing
(
const
string
&
str
,
std
::
function
<
bool
(
const
char
*
,
T
*
)
>
func
)
{
T
value
;
CHECK
(
func
(
str
.
c_str
(),
&
value
))
<<
"Failed to convert: "
<<
str
;
return
value
;
}
template
<
typename
T
>
T
ParseUsing
(
const
string
&
str
,
T
defval
,
std
::
function
<
bool
(
const
char
*
,
T
*
)
>
func
)
{
return
str
.
empty
()
?
defval
:
ParseUsing
<
T
>
(
str
,
func
);
}
string
CEscape
(
const
string
&
src
);
std
::
vector
<
string
>
Split
(
const
string
&
text
,
char
delim
);
template
<
typename
T
>
string
Join
(
const
std
::
vector
<
T
>
&
s
,
const
char
*
sep
)
{
string
result
;
bool
first
=
true
;
for
(
const
auto
&
x
:
s
)
{
tensorflow
::
strings
::
StrAppend
(
&
result
,
(
first
?
""
:
sep
),
x
);
first
=
false
;
}
return
result
;
}
string
JoinPath
(
std
::
initializer_list
<
StringPiece
>
paths
);
size_t
RemoveLeadingWhitespace
(
tensorflow
::
StringPiece
*
text
);
size_t
RemoveTrailingWhitespace
(
tensorflow
::
StringPiece
*
text
);
size_t
RemoveWhitespaceContext
(
tensorflow
::
StringPiece
*
text
);
uint32
Hash32
(
const
char
*
data
,
size_t
n
,
uint32
seed
);
// Deletes all the elements in an STL container and clears the container. This
// function is suitable for use with a vector, set, hash_set, or any other STL
// container which defines sensible begin(), end(), and clear() methods.
// If container is NULL, this function is a no-op.
template
<
typename
T
>
void
STLDeleteElements
(
T
*
container
)
{
if
(
!
container
)
return
;
auto
it
=
container
->
begin
();
while
(
it
!=
container
->
end
())
{
auto
temp
=
it
;
++
it
;
delete
*
temp
;
}
container
->
clear
();
}
// Returns lower-cased version of s.
string
Lowercase
(
tensorflow
::
StringPiece
s
);
class
PunctuationUtil
{
public:
// Unicode character ranges for punctuation characters according to CoNLL.
struct
CharacterRange
{
int
first
;
int
last
;
};
static
CharacterRange
kPunctuation
[];
// Returns true if Unicode character is a punctuation character.
static
bool
IsPunctuation
(
int
u
)
{
int
i
=
0
;
while
(
kPunctuation
[
i
].
first
>
0
)
{
if
(
u
<
kPunctuation
[
i
].
first
)
return
false
;
if
(
u
<=
kPunctuation
[
i
].
last
)
return
true
;
++
i
;
}
return
false
;
}
// Determine if tag is a punctuation tag.
static
bool
IsPunctuationTag
(
const
string
&
tag
)
{
for
(
size_t
i
=
0
;
i
<
tag
.
length
();
++
i
)
{
int
c
=
tag
[
i
];
if
(
c
!=
','
&&
c
!=
':'
&&
c
!=
'.'
&&
c
!=
'\''
&&
c
!=
'`'
)
{
return
false
;
}
}
return
true
;
}
// Returns true if word consists of punctuation characters.
static
bool
IsPunctuationToken
(
const
string
&
word
)
{
UnicodeText
text
;
text
.
PointToUTF8
(
word
.
c_str
(),
word
.
length
());
UnicodeText
::
const_iterator
it
;
for
(
it
=
text
.
begin
();
it
!=
text
.
end
();
++
it
)
{
if
(
!
IsPunctuation
(
*
it
))
return
false
;
}
return
true
;
}
// Returns true if tag is non-empty and has only punctuation or parens
// symbols.
static
bool
IsPunctuationTagOrParens
(
const
string
&
tag
)
{
if
(
tag
.
empty
())
return
false
;
for
(
size_t
i
=
0
;
i
<
tag
.
length
();
++
i
)
{
int
c
=
tag
[
i
];
if
(
c
!=
'('
&&
c
!=
')'
&&
c
!=
','
&&
c
!=
':'
&&
c
!=
'.'
&&
c
!=
'\''
&&
c
!=
'`'
)
{
return
false
;
}
}
return
true
;
}
// Decides whether to score a token, given the word, the POS tag and
// and the scoring type.
static
bool
ScoreToken
(
const
string
&
word
,
const
string
&
tag
,
const
string
&
scoring_type
)
{
if
(
scoring_type
==
"default"
)
{
return
tag
.
empty
()
||
!
IsPunctuationTag
(
tag
);
}
else
if
(
scoring_type
==
"conllx"
)
{
return
!
IsPunctuationToken
(
word
);
}
else
if
(
scoring_type
==
"ignore_parens"
)
{
return
!
IsPunctuationTagOrParens
(
tag
);
}
CHECK
(
scoring_type
.
empty
())
<<
"Unknown scoring strategy "
<<
scoring_type
;
return
true
;
}
};
void
NormalizeDigits
(
string
*
form
);
}
// namespace utils
}
// namespace syntaxnet
#endif // $TARGETDIR_UTILS_H_
syntaxnet/syntaxnet/workspace.cc
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "syntaxnet/workspace.h"
#include "tensorflow/core/lib/strings/strcat.h"
namespace
syntaxnet
{
string
WorkspaceRegistry
::
DebugString
()
const
{
string
str
;
for
(
auto
&
it
:
workspace_names_
)
{
const
string
&
type_name
=
workspace_types_
.
at
(
it
.
first
);
for
(
size_t
index
=
0
;
index
<
it
.
second
.
size
();
++
index
)
{
const
string
&
workspace_name
=
it
.
second
[
index
];
tensorflow
::
strings
::
StrAppend
(
&
str
,
"
\n
"
,
type_name
,
" :: "
,
workspace_name
);
}
}
return
str
;
}
VectorIntWorkspace
::
VectorIntWorkspace
(
int
size
)
:
elements_
(
size
)
{}
VectorIntWorkspace
::
VectorIntWorkspace
(
int
size
,
int
value
)
:
elements_
(
size
,
value
)
{}
VectorIntWorkspace
::
VectorIntWorkspace
(
const
vector
<
int
>
&
elements
)
:
elements_
(
elements
)
{}
string
VectorIntWorkspace
::
TypeName
()
{
return
"Vector"
;
}
VectorVectorIntWorkspace
::
VectorVectorIntWorkspace
(
int
size
)
:
elements_
(
size
)
{}
string
VectorVectorIntWorkspace
::
TypeName
()
{
return
"VectorVector"
;
}
}
// namespace syntaxnet
syntaxnet/syntaxnet/workspace.h
0 → 100644
View file @
32ab5a58
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Notes on thread-safety: All of the classes here are thread-compatible. More
// specifically, the registry machinery is thread-safe, as long as each thread
// performs feature extraction on a different Sentence object.
#ifndef $TARGETDIR_WORKSPACE_H_
#define $TARGETDIR_WORKSPACE_H_
#include <string>
#include <typeindex>
#include <unordered_map>
#include <utility>
#include <vector>
#include "syntaxnet/utils.h"
namespace
syntaxnet
{
// A base class for shared workspaces. Derived classes implement a static member
// function TypeName() which returns a human readable string name for the class.
class
Workspace
{
public:
// Polymorphic destructor.
virtual
~
Workspace
()
{}
protected:
// Create an empty workspace.
Workspace
()
{}
private:
TF_DISALLOW_COPY_AND_ASSIGN
(
Workspace
);
};
// A registry that keeps track of workspaces.
class
WorkspaceRegistry
{
public:
// Create an empty registry.
WorkspaceRegistry
()
{}
// Returns the index of a named workspace, adding it to the registry first
// if necessary.
template
<
class
W
>
int
Request
(
const
string
&
name
)
{
const
std
::
type_index
id
=
std
::
type_index
(
typeid
(
W
));
workspace_types_
[
id
]
=
W
::
TypeName
();
vector
<
string
>
&
names
=
workspace_names_
[
id
];
for
(
int
i
=
0
;
i
<
names
.
size
();
++
i
)
{
if
(
names
[
i
]
==
name
)
return
i
;
}
names
.
push_back
(
name
);
return
names
.
size
()
-
1
;
}
const
std
::
unordered_map
<
std
::
type_index
,
vector
<
string
>
>
&
WorkspaceNames
()
const
{
return
workspace_names_
;
}
// Returns a string describing the registered workspaces.
string
DebugString
()
const
;
private:
// Workspace type names, indexed as workspace_types_[typeid].
std
::
unordered_map
<
std
::
type_index
,
string
>
workspace_types_
;
// Workspace names, indexed as workspace_names_[typeid][workspace].
std
::
unordered_map
<
std
::
type_index
,
vector
<
string
>
>
workspace_names_
;
TF_DISALLOW_COPY_AND_ASSIGN
(
WorkspaceRegistry
);
};
// A typed collected of workspaces. The workspaces are indexed according to an
// external WorkspaceRegistry. If the WorkspaceSet is const, the contents are
// also immutable.
class
WorkspaceSet
{
public:
~
WorkspaceSet
()
{
Reset
(
WorkspaceRegistry
());
}
// Returns true if a workspace has been set.
template
<
class
W
>
bool
Has
(
int
index
)
const
{
const
std
::
type_index
id
=
std
::
type_index
(
typeid
(
W
));
DCHECK
(
workspaces_
.
find
(
id
)
!=
workspaces_
.
end
());
DCHECK_LT
(
index
,
workspaces_
.
find
(
id
)
->
second
.
size
());
return
workspaces_
.
find
(
id
)
->
second
[
index
]
!=
nullptr
;
}
// Returns an indexed workspace; the workspace must have been set.
template
<
class
W
>
const
W
&
Get
(
int
index
)
const
{
DCHECK
(
Has
<
W
>
(
index
));
const
Workspace
*
w
=
workspaces_
.
find
(
std
::
type_index
(
typeid
(
W
)))
->
second
[
index
];
return
reinterpret_cast
<
const
W
&>
(
*
w
);
}
// Sets an indexed workspace; this takes ownership of the workspace, which
// must have been new-allocated. It is an error to set a workspace twice.
template
<
class
W
>
void
Set
(
int
index
,
W
*
workspace
)
{
const
std
::
type_index
id
=
std
::
type_index
(
typeid
(
W
));
DCHECK
(
workspaces_
.
find
(
id
)
!=
workspaces_
.
end
());
DCHECK_LT
(
index
,
workspaces_
[
id
].
size
());
DCHECK
(
workspaces_
[
id
][
index
]
==
nullptr
);
DCHECK
(
workspace
!=
nullptr
);
workspaces_
[
id
][
index
]
=
workspace
;
}
void
Reset
(
const
WorkspaceRegistry
&
registry
)
{
// Deallocate current workspaces.
for
(
auto
&
it
:
workspaces_
)
{
for
(
size_t
index
=
0
;
index
<
it
.
second
.
size
();
++
index
)
{
delete
it
.
second
[
index
];
}
}
workspaces_
.
clear
();
// Allocate space for new workspaces.
for
(
auto
&
it
:
registry
.
WorkspaceNames
())
{
workspaces_
[
it
.
first
].
resize
(
it
.
second
.
size
());
}
}
private:
// The set of workspaces, indexed as workspaces_[typeid][index].
std
::
unordered_map
<
std
::
type_index
,
vector
<
Workspace
*>
>
workspaces_
;
};
// A workspace that wraps around a single int.
class
SingletonIntWorkspace
:
public
Workspace
{
public:
// Default-initializes the int value.
SingletonIntWorkspace
()
{}
// Initializes the int with the given value.
explicit
SingletonIntWorkspace
(
int
value
)
:
value_
(
value
)
{}
// Returns the name of this type of workspace.
static
string
TypeName
()
{
return
"SingletonInt"
;
}
// Returns the int value.
int
get
()
const
{
return
value_
;
}
// Sets the int value.
void
set
(
int
value
)
{
value_
=
value
;
}
private:
// The enclosed int.
int
value_
=
0
;
};
// A workspace that wraps around a vector of int.
class
VectorIntWorkspace
:
public
Workspace
{
public:
// Creates a vector of the given size.
explicit
VectorIntWorkspace
(
int
size
);
// Creates a vector initialized with the given array.
explicit
VectorIntWorkspace
(
const
vector
<
int
>
&
elements
);
// Creates a vector of the given size, with each element initialized to the
// given value.
VectorIntWorkspace
(
int
size
,
int
value
);
// Returns the name of this type of workspace.
static
string
TypeName
();
// Returns the i'th element.
int
element
(
int
i
)
const
{
return
elements_
[
i
];
}
// Sets the i'th element.
void
set_element
(
int
i
,
int
value
)
{
elements_
[
i
]
=
value
;
}
private:
// The enclosed vector.
vector
<
int
>
elements_
;
};
// A workspace that wraps around a vector of vector of int.
class
VectorVectorIntWorkspace
:
public
Workspace
{
public:
// Creates a vector of empty vectors of the given size.
explicit
VectorVectorIntWorkspace
(
int
size
);
// Returns the name of this type of workspace.
static
string
TypeName
();
// Returns the i'th vector of elements.
const
vector
<
int
>
&
elements
(
int
i
)
const
{
return
elements_
[
i
];
}
// Mutable access to the i'th vector of elements.
vector
<
int
>
*
mutable_elements
(
int
i
)
{
return
&
(
elements_
[
i
]);
}
private:
// The enclosed vector of vector of elements.
vector
<
vector
<
int
>
>
elements_
;
};
}
// namespace syntaxnet
#endif // $TARGETDIR_WORKSPACE_H_
tensorflow
@
3402f51e
Subproject commit 3402f51ecd11a26d0c071b1d06b4edab1b0ef351
syntaxnet/third_party/utf/BUILD
0 → 100644
View file @
32ab5a58
licenses
([
"notice"
])
cc_library
(
name
=
"utf"
,
srcs
=
[
"rune.c"
,
"runestrcat.c"
,
"runestrchr.c"
,
"runestrcmp.c"
,
"runestrcpy.c"
,
"runestrdup.c"
,
"runestrecpy.c"
,
"runestrlen.c"
,
"runestrncat.c"
,
"runestrncmp.c"
,
"runestrncpy.c"
,
"runestrrchr.c"
,
"runestrstr.c"
,
"runetype.c"
,
"utfecpy.c"
,
"utflen.c"
,
"utfnlen.c"
,
"utfrrune.c"
,
"utfrune.c"
,
"utfutf.c"
,
],
hdrs
=
[
"runetypebody.c"
,
"utf.h"
,
"utfdef.h"
,
],
includes
=
[
"."
],
visibility
=
[
"//visibility:public"
],
)
syntaxnet/third_party/utf/README
0 → 100644
View file @
32ab5a58
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 1998-2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
syntaxnet/third_party/utf/rune.c
0 → 100644
View file @
32ab5a58
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "third_party/utf/utf.h"
#include "third_party/utf/utfdef.h"
enum
{
Bit1
=
7
,
Bitx
=
6
,
Bit2
=
5
,
Bit3
=
4
,
Bit4
=
3
,
Bit5
=
2
,
T1
=
((
1
<<
(
Bit1
+
1
))
-
1
)
^
0xFF
,
/* 0000 0000 */
Tx
=
((
1
<<
(
Bitx
+
1
))
-
1
)
^
0xFF
,
/* 1000 0000 */
T2
=
((
1
<<
(
Bit2
+
1
))
-
1
)
^
0xFF
,
/* 1100 0000 */
T3
=
((
1
<<
(
Bit3
+
1
))
-
1
)
^
0xFF
,
/* 1110 0000 */
T4
=
((
1
<<
(
Bit4
+
1
))
-
1
)
^
0xFF
,
/* 1111 0000 */
T5
=
((
1
<<
(
Bit5
+
1
))
-
1
)
^
0xFF
,
/* 1111 1000 */
Rune1
=
(
1
<<
(
Bit1
+
0
*
Bitx
))
-
1
,
/* 0000 0000 0111 1111 */
Rune2
=
(
1
<<
(
Bit2
+
1
*
Bitx
))
-
1
,
/* 0000 0111 1111 1111 */
Rune3
=
(
1
<<
(
Bit3
+
2
*
Bitx
))
-
1
,
/* 1111 1111 1111 1111 */
Rune4
=
(
1
<<
(
Bit4
+
3
*
Bitx
))
-
1
,
/* 0001 1111 1111 1111 1111 1111 */
Maskx
=
(
1
<<
Bitx
)
-
1
,
/* 0011 1111 */
Testx
=
Maskx
^
0xFF
,
/* 1100 0000 */
Bad
=
Runeerror
,
};
/*
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
* This is a slower but "safe" version of the old chartorune
* that works on strings that are not necessarily null-terminated.
*
* If you know for sure that your string is null-terminated,
* chartorune will be a bit faster.
*
* It is guaranteed not to attempt to access "length"
* past the incoming pointer. This is to avoid
* possible access violations. If the string appears to be
* well-formed but incomplete (i.e., to get the whole Rune
* we'd need to read past str+length) then we'll set the Rune
* to Bad and return 0.
*
* Note that if we have decoding problems for other
* reasons, we return 1 instead of 0.
*/
int
charntorune
(
Rune
*
rune
,
const
char
*
str
,
int
length
)
{
int
c
,
c1
,
c2
,
c3
;
long
l
;
/* When we're not allowed to read anything */
if
(
length
<=
0
)
{
goto
badlen
;
}
/*
* one character sequence (7-bit value)
* 00000-0007F => T1
*/
c
=
*
(
uchar
*
)
str
;
if
(
c
<
Tx
)
{
*
rune
=
c
;
return
1
;
}
// If we can't read more than one character we must stop
if
(
length
<=
1
)
{
goto
badlen
;
}
/*
* two character sequence (11-bit value)
* 0080-07FF => T2 Tx
*/
c1
=
*
(
uchar
*
)(
str
+
1
)
^
Tx
;
if
(
c1
&
Testx
)
goto
bad
;
if
(
c
<
T3
)
{
if
(
c
<
T2
)
goto
bad
;
l
=
((
c
<<
Bitx
)
|
c1
)
&
Rune2
;
if
(
l
<=
Rune1
)
goto
bad
;
*
rune
=
l
;
return
2
;
}
// If we can't read more than two characters we must stop
if
(
length
<=
2
)
{
goto
badlen
;
}
/*
* three character sequence (16-bit value)
* 0800-FFFF => T3 Tx Tx
*/
c2
=
*
(
uchar
*
)(
str
+
2
)
^
Tx
;
if
(
c2
&
Testx
)
goto
bad
;
if
(
c
<
T4
)
{
l
=
((((
c
<<
Bitx
)
|
c1
)
<<
Bitx
)
|
c2
)
&
Rune3
;
if
(
l
<=
Rune2
)
goto
bad
;
*
rune
=
l
;
return
3
;
}
if
(
length
<=
3
)
goto
badlen
;
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3
=
*
(
uchar
*
)(
str
+
3
)
^
Tx
;
if
(
c3
&
Testx
)
goto
bad
;
if
(
c
<
T5
)
{
l
=
((((((
c
<<
Bitx
)
|
c1
)
<<
Bitx
)
|
c2
)
<<
Bitx
)
|
c3
)
&
Rune4
;
if
(
l
<=
Rune3
)
goto
bad
;
if
(
l
>
Runemax
)
goto
bad
;
*
rune
=
l
;
return
4
;
}
// Support for 5-byte or longer UTF-8 would go here, but
// since we don't have that, we'll just fall through to bad.
/*
* bad decoding
*/
bad:
*
rune
=
Bad
;
return
1
;
badlen:
*
rune
=
Bad
;
return
0
;
}
/*
* This is the older "unsafe" version, which works fine on
* null-terminated strings.
*/
int
chartorune
(
Rune
*
rune
,
const
char
*
str
)
{
int
c
,
c1
,
c2
,
c3
;
long
l
;
/*
* one character sequence
* 00000-0007F => T1
*/
c
=
*
(
uchar
*
)
str
;
if
(
c
<
Tx
)
{
*
rune
=
c
;
return
1
;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
c1
=
*
(
uchar
*
)(
str
+
1
)
^
Tx
;
if
(
c1
&
Testx
)
goto
bad
;
if
(
c
<
T3
)
{
if
(
c
<
T2
)
goto
bad
;
l
=
((
c
<<
Bitx
)
|
c1
)
&
Rune2
;
if
(
l
<=
Rune1
)
goto
bad
;
*
rune
=
l
;
return
2
;
}
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
c2
=
*
(
uchar
*
)(
str
+
2
)
^
Tx
;
if
(
c2
&
Testx
)
goto
bad
;
if
(
c
<
T4
)
{
l
=
((((
c
<<
Bitx
)
|
c1
)
<<
Bitx
)
|
c2
)
&
Rune3
;
if
(
l
<=
Rune2
)
goto
bad
;
*
rune
=
l
;
return
3
;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3
=
*
(
uchar
*
)(
str
+
3
)
^
Tx
;
if
(
c3
&
Testx
)
goto
bad
;
if
(
c
<
T5
)
{
l
=
((((((
c
<<
Bitx
)
|
c1
)
<<
Bitx
)
|
c2
)
<<
Bitx
)
|
c3
)
&
Rune4
;
if
(
l
<=
Rune3
)
goto
bad
;
if
(
l
>
Runemax
)
goto
bad
;
*
rune
=
l
;
return
4
;
}
/*
* Support for 5-byte or longer UTF-8 would go here, but
* since we don't have that, we'll just fall through to bad.
*/
/*
* bad decoding
*/
bad:
*
rune
=
Bad
;
return
1
;
}
int
isvalidcharntorune
(
const
char
*
str
,
int
length
,
Rune
*
rune
,
int
*
consumed
)
{
*
consumed
=
charntorune
(
rune
,
str
,
length
);
return
*
rune
!=
Runeerror
||
*
consumed
==
3
;
}
int
runetochar
(
char
*
str
,
const
Rune
*
rune
)
{
/* Runes are signed, so convert to unsigned for range check. */
unsigned
long
c
;
/*
* one character sequence
* 00000-0007F => 00-7F
*/
c
=
*
rune
;
if
(
c
<=
Rune1
)
{
str
[
0
]
=
c
;
return
1
;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
if
(
c
<=
Rune2
)
{
str
[
0
]
=
T2
|
(
c
>>
1
*
Bitx
);
str
[
1
]
=
Tx
|
(
c
&
Maskx
);
return
2
;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if
(
c
>
Runemax
)
c
=
Runeerror
;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if
(
c
<=
Rune3
)
{
str
[
0
]
=
T3
|
(
c
>>
2
*
Bitx
);
str
[
1
]
=
Tx
|
((
c
>>
1
*
Bitx
)
&
Maskx
);
str
[
2
]
=
Tx
|
(
c
&
Maskx
);
return
3
;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str
[
0
]
=
T4
|
(
c
>>
3
*
Bitx
);
str
[
1
]
=
Tx
|
((
c
>>
2
*
Bitx
)
&
Maskx
);
str
[
2
]
=
Tx
|
((
c
>>
1
*
Bitx
)
&
Maskx
);
str
[
3
]
=
Tx
|
(
c
&
Maskx
);
return
4
;
}
int
runelen
(
Rune
rune
)
{
char
str
[
10
];
return
runetochar
(
str
,
&
rune
);
}
int
runenlen
(
const
Rune
*
r
,
int
nrune
)
{
int
nb
;
ulong
c
;
/* Rune is signed, so use unsigned for range check. */
nb
=
0
;
while
(
nrune
--
)
{
c
=
*
r
++
;
if
(
c
<=
Rune1
)
nb
++
;
else
if
(
c
<=
Rune2
)
nb
+=
2
;
else
if
(
c
<=
Rune3
)
nb
+=
3
;
else
if
(
c
<=
Runemax
)
nb
+=
4
;
else
nb
+=
3
;
/* Runeerror = 0xFFFD, see runetochar */
}
return
nb
;
}
int
fullrune
(
const
char
*
str
,
int
n
)
{
if
(
n
>
0
)
{
int
c
=
*
(
uchar
*
)
str
;
if
(
c
<
Tx
)
return
1
;
if
(
n
>
1
)
{
if
(
c
<
T3
)
return
1
;
if
(
n
>
2
)
{
if
(
c
<
T4
||
n
>
3
)
return
1
;
}
}
}
return
0
;
}
syntaxnet/third_party/utf/runestrcat.c
0 → 100644
View file @
32ab5a58
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "third_party/utf/utf.h"
#include "third_party/utf/utfdef.h"
Rune
*
runestrcat
(
Rune
*
s1
,
const
Rune
*
s2
)
{
runestrcpy
((
Rune
*
)
runestrchr
(
s1
,
0
),
s2
);
return
s1
;
}
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment