Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
27b4acd4
Commit
27b4acd4
authored
Sep 25, 2018
by
Aman Gupta
Browse files
Merge remote-tracking branch 'upstream/master'
parents
5133522f
d4e1f97f
Changes
240
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1386 additions
and
0 deletions
+1386
-0
research/cvt_text/base/configure.py
research/cvt_text/base/configure.py
+139
-0
research/cvt_text/base/embeddings.py
research/cvt_text/base/embeddings.py
+167
-0
research/cvt_text/base/utils.py
research/cvt_text/base/utils.py
+68
-0
research/cvt_text/corpus_processing/__init__.py
research/cvt_text/corpus_processing/__init__.py
+0
-0
research/cvt_text/corpus_processing/example.py
research/cvt_text/corpus_processing/example.py
+52
-0
research/cvt_text/corpus_processing/minibatching.py
research/cvt_text/corpus_processing/minibatching.py
+143
-0
research/cvt_text/corpus_processing/scorer.py
research/cvt_text/corpus_processing/scorer.py
+52
-0
research/cvt_text/corpus_processing/unlabeled_data.py
research/cvt_text/corpus_processing/unlabeled_data.py
+81
-0
research/cvt_text/cvt.py
research/cvt_text/cvt.py
+67
-0
research/cvt_text/fetch_data.sh
research/cvt_text/fetch_data.sh
+51
-0
research/cvt_text/model/__init__.py
research/cvt_text/model/__init__.py
+0
-0
research/cvt_text/model/encoder.py
research/cvt_text/model/encoder.py
+110
-0
research/cvt_text/model/model_helpers.py
research/cvt_text/model/model_helpers.py
+54
-0
research/cvt_text/model/multitask_model.py
research/cvt_text/model/multitask_model.py
+132
-0
research/cvt_text/model/shared_inputs.py
research/cvt_text/model/shared_inputs.py
+48
-0
research/cvt_text/model/task_module.py
research/cvt_text/model/task_module.py
+44
-0
research/cvt_text/preprocessing.py
research/cvt_text/preprocessing.py
+87
-0
research/cvt_text/task_specific/__init__.py
research/cvt_text/task_specific/__init__.py
+0
-0
research/cvt_text/task_specific/task_definitions.py
research/cvt_text/task_specific/task_definitions.py
+91
-0
research/cvt_text/task_specific/word_level/__init__.py
research/cvt_text/task_specific/word_level/__init__.py
+0
-0
No files found.
research/cvt_text/base/configure.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Classes for storing hyperparameters, data locations, etc."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
json
from
os.path
import
join
import
tensorflow
as
tf
class
Config
(
object
):
"""Stores everything needed to train a model."""
def
__init__
(
self
,
**
kwargs
):
# general
self
.
data_dir
=
'./data'
# top directory for data (corpora, models, etc.)
self
.
model_name
=
'default_model'
# name identifying the current model
# mode
self
.
mode
=
'train'
# either "train" or "eval"
self
.
task_names
=
[
'chunk'
]
# list of tasks this model will learn
# more than one trains a multi-task model
self
.
is_semisup
=
True
# whether to use CVT or train purely supervised
self
.
for_preprocessing
=
False
# is this for the preprocessing script
# embeddings
self
.
pretrained_embeddings
=
'glove.6B.300d.txt'
# which pretrained
# embeddings to use
self
.
word_embedding_size
=
300
# size of each word embedding
# encoder
self
.
use_chars
=
True
# whether to include a character-level cnn
self
.
char_embedding_size
=
50
# size of character embeddings
self
.
char_cnn_filter_widths
=
[
2
,
3
,
4
]
# filter widths for the char cnn
self
.
char_cnn_n_filters
=
100
# number of filters for each filter width
self
.
unidirectional_sizes
=
[
1024
]
# size of first Bi-LSTM
self
.
bidirectional_sizes
=
[
512
]
# size of second Bi-LSTM
self
.
projection_size
=
512
# projections size for LSTMs and hidden layers
# dependency parsing
self
.
depparse_projection_size
=
128
# size of the representations used in
# the bilinear classifier for parsing
# tagging
self
.
label_encoding
=
'BIOES'
# label encoding scheme for entity-level
# tagging tasks
self
.
label_smoothing
=
0.1
# label smoothing rate for tagging tasks
# optimization
self
.
lr
=
0.5
# base learning rate
self
.
momentum
=
0.9
# momentum
self
.
grad_clip
=
1.0
# maximum gradient norm during optimization
self
.
warm_up_steps
=
5000.0
# linearly ramp up the lr for this many steps
self
.
lr_decay
=
0.005
# factor for gradually decaying the lr
# EMA
self
.
ema_decay
=
0.998
# EMA coefficient for averaged model weights
self
.
ema_test
=
True
# whether to use EMA weights at test time
self
.
ema_teacher
=
False
# whether to use EMA weights for the teacher model
# regularization
self
.
labeled_keep_prob
=
0.5
# 1 - dropout on labeled examples
self
.
unlabeled_keep_prob
=
0.8
# 1 - dropout on unlabeled examples
# sizing
self
.
max_sentence_length
=
100
# maximum length of unlabeled sentences
self
.
max_word_length
=
20
# maximum length of words for char cnn
self
.
train_batch_size
=
64
# train batch size
self
.
test_batch_size
=
64
# test batch size
self
.
buckets
=
[(
0
,
15
),
(
15
,
40
),
(
40
,
1000
)]
# buckets for binning
# sentences by length
# training
self
.
print_every
=
25
# how often to print out training progress
self
.
eval_dev_every
=
500
# how often to evaluate on the dev set
self
.
eval_train_every
=
2000
# how often to evaluate on the train set
self
.
save_model_every
=
1000
# how often to checkpoint the model
# data set
self
.
train_set_percent
=
100
# how much of the train set to use
for
k
,
v
in
kwargs
.
iteritems
():
if
k
not
in
self
.
__dict__
:
raise
ValueError
(
"Unknown argument"
,
k
)
self
.
__dict__
[
k
]
=
v
self
.
dev_set
=
self
.
mode
==
"train"
# whether to evaluate on the dev or
# test set
# locations of various data files
self
.
raw_data_topdir
=
join
(
self
.
data_dir
,
'raw_data'
)
self
.
unsupervised_data
=
join
(
self
.
raw_data_topdir
,
'unlabeled_data'
,
'1-billion-word-language-modeling-benchmark-r13output'
,
'training-monolingual.tokenized.shuffled'
)
self
.
pretrained_embeddings_file
=
join
(
self
.
raw_data_topdir
,
'pretrained_embeddings'
,
self
.
pretrained_embeddings
)
self
.
preprocessed_data_topdir
=
join
(
self
.
data_dir
,
'preprocessed_data'
)
self
.
embeddings_dir
=
join
(
self
.
preprocessed_data_topdir
,
self
.
pretrained_embeddings
.
rsplit
(
'.'
,
1
)[
0
])
self
.
word_vocabulary
=
join
(
self
.
embeddings_dir
,
'word_vocabulary.pkl'
)
self
.
word_embeddings
=
join
(
self
.
embeddings_dir
,
'word_embeddings.pkl'
)
self
.
model_dir
=
join
(
self
.
data_dir
,
"models"
,
self
.
model_name
)
self
.
checkpoints_dir
=
join
(
self
.
model_dir
,
'checkpoints'
)
self
.
checkpoint
=
join
(
self
.
checkpoints_dir
,
'checkpoint.ckpt'
)
self
.
best_model_checkpoints_dir
=
join
(
self
.
model_dir
,
'best_model_checkpoints'
)
self
.
best_model_checkpoint
=
join
(
self
.
best_model_checkpoints_dir
,
'checkpoint.ckpt'
)
self
.
progress
=
join
(
self
.
checkpoints_dir
,
'progress.pkl'
)
self
.
summaries_dir
=
join
(
self
.
model_dir
,
'summaries'
)
self
.
history_file
=
join
(
self
.
model_dir
,
'history.pkl'
)
def
write
(
self
):
tf
.
gfile
.
MakeDirs
(
self
.
model_dir
)
with
open
(
join
(
self
.
model_dir
,
'config.json'
),
'w'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
__dict__
,
sort_keys
=
True
,
indent
=
4
,
separators
=
(
','
,
': '
)))
research/cvt_text/base/embeddings.py
0 → 100644
View file @
27b4acd4
# coding=utf-8
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for handling word embeddings."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
re
import
numpy
as
np
import
tensorflow
as
tf
from
base
import
utils
_CHARS
=
[
# punctuation
'!'
,
'
\'
'
,
'#'
,
'$'
,
'%'
,
'&'
,
'"'
,
'('
,
')'
,
'*'
,
'+'
,
','
,
'-'
,
'.'
,
'/'
,
'
\\
'
,
'_'
,
'`'
,
'{'
,
'}'
,
'['
,
']'
,
'<'
,
'>'
,
':'
,
';'
,
'?'
,
'@'
,
# digits
'0'
,
'1'
,
'2'
,
'3'
,
'4'
,
'5'
,
'6'
,
'7'
,
'8'
,
'9'
,
# letters
'A'
,
'B'
,
'C'
,
'D'
,
'E'
,
'F'
,
'G'
,
'H'
,
'I'
,
'J'
,
'K'
,
'L'
,
'M'
,
'N'
,
'O'
,
'P'
,
'Q'
,
'R'
,
'S'
,
'T'
,
'U'
,
'V'
,
'W'
,
'X'
,
'Y'
,
'Z'
,
'a'
,
'b'
,
'c'
,
'd'
,
'e'
,
'f'
,
'g'
,
'h'
,
'i'
,
'j'
,
'k'
,
'l'
,
'm'
,
'n'
,
'o'
,
'p'
,
'q'
,
'r'
,
's'
,
't'
,
'u'
,
'v'
,
'w'
,
'x'
,
'y'
,
'z'
,
# special characters
'£'
,
'€'
,
'®'
,
'™'
,
'�'
,
'½'
,
'»'
,
'•'
,
'—'
,
'“'
,
'”'
,
'°'
,
'‘'
,
'’'
]
# words not in GloVe that still should have embeddings
_EXTRA_WORDS
=
[
# common digit patterns
'0/0'
,
'0/00'
,
'00/00'
,
'0/000'
,
'00/00/00'
,
'0/00/00'
,
'00/00/0000'
,
'0/00/0000'
,
'00-00'
,
'00-00-00'
,
'0-00-00'
,
'00-00-0000'
,
'0-00-0000'
,
'0000-00-00'
,
'00-0-00-0'
,
'00000000'
,
'0:00.000'
,
'00:00.000'
,
'0%'
,
'00%'
,
'00.'
'0000.'
,
'0.0bn'
,
'0.0m'
,
'0-'
,
'00-'
,
# ontonotes uses **f to represent formulas and -amp- instead of amperstands
'**f'
,
'-amp-'
]
SPECIAL_TOKENS
=
[
'<pad>'
,
'<unk>'
,
'<start>'
,
'<end>'
,
'<missing>'
]
NUM_CHARS
=
len
(
_CHARS
)
+
len
(
SPECIAL_TOKENS
)
PAD
,
UNK
,
START
,
END
,
MISSING
=
0
,
1
,
2
,
3
,
4
class
Vocabulary
(
collections
.
OrderedDict
):
def
__getitem__
(
self
,
w
):
return
self
.
get
(
w
,
UNK
)
@
utils
.
Memoize
def
get_char_vocab
():
characters
=
_CHARS
for
i
,
special
in
enumerate
(
SPECIAL_TOKENS
):
characters
.
insert
(
i
,
special
)
return
Vocabulary
({
c
:
i
for
i
,
c
in
enumerate
(
characters
)})
@
utils
.
Memoize
def
get_inv_char_vocab
():
return
{
i
:
c
for
c
,
i
in
get_char_vocab
().
items
()}
def
get_word_vocab
(
config
):
return
Vocabulary
(
utils
.
load_cpickle
(
config
.
word_vocabulary
))
def
get_word_embeddings
(
config
):
return
utils
.
load_cpickle
(
config
.
word_embeddings
)
@
utils
.
Memoize
def
_punctuation_ids
(
vocab_path
):
vocab
=
Vocabulary
(
utils
.
load_cpickle
(
vocab_path
))
return
set
(
i
for
w
,
i
in
vocab
.
iteritems
()
if
w
in
[
'!'
,
'...'
,
'``'
,
'{'
,
'}'
,
'('
,
')'
,
'['
,
']'
,
'--'
,
'-'
,
','
,
'.'
,
"''"
,
'`'
,
';'
,
':'
,
'?'
])
def
get_punctuation_ids
(
config
):
return
_punctuation_ids
(
config
.
word_vocabulary
)
class
PretrainedEmbeddingLoader
(
object
):
def
__init__
(
self
,
config
):
self
.
config
=
config
self
.
vocabulary
=
{}
self
.
vectors
=
[]
self
.
vector_size
=
config
.
word_embedding_size
def
_add_vector
(
self
,
w
):
if
w
not
in
self
.
vocabulary
:
self
.
vocabulary
[
w
]
=
len
(
self
.
vectors
)
self
.
vectors
.
append
(
np
.
zeros
(
self
.
vector_size
,
dtype
=
'float32'
))
def
build
(
self
):
utils
.
log
(
'loading pretrained embeddings from'
,
self
.
config
.
pretrained_embeddings_file
)
for
special
in
SPECIAL_TOKENS
:
self
.
_add_vector
(
special
)
for
extra
in
_EXTRA_WORDS
:
self
.
_add_vector
(
extra
)
with
tf
.
gfile
.
GFile
(
self
.
config
.
pretrained_embeddings_file
,
'r'
)
as
f
:
for
i
,
line
in
enumerate
(
f
):
if
i
%
10000
==
0
:
utils
.
log
(
'on line'
,
i
)
split
=
line
.
decode
(
'utf8'
).
split
()
w
=
normalize_word
(
split
[
0
])
try
:
vec
=
np
.
array
(
map
(
float
,
split
[
1
:]),
dtype
=
'float32'
)
if
vec
.
size
!=
self
.
vector_size
:
utils
.
log
(
'vector for line'
,
i
,
'has size'
,
vec
.
size
,
'so skipping'
)
utils
.
log
(
line
[:
100
]
+
'...'
)
continue
except
:
utils
.
log
(
'can
\'
t parse line'
,
i
,
'so skipping'
)
utils
.
log
(
line
[:
100
]
+
'...'
)
continue
if
w
not
in
self
.
vocabulary
:
self
.
vocabulary
[
w
]
=
len
(
self
.
vectors
)
self
.
vectors
.
append
(
vec
)
utils
.
log
(
'writing vectors!'
)
self
.
_write
()
def
_write
(
self
):
utils
.
write_cpickle
(
np
.
vstack
(
self
.
vectors
),
self
.
config
.
word_embeddings
)
utils
.
write_cpickle
(
self
.
vocabulary
,
self
.
config
.
word_vocabulary
)
def
normalize_chars
(
w
):
if
w
==
'-LRB-'
:
return
'('
elif
w
==
'-RRB-'
:
return
')'
elif
w
==
'-LCB-'
:
return
'{'
elif
w
==
'-RCB-'
:
return
'}'
elif
w
==
'-LSB-'
:
return
'['
elif
w
==
'-RSB-'
:
return
']'
return
w
.
replace
(
r
'\/'
,
'/'
).
replace
(
r
'\*'
,
'*'
)
def
normalize_word
(
w
):
return
re
.
sub
(
r
'\d'
,
'0'
,
normalize_chars
(
w
).
lower
())
research/cvt_text/base/utils.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Various utilities."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
cPickle
import
sys
import
tensorflow
as
tf
class
Memoize
(
object
):
def
__init__
(
self
,
f
):
self
.
f
=
f
self
.
cache
=
{}
def
__call__
(
self
,
*
args
):
if
args
not
in
self
.
cache
:
self
.
cache
[
args
]
=
self
.
f
(
*
args
)
return
self
.
cache
[
args
]
def
load_cpickle
(
path
,
memoized
=
True
):
return
_load_cpickle_memoize
(
path
)
if
memoized
else
_load_cpickle
(
path
)
def
_load_cpickle
(
path
):
with
tf
.
gfile
.
GFile
(
path
,
'r'
)
as
f
:
return
cPickle
.
load
(
f
)
@
Memoize
def
_load_cpickle_memoize
(
path
):
return
_load_cpickle
(
path
)
def
write_cpickle
(
o
,
path
):
tf
.
gfile
.
MakeDirs
(
path
.
rsplit
(
'/'
,
1
)[
0
])
with
tf
.
gfile
.
GFile
(
path
,
'w'
)
as
f
:
cPickle
.
dump
(
o
,
f
,
-
1
)
def
log
(
*
args
):
msg
=
' '
.
join
(
map
(
str
,
args
))
sys
.
stdout
.
write
(
msg
+
'
\n
'
)
sys
.
stdout
.
flush
()
def
heading
(
*
args
):
log
()
log
(
80
*
'='
)
log
(
*
args
)
log
(
80
*
'='
)
research/cvt_text/corpus_processing/__init__.py
0 → 100644
View file @
27b4acd4
research/cvt_text/corpus_processing/example.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base class for training examples."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
base
import
embeddings
CONTRACTION_WORDS
=
set
(
w
+
'n'
for
w
in
[
'do'
,
'does'
,
'did'
,
'is'
,
'are'
,
'was'
,
'were'
,
'has'
,
'have'
,
'had'
,
'could'
,
'would'
,
'should'
,
'ca'
,
'wo'
,
'ai'
,
'might'
])
class
Example
(
object
):
def
__init__
(
self
,
words
,
word_vocab
,
char_vocab
):
words
=
words
[:]
# Fix inconsistent tokenization between datasets
for
i
in
range
(
len
(
words
)):
if
(
words
[
i
].
lower
()
==
'
\'
t'
and
i
>
0
and
words
[
i
-
1
].
lower
()
in
CONTRACTION_WORDS
):
words
[
i
]
=
words
[
i
-
1
][
-
1
]
+
words
[
i
]
words
[
i
-
1
]
=
words
[
i
-
1
][:
-
1
]
self
.
words
=
([
embeddings
.
START
]
+
[
word_vocab
[
embeddings
.
normalize_word
(
w
)]
for
w
in
words
]
+
[
embeddings
.
END
])
self
.
chars
=
([[
embeddings
.
MISSING
]]
+
[[
char_vocab
[
c
]
for
c
in
embeddings
.
normalize_chars
(
w
)]
for
w
in
words
]
+
[[
embeddings
.
MISSING
]])
def
__repr__
(
self
,):
inv_char_vocab
=
embeddings
.
get_inv_char_vocab
()
return
' '
.
join
([
''
.
join
([
inv_char_vocab
[
c
]
for
c
in
w
])
for
w
in
self
.
chars
])
research/cvt_text/corpus_processing/minibatching.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for constructing minibatches."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
random
import
numpy
as
np
from
base
import
embeddings
def
get_bucket
(
config
,
l
):
for
i
,
(
s
,
e
)
in
enumerate
(
config
.
buckets
):
if
s
<=
l
<
e
:
return
config
.
buckets
[
i
]
def
build_array
(
nested_lists
,
dtype
=
'int32'
):
depth_to_sizes
=
collections
.
defaultdict
(
set
)
_get_sizes
(
nested_lists
,
depth_to_sizes
)
shape
=
[
max
(
depth_to_sizes
[
depth
])
for
depth
in
range
(
len
(
depth_to_sizes
))]
copy_depth
=
len
(
depth_to_sizes
)
-
1
while
copy_depth
>
0
and
len
(
depth_to_sizes
[
copy_depth
])
==
1
:
copy_depth
-=
1
arr
=
np
.
zeros
(
shape
,
dtype
=
dtype
)
_fill_array
(
nested_lists
,
arr
,
copy_depth
)
return
arr
def
_get_sizes
(
nested_lists
,
depth_to_sizes
,
depth
=
0
):
depth_to_sizes
[
depth
].
add
(
len
(
nested_lists
))
first_elem
=
nested_lists
[
0
]
if
(
isinstance
(
first_elem
,
collections
.
Sequence
)
or
isinstance
(
first_elem
,
np
.
ndarray
)):
for
sublist
in
nested_lists
:
_get_sizes
(
sublist
,
depth_to_sizes
,
depth
+
1
)
def
_fill_array
(
nested_lists
,
arr
,
copy_depth
,
depth
=
0
):
if
depth
==
copy_depth
:
for
i
in
range
(
len
(
nested_lists
)):
if
isinstance
(
nested_lists
[
i
],
np
.
ndarray
):
arr
[
i
]
=
nested_lists
[
i
]
else
:
arr
[
i
]
=
np
.
array
(
nested_lists
[
i
])
else
:
for
i
in
range
(
len
(
nested_lists
)):
_fill_array
(
nested_lists
[
i
],
arr
[
i
],
copy_depth
,
depth
+
1
)
class
Dataset
(
object
):
def
__init__
(
self
,
config
,
examples
,
task_name
=
'unlabeled'
,
is_training
=
False
):
self
.
_config
=
config
self
.
examples
=
examples
self
.
size
=
len
(
examples
)
self
.
task_name
=
task_name
self
.
is_training
=
is_training
def
get_minibatches
(
self
,
minibatch_size
):
by_bucket
=
collections
.
defaultdict
(
list
)
for
i
,
e
in
enumerate
(
self
.
examples
):
by_bucket
[
get_bucket
(
self
.
_config
,
len
(
e
.
words
))].
append
(
i
)
# save memory by weighting examples so longer sentences have
# smaller minibatches.
weight
=
lambda
ind
:
np
.
sqrt
(
len
(
self
.
examples
[
ind
].
words
))
total_weight
=
float
(
sum
(
weight
(
i
)
for
i
in
range
(
len
(
self
.
examples
))))
weight_per_batch
=
minibatch_size
*
total_weight
/
len
(
self
.
examples
)
cumulative_weight
=
0.0
id_batches
=
[]
for
_
,
ids
in
by_bucket
.
iteritems
():
ids
=
np
.
array
(
ids
)
np
.
random
.
shuffle
(
ids
)
curr_batch
,
curr_weight
=
[],
0.0
for
i
,
curr_id
in
enumerate
(
ids
):
curr_batch
.
append
(
curr_id
)
curr_weight
+=
weight
(
curr_id
)
if
(
i
==
len
(
ids
)
-
1
or
cumulative_weight
+
curr_weight
>=
(
len
(
id_batches
)
+
1
)
*
weight_per_batch
):
cumulative_weight
+=
curr_weight
id_batches
.
append
(
np
.
array
(
curr_batch
))
curr_batch
,
curr_weight
=
[],
0.0
random
.
shuffle
(
id_batches
)
for
id_batch
in
id_batches
:
yield
self
.
_make_minibatch
(
id_batch
)
def
endless_minibatches
(
self
,
minibatch_size
):
while
True
:
for
mb
in
self
.
get_minibatches
(
minibatch_size
):
yield
mb
def
_make_minibatch
(
self
,
ids
):
examples
=
[
self
.
examples
[
i
]
for
i
in
ids
]
sentence_lengths
=
np
.
array
([
len
(
e
.
words
)
for
e
in
examples
])
max_word_length
=
min
(
max
(
max
(
len
(
word
)
for
word
in
e
.
chars
)
for
e
in
examples
),
self
.
_config
.
max_word_length
)
characters
=
[[[
embeddings
.
PAD
]
+
[
embeddings
.
START
]
+
w
[:
max_word_length
]
+
[
embeddings
.
END
]
+
[
embeddings
.
PAD
]
for
w
in
e
.
chars
]
for
e
in
examples
]
# the first and last words are masked because they are start/end tokens
mask
=
build_array
([[
0
]
+
[
1
]
*
(
length
-
2
)
+
[
0
]
for
length
in
sentence_lengths
])
words
=
build_array
([
e
.
words
for
e
in
examples
])
chars
=
build_array
(
characters
,
dtype
=
'int16'
)
return
Minibatch
(
task_name
=
self
.
task_name
,
size
=
ids
.
size
,
examples
=
examples
,
ids
=
ids
,
teacher_predictions
=
{},
words
=
words
,
chars
=
chars
,
lengths
=
sentence_lengths
,
mask
=
mask
,
)
Minibatch
=
collections
.
namedtuple
(
'Minibatch'
,
[
'task_name'
,
'size'
,
'examples'
,
'ids'
,
'teacher_predictions'
,
'words'
,
'chars'
,
'lengths'
,
'mask'
])
research/cvt_text/corpus_processing/scorer.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Abstract base class for evaluation."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
abc
class
Scorer
(
object
):
__metaclass__
=
abc
.
ABCMeta
def
__init__
(
self
):
self
.
_updated
=
False
self
.
_cached_results
=
{}
@
abc
.
abstractmethod
def
update
(
self
,
examples
,
predictions
,
loss
):
self
.
_updated
=
True
@
abc
.
abstractmethod
def
get_loss
(
self
):
pass
@
abc
.
abstractmethod
def
_get_results
(
self
):
return
[]
def
get_results
(
self
,
prefix
=
""
):
results
=
self
.
_get_results
()
if
self
.
_updated
else
self
.
_cached_results
self
.
_cached_results
=
results
self
.
_updated
=
False
return
[(
prefix
+
k
,
v
)
for
k
,
v
in
results
]
def
results_str
(
self
):
return
" - "
.
join
([
"{:}: {:.2f}"
.
format
(
k
,
v
)
for
k
,
v
in
self
.
get_results
()])
research/cvt_text/corpus_processing/unlabeled_data.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Reads data from a large unlabeled corpus."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
tensorflow
as
tf
from
base
import
embeddings
from
corpus_processing
import
example
from
corpus_processing
import
minibatching
class
UnlabeledDataReader
(
object
):
def
__init__
(
self
,
config
,
starting_file
=
0
,
starting_line
=
0
,
one_pass
=
False
):
self
.
config
=
config
self
.
current_file
=
starting_file
self
.
current_line
=
starting_line
self
.
_one_pass
=
one_pass
def
endless_minibatches
(
self
):
for
examples
in
self
.
get_unlabeled_examples
():
d
=
minibatching
.
Dataset
(
self
.
config
,
examples
,
'unlabeled'
)
for
mb
in
d
.
get_minibatches
(
self
.
config
.
train_batch_size
):
yield
mb
def
_make_examples
(
self
,
sentences
):
word_vocab
=
embeddings
.
get_word_vocab
(
self
.
config
)
char_vocab
=
embeddings
.
get_char_vocab
()
return
[
example
.
Example
(
sentence
,
word_vocab
,
char_vocab
)
for
sentence
in
sentences
]
def
get_unlabeled_examples
(
self
):
lines
=
[]
for
words
in
self
.
get_unlabeled_sentences
():
lines
.
append
(
words
)
if
len
(
lines
)
>=
10000
:
yield
self
.
_make_examples
(
lines
)
lines
=
[]
def
get_unlabeled_sentences
(
self
):
while
True
:
file_ids_and_names
=
sorted
([
(
int
(
fname
.
split
(
'-'
)[
1
].
replace
(
'.txt'
,
''
)),
fname
)
for
fname
in
tf
.
gfile
.
ListDirectory
(
self
.
config
.
unsupervised_data
)])
for
fid
,
fname
in
file_ids_and_names
:
if
fid
<
self
.
current_file
:
continue
self
.
current_file
=
fid
self
.
current_line
=
0
with
tf
.
gfile
.
FastGFile
(
os
.
path
.
join
(
self
.
config
.
unsupervised_data
,
fname
),
'r'
)
as
f
:
for
i
,
line
in
enumerate
(
f
):
if
i
<
self
.
current_line
:
continue
self
.
current_line
=
i
words
=
line
.
strip
().
split
()
if
len
(
words
)
<
self
.
config
.
max_sentence_length
:
yield
words
self
.
current_file
=
0
self
.
current_line
=
0
if
self
.
_one_pass
:
break
research/cvt_text/cvt.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Run training and evaluation for CVT text models."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
tensorflow
as
tf
from
base
import
configure
from
base
import
utils
from
training
import
trainer
from
training
import
training_progress
FLAGS
=
tf
.
app
.
flags
.
FLAGS
tf
.
app
.
flags
.
DEFINE_string
(
'mode'
,
'train'
,
'"train" or "eval'
)
tf
.
app
.
flags
.
DEFINE_string
(
'model_name'
,
'default_model'
,
'A name identifying the model being '
'trained/evaluated'
)
def
main
():
utils
.
heading
(
'SETUP'
)
config
=
configure
.
Config
(
mode
=
FLAGS
.
mode
,
model_name
=
FLAGS
.
model_name
)
config
.
write
()
with
tf
.
Graph
().
as_default
()
as
graph
:
model_trainer
=
trainer
.
Trainer
(
config
)
summary_writer
=
tf
.
summary
.
FileWriter
(
config
.
summaries_dir
)
checkpoints_saver
=
tf
.
train
.
Saver
(
max_to_keep
=
1
)
best_model_saver
=
tf
.
train
.
Saver
(
max_to_keep
=
1
)
init_op
=
tf
.
global_variables_initializer
()
graph
.
finalize
()
with
tf
.
Session
()
as
sess
:
sess
.
run
(
init_op
)
progress
=
training_progress
.
TrainingProgress
(
config
,
sess
,
checkpoints_saver
,
best_model_saver
,
config
.
mode
==
'train'
)
utils
.
log
()
if
config
.
mode
==
'train'
:
utils
.
heading
(
'START TRAINING ({:})'
.
format
(
config
.
model_name
))
model_trainer
.
train
(
sess
,
progress
,
summary_writer
)
elif
config
.
mode
==
'eval'
:
utils
.
heading
(
'RUN EVALUATION ({:})'
.
format
(
config
.
model_name
))
progress
.
best_model_saver
.
restore
(
sess
,
tf
.
train
.
latest_checkpoint
(
config
.
checkpoints_dir
))
model_trainer
.
evaluate_all_tasks
(
sess
,
summary_writer
,
None
)
else
:
raise
ValueError
(
'Mode must be "train" or "eval"'
)
if
__name__
==
'__main__'
:
main
()
research/cvt_text/fetch_data.sh
0 → 100755
View file @
27b4acd4
#!/bin/bash
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
TOPDIR
=
'./data'
RUNDIR
=
${
PWD
}
mkdir
-p
${
TOPDIR
}
cd
${
TOPDIR
}
mkdir
-p
raw_data
mkdir
-p
raw_data/pretrained_embeddings
mkdir
-p
raw_data/unlabeled_data
mkdir
-p
raw_data/chunk
cd
${
RUNDIR
}
echo
"Preparing GloVe embeddings"
cd
"
${
TOPDIR
}
/raw_data/pretrained_embeddings"
curl
-OL
http://nlp.stanford.edu/data/glove.6B.zip
unzip glove.6B.zip
cd
${
RUNDIR
}
echo
echo
"Preparing lm1b corpus"
cd
"
${
TOPDIR
}
/raw_data/unlabeled_data"
curl
-OL
http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
tar
xzf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
cd
${
RUNDIR
}
echo
echo
"Preparing chunking corpus"
cd
"
${
TOPDIR
}
/raw_data/chunk"
curl
-OL
https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz
curl
-OL
http://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz
gunzip
*
cd
${
RUNDIR
}
echo
echo
"Done with data fetching!"
research/cvt_text/model/__init__.py
0 → 100644
View file @
27b4acd4
research/cvt_text/model/encoder.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""CNN-BiLSTM sentence encoder."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
tensorflow
as
tf
from
base
import
embeddings
from
model
import
model_helpers
class
Encoder
(
object
):
def
__init__
(
self
,
config
,
inputs
,
pretrained_embeddings
):
self
.
_config
=
config
self
.
_inputs
=
inputs
self
.
word_reprs
=
self
.
_get_word_reprs
(
pretrained_embeddings
)
self
.
uni_fw
,
self
.
uni_bw
=
self
.
_get_unidirectional_reprs
(
self
.
word_reprs
)
self
.
uni_reprs
=
tf
.
concat
([
self
.
uni_fw
,
self
.
uni_bw
],
axis
=-
1
)
self
.
bi_fw
,
self
.
bi_bw
,
self
.
bi_reprs
=
self
.
_get_bidirectional_reprs
(
self
.
uni_reprs
)
def
_get_word_reprs
(
self
,
pretrained_embeddings
):
with
tf
.
variable_scope
(
'word_embeddings'
):
word_embedding_matrix
=
tf
.
get_variable
(
'word_embedding_matrix'
,
initializer
=
pretrained_embeddings
)
word_embeddings
=
tf
.
nn
.
embedding_lookup
(
word_embedding_matrix
,
self
.
_inputs
.
words
)
word_embeddings
=
tf
.
nn
.
dropout
(
word_embeddings
,
self
.
_inputs
.
keep_prob
)
word_embeddings
*=
tf
.
get_variable
(
'emb_scale'
,
initializer
=
1.0
)
if
not
self
.
_config
.
use_chars
:
return
word_embeddings
with
tf
.
variable_scope
(
'char_embeddings'
):
char_embedding_matrix
=
tf
.
get_variable
(
'char_embeddings'
,
shape
=
[
embeddings
.
NUM_CHARS
,
self
.
_config
.
char_embedding_size
])
char_embeddings
=
tf
.
nn
.
embedding_lookup
(
char_embedding_matrix
,
self
.
_inputs
.
chars
)
shape
=
tf
.
shape
(
char_embeddings
)
char_embeddings
=
tf
.
reshape
(
char_embeddings
,
shape
=
[
-
1
,
shape
[
-
2
],
self
.
_config
.
char_embedding_size
])
char_reprs
=
[]
for
filter_width
in
self
.
_config
.
char_cnn_filter_widths
:
conv
=
tf
.
layers
.
conv1d
(
char_embeddings
,
self
.
_config
.
char_cnn_n_filters
,
filter_width
)
conv
=
tf
.
nn
.
relu
(
conv
)
conv
=
tf
.
nn
.
dropout
(
tf
.
reduce_max
(
conv
,
axis
=
1
),
self
.
_inputs
.
keep_prob
)
conv
=
tf
.
reshape
(
conv
,
shape
=
[
-
1
,
shape
[
1
],
self
.
_config
.
char_cnn_n_filters
])
char_reprs
.
append
(
conv
)
return
tf
.
concat
([
word_embeddings
]
+
char_reprs
,
axis
=-
1
)
def
_get_unidirectional_reprs
(
self
,
word_reprs
):
with
tf
.
variable_scope
(
'unidirectional_reprs'
):
word_lstm_input_size
=
(
self
.
_config
.
word_embedding_size
if
not
self
.
_config
.
use_chars
else
(
self
.
_config
.
word_embedding_size
+
len
(
self
.
_config
.
char_cnn_filter_widths
)
*
self
.
_config
.
char_cnn_n_filters
))
word_reprs
.
set_shape
([
None
,
None
,
word_lstm_input_size
])
(
outputs_fw
,
outputs_bw
),
_
=
tf
.
nn
.
bidirectional_dynamic_rnn
(
model_helpers
.
multi_lstm_cell
(
self
.
_config
.
unidirectional_sizes
,
self
.
_inputs
.
keep_prob
,
self
.
_config
.
projection_size
),
model_helpers
.
multi_lstm_cell
(
self
.
_config
.
unidirectional_sizes
,
self
.
_inputs
.
keep_prob
,
self
.
_config
.
projection_size
),
word_reprs
,
dtype
=
tf
.
float32
,
sequence_length
=
self
.
_inputs
.
lengths
,
scope
=
'unilstm'
)
return
outputs_fw
,
outputs_bw
def
_get_bidirectional_reprs
(
self
,
uni_reprs
):
with
tf
.
variable_scope
(
'bidirectional_reprs'
):
current_outputs
=
uni_reprs
outputs_fw
,
outputs_bw
=
None
,
None
for
size
in
self
.
_config
.
bidirectional_sizes
:
(
outputs_fw
,
outputs_bw
),
_
=
tf
.
nn
.
bidirectional_dynamic_rnn
(
model_helpers
.
lstm_cell
(
size
,
self
.
_inputs
.
keep_prob
,
self
.
_config
.
projection_size
),
model_helpers
.
lstm_cell
(
size
,
self
.
_inputs
.
keep_prob
,
self
.
_config
.
projection_size
),
current_outputs
,
dtype
=
tf
.
float32
,
sequence_length
=
self
.
_inputs
.
lengths
,
scope
=
'bilstm'
)
current_outputs
=
tf
.
concat
([
outputs_fw
,
outputs_bw
],
axis
=-
1
)
return
outputs_fw
,
outputs_bw
,
current_outputs
research/cvt_text/model/model_helpers.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for building the model."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
tensorflow
as
tf
def
project
(
input_layers
,
size
,
name
=
'projection'
):
return
tf
.
add_n
([
tf
.
layers
.
dense
(
layer
,
size
,
name
=
name
+
'_'
+
str
(
i
))
for
i
,
layer
in
enumerate
(
input_layers
)])
def
lstm_cell
(
cell_size
,
keep_prob
,
num_proj
):
return
tf
.
contrib
.
rnn
.
DropoutWrapper
(
tf
.
contrib
.
rnn
.
LSTMCell
(
cell_size
,
num_proj
=
min
(
cell_size
,
num_proj
)),
output_keep_prob
=
keep_prob
)
def
multi_lstm_cell
(
cell_sizes
,
keep_prob
,
num_proj
):
return
tf
.
contrib
.
rnn
.
MultiRNNCell
([
lstm_cell
(
cell_size
,
keep_prob
,
num_proj
)
for
cell_size
in
cell_sizes
])
def
masked_ce_loss
(
logits
,
labels
,
mask
,
sparse
=
False
,
roll_direction
=
0
):
if
roll_direction
!=
0
:
labels
=
_roll
(
labels
,
roll_direction
,
sparse
)
mask
*=
_roll
(
mask
,
roll_direction
,
True
)
ce
=
((
tf
.
nn
.
sparse_softmax_cross_entropy_with_logits
if
sparse
else
tf
.
nn
.
softmax_cross_entropy_with_logits_v2
)
(
logits
=
logits
,
labels
=
labels
))
return
tf
.
reduce_sum
(
mask
*
ce
)
/
tf
.
to_float
(
tf
.
reduce_sum
(
mask
))
def
_roll
(
arr
,
direction
,
sparse
=
False
):
if
sparse
:
return
tf
.
concat
([
arr
[:,
direction
:],
arr
[:,
:
direction
]],
axis
=
1
)
return
tf
.
concat
([
arr
[:,
direction
:,
:],
arr
[:,
:
direction
,
:]],
axis
=
1
)
research/cvt_text/model/multitask_model.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A multi-task and semi-supervised NLP model."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
tensorflow
as
tf
from
model
import
encoder
from
model
import
shared_inputs
class
Inference
(
object
):
def
__init__
(
self
,
config
,
inputs
,
pretrained_embeddings
,
tasks
):
with
tf
.
variable_scope
(
'encoder'
):
self
.
encoder
=
encoder
.
Encoder
(
config
,
inputs
,
pretrained_embeddings
)
self
.
modules
=
{}
for
task
in
tasks
:
with
tf
.
variable_scope
(
task
.
name
):
self
.
modules
[
task
.
name
]
=
task
.
get_module
(
inputs
,
self
.
encoder
)
class
Model
(
object
):
def
__init__
(
self
,
config
,
pretrained_embeddings
,
tasks
):
self
.
_config
=
config
self
.
_tasks
=
tasks
self
.
_global_step
,
self
.
_optimizer
=
self
.
_get_optimizer
()
self
.
_inputs
=
shared_inputs
.
Inputs
(
config
)
with
tf
.
variable_scope
(
'model'
,
reuse
=
tf
.
AUTO_REUSE
)
as
scope
:
inference
=
Inference
(
config
,
self
.
_inputs
,
pretrained_embeddings
,
tasks
)
self
.
_trainer
=
inference
self
.
_tester
=
inference
self
.
_teacher
=
inference
if
config
.
ema_test
or
config
.
ema_teacher
:
ema
=
tf
.
train
.
ExponentialMovingAverage
(
config
.
ema_decay
)
model_vars
=
tf
.
get_collection
(
"trainable_variables"
,
"model"
)
ema_op
=
ema
.
apply
(
model_vars
)
tf
.
add_to_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
,
ema_op
)
def
ema_getter
(
getter
,
name
,
*
args
,
**
kwargs
):
var
=
getter
(
name
,
*
args
,
**
kwargs
)
return
ema
.
average
(
var
)
scope
.
set_custom_getter
(
ema_getter
)
inference_ema
=
Inference
(
config
,
self
.
_inputs
,
pretrained_embeddings
,
tasks
)
if
config
.
ema_teacher
:
self
.
_teacher
=
inference_ema
if
config
.
ema_test
:
self
.
_tester
=
inference_ema
self
.
_unlabeled_loss
=
self
.
_get_consistency_loss
(
tasks
)
self
.
_unlabeled_train_op
=
self
.
_get_train_op
(
self
.
_unlabeled_loss
)
self
.
_labeled_train_ops
=
{}
for
task
in
self
.
_tasks
:
task_loss
=
self
.
_trainer
.
modules
[
task
.
name
].
supervised_loss
self
.
_labeled_train_ops
[
task
.
name
]
=
self
.
_get_train_op
(
task_loss
)
def
_get_consistency_loss
(
self
,
tasks
):
return
sum
([
self
.
_trainer
.
modules
[
task
.
name
].
unsupervised_loss
for
task
in
tasks
])
def
_get_optimizer
(
self
):
global_step
=
tf
.
get_variable
(
'global_step'
,
initializer
=
0
,
trainable
=
False
)
warm_up_multiplier
=
(
tf
.
minimum
(
tf
.
to_float
(
global_step
),
self
.
_config
.
warm_up_steps
)
/
self
.
_config
.
warm_up_steps
)
decay_multiplier
=
1.0
/
(
1
+
self
.
_config
.
lr_decay
*
tf
.
sqrt
(
tf
.
to_float
(
global_step
)))
lr
=
self
.
_config
.
lr
*
warm_up_multiplier
*
decay_multiplier
optimizer
=
tf
.
train
.
MomentumOptimizer
(
lr
,
self
.
_config
.
momentum
)
return
global_step
,
optimizer
def
_get_train_op
(
self
,
loss
):
grads
,
vs
=
zip
(
*
self
.
_optimizer
.
compute_gradients
(
loss
))
grads
,
_
=
tf
.
clip_by_global_norm
(
grads
,
self
.
_config
.
grad_clip
)
update_ops
=
tf
.
get_collection
(
tf
.
GraphKeys
.
UPDATE_OPS
)
with
tf
.
control_dependencies
(
update_ops
):
return
self
.
_optimizer
.
apply_gradients
(
zip
(
grads
,
vs
),
global_step
=
self
.
_global_step
)
def
_create_feed_dict
(
self
,
mb
,
model
,
is_training
=
True
):
feed
=
self
.
_inputs
.
create_feed_dict
(
mb
,
is_training
)
if
mb
.
task_name
in
model
.
modules
:
model
.
modules
[
mb
.
task_name
].
update_feed_dict
(
feed
,
mb
)
else
:
for
module
in
model
.
modules
.
values
():
module
.
update_feed_dict
(
feed
,
mb
)
return
feed
def
train_unlabeled
(
self
,
sess
,
mb
):
return
sess
.
run
([
self
.
_unlabeled_train_op
,
self
.
_unlabeled_loss
],
feed_dict
=
self
.
_create_feed_dict
(
mb
,
self
.
_trainer
))[
1
]
def
train_labeled
(
self
,
sess
,
mb
):
return
sess
.
run
([
self
.
_labeled_train_ops
[
mb
.
task_name
],
self
.
_trainer
.
modules
[
mb
.
task_name
].
supervised_loss
,],
feed_dict
=
self
.
_create_feed_dict
(
mb
,
self
.
_trainer
))[
1
]
def
run_teacher
(
self
,
sess
,
mb
):
result
=
sess
.
run
({
task
.
name
:
self
.
_teacher
.
modules
[
task
.
name
].
probs
for
task
in
self
.
_tasks
},
feed_dict
=
self
.
_create_feed_dict
(
mb
,
self
.
_teacher
,
False
))
for
task_name
,
probs
in
result
.
iteritems
():
mb
.
teacher_predictions
[
task_name
]
=
probs
.
astype
(
'float16'
)
def
test
(
self
,
sess
,
mb
):
return
sess
.
run
(
[
self
.
_tester
.
modules
[
mb
.
task_name
].
supervised_loss
,
self
.
_tester
.
modules
[
mb
.
task_name
].
preds
],
feed_dict
=
self
.
_create_feed_dict
(
mb
,
self
.
_tester
,
False
))
def
get_global_step
(
self
,
sess
):
return
sess
.
run
(
self
.
_global_step
)
research/cvt_text/model/shared_inputs.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Placeholders for non-task-specific model inputs."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
tensorflow
as
tf
class
Inputs
(
object
):
def
__init__
(
self
,
config
):
self
.
_config
=
config
self
.
keep_prob
=
tf
.
placeholder
(
tf
.
float32
,
name
=
'keep_prob'
)
self
.
label_smoothing
=
tf
.
placeholder
(
tf
.
float32
,
name
=
'label_smoothing'
)
self
.
lengths
=
tf
.
placeholder
(
tf
.
int32
,
shape
=
[
None
],
name
=
'lengths'
)
self
.
mask
=
tf
.
placeholder
(
tf
.
float32
,
[
None
,
None
],
name
=
'mask'
)
self
.
words
=
tf
.
placeholder
(
tf
.
int32
,
shape
=
[
None
,
None
],
name
=
'words'
)
self
.
chars
=
tf
.
placeholder
(
tf
.
int32
,
shape
=
[
None
,
None
,
None
],
name
=
'chars'
)
def
create_feed_dict
(
self
,
mb
,
is_training
):
cvt
=
mb
.
task_name
==
'unlabeled'
return
{
self
.
keep_prob
:
1.0
if
not
is_training
else
(
self
.
_config
.
unlabeled_keep_prob
if
cvt
else
self
.
_config
.
labeled_keep_prob
),
self
.
label_smoothing
:
self
.
_config
.
label_smoothing
if
(
is_training
and
not
cvt
)
else
0.0
,
self
.
lengths
:
mb
.
lengths
,
self
.
words
:
mb
.
words
,
self
.
chars
:
mb
.
chars
,
self
.
mask
:
mb
.
mask
.
astype
(
'float32'
)
}
research/cvt_text/model/task_module.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base classes for task-specific modules."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
abc
class
SupervisedModule
(
object
):
__metaclass__
=
abc
.
ABCMeta
def
__init__
(
self
):
self
.
supervised_loss
=
NotImplemented
self
.
probs
=
NotImplemented
self
.
preds
=
NotImplemented
@
abc
.
abstractmethod
def
update_feed_dict
(
self
,
feed
,
mb
):
pass
class
SemiSupervisedModule
(
SupervisedModule
):
__metaclass__
=
abc
.
ABCMeta
def
__init__
(
self
):
super
(
SemiSupervisedModule
,
self
).
__init__
()
self
.
unsupervised_loss
=
NotImplemented
research/cvt_text/preprocessing.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Preprocesses pretrained word embeddings, creates dev sets for tasks without a
provided one, and figures out the set of output classes for each task.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
random
from
base
import
configure
from
base
import
embeddings
from
base
import
utils
from
task_specific.word_level
import
word_level_data
def
main
(
data_dir
=
'./data'
):
random
.
seed
(
0
)
utils
.
log
(
"BUILDING WORD VOCABULARY/EMBEDDINGS"
)
for
pretrained
in
[
'glove.6B.300d.txt'
]:
config
=
configure
.
Config
(
data_dir
=
data_dir
,
for_preprocessing
=
True
,
pretrained_embeddings
=
pretrained
,
word_embedding_size
=
300
)
embeddings
.
PretrainedEmbeddingLoader
(
config
).
build
()
utils
.
log
(
"CONSTRUCTING DEV SETS"
)
for
task_name
in
[
"chunk"
]:
# chunking does not come with a provided dev split, so create one by
# selecting a random subset of the data
config
=
configure
.
Config
(
data_dir
=
data_dir
,
for_preprocessing
=
True
)
task_data_dir
=
os
.
path
.
join
(
config
.
raw_data_topdir
,
task_name
)
+
'/'
train_sentences
=
word_level_data
.
TaggedDataLoader
(
config
,
task_name
,
False
).
get_labeled_sentences
(
"train"
)
random
.
shuffle
(
train_sentences
)
write_sentences
(
task_data_dir
+
'train_subset.txt'
,
train_sentences
[
1500
:])
write_sentences
(
task_data_dir
+
'dev.txt'
,
train_sentences
[:
1500
])
utils
.
log
(
"WRITING LABEL MAPPINGS"
)
for
task_name
in
[
"chunk"
]:
for
i
,
label_encoding
in
enumerate
([
"BIOES"
]):
config
=
configure
.
Config
(
data_dir
=
data_dir
,
for_preprocessing
=
True
,
label_encoding
=
label_encoding
)
token_level
=
task_name
in
[
"ccg"
,
"pos"
,
"depparse"
]
loader
=
word_level_data
.
TaggedDataLoader
(
config
,
task_name
,
token_level
)
if
token_level
:
if
i
!=
0
:
continue
utils
.
log
(
"WRITING LABEL MAPPING FOR"
,
task_name
.
upper
())
else
:
utils
.
log
(
" Writing label mapping for"
,
task_name
.
upper
(),
label_encoding
)
utils
.
log
(
" "
,
len
(
loader
.
label_mapping
),
"classes"
)
utils
.
write_cpickle
(
loader
.
label_mapping
,
loader
.
label_mapping_path
)
def
write_sentences
(
fname
,
sentences
):
with
open
(
fname
,
'w'
)
as
f
:
for
words
,
tags
in
sentences
:
for
word
,
tag
in
zip
(
words
,
tags
):
f
.
write
(
word
+
" "
+
tag
+
"
\n
"
)
f
.
write
(
"
\n
"
)
if
__name__
==
'__main__'
:
main
()
research/cvt_text/task_specific/__init__.py
0 → 100644
View file @
27b4acd4
research/cvt_text/task_specific/task_definitions.py
0 → 100644
View file @
27b4acd4
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines all the tasks the model can learn."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
abc
from
base
import
embeddings
from
task_specific.word_level
import
depparse_module
from
task_specific.word_level
import
depparse_scorer
from
task_specific.word_level
import
tagging_module
from
task_specific.word_level
import
tagging_scorers
from
task_specific.word_level
import
word_level_data
class
Task
(
object
):
__metaclass__
=
abc
.
ABCMeta
def
__init__
(
self
,
config
,
name
,
loader
):
self
.
config
=
config
self
.
name
=
name
self
.
loader
=
loader
self
.
train_set
=
self
.
loader
.
get_dataset
(
"train"
)
self
.
val_set
=
self
.
loader
.
get_dataset
(
"dev"
if
config
.
dev_set
else
"test"
)
@
abc
.
abstractmethod
def
get_module
(
self
,
inputs
,
encoder
):
pass
@
abc
.
abstractmethod
def
get_scorer
(
self
):
pass
class
Tagging
(
Task
):
def
__init__
(
self
,
config
,
name
,
is_token_level
=
True
):
super
(
Tagging
,
self
).
__init__
(
config
,
name
,
word_level_data
.
TaggedDataLoader
(
config
,
name
,
is_token_level
))
self
.
n_classes
=
len
(
set
(
self
.
loader
.
label_mapping
.
values
()))
self
.
is_token_level
=
is_token_level
def
get_module
(
self
,
inputs
,
encoder
):
return
tagging_module
.
TaggingModule
(
self
.
config
,
self
.
name
,
self
.
n_classes
,
inputs
,
encoder
)
def
get_scorer
(
self
):
if
self
.
is_token_level
:
return
tagging_scorers
.
AccuracyScorer
()
else
:
return
tagging_scorers
.
EntityLevelF1Scorer
(
self
.
loader
.
label_mapping
)
class
DependencyParsing
(
Tagging
):
def
__init__
(
self
,
config
,
name
):
super
(
DependencyParsing
,
self
).
__init__
(
config
,
name
,
True
)
def
get_module
(
self
,
inputs
,
encoder
):
return
depparse_module
.
DepparseModule
(
self
.
config
,
self
.
name
,
self
.
n_classes
,
inputs
,
encoder
)
def
get_scorer
(
self
):
return
depparse_scorer
.
DepparseScorer
(
self
.
n_classes
,
(
embeddings
.
get_punctuation_ids
(
self
.
config
)))
def
get_task
(
config
,
name
):
if
name
in
[
"ccg"
,
"pos"
]:
return
Tagging
(
config
,
name
,
True
)
elif
name
in
[
"chunk"
,
"ner"
,
"er"
]:
return
Tagging
(
config
,
name
,
False
)
elif
name
==
"depparse"
:
return
DependencyParsing
(
config
,
name
)
else
:
raise
ValueError
(
"Unknown task"
,
name
)
research/cvt_text/task_specific/word_level/__init__.py
0 → 100644
View file @
27b4acd4
Prev
1
2
3
4
5
6
7
8
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment