Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
f5fc733a
Commit
f5fc733a
authored
Feb 03, 2022
by
Byzantine
Browse files
Removing research/community models
parent
09bc9f54
Changes
326
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
0 additions
and
486 deletions
+0
-486
research/cvt_text/task_specific/word_level/tagging_utils.py
research/cvt_text/task_specific/word_level/tagging_utils.py
+0
-59
research/cvt_text/task_specific/word_level/word_level_data.py
...arch/cvt_text/task_specific/word_level/word_level_data.py
+0
-161
research/cvt_text/task_specific/word_level/word_level_scorer.py
...ch/cvt_text/task_specific/word_level/word_level_scorer.py
+0
-48
research/cvt_text/training/__init__.py
research/cvt_text/training/__init__.py
+0
-0
research/cvt_text/training/trainer.py
research/cvt_text/training/trainer.py
+0
-139
research/cvt_text/training/training_progress.py
research/cvt_text/training/training_progress.py
+0
-79
No files found.
Too many changes to show.
To preserve performance only
326 of 326+
files are displayed.
Plain diff
Email patch
research/cvt_text/task_specific/word_level/tagging_utils.py
deleted
100644 → 0
View file @
09bc9f54
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for sequence tagging tasks for entity-level tasks (e.g., NER)."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
def
get_span_labels
(
sentence_tags
,
inv_label_mapping
=
None
):
"""Go from token-level labels to list of entities (start, end, class)."""
if
inv_label_mapping
:
sentence_tags
=
[
inv_label_mapping
[
i
]
for
i
in
sentence_tags
]
span_labels
=
[]
last
=
'O'
start
=
-
1
for
i
,
tag
in
enumerate
(
sentence_tags
):
pos
,
_
=
(
None
,
'O'
)
if
tag
==
'O'
else
tag
.
split
(
'-'
)
if
(
pos
==
'S'
or
pos
==
'B'
or
tag
==
'O'
)
and
last
!=
'O'
:
span_labels
.
append
((
start
,
i
-
1
,
last
.
split
(
'-'
)[
-
1
]))
if
pos
==
'B'
or
pos
==
'S'
or
last
==
'O'
:
start
=
i
last
=
tag
if
sentence_tags
[
-
1
]
!=
'O'
:
span_labels
.
append
((
start
,
len
(
sentence_tags
)
-
1
,
sentence_tags
[
-
1
].
split
(
'-'
)[
-
1
]))
return
span_labels
def
get_tags
(
span_labels
,
length
,
encoding
):
"""Converts a list of entities to token-label labels based on the provided
encoding (e.g., BIOES).
"""
tags
=
[
'O'
for
_
in
range
(
length
)]
for
s
,
e
,
t
in
span_labels
:
for
i
in
range
(
s
,
e
+
1
):
tags
[
i
]
=
'I-'
+
t
if
'E'
in
encoding
:
tags
[
e
]
=
'E-'
+
t
if
'B'
in
encoding
:
tags
[
s
]
=
'B-'
+
t
if
'S'
in
encoding
and
s
==
e
:
tags
[
s
]
=
'S-'
+
t
return
tags
research/cvt_text/task_specific/word_level/word_level_data.py
deleted
100644 → 0
View file @
09bc9f54
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for processing word-level datasets."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
os
import
random
import
tensorflow
as
tf
from
base
import
embeddings
from
base
import
utils
from
corpus_processing
import
example
from
corpus_processing
import
minibatching
from
task_specific.word_level
import
tagging_utils
class
TaggedDataLoader
(
object
):
def
__init__
(
self
,
config
,
name
,
is_token_level
):
self
.
_config
=
config
self
.
_task_name
=
name
self
.
_raw_data_path
=
os
.
path
.
join
(
config
.
raw_data_topdir
,
name
)
self
.
_is_token_level
=
is_token_level
self
.
label_mapping_path
=
os
.
path
.
join
(
config
.
preprocessed_data_topdir
,
(
name
if
is_token_level
else
name
+
'_'
+
config
.
label_encoding
)
+
'_label_mapping.pkl'
)
if
self
.
label_mapping
:
self
.
_n_classes
=
len
(
set
(
self
.
label_mapping
.
values
()))
else
:
self
.
_n_classes
=
None
def
get_dataset
(
self
,
split
):
if
(
split
==
'train'
and
not
self
.
_config
.
for_preprocessing
and
tf
.
gfile
.
Exists
(
os
.
path
.
join
(
self
.
_raw_data_path
,
'train_subset.txt'
))):
split
=
'train_subset'
return
minibatching
.
Dataset
(
self
.
_config
,
self
.
_get_examples
(
split
),
self
.
_task_name
)
def
get_labeled_sentences
(
self
,
split
):
sentences
=
[]
path
=
os
.
path
.
join
(
self
.
_raw_data_path
,
split
+
'.txt'
)
if
not
tf
.
gfile
.
Exists
(
path
):
if
self
.
_config
.
for_preprocessing
:
return
[]
else
:
raise
ValueError
(
'Unable to load data from'
,
path
)
with
tf
.
gfile
.
GFile
(
path
,
'r'
)
as
f
:
sentence
=
[]
for
line
in
f
:
line
=
line
.
strip
().
split
()
if
not
line
:
if
sentence
:
words
,
tags
=
zip
(
*
sentence
)
sentences
.
append
((
words
,
tags
))
sentence
=
[]
continue
if
line
[
0
]
==
'-DOCSTART-'
:
continue
word
,
tag
=
line
[
0
],
line
[
-
1
]
sentence
.
append
((
word
,
tag
))
return
sentences
@
property
def
label_mapping
(
self
):
if
not
self
.
_config
.
for_preprocessing
:
return
utils
.
load_cpickle
(
self
.
label_mapping_path
)
tag_counts
=
collections
.
Counter
()
train_tags
=
set
()
for
split
in
[
'train'
,
'dev'
,
'test'
]:
for
words
,
tags
in
self
.
get_labeled_sentences
(
split
):
if
not
self
.
_is_token_level
:
span_labels
=
tagging_utils
.
get_span_labels
(
tags
)
tags
=
tagging_utils
.
get_tags
(
span_labels
,
len
(
words
),
self
.
_config
.
label_encoding
)
for
tag
in
tags
:
if
self
.
_task_name
==
'depparse'
:
tag
=
tag
.
split
(
'-'
)[
1
]
tag_counts
[
tag
]
+=
1
if
split
==
'train'
:
train_tags
.
add
(
tag
)
if
self
.
_task_name
==
'ccg'
:
# for CCG, there are tags in the test sets that aren't in the train set
# all tags not in the train set get mapped to a special label
# the model will never predict this label because it never sees it in the
# training set
not_in_train_tags
=
[]
for
tag
,
count
in
tag_counts
.
items
():
if
tag
not
in
train_tags
:
not_in_train_tags
.
append
(
tag
)
label_mapping
=
{
label
:
i
for
i
,
label
in
enumerate
(
sorted
(
filter
(
lambda
t
:
t
not
in
not_in_train_tags
,
tag_counts
.
keys
())))
}
n
=
len
(
label_mapping
)
for
tag
in
not_in_train_tags
:
label_mapping
[
tag
]
=
n
else
:
labels
=
sorted
(
tag_counts
.
keys
())
if
self
.
_task_name
==
'depparse'
:
labels
.
remove
(
'root'
)
labels
.
insert
(
0
,
'root'
)
label_mapping
=
{
label
:
i
for
i
,
label
in
enumerate
(
labels
)}
return
label_mapping
def
_get_examples
(
self
,
split
):
word_vocab
=
embeddings
.
get_word_vocab
(
self
.
_config
)
char_vocab
=
embeddings
.
get_char_vocab
()
examples
=
[
TaggingExample
(
self
.
_config
,
self
.
_is_token_level
,
words
,
tags
,
word_vocab
,
char_vocab
,
self
.
label_mapping
,
self
.
_task_name
)
for
words
,
tags
in
self
.
get_labeled_sentences
(
split
)]
if
self
.
_config
.
train_set_percent
<
100
:
utils
.
log
(
'using reduced train set ({:}%)'
.
format
(
self
.
_config
.
train_set_percent
))
random
.
shuffle
(
examples
)
examples
=
examples
[:
int
(
len
(
examples
)
*
self
.
_config
.
train_set_percent
/
100.0
)]
return
examples
class
TaggingExample
(
example
.
Example
):
def
__init__
(
self
,
config
,
is_token_level
,
words
,
original_tags
,
word_vocab
,
char_vocab
,
label_mapping
,
task_name
):
super
(
TaggingExample
,
self
).
__init__
(
words
,
word_vocab
,
char_vocab
)
if
is_token_level
:
labels
=
original_tags
else
:
span_labels
=
tagging_utils
.
get_span_labels
(
original_tags
)
labels
=
tagging_utils
.
get_tags
(
span_labels
,
len
(
words
),
config
.
label_encoding
)
if
task_name
==
'depparse'
:
self
.
labels
=
[]
for
l
in
labels
:
split
=
l
.
split
(
'-'
)
self
.
labels
.
append
(
len
(
label_mapping
)
*
(
0
if
split
[
0
]
==
'0'
else
1
+
int
(
split
[
0
]))
+
label_mapping
[
split
[
1
]])
else
:
self
.
labels
=
[
label_mapping
[
l
]
for
l
in
labels
]
research/cvt_text/task_specific/word_level/word_level_scorer.py
deleted
100644 → 0
View file @
09bc9f54
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base class for word-level scorers."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
abc
from
corpus_processing
import
scorer
class
WordLevelScorer
(
scorer
.
Scorer
):
__metaclass__
=
abc
.
ABCMeta
def
__init__
(
self
):
super
(
WordLevelScorer
,
self
).
__init__
()
self
.
_total_loss
=
0
self
.
_total_words
=
0
self
.
_examples
=
[]
self
.
_preds
=
[]
def
update
(
self
,
examples
,
predictions
,
loss
):
super
(
WordLevelScorer
,
self
).
update
(
examples
,
predictions
,
loss
)
n_words
=
0
for
example
,
preds
in
zip
(
examples
,
predictions
):
self
.
_examples
.
append
(
example
)
self
.
_preds
.
append
(
list
(
preds
)[
1
:
len
(
example
.
words
)
-
1
])
n_words
+=
len
(
example
.
words
)
-
2
self
.
_total_loss
+=
loss
*
n_words
self
.
_total_words
+=
n_words
def
get_loss
(
self
):
return
self
.
_total_loss
/
max
(
1
,
self
.
_total_words
)
research/cvt_text/training/__init__.py
deleted
100644 → 0
View file @
09bc9f54
research/cvt_text/training/trainer.py
deleted
100644 → 0
View file @
09bc9f54
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs training for CVT text models."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
bisect
import
time
import
numpy
as
np
import
tensorflow
as
tf
from
base
import
utils
from
model
import
multitask_model
from
task_specific
import
task_definitions
class
Trainer
(
object
):
def
__init__
(
self
,
config
):
self
.
_config
=
config
self
.
tasks
=
[
task_definitions
.
get_task
(
self
.
_config
,
task_name
)
for
task_name
in
self
.
_config
.
task_names
]
utils
.
log
(
'Loading Pretrained Embeddings'
)
pretrained_embeddings
=
utils
.
load_cpickle
(
self
.
_config
.
word_embeddings
)
utils
.
log
(
'Building Model'
)
self
.
_model
=
multitask_model
.
Model
(
self
.
_config
,
pretrained_embeddings
,
self
.
tasks
)
utils
.
log
()
def
train
(
self
,
sess
,
progress
,
summary_writer
):
heading
=
lambda
s
:
utils
.
heading
(
s
,
'('
+
self
.
_config
.
model_name
+
')'
)
trained_on_sentences
=
0
start_time
=
time
.
time
()
unsupervised_loss_total
,
unsupervised_loss_count
=
0
,
0
supervised_loss_total
,
supervised_loss_count
=
0
,
0
for
mb
in
self
.
_get_training_mbs
(
progress
.
unlabeled_data_reader
):
if
mb
.
task_name
!=
'unlabeled'
:
loss
=
self
.
_model
.
train_labeled
(
sess
,
mb
)
supervised_loss_total
+=
loss
supervised_loss_count
+=
1
if
mb
.
task_name
==
'unlabeled'
:
self
.
_model
.
run_teacher
(
sess
,
mb
)
loss
=
self
.
_model
.
train_unlabeled
(
sess
,
mb
)
unsupervised_loss_total
+=
loss
unsupervised_loss_count
+=
1
mb
.
teacher_predictions
.
clear
()
trained_on_sentences
+=
mb
.
size
global_step
=
self
.
_model
.
get_global_step
(
sess
)
if
global_step
%
self
.
_config
.
print_every
==
0
:
utils
.
log
(
'step {:} - '
'supervised loss: {:.2f} - '
'unsupervised loss: {:.2f} - '
'{:.1f} sentences per second'
.
format
(
global_step
,
supervised_loss_total
/
max
(
1
,
supervised_loss_count
),
unsupervised_loss_total
/
max
(
1
,
unsupervised_loss_count
),
trained_on_sentences
/
(
time
.
time
()
-
start_time
)))
unsupervised_loss_total
,
unsupervised_loss_count
=
0
,
0
supervised_loss_total
,
supervised_loss_count
=
0
,
0
if
global_step
%
self
.
_config
.
eval_dev_every
==
0
:
heading
(
'EVAL ON DEV'
)
self
.
evaluate_all_tasks
(
sess
,
summary_writer
,
progress
.
history
)
progress
.
save_if_best_dev_model
(
sess
,
global_step
)
utils
.
log
()
if
global_step
%
self
.
_config
.
eval_train_every
==
0
:
heading
(
'EVAL ON TRAIN'
)
self
.
evaluate_all_tasks
(
sess
,
summary_writer
,
progress
.
history
,
True
)
utils
.
log
()
if
global_step
%
self
.
_config
.
save_model_every
==
0
:
heading
(
'CHECKPOINTING MODEL'
)
progress
.
write
(
sess
,
global_step
)
utils
.
log
()
def
evaluate_all_tasks
(
self
,
sess
,
summary_writer
,
history
,
train_set
=
False
):
for
task
in
self
.
tasks
:
results
=
self
.
_evaluate_task
(
sess
,
task
,
summary_writer
,
train_set
)
if
history
is
not
None
:
results
.
append
((
'step'
,
self
.
_model
.
get_global_step
(
sess
)))
history
.
append
(
results
)
if
history
is
not
None
:
utils
.
write_cpickle
(
history
,
self
.
_config
.
history_file
)
def
_evaluate_task
(
self
,
sess
,
task
,
summary_writer
,
train_set
):
scorer
=
task
.
get_scorer
()
data
=
task
.
train_set
if
train_set
else
task
.
val_set
for
i
,
mb
in
enumerate
(
data
.
get_minibatches
(
self
.
_config
.
test_batch_size
)):
loss
,
batch_preds
=
self
.
_model
.
test
(
sess
,
mb
)
scorer
.
update
(
mb
.
examples
,
batch_preds
,
loss
)
results
=
scorer
.
get_results
(
task
.
name
+
(
'_train_'
if
train_set
else
'_dev_'
))
utils
.
log
(
task
.
name
.
upper
()
+
': '
+
scorer
.
results_str
())
write_summary
(
summary_writer
,
results
,
global_step
=
self
.
_model
.
get_global_step
(
sess
))
return
results
def
_get_training_mbs
(
self
,
unlabeled_data_reader
):
datasets
=
[
task
.
train_set
for
task
in
self
.
tasks
]
weights
=
[
np
.
sqrt
(
dataset
.
size
)
for
dataset
in
datasets
]
thresholds
=
np
.
cumsum
([
w
/
np
.
sum
(
weights
)
for
w
in
weights
])
labeled_mbs
=
[
dataset
.
endless_minibatches
(
self
.
_config
.
train_batch_size
)
for
dataset
in
datasets
]
unlabeled_mbs
=
unlabeled_data_reader
.
endless_minibatches
()
while
True
:
dataset_ind
=
bisect
.
bisect
(
thresholds
,
np
.
random
.
random
())
yield
next
(
labeled_mbs
[
dataset_ind
])
if
self
.
_config
.
is_semisup
:
yield
next
(
unlabeled_mbs
)
def
write_summary
(
writer
,
results
,
global_step
):
for
k
,
v
in
results
:
if
'f1'
in
k
or
'acc'
in
k
or
'loss'
in
k
:
writer
.
add_summary
(
tf
.
Summary
(
value
=
[
tf
.
Summary
.
Value
(
tag
=
k
,
simple_value
=
v
)]),
global_step
)
writer
.
flush
()
research/cvt_text/training/training_progress.py
deleted
100644 → 0
View file @
09bc9f54
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Tracks and saves training progress (models and other data such as the current
location in the lm1b corpus) for later reloading.
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
tensorflow
as
tf
from
base
import
utils
from
corpus_processing
import
unlabeled_data
class
TrainingProgress
(
object
):
def
__init__
(
self
,
config
,
sess
,
checkpoint_saver
,
best_model_saver
,
restore_if_possible
=
True
):
self
.
config
=
config
self
.
checkpoint_saver
=
checkpoint_saver
self
.
best_model_saver
=
best_model_saver
tf
.
gfile
.
MakeDirs
(
config
.
checkpoints_dir
)
if
restore_if_possible
and
tf
.
gfile
.
Exists
(
config
.
progress
):
history
,
current_file
,
current_line
=
utils
.
load_cpickle
(
config
.
progress
,
memoized
=
False
)
self
.
history
=
history
self
.
unlabeled_data_reader
=
unlabeled_data
.
UnlabeledDataReader
(
config
,
current_file
,
current_line
)
utils
.
log
(
"Continuing from global step"
,
dict
(
self
.
history
[
-
1
])[
"step"
],
"(lm1b file {:}, line {:})"
.
format
(
current_file
,
current_line
))
self
.
checkpoint_saver
.
restore
(
sess
,
tf
.
train
.
latest_checkpoint
(
self
.
config
.
checkpoints_dir
))
else
:
utils
.
log
(
"No previous checkpoint found - starting from scratch"
)
self
.
history
=
[]
self
.
unlabeled_data_reader
=
(
unlabeled_data
.
UnlabeledDataReader
(
config
))
def
write
(
self
,
sess
,
global_step
):
self
.
checkpoint_saver
.
save
(
sess
,
self
.
config
.
checkpoint
,
global_step
=
global_step
)
utils
.
write_cpickle
(
(
self
.
history
,
self
.
unlabeled_data_reader
.
current_file
,
self
.
unlabeled_data_reader
.
current_line
),
self
.
config
.
progress
)
def
save_if_best_dev_model
(
self
,
sess
,
global_step
):
best_avg_score
=
0
for
i
,
results
in
enumerate
(
self
.
history
):
if
any
(
"train"
in
metric
for
metric
,
value
in
results
):
continue
total
,
count
=
0
,
0
for
metric
,
value
in
results
:
if
"f1"
in
metric
or
"las"
in
metric
or
"accuracy"
in
metric
:
total
+=
value
count
+=
1
avg_score
=
total
/
count
if
avg_score
>=
best_avg_score
:
best_avg_score
=
avg_score
if
i
==
len
(
self
.
history
)
-
1
:
utils
.
log
(
"New best model! Saving..."
)
self
.
best_model_saver
.
save
(
sess
,
self
.
config
.
best_model_checkpoint
,
global_step
=
global_step
)
Prev
1
…
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment