"vscode:/vscode.git/clone" did not exist on "38800db36f99b816d24f080e0ba3acb2afe30532"
Commit f5fc733a authored by Byzantine's avatar Byzantine
Browse files

Removing research/community models

parent 09bc9f54
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for sequence tagging tasks for entity-level tasks (e.g., NER)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
def get_span_labels(sentence_tags, inv_label_mapping=None):
"""Go from token-level labels to list of entities (start, end, class)."""
if inv_label_mapping:
sentence_tags = [inv_label_mapping[i] for i in sentence_tags]
span_labels = []
last = 'O'
start = -1
for i, tag in enumerate(sentence_tags):
pos, _ = (None, 'O') if tag == 'O' else tag.split('-')
if (pos == 'S' or pos == 'B' or tag == 'O') and last != 'O':
span_labels.append((start, i - 1, last.split('-')[-1]))
if pos == 'B' or pos == 'S' or last == 'O':
start = i
last = tag
if sentence_tags[-1] != 'O':
span_labels.append((start, len(sentence_tags) - 1,
sentence_tags[-1].split('-')[-1]))
return span_labels
def get_tags(span_labels, length, encoding):
"""Converts a list of entities to token-label labels based on the provided
encoding (e.g., BIOES).
"""
tags = ['O' for _ in range(length)]
for s, e, t in span_labels:
for i in range(s, e + 1):
tags[i] = 'I-' + t
if 'E' in encoding:
tags[e] = 'E-' + t
if 'B' in encoding:
tags[s] = 'B-' + t
if 'S' in encoding and s == e:
tags[s] = 'S-' + t
return tags
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for processing word-level datasets."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import random
import tensorflow as tf
from base import embeddings
from base import utils
from corpus_processing import example
from corpus_processing import minibatching
from task_specific.word_level import tagging_utils
class TaggedDataLoader(object):
def __init__(self, config, name, is_token_level):
self._config = config
self._task_name = name
self._raw_data_path = os.path.join(config.raw_data_topdir, name)
self._is_token_level = is_token_level
self.label_mapping_path = os.path.join(
config.preprocessed_data_topdir,
(name if is_token_level else
name + '_' + config.label_encoding) + '_label_mapping.pkl')
if self.label_mapping:
self._n_classes = len(set(self.label_mapping.values()))
else:
self._n_classes = None
def get_dataset(self, split):
if (split == 'train' and not self._config.for_preprocessing and
tf.gfile.Exists(os.path.join(self._raw_data_path, 'train_subset.txt'))):
split = 'train_subset'
return minibatching.Dataset(
self._config, self._get_examples(split), self._task_name)
def get_labeled_sentences(self, split):
sentences = []
path = os.path.join(self._raw_data_path, split + '.txt')
if not tf.gfile.Exists(path):
if self._config.for_preprocessing:
return []
else:
raise ValueError('Unable to load data from', path)
with tf.gfile.GFile(path, 'r') as f:
sentence = []
for line in f:
line = line.strip().split()
if not line:
if sentence:
words, tags = zip(*sentence)
sentences.append((words, tags))
sentence = []
continue
if line[0] == '-DOCSTART-':
continue
word, tag = line[0], line[-1]
sentence.append((word, tag))
return sentences
@property
def label_mapping(self):
if not self._config.for_preprocessing:
return utils.load_cpickle(self.label_mapping_path)
tag_counts = collections.Counter()
train_tags = set()
for split in ['train', 'dev', 'test']:
for words, tags in self.get_labeled_sentences(split):
if not self._is_token_level:
span_labels = tagging_utils.get_span_labels(tags)
tags = tagging_utils.get_tags(
span_labels, len(words), self._config.label_encoding)
for tag in tags:
if self._task_name == 'depparse':
tag = tag.split('-')[1]
tag_counts[tag] += 1
if split == 'train':
train_tags.add(tag)
if self._task_name == 'ccg':
# for CCG, there are tags in the test sets that aren't in the train set
# all tags not in the train set get mapped to a special label
# the model will never predict this label because it never sees it in the
# training set
not_in_train_tags = []
for tag, count in tag_counts.items():
if tag not in train_tags:
not_in_train_tags.append(tag)
label_mapping = {
label: i for i, label in enumerate(sorted(filter(
lambda t: t not in not_in_train_tags, tag_counts.keys())))
}
n = len(label_mapping)
for tag in not_in_train_tags:
label_mapping[tag] = n
else:
labels = sorted(tag_counts.keys())
if self._task_name == 'depparse':
labels.remove('root')
labels.insert(0, 'root')
label_mapping = {label: i for i, label in enumerate(labels)}
return label_mapping
def _get_examples(self, split):
word_vocab = embeddings.get_word_vocab(self._config)
char_vocab = embeddings.get_char_vocab()
examples = [
TaggingExample(
self._config, self._is_token_level, words, tags,
word_vocab, char_vocab, self.label_mapping, self._task_name)
for words, tags in self.get_labeled_sentences(split)]
if self._config.train_set_percent < 100:
utils.log('using reduced train set ({:}%)'.format(
self._config.train_set_percent))
random.shuffle(examples)
examples = examples[:int(len(examples) *
self._config.train_set_percent / 100.0)]
return examples
class TaggingExample(example.Example):
def __init__(self, config, is_token_level, words, original_tags,
word_vocab, char_vocab, label_mapping, task_name):
super(TaggingExample, self).__init__(words, word_vocab, char_vocab)
if is_token_level:
labels = original_tags
else:
span_labels = tagging_utils.get_span_labels(original_tags)
labels = tagging_utils.get_tags(
span_labels, len(words), config.label_encoding)
if task_name == 'depparse':
self.labels = []
for l in labels:
split = l.split('-')
self.labels.append(
len(label_mapping) * (0 if split[0] == '0' else 1 + int(split[0]))
+ label_mapping[split[1]])
else:
self.labels = [label_mapping[l] for l in labels]
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base class for word-level scorers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import abc
from corpus_processing import scorer
class WordLevelScorer(scorer.Scorer):
__metaclass__ = abc.ABCMeta
def __init__(self):
super(WordLevelScorer, self).__init__()
self._total_loss = 0
self._total_words = 0
self._examples = []
self._preds = []
def update(self, examples, predictions, loss):
super(WordLevelScorer, self).update(examples, predictions, loss)
n_words = 0
for example, preds in zip(examples, predictions):
self._examples.append(example)
self._preds.append(list(preds)[1:len(example.words) - 1])
n_words += len(example.words) - 2
self._total_loss += loss * n_words
self._total_words += n_words
def get_loss(self):
return self._total_loss / max(1, self._total_words)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs training for CVT text models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import bisect
import time
import numpy as np
import tensorflow as tf
from base import utils
from model import multitask_model
from task_specific import task_definitions
class Trainer(object):
def __init__(self, config):
self._config = config
self.tasks = [task_definitions.get_task(self._config, task_name)
for task_name in self._config.task_names]
utils.log('Loading Pretrained Embeddings')
pretrained_embeddings = utils.load_cpickle(self._config.word_embeddings)
utils.log('Building Model')
self._model = multitask_model.Model(
self._config, pretrained_embeddings, self.tasks)
utils.log()
def train(self, sess, progress, summary_writer):
heading = lambda s: utils.heading(s, '(' + self._config.model_name + ')')
trained_on_sentences = 0
start_time = time.time()
unsupervised_loss_total, unsupervised_loss_count = 0, 0
supervised_loss_total, supervised_loss_count = 0, 0
for mb in self._get_training_mbs(progress.unlabeled_data_reader):
if mb.task_name != 'unlabeled':
loss = self._model.train_labeled(sess, mb)
supervised_loss_total += loss
supervised_loss_count += 1
if mb.task_name == 'unlabeled':
self._model.run_teacher(sess, mb)
loss = self._model.train_unlabeled(sess, mb)
unsupervised_loss_total += loss
unsupervised_loss_count += 1
mb.teacher_predictions.clear()
trained_on_sentences += mb.size
global_step = self._model.get_global_step(sess)
if global_step % self._config.print_every == 0:
utils.log('step {:} - '
'supervised loss: {:.2f} - '
'unsupervised loss: {:.2f} - '
'{:.1f} sentences per second'.format(
global_step,
supervised_loss_total / max(1, supervised_loss_count),
unsupervised_loss_total / max(1, unsupervised_loss_count),
trained_on_sentences / (time.time() - start_time)))
unsupervised_loss_total, unsupervised_loss_count = 0, 0
supervised_loss_total, supervised_loss_count = 0, 0
if global_step % self._config.eval_dev_every == 0:
heading('EVAL ON DEV')
self.evaluate_all_tasks(sess, summary_writer, progress.history)
progress.save_if_best_dev_model(sess, global_step)
utils.log()
if global_step % self._config.eval_train_every == 0:
heading('EVAL ON TRAIN')
self.evaluate_all_tasks(sess, summary_writer, progress.history, True)
utils.log()
if global_step % self._config.save_model_every == 0:
heading('CHECKPOINTING MODEL')
progress.write(sess, global_step)
utils.log()
def evaluate_all_tasks(self, sess, summary_writer, history, train_set=False):
for task in self.tasks:
results = self._evaluate_task(sess, task, summary_writer, train_set)
if history is not None:
results.append(('step', self._model.get_global_step(sess)))
history.append(results)
if history is not None:
utils.write_cpickle(history, self._config.history_file)
def _evaluate_task(self, sess, task, summary_writer, train_set):
scorer = task.get_scorer()
data = task.train_set if train_set else task.val_set
for i, mb in enumerate(data.get_minibatches(self._config.test_batch_size)):
loss, batch_preds = self._model.test(sess, mb)
scorer.update(mb.examples, batch_preds, loss)
results = scorer.get_results(task.name +
('_train_' if train_set else '_dev_'))
utils.log(task.name.upper() + ': ' + scorer.results_str())
write_summary(summary_writer, results,
global_step=self._model.get_global_step(sess))
return results
def _get_training_mbs(self, unlabeled_data_reader):
datasets = [task.train_set for task in self.tasks]
weights = [np.sqrt(dataset.size) for dataset in datasets]
thresholds = np.cumsum([w / np.sum(weights) for w in weights])
labeled_mbs = [dataset.endless_minibatches(self._config.train_batch_size)
for dataset in datasets]
unlabeled_mbs = unlabeled_data_reader.endless_minibatches()
while True:
dataset_ind = bisect.bisect(thresholds, np.random.random())
yield next(labeled_mbs[dataset_ind])
if self._config.is_semisup:
yield next(unlabeled_mbs)
def write_summary(writer, results, global_step):
for k, v in results:
if 'f1' in k or 'acc' in k or 'loss' in k:
writer.add_summary(tf.Summary(
value=[tf.Summary.Value(tag=k, simple_value=v)]), global_step)
writer.flush()
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Tracks and saves training progress (models and other data such as the current
location in the lm1b corpus) for later reloading.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from base import utils
from corpus_processing import unlabeled_data
class TrainingProgress(object):
def __init__(self, config, sess, checkpoint_saver, best_model_saver,
restore_if_possible=True):
self.config = config
self.checkpoint_saver = checkpoint_saver
self.best_model_saver = best_model_saver
tf.gfile.MakeDirs(config.checkpoints_dir)
if restore_if_possible and tf.gfile.Exists(config.progress):
history, current_file, current_line = utils.load_cpickle(
config.progress, memoized=False)
self.history = history
self.unlabeled_data_reader = unlabeled_data.UnlabeledDataReader(
config, current_file, current_line)
utils.log("Continuing from global step", dict(self.history[-1])["step"],
"(lm1b file {:}, line {:})".format(current_file, current_line))
self.checkpoint_saver.restore(sess, tf.train.latest_checkpoint(
self.config.checkpoints_dir))
else:
utils.log("No previous checkpoint found - starting from scratch")
self.history = []
self.unlabeled_data_reader = (
unlabeled_data.UnlabeledDataReader(config))
def write(self, sess, global_step):
self.checkpoint_saver.save(sess, self.config.checkpoint,
global_step=global_step)
utils.write_cpickle(
(self.history, self.unlabeled_data_reader.current_file,
self.unlabeled_data_reader.current_line),
self.config.progress)
def save_if_best_dev_model(self, sess, global_step):
best_avg_score = 0
for i, results in enumerate(self.history):
if any("train" in metric for metric, value in results):
continue
total, count = 0, 0
for metric, value in results:
if "f1" in metric or "las" in metric or "accuracy" in metric:
total += value
count += 1
avg_score = total / count
if avg_score >= best_avg_score:
best_avg_score = avg_score
if i == len(self.history) - 1:
utils.log("New best model! Saving...")
self.best_model_saver.save(sess, self.config.best_model_checkpoint,
global_step=global_step)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment