Removing research/community models

f5fc733a · Byzantine · 09bc9f54 · 09bc9f54 · 09bc9f54 · 09bc9f54
Commit f5fc733a authored Feb 03, 2022 by Byzantine
6 changed files
--- a/research/cvt_text/task_specific/word_level/tagging_utils.py
+++ b/research/cvt_text/task_specific/word_level/tagging_utils.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Utilities for sequence tagging tasks for entity-level tasks (e.g., NER)."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-def get_span_labels(sentence_tags, inv_label_mapping=None):
-  """Go from token-level labels to list of entities (start, end, class)."""
-
-  if inv_label_mapping:
-    sentence_tags = [inv_label_mapping[i] for i in sentence_tags]
-  span_labels = []
-  last = 'O'
-  start = -1
-  for i, tag in enumerate(sentence_tags):
-    pos, _ = (None, 'O') if tag == 'O' else tag.split('-')
-    if (pos == 'S' or pos == 'B' or tag == 'O') and last != 'O':
-      span_labels.append((start, i - 1, last.split('-')[-1]))
-    if pos == 'B' or pos == 'S' or last == 'O':
-      start = i
-    last = tag
-  if sentence_tags[-1] != 'O':
-    span_labels.append((start, len(sentence_tags) - 1,
-                        sentence_tags[-1].split('-')[-1]))
-  return span_labels
-
-
-def get_tags(span_labels, length, encoding):
-  """Converts a list of entities to token-label labels based on the provided
-  encoding (e.g., BIOES).
-  """
-
-  tags = ['O' for _ in range(length)]
-  for s, e, t in span_labels:
-    for i in range(s, e + 1):
-      tags[i] = 'I-' + t
-    if 'E' in encoding:
-      tags[e] = 'E-' + t
-    if 'B' in encoding:
-      tags[s] = 'B-' + t
-    if 'S' in encoding and s == e:
-      tags[s] = 'S-' + t
-  return tags
--- a/research/cvt_text/task_specific/word_level/word_level_data.py
+++ b/research/cvt_text/task_specific/word_level/word_level_data.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Utilities for processing word-level datasets."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import os
-import random
-import tensorflow as tf
-
-from base import embeddings
-from base import utils
-from corpus_processing import example
-from corpus_processing import minibatching
-from task_specific.word_level import tagging_utils
-
-
-class TaggedDataLoader(object):
-  def __init__(self, config, name, is_token_level):
-    self._config = config
-    self._task_name = name
-    self._raw_data_path = os.path.join(config.raw_data_topdir, name)
-    self._is_token_level = is_token_level
-    self.label_mapping_path = os.path.join(
-        config.preprocessed_data_topdir,
-        (name if is_token_level else
-         name + '_' + config.label_encoding) + '_label_mapping.pkl')
-
-    if self.label_mapping:
-      self._n_classes = len(set(self.label_mapping.values()))
-    else:
-      self._n_classes = None
-
-  def get_dataset(self, split):
-    if (split == 'train' and not self._config.for_preprocessing and
-        tf.gfile.Exists(os.path.join(self._raw_data_path, 'train_subset.txt'))):
-      split = 'train_subset'
-    return minibatching.Dataset(
-        self._config, self._get_examples(split), self._task_name)
-
-  def get_labeled_sentences(self, split):
-    sentences = []
-    path = os.path.join(self._raw_data_path, split + '.txt')
-    if not tf.gfile.Exists(path):
-      if self._config.for_preprocessing:
-        return []
-      else:
-        raise ValueError('Unable to load data from', path)
-
-    with tf.gfile.GFile(path, 'r') as f:
-      sentence = []
-      for line in f:
-        line = line.strip().split()
-        if not line:
-          if sentence:
-            words, tags = zip(*sentence)
-            sentences.append((words, tags))
-            sentence = []
-          continue
-        if line[0] == '-DOCSTART-':
-          continue
-        word, tag = line[0], line[-1]
-        sentence.append((word, tag))
-    return sentences
-
-  @property
-  def label_mapping(self):
-    if not self._config.for_preprocessing:
-      return utils.load_cpickle(self.label_mapping_path)
-
-    tag_counts = collections.Counter()
-    train_tags = set()
-    for split in ['train', 'dev', 'test']:
-      for words, tags in self.get_labeled_sentences(split):
-        if not self._is_token_level:
-          span_labels = tagging_utils.get_span_labels(tags)
-          tags = tagging_utils.get_tags(
-              span_labels, len(words), self._config.label_encoding)
-        for tag in tags:
-          if self._task_name == 'depparse':
-            tag = tag.split('-')[1]
-          tag_counts[tag] += 1
-          if split == 'train':
-            train_tags.add(tag)
-    if self._task_name == 'ccg':
-      # for CCG, there are tags in the test sets that aren't in the train set
-      # all tags not in the train set get mapped to a special label
-      # the model will never predict this label because it never sees it in the
-      # training set
-      not_in_train_tags = []
-      for tag, count in tag_counts.items():
-        if tag not in train_tags:
-          not_in_train_tags.append(tag)
-      label_mapping = {
-          label: i for i, label in enumerate(sorted(filter(
-            lambda t: t not in not_in_train_tags, tag_counts.keys())))
-      }
-      n = len(label_mapping)
-      for tag in not_in_train_tags:
-        label_mapping[tag] = n
-    else:
-      labels = sorted(tag_counts.keys())
-      if self._task_name == 'depparse':
-        labels.remove('root')
-        labels.insert(0, 'root')
-      label_mapping = {label: i for i, label in enumerate(labels)}
-    return label_mapping
-
-  def _get_examples(self, split):
-    word_vocab = embeddings.get_word_vocab(self._config)
-    char_vocab = embeddings.get_char_vocab()
-    examples = [
-        TaggingExample(
-            self._config, self._is_token_level, words, tags,
-            word_vocab, char_vocab, self.label_mapping, self._task_name)
-        for words, tags in self.get_labeled_sentences(split)]
-    if self._config.train_set_percent < 100:
-      utils.log('using reduced train set ({:}%)'.format(
-          self._config.train_set_percent))
-      random.shuffle(examples)
-      examples = examples[:int(len(examples) *
-                               self._config.train_set_percent / 100.0)]
-    return examples
-
-
-class TaggingExample(example.Example):
-  def __init__(self, config, is_token_level, words, original_tags,
-               word_vocab, char_vocab, label_mapping, task_name):
-    super(TaggingExample, self).__init__(words, word_vocab, char_vocab)
-    if is_token_level:
-      labels = original_tags
-    else:
-      span_labels = tagging_utils.get_span_labels(original_tags)
-      labels = tagging_utils.get_tags(
-          span_labels, len(words), config.label_encoding)
-
-    if task_name == 'depparse':
-      self.labels = []
-      for l in labels:
-        split = l.split('-')
-        self.labels.append(
-            len(label_mapping) * (0 if split[0] == '0' else 1 + int(split[0]))
-            + label_mapping[split[1]])
-    else:
-      self.labels = [label_mapping[l] for l in labels]
--- a/research/cvt_text/task_specific/word_level/word_level_scorer.py
+++ b/research/cvt_text/task_specific/word_level/word_level_scorer.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Base class for word-level scorers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-
-from corpus_processing import scorer
-
-
-class WordLevelScorer(scorer.Scorer):
-  __metaclass__ = abc.ABCMeta
-
-  def __init__(self):
-    super(WordLevelScorer, self).__init__()
-    self._total_loss = 0
-    self._total_words = 0
-    self._examples = []
-    self._preds = []
-
-  def update(self, examples, predictions, loss):
-    super(WordLevelScorer, self).update(examples, predictions, loss)
-    n_words = 0
-    for example, preds in zip(examples, predictions):
-      self._examples.append(example)
-      self._preds.append(list(preds)[1:len(example.words) - 1])
-      n_words += len(example.words) - 2
-    self._total_loss += loss * n_words
-    self._total_words += n_words
-
-  def get_loss(self):
-    return self._total_loss / max(1, self._total_words)
--- a/research/cvt_text/training/__init__.py
+++ b/research/cvt_text/training/__init__.py
--- a/research/cvt_text/training/trainer.py
+++ b/research/cvt_text/training/trainer.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Runs training for CVT text models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import bisect
-import time
-import numpy as np
-import tensorflow as tf
-
-from base import utils
-from model import multitask_model
-from task_specific import task_definitions
-
-
-class Trainer(object):
-  def __init__(self, config):
-    self._config = config
-    self.tasks = [task_definitions.get_task(self._config, task_name)
-                  for task_name in self._config.task_names]
-
-    utils.log('Loading Pretrained Embeddings')
-    pretrained_embeddings = utils.load_cpickle(self._config.word_embeddings)
-
-    utils.log('Building Model')
-    self._model = multitask_model.Model(
-        self._config, pretrained_embeddings, self.tasks)
-    utils.log()
-
-  def train(self, sess, progress, summary_writer):
-    heading = lambda s: utils.heading(s, '(' + self._config.model_name + ')')
-    trained_on_sentences = 0
-    start_time = time.time()
-    unsupervised_loss_total, unsupervised_loss_count = 0, 0
-    supervised_loss_total, supervised_loss_count = 0, 0
-    for mb in self._get_training_mbs(progress.unlabeled_data_reader):
-      if mb.task_name != 'unlabeled':
-        loss = self._model.train_labeled(sess, mb)
-        supervised_loss_total += loss
-        supervised_loss_count += 1
-
-      if mb.task_name == 'unlabeled':
-        self._model.run_teacher(sess, mb)
-        loss = self._model.train_unlabeled(sess, mb)
-        unsupervised_loss_total += loss
-        unsupervised_loss_count += 1
-        mb.teacher_predictions.clear()
-
-      trained_on_sentences += mb.size
-      global_step = self._model.get_global_step(sess)
-
-      if global_step % self._config.print_every == 0:
-        utils.log('step {:} - '
-                  'supervised loss: {:.2f} - '
-                  'unsupervised loss: {:.2f} - '
-                  '{:.1f} sentences per second'.format(
-            global_step,
-            supervised_loss_total / max(1, supervised_loss_count),
-            unsupervised_loss_total / max(1, unsupervised_loss_count),
-            trained_on_sentences / (time.time() - start_time)))
-        unsupervised_loss_total, unsupervised_loss_count = 0, 0
-        supervised_loss_total, supervised_loss_count = 0, 0
-
-      if global_step % self._config.eval_dev_every == 0:
-        heading('EVAL ON DEV')
-        self.evaluate_all_tasks(sess, summary_writer, progress.history)
-        progress.save_if_best_dev_model(sess, global_step)
-        utils.log()
-
-      if global_step % self._config.eval_train_every == 0:
-        heading('EVAL ON TRAIN')
-        self.evaluate_all_tasks(sess, summary_writer, progress.history, True)
-        utils.log()
-
-      if global_step % self._config.save_model_every == 0:
-        heading('CHECKPOINTING MODEL')
-        progress.write(sess, global_step)
-        utils.log()
-
-  def evaluate_all_tasks(self, sess, summary_writer, history, train_set=False):
-    for task in self.tasks:
-      results = self._evaluate_task(sess, task, summary_writer, train_set)
-      if history is not None:
-        results.append(('step', self._model.get_global_step(sess)))
-        history.append(results)
-    if history is not None:
-      utils.write_cpickle(history, self._config.history_file)
-
-  def _evaluate_task(self, sess, task, summary_writer, train_set):
-    scorer = task.get_scorer()
-    data = task.train_set if train_set else task.val_set
-    for i, mb in enumerate(data.get_minibatches(self._config.test_batch_size)):
-      loss, batch_preds = self._model.test(sess, mb)
-      scorer.update(mb.examples, batch_preds, loss)
-
-    results = scorer.get_results(task.name +
-                                 ('_train_' if train_set else '_dev_'))
-    utils.log(task.name.upper() + ': ' + scorer.results_str())
-    write_summary(summary_writer, results,
-                  global_step=self._model.get_global_step(sess))
-    return results
-
-  def _get_training_mbs(self, unlabeled_data_reader):
-    datasets = [task.train_set for task in self.tasks]
-    weights = [np.sqrt(dataset.size) for dataset in datasets]
-    thresholds = np.cumsum([w / np.sum(weights) for w in weights])
-
-    labeled_mbs = [dataset.endless_minibatches(self._config.train_batch_size)
-                   for dataset in datasets]
-    unlabeled_mbs = unlabeled_data_reader.endless_minibatches()
-    while True:
-      dataset_ind = bisect.bisect(thresholds, np.random.random())
-      yield next(labeled_mbs[dataset_ind])
-      if self._config.is_semisup:
-        yield next(unlabeled_mbs)
-
-
-def write_summary(writer, results, global_step):
-  for k, v in results:
-    if 'f1' in k or 'acc' in k or 'loss' in k:
-      writer.add_summary(tf.Summary(
-          value=[tf.Summary.Value(tag=k, simple_value=v)]), global_step)
-  writer.flush()
--- a/research/cvt_text/training/training_progress.py
+++ b/research/cvt_text/training/training_progress.py
-# Copyright 2018 The TensorFlow Authors All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""
-Tracks and saves training progress (models and other data such as the current
-location in the lm1b corpus) for later reloading.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from base import utils
-from corpus_processing import unlabeled_data
-
-
-class TrainingProgress(object):
-  def __init__(self, config, sess, checkpoint_saver, best_model_saver,
-              restore_if_possible=True):
-    self.config = config
-    self.checkpoint_saver = checkpoint_saver
-    self.best_model_saver = best_model_saver
-
-    tf.gfile.MakeDirs(config.checkpoints_dir)
-    if restore_if_possible and tf.gfile.Exists(config.progress):
-      history, current_file, current_line = utils.load_cpickle(
-          config.progress, memoized=False)
-      self.history = history
-      self.unlabeled_data_reader = unlabeled_data.UnlabeledDataReader(
-          config, current_file, current_line)
-      utils.log("Continuing from global step", dict(self.history[-1])["step"],
-                "(lm1b file {:}, line {:})".format(current_file, current_line))
-      self.checkpoint_saver.restore(sess, tf.train.latest_checkpoint(
-          self.config.checkpoints_dir))
-    else:
-      utils.log("No previous checkpoint found - starting from scratch")
-      self.history = []
-      self.unlabeled_data_reader = (
-          unlabeled_data.UnlabeledDataReader(config))
-
-  def write(self, sess, global_step):
-    self.checkpoint_saver.save(sess, self.config.checkpoint,
-                               global_step=global_step)
-    utils.write_cpickle(
-        (self.history, self.unlabeled_data_reader.current_file,
-         self.unlabeled_data_reader.current_line),
-        self.config.progress)
-
-  def save_if_best_dev_model(self, sess, global_step):
-    best_avg_score = 0
-    for i, results in enumerate(self.history):
-      if any("train" in metric for metric, value in results):
-        continue
-      total, count = 0, 0
-      for metric, value in results:
-        if "f1" in metric or "las" in metric or "accuracy" in metric:
-          total += value
-          count += 1
-      avg_score = total / count
-      if avg_score >= best_avg_score:
-        best_avg_score = avg_score
-        if i == len(self.history) - 1:
-          utils.log("New best model! Saving...")
-          self.best_model_saver.save(sess, self.config.best_model_checkpoint,
-                                     global_step=global_step)