Merge remote-tracking branch 'upstream/master'

27b4acd4 · Aman Gupta · 5133522f · d4e1f97f · 27b4acd4 · 27b4acd4
Commit 27b4acd4 authored Sep 25, 2018 by Aman Gupta
20 changed files
--- a/research/cvt_text/base/configure.py
+++ b/research/cvt_text/base/configure.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Classes for storing hyperparameters, data locations, etc."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+from os.path import join
+import tensorflow as tf
+
+
+class Config(object):
+  """Stores everything needed to train a model."""
+
+  def __init__(self, **kwargs):
+    # general
+    self.data_dir = './data'  # top directory for data (corpora, models, etc.)
+    self.model_name = 'default_model'  # name identifying the current model
+
+    # mode
+    self.mode = 'train'  # either "train" or "eval"
+    self.task_names = ['chunk']  # list of tasks this model will learn
+                                 # more than one trains a multi-task model
+    self.is_semisup = True  # whether to use CVT or train purely supervised
+    self.for_preprocessing = False  # is this for the preprocessing script
+
+    # embeddings
+    self.pretrained_embeddings = 'glove.6B.300d.txt'  # which pretrained
+                                                      # embeddings to use
+    self.word_embedding_size = 300  # size of each word embedding
+
+    # encoder
+    self.use_chars = True  # whether to include a character-level cnn
+    self.char_embedding_size = 50  # size of character embeddings
+    self.char_cnn_filter_widths = [2, 3, 4]  # filter widths for the char cnn
+    self.char_cnn_n_filters = 100  # number of filters for each filter width
+    self.unidirectional_sizes = [1024]  # size of first Bi-LSTM
+    self.bidirectional_sizes = [512]  # size of second Bi-LSTM
+    self.projection_size = 512  # projections size for LSTMs and hidden layers
+
+    # dependency parsing
+    self.depparse_projection_size = 128  # size of the representations used in
+                                         # the bilinear classifier for parsing
+
+    # tagging
+    self.label_encoding = 'BIOES'  # label encoding scheme for entity-level
+                                   # tagging tasks
+    self.label_smoothing = 0.1  # label smoothing rate for tagging tasks
+
+    # optimization
+    self.lr = 0.5  # base learning rate
+    self.momentum = 0.9  # momentum
+    self.grad_clip = 1.0  # maximum gradient norm during optimization
+    self.warm_up_steps = 5000.0  # linearly ramp up the lr for this many steps
+    self.lr_decay = 0.005  # factor for gradually decaying the lr
+
+    # EMA
+    self.ema_decay = 0.998  # EMA coefficient for averaged model weights
+    self.ema_test = True  # whether to use EMA weights at test time
+    self.ema_teacher = False  # whether to use EMA weights for the teacher model
+
+    # regularization
+    self.labeled_keep_prob = 0.5  # 1 - dropout on labeled examples
+    self.unlabeled_keep_prob = 0.8  # 1 - dropout on unlabeled examples
+
+    # sizing
+    self.max_sentence_length = 100  # maximum length of unlabeled sentences
+    self.max_word_length = 20  # maximum length of words for char cnn
+    self.train_batch_size = 64  # train batch size
+    self.test_batch_size = 64  # test batch size
+    self.buckets = [(0, 15), (15, 40), (40, 1000)]  # buckets for binning
+                                                    # sentences by length
+
+    # training
+    self.print_every = 25  # how often to print out training progress
+    self.eval_dev_every = 500  # how often to evaluate on the dev set
+    self.eval_train_every = 2000  # how often to evaluate on the train set
+    self.save_model_every = 1000  # how often to checkpoint the model
+
+    # data set
+    self.train_set_percent = 100  # how much of the train set to use
+
+    for k, v in kwargs.iteritems():
+      if k not in self.__dict__:
+        raise ValueError("Unknown argument", k)
+      self.__dict__[k] = v
+
+    self.dev_set = self.mode == "train"  # whether to evaluate on the dev or
+                                         # test set
+
+    # locations of various data files
+    self.raw_data_topdir = join(self.data_dir, 'raw_data')
+    self.unsupervised_data = join(
+        self.raw_data_topdir,
+        'unlabeled_data',
+        '1-billion-word-language-modeling-benchmark-r13output',
+        'training-monolingual.tokenized.shuffled')
+    self.pretrained_embeddings_file = join(
+        self.raw_data_topdir, 'pretrained_embeddings',
+        self.pretrained_embeddings)
+
+    self.preprocessed_data_topdir = join(self.data_dir, 'preprocessed_data')
+    self.embeddings_dir = join(self.preprocessed_data_topdir,
+                               self.pretrained_embeddings.rsplit('.', 1)[0])
+    self.word_vocabulary = join(self.embeddings_dir, 'word_vocabulary.pkl')
+    self.word_embeddings = join(self.embeddings_dir, 'word_embeddings.pkl')
+
+    self.model_dir = join(self.data_dir, "models", self.model_name)
+    self.checkpoints_dir = join(self.model_dir, 'checkpoints')
+    self.checkpoint = join(self.checkpoints_dir, 'checkpoint.ckpt')
+    self.best_model_checkpoints_dir = join(
+        self.model_dir, 'best_model_checkpoints')
+    self.best_model_checkpoint = join(
+        self.best_model_checkpoints_dir, 'checkpoint.ckpt')
+    self.progress = join(self.checkpoints_dir, 'progress.pkl')
+    self.summaries_dir = join(self.model_dir, 'summaries')
+    self.history_file = join(self.model_dir, 'history.pkl')
+
+  def write(self):
+    tf.gfile.MakeDirs(self.model_dir)
+    with open(join(self.model_dir, 'config.json'), 'w') as f:
+      f.write(json.dumps(self.__dict__, sort_keys=True, indent=4,
+                         separators=(',', ': ')))
+
--- a/research/cvt_text/base/embeddings.py
+++ b/research/cvt_text/base/embeddings.py
+# coding=utf-8
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+"""Utilities for handling word embeddings."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import numpy as np
+import tensorflow as tf
+
+from base import utils
+
+
+_CHARS = [
+    # punctuation
+    '!', '\'', '#', '$', '%', '&', '"', '(', ')', '*', '+', ',', '-', '.',
+    '/', '\\', '_', '`', '{', '}', '[', ']', '<', '>', ':', ';', '?', '@',
+    # digits
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+    # letters
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
+    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+    'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    # special characters
+    '£', '€', '®', '™', '�', '½', '»', '•', '—', '“', '”', '°', '‘', '’'
+]
+
+# words not in GloVe that still should have embeddings
+_EXTRA_WORDS = [
+    # common digit patterns
+    '0/0', '0/00', '00/00', '0/000',
+    '00/00/00', '0/00/00', '00/00/0000', '0/00/0000',
+    '00-00', '00-00-00', '0-00-00', '00-00-0000', '0-00-0000', '0000-00-00',
+    '00-0-00-0', '00000000', '0:00.000', '00:00.000',
+    '0%', '00%', '00.' '0000.', '0.0bn', '0.0m', '0-', '00-',
+    # ontonotes uses **f to represent formulas and -amp- instead of amperstands
+    '**f', '-amp-'
+]
+SPECIAL_TOKENS = ['<pad>', '<unk>', '<start>', '<end>', '<missing>']
+NUM_CHARS = len(_CHARS) + len(SPECIAL_TOKENS)
+PAD, UNK, START, END, MISSING = 0, 1, 2, 3, 4
+
+
+class Vocabulary(collections.OrderedDict):
+  def __getitem__(self, w):
+    return self.get(w, UNK)
+
+
+@utils.Memoize
+def get_char_vocab():
+  characters = _CHARS
+  for i, special in enumerate(SPECIAL_TOKENS):
+    characters.insert(i, special)
+  return Vocabulary({c: i for i, c in enumerate(characters)})
+
+
+@utils.Memoize
+def get_inv_char_vocab():
+  return {i: c for c, i in get_char_vocab().items()}
+
+
+def get_word_vocab(config):
+  return Vocabulary(utils.load_cpickle(config.word_vocabulary))
+
+
+def get_word_embeddings(config):
+  return utils.load_cpickle(config.word_embeddings)
+
+
+@utils.Memoize
+def _punctuation_ids(vocab_path):
+  vocab = Vocabulary(utils.load_cpickle(vocab_path))
+  return set(i for w, i in vocab.iteritems() if w in [
+      '!', '...', '``', '{', '}', '(', ')', '[', ']', '--', '-', ',', '.',
+      "''", '`', ';', ':', '?'])
+
+
+def get_punctuation_ids(config):
+  return _punctuation_ids(config.word_vocabulary)
+
+
+class PretrainedEmbeddingLoader(object):
+  def __init__(self, config):
+    self.config = config
+    self.vocabulary = {}
+    self.vectors = []
+    self.vector_size = config.word_embedding_size
+
+  def _add_vector(self, w):
+    if w not in self.vocabulary:
+      self.vocabulary[w] = len(self.vectors)
+      self.vectors.append(np.zeros(self.vector_size, dtype='float32'))
+
+  def build(self):
+    utils.log('loading pretrained embeddings from',
+              self.config.pretrained_embeddings_file)
+    for special in SPECIAL_TOKENS:
+      self._add_vector(special)
+    for extra in _EXTRA_WORDS:
+      self._add_vector(extra)
+    with tf.gfile.GFile(
+        self.config.pretrained_embeddings_file, 'r') as f:
+      for i, line in enumerate(f):
+        if i % 10000 == 0:
+          utils.log('on line', i)
+
+        split = line.decode('utf8').split()
+        w = normalize_word(split[0])
+
+        try:
+          vec = np.array(map(float, split[1:]), dtype='float32')
+          if vec.size != self.vector_size:
+            utils.log('vector for line', i, 'has size', vec.size, 'so skipping')
+            utils.log(line[:100] + '...')
+            continue
+        except:
+          utils.log('can\'t parse line', i, 'so skipping')
+          utils.log(line[:100] + '...')
+          continue
+        if w not in self.vocabulary:
+          self.vocabulary[w] = len(self.vectors)
+          self.vectors.append(vec)
+    utils.log('writing vectors!')
+    self._write()
+
+  def _write(self):
+    utils.write_cpickle(np.vstack(self.vectors), self.config.word_embeddings)
+    utils.write_cpickle(self.vocabulary, self.config.word_vocabulary)
+
+
+def normalize_chars(w):
+  if w == '-LRB-':
+    return '('
+  elif w == '-RRB-':
+    return ')'
+  elif w == '-LCB-':
+    return '{'
+  elif w == '-RCB-':
+    return '}'
+  elif w == '-LSB-':
+    return '['
+  elif w == '-RSB-':
+    return ']'
+  return w.replace(r'\/', '/').replace(r'\*', '*')
+
+
+def normalize_word(w):
+  return re.sub(r'\d', '0', normalize_chars(w).lower())
--- a/research/cvt_text/base/utils.py
+++ b/research/cvt_text/base/utils.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Various utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cPickle
+import sys
+import tensorflow as tf
+
+
+class Memoize(object):
+  def __init__(self, f):
+    self.f = f
+    self.cache = {}
+
+  def __call__(self, *args):
+    if args not in self.cache:
+      self.cache[args] = self.f(*args)
+    return self.cache[args]
+
+
+def load_cpickle(path, memoized=True):
+  return _load_cpickle_memoize(path) if memoized else _load_cpickle(path)
+
+
+def _load_cpickle(path):
+  with tf.gfile.GFile(path, 'r') as f:
+    return cPickle.load(f)
+
+
+@Memoize
+def _load_cpickle_memoize(path):
+  return _load_cpickle(path)
+
+
+def write_cpickle(o, path):
+  tf.gfile.MakeDirs(path.rsplit('/', 1)[0])
+  with tf.gfile.GFile(path, 'w') as f:
+    cPickle.dump(o, f, -1)
+
+
+def log(*args):
+  msg = ' '.join(map(str, args))
+  sys.stdout.write(msg + '\n')
+  sys.stdout.flush()
+
+
+def heading(*args):
+  log()
+  log(80 * '=')
+  log(*args)
+  log(80 * '=')
--- a/research/cvt_text/corpus_processing/__init__.py
+++ b/research/cvt_text/corpus_processing/__init__.py
--- a/research/cvt_text/corpus_processing/example.py
+++ b/research/cvt_text/corpus_processing/example.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Base class for training examples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from base import embeddings
+
+
+CONTRACTION_WORDS = set(w + 'n' for w in
+                        ['do', 'does', 'did', 'is', 'are', 'was', 'were', 'has',
+                         'have', 'had', 'could', 'would', 'should', 'ca', 'wo',
+                         'ai', 'might'])
+
+
+class Example(object):
+  def __init__(self, words, word_vocab, char_vocab):
+    words = words[:]
+    # Fix inconsistent tokenization between datasets
+    for i in range(len(words)):
+      if (words[i].lower() == '\'t' and i > 0 and
+          words[i - 1].lower() in CONTRACTION_WORDS):
+        words[i] = words[i - 1][-1] + words[i]
+        words[i - 1] = words[i - 1][:-1]
+
+    self.words = ([embeddings.START] +
+                  [word_vocab[embeddings.normalize_word(w)] for w in words] +
+                  [embeddings.END])
+    self.chars = ([[embeddings.MISSING]] +
+                  [[char_vocab[c] for c in embeddings.normalize_chars(w)]
+                   for w in words] +
+                  [[embeddings.MISSING]])
+
+  def __repr__(self,):
+    inv_char_vocab = embeddings.get_inv_char_vocab()
+    return ' '.join([''.join([inv_char_vocab[c] for c in w])
+                     for w in self.chars])
--- a/research/cvt_text/corpus_processing/minibatching.py
+++ b/research/cvt_text/corpus_processing/minibatching.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities for constructing minibatches."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import random
+import numpy as np
+
+from base import embeddings
+
+
+def get_bucket(config, l):
+  for i, (s, e) in enumerate(config.buckets):
+    if s <= l < e:
+      return config.buckets[i]
+
+
+def build_array(nested_lists, dtype='int32'):
+  depth_to_sizes = collections.defaultdict(set)
+  _get_sizes(nested_lists, depth_to_sizes)
+  shape = [max(depth_to_sizes[depth]) for depth in range(len(depth_to_sizes))]
+
+  copy_depth = len(depth_to_sizes) - 1
+  while copy_depth > 0 and len(depth_to_sizes[copy_depth]) == 1:
+    copy_depth -= 1
+
+  arr = np.zeros(shape, dtype=dtype)
+  _fill_array(nested_lists, arr, copy_depth)
+
+  return arr
+
+
+def _get_sizes(nested_lists, depth_to_sizes, depth=0):
+  depth_to_sizes[depth].add(len(nested_lists))
+  first_elem = nested_lists[0]
+  if (isinstance(first_elem, collections.Sequence) or
+      isinstance(first_elem, np.ndarray)):
+    for sublist in nested_lists:
+      _get_sizes(sublist, depth_to_sizes, depth + 1)
+
+
+def _fill_array(nested_lists, arr, copy_depth, depth=0):
+  if depth == copy_depth:
+    for i in range(len(nested_lists)):
+      if isinstance(nested_lists[i], np.ndarray):
+        arr[i] = nested_lists[i]
+      else:
+        arr[i] = np.array(nested_lists[i])
+  else:
+    for i in range(len(nested_lists)):
+      _fill_array(nested_lists[i], arr[i], copy_depth, depth + 1)
+
+
+class Dataset(object):
+  def __init__(self, config, examples, task_name='unlabeled', is_training=False):
+    self._config = config
+    self.examples = examples
+    self.size = len(examples)
+    self.task_name = task_name
+    self.is_training = is_training
+
+  def get_minibatches(self, minibatch_size):
+    by_bucket = collections.defaultdict(list)
+    for i, e in enumerate(self.examples):
+      by_bucket[get_bucket(self._config, len(e.words))].append(i)
+
+    # save memory by weighting examples so longer sentences have
+    # smaller minibatches.
+    weight = lambda ind: np.sqrt(len(self.examples[ind].words))
+    total_weight = float(sum(weight(i) for i in range(len(self.examples))))
+    weight_per_batch = minibatch_size * total_weight / len(self.examples)
+    cumulative_weight = 0.0
+    id_batches = []
+    for _, ids in by_bucket.iteritems():
+      ids = np.array(ids)
+      np.random.shuffle(ids)
+      curr_batch, curr_weight = [], 0.0
+      for i, curr_id in enumerate(ids):
+        curr_batch.append(curr_id)
+        curr_weight += weight(curr_id)
+        if (i == len(ids) - 1 or cumulative_weight + curr_weight >=
+            (len(id_batches) + 1) * weight_per_batch):
+          cumulative_weight += curr_weight
+          id_batches.append(np.array(curr_batch))
+          curr_batch, curr_weight = [], 0.0
+    random.shuffle(id_batches)
+
+    for id_batch in id_batches:
+      yield self._make_minibatch(id_batch)
+
+  def endless_minibatches(self, minibatch_size):
+    while True:
+      for mb in self.get_minibatches(minibatch_size):
+        yield mb
+
+  def _make_minibatch(self, ids):
+    examples = [self.examples[i] for i in ids]
+    sentence_lengths = np.array([len(e.words) for e in examples])
+    max_word_length = min(max(max(len(word) for word in e.chars)
+                              for e in examples),
+                          self._config.max_word_length)
+    characters = [[[embeddings.PAD] + [embeddings.START] + w[:max_word_length] +
+                   [embeddings.END] + [embeddings.PAD] for w in e.chars]
+                  for e in examples]
+    # the first and last words are masked because they are start/end tokens
+    mask = build_array([[0] + [1] * (length - 2) + [0]
+                        for length in sentence_lengths])
+    words = build_array([e.words for e in examples])
+    chars = build_array(characters, dtype='int16')
+    return Minibatch(
+        task_name=self.task_name,
+        size=ids.size,
+        examples=examples,
+        ids=ids,
+        teacher_predictions={},
+        words=words,
+        chars=chars,
+        lengths=sentence_lengths,
+        mask=mask,
+    )
+
+
+Minibatch = collections.namedtuple('Minibatch', [
+    'task_name', 'size', 'examples', 'ids', 'teacher_predictions',
+    'words', 'chars', 'lengths', 'mask'
+])
--- a/research/cvt_text/corpus_processing/scorer.py
+++ b/research/cvt_text/corpus_processing/scorer.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Abstract base class for evaluation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+
+class Scorer(object):
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self):
+    self._updated = False
+    self._cached_results = {}
+
+  @abc.abstractmethod
+  def update(self, examples, predictions, loss):
+    self._updated = True
+
+  @abc.abstractmethod
+  def get_loss(self):
+    pass
+
+  @abc.abstractmethod
+  def _get_results(self):
+    return []
+
+  def get_results(self, prefix=""):
+    results = self._get_results() if self._updated else self._cached_results
+    self._cached_results = results
+    self._updated = False
+    return [(prefix + k, v) for k, v in results]
+
+  def results_str(self):
+    return " - ".join(["{:}: {:.2f}".format(k, v)
+                       for k, v in self.get_results()])
--- a/research/cvt_text/corpus_processing/unlabeled_data.py
+++ b/research/cvt_text/corpus_processing/unlabeled_data.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Reads data from a large unlabeled corpus."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+
+from base import embeddings
+from corpus_processing import example
+from corpus_processing import minibatching
+
+
+class UnlabeledDataReader(object):
+  def __init__(self, config, starting_file=0, starting_line=0, one_pass=False):
+    self.config = config
+    self.current_file = starting_file
+    self.current_line = starting_line
+    self._one_pass = one_pass
+
+  def endless_minibatches(self):
+    for examples in self.get_unlabeled_examples():
+      d = minibatching.Dataset(self.config, examples, 'unlabeled')
+      for mb in d.get_minibatches(self.config.train_batch_size):
+        yield mb
+
+  def _make_examples(self, sentences):
+    word_vocab = embeddings.get_word_vocab(self.config)
+    char_vocab = embeddings.get_char_vocab()
+    return [
+        example.Example(sentence, word_vocab, char_vocab)
+        for sentence in sentences
+    ]
+
+  def get_unlabeled_examples(self):
+    lines = []
+    for words in self.get_unlabeled_sentences():
+      lines.append(words)
+      if len(lines) >= 10000:
+        yield self._make_examples(lines)
+        lines = []
+
+  def get_unlabeled_sentences(self):
+    while True:
+      file_ids_and_names = sorted([
+          (int(fname.split('-')[1].replace('.txt', '')), fname) for fname in
+          tf.gfile.ListDirectory(self.config.unsupervised_data)])
+      for fid, fname in file_ids_and_names:
+        if fid < self.current_file:
+          continue
+        self.current_file = fid
+        self.current_line = 0
+        with tf.gfile.FastGFile(os.path.join(self.config.unsupervised_data,
+                                             fname), 'r') as f:
+          for i, line in enumerate(f):
+            if i < self.current_line:
+              continue
+            self.current_line = i
+            words = line.strip().split()
+            if len(words) < self.config.max_sentence_length:
+              yield words
+      self.current_file = 0
+      self.current_line = 0
+      if self._one_pass:
+        break
--- a/research/cvt_text/cvt.py
+++ b/research/cvt_text/cvt.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Run training and evaluation for CVT text models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from base import configure
+from base import utils
+from training import trainer
+from training import training_progress
+
+
+FLAGS = tf.app.flags.FLAGS
+tf.app.flags.DEFINE_string('mode', 'train', '"train" or "eval')
+tf.app.flags.DEFINE_string('model_name', 'default_model',
+                           'A name identifying the model being '
+                           'trained/evaluated')
+
+
+def main():
+  utils.heading('SETUP')
+  config = configure.Config(mode=FLAGS.mode, model_name=FLAGS.model_name)
+  config.write()
+  with tf.Graph().as_default() as graph:
+    model_trainer = trainer.Trainer(config)
+    summary_writer = tf.summary.FileWriter(config.summaries_dir)
+    checkpoints_saver = tf.train.Saver(max_to_keep=1)
+    best_model_saver = tf.train.Saver(max_to_keep=1)
+    init_op = tf.global_variables_initializer()
+    graph.finalize()
+    with tf.Session() as sess:
+      sess.run(init_op)
+      progress = training_progress.TrainingProgress(
+          config, sess, checkpoints_saver, best_model_saver,
+          config.mode == 'train')
+      utils.log()
+      if config.mode == 'train':
+        utils.heading('START TRAINING ({:})'.format(config.model_name))
+        model_trainer.train(sess, progress, summary_writer)
+      elif config.mode == 'eval':
+        utils.heading('RUN EVALUATION ({:})'.format(config.model_name))
+        progress.best_model_saver.restore(sess, tf.train.latest_checkpoint(
+            config.checkpoints_dir))
+        model_trainer.evaluate_all_tasks(sess, summary_writer, None)
+      else:
+        raise ValueError('Mode must be "train" or "eval"')
+
+
+if __name__ == '__main__':
+  main()
--- a/research/cvt_text/fetch_data.sh
+++ b/research/cvt_text/fetch_data.sh
+#!/bin/bash
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+TOPDIR='./data'
+RUNDIR=${PWD}
+
+mkdir -p ${TOPDIR}
+cd ${TOPDIR}
+mkdir -p raw_data
+mkdir -p raw_data/pretrained_embeddings
+mkdir -p raw_data/unlabeled_data
+mkdir -p raw_data/chunk
+cd ${RUNDIR}
+
+echo "Preparing GloVe embeddings"
+cd "${TOPDIR}/raw_data/pretrained_embeddings"
+curl -OL http://nlp.stanford.edu/data/glove.6B.zip
+unzip glove.6B.zip
+cd ${RUNDIR}
+echo
+
+echo "Preparing lm1b corpus"
+cd "${TOPDIR}/raw_data/unlabeled_data"
+curl -OL http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
+tar xzf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
+cd ${RUNDIR}
+echo
+
+echo "Preparing chunking corpus"
+cd "${TOPDIR}/raw_data/chunk"
+curl -OL https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz
+curl -OL http://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz
+gunzip *
+cd ${RUNDIR}
+echo
+
+echo "Done with data fetching!"
+
--- a/research/cvt_text/model/__init__.py
+++ b/research/cvt_text/model/__init__.py
--- a/research/cvt_text/model/encoder.py
+++ b/research/cvt_text/model/encoder.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""CNN-BiLSTM sentence encoder."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from base import embeddings
+from model import model_helpers
+
+
+class Encoder(object):
+  def __init__(self, config, inputs, pretrained_embeddings):
+    self._config = config
+    self._inputs = inputs
+
+    self.word_reprs = self._get_word_reprs(pretrained_embeddings)
+    self.uni_fw, self.uni_bw = self._get_unidirectional_reprs(self.word_reprs)
+    self.uni_reprs = tf.concat([self.uni_fw, self.uni_bw], axis=-1)
+    self.bi_fw, self.bi_bw, self.bi_reprs = self._get_bidirectional_reprs(
+        self.uni_reprs)
+
+  def _get_word_reprs(self, pretrained_embeddings):
+    with tf.variable_scope('word_embeddings'):
+      word_embedding_matrix = tf.get_variable(
+          'word_embedding_matrix', initializer=pretrained_embeddings)
+      word_embeddings = tf.nn.embedding_lookup(
+          word_embedding_matrix, self._inputs.words)
+      word_embeddings = tf.nn.dropout(word_embeddings, self._inputs.keep_prob)
+      word_embeddings *= tf.get_variable('emb_scale', initializer=1.0)
+
+    if not self._config.use_chars:
+      return word_embeddings
+
+    with tf.variable_scope('char_embeddings'):
+      char_embedding_matrix = tf.get_variable(
+          'char_embeddings',
+          shape=[embeddings.NUM_CHARS, self._config.char_embedding_size])
+      char_embeddings = tf.nn.embedding_lookup(char_embedding_matrix,
+                                               self._inputs.chars)
+      shape = tf.shape(char_embeddings)
+      char_embeddings = tf.reshape(
+          char_embeddings,
+          shape=[-1, shape[-2], self._config.char_embedding_size])
+      char_reprs = []
+      for filter_width in self._config.char_cnn_filter_widths:
+        conv = tf.layers.conv1d(
+            char_embeddings, self._config.char_cnn_n_filters, filter_width)
+        conv = tf.nn.relu(conv)
+        conv = tf.nn.dropout(tf.reduce_max(conv, axis=1),
+                             self._inputs.keep_prob)
+        conv = tf.reshape(conv, shape=[-1, shape[1],
+                                       self._config.char_cnn_n_filters])
+        char_reprs.append(conv)
+      return tf.concat([word_embeddings] + char_reprs, axis=-1)
+
+  def _get_unidirectional_reprs(self, word_reprs):
+    with tf.variable_scope('unidirectional_reprs'):
+      word_lstm_input_size = (
+          self._config.word_embedding_size if not self._config.use_chars else
+          (self._config.word_embedding_size +
+           len(self._config.char_cnn_filter_widths)
+           * self._config.char_cnn_n_filters))
+      word_reprs.set_shape([None, None, word_lstm_input_size])
+      (outputs_fw, outputs_bw), _ = tf.nn.bidirectional_dynamic_rnn(
+          model_helpers.multi_lstm_cell(self._config.unidirectional_sizes,
+                                        self._inputs.keep_prob,
+                                        self._config.projection_size),
+          model_helpers.multi_lstm_cell(self._config.unidirectional_sizes,
+                                        self._inputs.keep_prob,
+                                        self._config.projection_size),
+          word_reprs,
+          dtype=tf.float32,
+          sequence_length=self._inputs.lengths,
+          scope='unilstm'
+      )
+      return outputs_fw, outputs_bw
+
+  def _get_bidirectional_reprs(self, uni_reprs):
+    with tf.variable_scope('bidirectional_reprs'):
+      current_outputs = uni_reprs
+      outputs_fw, outputs_bw = None, None
+      for size in self._config.bidirectional_sizes:
+        (outputs_fw, outputs_bw), _ = tf.nn.bidirectional_dynamic_rnn(
+            model_helpers.lstm_cell(size, self._inputs.keep_prob,
+                                    self._config.projection_size),
+            model_helpers.lstm_cell(size, self._inputs.keep_prob,
+                                    self._config.projection_size),
+            current_outputs,
+            dtype=tf.float32,
+            sequence_length=self._inputs.lengths,
+            scope='bilstm'
+        )
+        current_outputs = tf.concat([outputs_fw, outputs_bw], axis=-1)
+      return outputs_fw, outputs_bw, current_outputs
--- a/research/cvt_text/model/model_helpers.py
+++ b/research/cvt_text/model/model_helpers.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities for building the model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def project(input_layers, size, name='projection'):
+  return tf.add_n([tf.layers.dense(layer, size, name=name + '_' + str(i))
+                   for i, layer in enumerate(input_layers)])
+
+
+def lstm_cell(cell_size, keep_prob, num_proj):
+  return tf.contrib.rnn.DropoutWrapper(
+      tf.contrib.rnn.LSTMCell(cell_size, num_proj=min(cell_size, num_proj)),
+      output_keep_prob=keep_prob)
+
+
+def multi_lstm_cell(cell_sizes, keep_prob, num_proj):
+  return tf.contrib.rnn.MultiRNNCell([lstm_cell(cell_size, keep_prob, num_proj)
+                                      for cell_size in cell_sizes])
+
+
+def masked_ce_loss(logits, labels, mask, sparse=False, roll_direction=0):
+  if roll_direction != 0:
+    labels = _roll(labels, roll_direction, sparse)
+    mask *= _roll(mask, roll_direction, True)
+  ce = ((tf.nn.sparse_softmax_cross_entropy_with_logits if sparse
+         else tf.nn.softmax_cross_entropy_with_logits_v2)
+        (logits=logits, labels=labels))
+  return tf.reduce_sum(mask * ce) / tf.to_float(tf.reduce_sum(mask))
+
+
+def _roll(arr, direction, sparse=False):
+  if sparse:
+    return tf.concat([arr[:, direction:], arr[:, :direction]], axis=1)
+  return tf.concat([arr[:, direction:, :], arr[:, :direction, :]], axis=1)
--- a/research/cvt_text/model/multitask_model.py
+++ b/research/cvt_text/model/multitask_model.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A multi-task and semi-supervised NLP model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from model import encoder
+from model import shared_inputs
+
+
+class Inference(object):
+  def __init__(self, config, inputs, pretrained_embeddings, tasks):
+    with tf.variable_scope('encoder'):
+      self.encoder = encoder.Encoder(config, inputs, pretrained_embeddings)
+    self.modules = {}
+    for task in tasks:
+      with tf.variable_scope(task.name):
+        self.modules[task.name] = task.get_module(inputs, self.encoder)
+
+
+class Model(object):
+  def __init__(self, config, pretrained_embeddings, tasks):
+    self._config = config
+    self._tasks = tasks
+
+    self._global_step, self._optimizer = self._get_optimizer()
+    self._inputs = shared_inputs.Inputs(config)
+    with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope:
+      inference = Inference(config, self._inputs, pretrained_embeddings,
+                            tasks)
+      self._trainer = inference
+      self._tester = inference
+      self._teacher = inference
+      if config.ema_test or config.ema_teacher:
+        ema = tf.train.ExponentialMovingAverage(config.ema_decay)
+        model_vars = tf.get_collection("trainable_variables", "model")
+        ema_op = ema.apply(model_vars)
+        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op)
+
+        def ema_getter(getter, name, *args, **kwargs):
+          var = getter(name, *args, **kwargs)
+          return ema.average(var)
+
+        scope.set_custom_getter(ema_getter)
+        inference_ema = Inference(
+            config, self._inputs, pretrained_embeddings, tasks)
+        if config.ema_teacher:
+          self._teacher = inference_ema
+        if config.ema_test:
+          self._tester = inference_ema
+
+    self._unlabeled_loss = self._get_consistency_loss(tasks)
+    self._unlabeled_train_op = self._get_train_op(self._unlabeled_loss)
+    self._labeled_train_ops = {}
+    for task in self._tasks:
+      task_loss = self._trainer.modules[task.name].supervised_loss
+      self._labeled_train_ops[task.name] = self._get_train_op(task_loss)
+
+  def _get_consistency_loss(self, tasks):
+    return sum([self._trainer.modules[task.name].unsupervised_loss
+                for task in tasks])
+
+  def _get_optimizer(self):
+    global_step = tf.get_variable('global_step', initializer=0, trainable=False)
+    warm_up_multiplier = (tf.minimum(tf.to_float(global_step),
+                                     self._config.warm_up_steps)
+                          / self._config.warm_up_steps)
+    decay_multiplier = 1.0 / (1 + self._config.lr_decay *
+                              tf.sqrt(tf.to_float(global_step)))
+    lr = self._config.lr * warm_up_multiplier * decay_multiplier
+    optimizer = tf.train.MomentumOptimizer(lr, self._config.momentum)
+    return global_step, optimizer
+
+  def _get_train_op(self, loss):
+    grads, vs = zip(*self._optimizer.compute_gradients(loss))
+    grads, _ = tf.clip_by_global_norm(grads, self._config.grad_clip)
+    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    with tf.control_dependencies(update_ops):
+      return self._optimizer.apply_gradients(
+          zip(grads, vs), global_step=self._global_step)
+
+  def _create_feed_dict(self, mb, model, is_training=True):
+    feed = self._inputs.create_feed_dict(mb, is_training)
+    if mb.task_name in model.modules:
+      model.modules[mb.task_name].update_feed_dict(feed, mb)
+    else:
+      for module in model.modules.values():
+        module.update_feed_dict(feed, mb)
+    return feed
+
+  def train_unlabeled(self, sess, mb):
+    return sess.run([self._unlabeled_train_op, self._unlabeled_loss],
+                    feed_dict=self._create_feed_dict(mb, self._trainer))[1]
+
+  def train_labeled(self, sess, mb):
+    return sess.run([self._labeled_train_ops[mb.task_name],
+                     self._trainer.modules[mb.task_name].supervised_loss,],
+                    feed_dict=self._create_feed_dict(mb, self._trainer))[1]
+
+  def run_teacher(self, sess, mb):
+    result = sess.run({task.name: self._teacher.modules[task.name].probs
+                       for task in self._tasks},
+                      feed_dict=self._create_feed_dict(mb, self._teacher,
+                                                       False))
+    for task_name, probs in result.iteritems():
+      mb.teacher_predictions[task_name] = probs.astype('float16')
+
+  def test(self, sess, mb):
+    return sess.run(
+        [self._tester.modules[mb.task_name].supervised_loss,
+         self._tester.modules[mb.task_name].preds],
+        feed_dict=self._create_feed_dict(mb, self._tester, False))
+
+  def get_global_step(self, sess):
+    return sess.run(self._global_step)
--- a/research/cvt_text/model/shared_inputs.py
+++ b/research/cvt_text/model/shared_inputs.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Placeholders for non-task-specific model inputs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class Inputs(object):
+  def __init__(self, config):
+    self._config = config
+    self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
+    self.label_smoothing = tf.placeholder(tf.float32, name='label_smoothing')
+    self.lengths = tf.placeholder(tf.int32, shape=[None], name='lengths')
+    self.mask = tf.placeholder(tf.float32, [None, None], name='mask')
+    self.words = tf.placeholder(tf.int32, shape=[None, None], name='words')
+    self.chars = tf.placeholder(tf.int32, shape=[None, None, None],
+                                name='chars')
+
+  def create_feed_dict(self, mb, is_training):
+    cvt = mb.task_name == 'unlabeled'
+    return {
+        self.keep_prob: 1.0 if not is_training else
+                        (self._config.unlabeled_keep_prob if cvt else
+                         self._config.labeled_keep_prob),
+        self.label_smoothing: self._config.label_smoothing
+                              if (is_training and not cvt) else 0.0,
+        self.lengths: mb.lengths,
+        self.words: mb.words,
+        self.chars: mb.chars,
+        self.mask: mb.mask.astype('float32')
+    }
--- a/research/cvt_text/model/task_module.py
+++ b/research/cvt_text/model/task_module.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Base classes for task-specific modules."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+
+class SupervisedModule(object):
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self):
+    self.supervised_loss = NotImplemented
+    self.probs = NotImplemented
+    self.preds = NotImplemented
+
+  @abc.abstractmethod
+  def update_feed_dict(self, feed, mb):
+    pass
+
+
+class SemiSupervisedModule(SupervisedModule):
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self):
+    super(SemiSupervisedModule, self).__init__()
+    self.unsupervised_loss = NotImplemented
+
--- a/research/cvt_text/preprocessing.py
+++ b/research/cvt_text/preprocessing.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""
+Preprocesses pretrained word embeddings, creates dev sets for tasks without a
+provided one, and figures out the set of output classes for each task.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import random
+
+from base import configure
+from base import embeddings
+from base import utils
+from task_specific.word_level import word_level_data
+
+
+def main(data_dir='./data'):
+  random.seed(0)
+
+  utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS")
+  for pretrained in ['glove.6B.300d.txt']:
+    config = configure.Config(data_dir=data_dir,
+                              for_preprocessing=True,
+                              pretrained_embeddings=pretrained,
+                              word_embedding_size=300)
+    embeddings.PretrainedEmbeddingLoader(config).build()
+
+  utils.log("CONSTRUCTING DEV SETS")
+  for task_name in ["chunk"]:
+    # chunking does not come with a provided dev split, so create one by
+    # selecting a random subset of the data
+    config = configure.Config(data_dir=data_dir,
+                              for_preprocessing=True)
+    task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/'
+    train_sentences = word_level_data.TaggedDataLoader(
+        config, task_name, False).get_labeled_sentences("train")
+    random.shuffle(train_sentences)
+    write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:])
+    write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500])
+
+  utils.log("WRITING LABEL MAPPINGS")
+  for task_name in ["chunk"]:
+    for i, label_encoding in enumerate(["BIOES"]):
+      config = configure.Config(data_dir=data_dir,
+                                for_preprocessing=True,
+                                label_encoding=label_encoding)
+      token_level = task_name in ["ccg", "pos", "depparse"]
+      loader = word_level_data.TaggedDataLoader(config, task_name, token_level)
+      if token_level:
+        if i != 0:
+          continue
+        utils.log("WRITING LABEL MAPPING FOR", task_name.upper())
+      else:
+        utils.log("  Writing label mapping for", task_name.upper(),
+                  label_encoding)
+      utils.log(" ", len(loader.label_mapping), "classes")
+      utils.write_cpickle(loader.label_mapping,
+                          loader.label_mapping_path)
+
+
+def write_sentences(fname, sentences):
+  with open(fname, 'w') as f:
+    for words, tags in sentences:
+      for word, tag in zip(words, tags):
+        f.write(word + " " + tag + "\n")
+      f.write("\n")
+
+
+if __name__ == '__main__':
+  main()
--- a/research/cvt_text/task_specific/__init__.py
+++ b/research/cvt_text/task_specific/__init__.py
--- a/research/cvt_text/task_specific/task_definitions.py
+++ b/research/cvt_text/task_specific/task_definitions.py
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Defines all the tasks the model can learn."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from base import embeddings
+from task_specific.word_level import depparse_module
+from task_specific.word_level import depparse_scorer
+from task_specific.word_level import tagging_module
+from task_specific.word_level import tagging_scorers
+from task_specific.word_level import word_level_data
+
+
+class Task(object):
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self, config, name, loader):
+    self.config = config
+    self.name = name
+    self.loader = loader
+    self.train_set = self.loader.get_dataset("train")
+    self.val_set = self.loader.get_dataset("dev" if config.dev_set else "test")
+
+  @abc.abstractmethod
+  def get_module(self, inputs, encoder):
+    pass
+
+  @abc.abstractmethod
+  def get_scorer(self):
+    pass
+
+
+class Tagging(Task):
+  def __init__(self, config, name, is_token_level=True):
+    super(Tagging, self).__init__(
+        config, name, word_level_data.TaggedDataLoader(
+            config, name, is_token_level))
+    self.n_classes = len(set(self.loader.label_mapping.values()))
+    self.is_token_level = is_token_level
+
+  def get_module(self, inputs, encoder):
+    return tagging_module.TaggingModule(
+        self.config, self.name, self.n_classes, inputs, encoder)
+
+  def get_scorer(self):
+    if self.is_token_level:
+      return tagging_scorers.AccuracyScorer()
+    else:
+      return tagging_scorers.EntityLevelF1Scorer(self.loader.label_mapping)
+
+
+class DependencyParsing(Tagging):
+  def __init__(self, config, name):
+    super(DependencyParsing, self).__init__(config, name, True)
+
+  def get_module(self, inputs, encoder):
+    return depparse_module.DepparseModule(
+        self.config, self.name, self.n_classes, inputs, encoder)
+
+  def get_scorer(self):
+    return depparse_scorer.DepparseScorer(
+        self.n_classes, (embeddings.get_punctuation_ids(self.config)))
+
+
+def get_task(config, name):
+  if name in ["ccg", "pos"]:
+    return Tagging(config, name, True)
+  elif name in ["chunk", "ner", "er"]:
+    return Tagging(config, name, False)
+  elif name == "depparse":
+    return DependencyParsing(config, name)
+  else:
+    raise ValueError("Unknown task", name)
--- a/research/cvt_text/task_specific/word_level/__init__.py
+++ b/research/cvt_text/task_specific/word_level/__init__.py