Commit f5fc733a authored by Byzantine's avatar Byzantine
Browse files

Removing research/community models

parent 09bc9f54
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base class for training examples."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from base import embeddings
CONTRACTION_WORDS = set(w + 'n' for w in
['do', 'does', 'did', 'is', 'are', 'was', 'were', 'has',
'have', 'had', 'could', 'would', 'should', 'ca', 'wo',
'ai', 'might'])
class Example(object):
def __init__(self, words, word_vocab, char_vocab):
words = words[:]
# Fix inconsistent tokenization between datasets
for i in range(len(words)):
if (words[i].lower() == '\'t' and i > 0 and
words[i - 1].lower() in CONTRACTION_WORDS):
words[i] = words[i - 1][-1] + words[i]
words[i - 1] = words[i - 1][:-1]
self.words = ([embeddings.START] +
[word_vocab[embeddings.normalize_word(w)] for w in words] +
[embeddings.END])
self.chars = ([[embeddings.MISSING]] +
[[char_vocab[c] for c in embeddings.normalize_chars(w)]
for w in words] +
[[embeddings.MISSING]])
def __repr__(self,):
inv_char_vocab = embeddings.get_inv_char_vocab()
return ' '.join([''.join([inv_char_vocab[c] for c in w])
for w in self.chars])
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for constructing minibatches."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import random
import numpy as np
from base import embeddings
def get_bucket(config, l):
for i, (s, e) in enumerate(config.buckets):
if s <= l < e:
return config.buckets[i]
def build_array(nested_lists, dtype='int32'):
depth_to_sizes = collections.defaultdict(set)
_get_sizes(nested_lists, depth_to_sizes)
shape = [max(depth_to_sizes[depth]) for depth in range(len(depth_to_sizes))]
copy_depth = len(depth_to_sizes) - 1
while copy_depth > 0 and len(depth_to_sizes[copy_depth]) == 1:
copy_depth -= 1
arr = np.zeros(shape, dtype=dtype)
_fill_array(nested_lists, arr, copy_depth)
return arr
def _get_sizes(nested_lists, depth_to_sizes, depth=0):
depth_to_sizes[depth].add(len(nested_lists))
first_elem = nested_lists[0]
if (isinstance(first_elem, collections.Sequence) or
isinstance(first_elem, np.ndarray)):
for sublist in nested_lists:
_get_sizes(sublist, depth_to_sizes, depth + 1)
def _fill_array(nested_lists, arr, copy_depth, depth=0):
if depth == copy_depth:
for i in range(len(nested_lists)):
if isinstance(nested_lists[i], np.ndarray):
arr[i] = nested_lists[i]
else:
arr[i] = np.array(nested_lists[i])
else:
for i in range(len(nested_lists)):
_fill_array(nested_lists[i], arr[i], copy_depth, depth + 1)
class Dataset(object):
def __init__(self, config, examples, task_name='unlabeled', is_training=False):
self._config = config
self.examples = examples
self.size = len(examples)
self.task_name = task_name
self.is_training = is_training
def get_minibatches(self, minibatch_size):
by_bucket = collections.defaultdict(list)
for i, e in enumerate(self.examples):
by_bucket[get_bucket(self._config, len(e.words))].append(i)
# save memory by weighting examples so longer sentences have
# smaller minibatches.
weight = lambda ind: np.sqrt(len(self.examples[ind].words))
total_weight = float(sum(weight(i) for i in range(len(self.examples))))
weight_per_batch = minibatch_size * total_weight / len(self.examples)
cumulative_weight = 0.0
id_batches = []
for _, ids in by_bucket.iteritems():
ids = np.array(ids)
np.random.shuffle(ids)
curr_batch, curr_weight = [], 0.0
for i, curr_id in enumerate(ids):
curr_batch.append(curr_id)
curr_weight += weight(curr_id)
if (i == len(ids) - 1 or cumulative_weight + curr_weight >=
(len(id_batches) + 1) * weight_per_batch):
cumulative_weight += curr_weight
id_batches.append(np.array(curr_batch))
curr_batch, curr_weight = [], 0.0
random.shuffle(id_batches)
for id_batch in id_batches:
yield self._make_minibatch(id_batch)
def endless_minibatches(self, minibatch_size):
while True:
for mb in self.get_minibatches(minibatch_size):
yield mb
def _make_minibatch(self, ids):
examples = [self.examples[i] for i in ids]
sentence_lengths = np.array([len(e.words) for e in examples])
max_word_length = min(max(max(len(word) for word in e.chars)
for e in examples),
self._config.max_word_length)
characters = [[[embeddings.PAD] + [embeddings.START] + w[:max_word_length] +
[embeddings.END] + [embeddings.PAD] for w in e.chars]
for e in examples]
# the first and last words are masked because they are start/end tokens
mask = build_array([[0] + [1] * (length - 2) + [0]
for length in sentence_lengths])
words = build_array([e.words for e in examples])
chars = build_array(characters, dtype='int16')
return Minibatch(
task_name=self.task_name,
size=ids.size,
examples=examples,
ids=ids,
teacher_predictions={},
words=words,
chars=chars,
lengths=sentence_lengths,
mask=mask,
)
Minibatch = collections.namedtuple('Minibatch', [
'task_name', 'size', 'examples', 'ids', 'teacher_predictions',
'words', 'chars', 'lengths', 'mask'
])
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Abstract base class for evaluation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import abc
class Scorer(object):
__metaclass__ = abc.ABCMeta
def __init__(self):
self._updated = False
self._cached_results = {}
@abc.abstractmethod
def update(self, examples, predictions, loss):
self._updated = True
@abc.abstractmethod
def get_loss(self):
pass
@abc.abstractmethod
def _get_results(self):
return []
def get_results(self, prefix=""):
results = self._get_results() if self._updated else self._cached_results
self._cached_results = results
self._updated = False
return [(prefix + k, v) for k, v in results]
def results_str(self):
return " - ".join(["{:}: {:.2f}".format(k, v)
for k, v in self.get_results()])
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Reads data from a large unlabeled corpus."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tensorflow as tf
from base import embeddings
from corpus_processing import example
from corpus_processing import minibatching
class UnlabeledDataReader(object):
def __init__(self, config, starting_file=0, starting_line=0, one_pass=False):
self.config = config
self.current_file = starting_file
self.current_line = starting_line
self._one_pass = one_pass
def endless_minibatches(self):
for examples in self.get_unlabeled_examples():
d = minibatching.Dataset(self.config, examples, 'unlabeled')
for mb in d.get_minibatches(self.config.train_batch_size):
yield mb
def _make_examples(self, sentences):
word_vocab = embeddings.get_word_vocab(self.config)
char_vocab = embeddings.get_char_vocab()
return [
example.Example(sentence, word_vocab, char_vocab)
for sentence in sentences
]
def get_unlabeled_examples(self):
lines = []
for words in self.get_unlabeled_sentences():
lines.append(words)
if len(lines) >= 10000:
yield self._make_examples(lines)
lines = []
def get_unlabeled_sentences(self):
while True:
file_ids_and_names = sorted([
(int(fname.split('-')[1].replace('.txt', '')), fname) for fname in
tf.gfile.ListDirectory(self.config.unsupervised_data)])
for fid, fname in file_ids_and_names:
if fid < self.current_file:
continue
self.current_file = fid
self.current_line = 0
with tf.gfile.FastGFile(os.path.join(self.config.unsupervised_data,
fname), 'r') as f:
for i, line in enumerate(f):
if i < self.current_line:
continue
self.current_line = i
words = line.strip().split()
if len(words) < self.config.max_sentence_length:
yield words
self.current_file = 0
self.current_line = 0
if self._one_pass:
break
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Run training and evaluation for CVT text models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from base import configure
from base import utils
from training import trainer
from training import training_progress
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('mode', 'train', '"train" or "eval')
tf.app.flags.DEFINE_string('model_name', 'default_model',
'A name identifying the model being '
'trained/evaluated')
def main():
utils.heading('SETUP')
config = configure.Config(mode=FLAGS.mode, model_name=FLAGS.model_name)
config.write()
with tf.Graph().as_default() as graph:
model_trainer = trainer.Trainer(config)
summary_writer = tf.summary.FileWriter(config.summaries_dir)
checkpoints_saver = tf.train.Saver(max_to_keep=1)
best_model_saver = tf.train.Saver(max_to_keep=1)
init_op = tf.global_variables_initializer()
graph.finalize()
with tf.Session() as sess:
sess.run(init_op)
progress = training_progress.TrainingProgress(
config, sess, checkpoints_saver, best_model_saver,
config.mode == 'train')
utils.log()
if config.mode == 'train':
utils.heading('START TRAINING ({:})'.format(config.model_name))
model_trainer.train(sess, progress, summary_writer)
elif config.mode == 'eval':
utils.heading('RUN EVALUATION ({:})'.format(config.model_name))
progress.best_model_saver.restore(sess, tf.train.latest_checkpoint(
config.checkpoints_dir))
model_trainer.evaluate_all_tasks(sess, summary_writer, None)
else:
raise ValueError('Mode must be "train" or "eval"')
if __name__ == '__main__':
main()
#!/bin/bash
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
TOPDIR='./data'
RUNDIR=${PWD}
mkdir -p ${TOPDIR}
cd ${TOPDIR}
mkdir -p raw_data
mkdir -p raw_data/pretrained_embeddings
mkdir -p raw_data/unlabeled_data
mkdir -p raw_data/chunk
cd ${RUNDIR}
echo "Preparing GloVe embeddings"
cd "${TOPDIR}/raw_data/pretrained_embeddings"
curl -OL http://nlp.stanford.edu/data/glove.6B.zip
unzip glove.6B.zip
cd ${RUNDIR}
echo
echo "Preparing lm1b corpus"
cd "${TOPDIR}/raw_data/unlabeled_data"
curl -OL http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
tar xzf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
cd ${RUNDIR}
echo
echo "Preparing chunking corpus"
cd "${TOPDIR}/raw_data/chunk"
curl -OL https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz
curl -OL http://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz
gunzip *
cd ${RUNDIR}
echo
echo "Done with data fetching!"
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""CNN-BiLSTM sentence encoder."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from base import embeddings
from model import model_helpers
class Encoder(object):
def __init__(self, config, inputs, pretrained_embeddings):
self._config = config
self._inputs = inputs
self.word_reprs = self._get_word_reprs(pretrained_embeddings)
self.uni_fw, self.uni_bw = self._get_unidirectional_reprs(self.word_reprs)
self.uni_reprs = tf.concat([self.uni_fw, self.uni_bw], axis=-1)
self.bi_fw, self.bi_bw, self.bi_reprs = self._get_bidirectional_reprs(
self.uni_reprs)
def _get_word_reprs(self, pretrained_embeddings):
with tf.variable_scope('word_embeddings'):
word_embedding_matrix = tf.get_variable(
'word_embedding_matrix', initializer=pretrained_embeddings)
word_embeddings = tf.nn.embedding_lookup(
word_embedding_matrix, self._inputs.words)
word_embeddings = tf.nn.dropout(word_embeddings, self._inputs.keep_prob)
word_embeddings *= tf.get_variable('emb_scale', initializer=1.0)
if not self._config.use_chars:
return word_embeddings
with tf.variable_scope('char_embeddings'):
char_embedding_matrix = tf.get_variable(
'char_embeddings',
shape=[embeddings.NUM_CHARS, self._config.char_embedding_size])
char_embeddings = tf.nn.embedding_lookup(char_embedding_matrix,
self._inputs.chars)
shape = tf.shape(char_embeddings)
char_embeddings = tf.reshape(
char_embeddings,
shape=[-1, shape[-2], self._config.char_embedding_size])
char_reprs = []
for filter_width in self._config.char_cnn_filter_widths:
conv = tf.layers.conv1d(
char_embeddings, self._config.char_cnn_n_filters, filter_width)
conv = tf.nn.relu(conv)
conv = tf.nn.dropout(tf.reduce_max(conv, axis=1),
self._inputs.keep_prob)
conv = tf.reshape(conv, shape=[-1, shape[1],
self._config.char_cnn_n_filters])
char_reprs.append(conv)
return tf.concat([word_embeddings] + char_reprs, axis=-1)
def _get_unidirectional_reprs(self, word_reprs):
with tf.variable_scope('unidirectional_reprs'):
word_lstm_input_size = (
self._config.word_embedding_size if not self._config.use_chars else
(self._config.word_embedding_size +
len(self._config.char_cnn_filter_widths)
* self._config.char_cnn_n_filters))
word_reprs.set_shape([None, None, word_lstm_input_size])
(outputs_fw, outputs_bw), _ = tf.nn.bidirectional_dynamic_rnn(
model_helpers.multi_lstm_cell(self._config.unidirectional_sizes,
self._inputs.keep_prob,
self._config.projection_size),
model_helpers.multi_lstm_cell(self._config.unidirectional_sizes,
self._inputs.keep_prob,
self._config.projection_size),
word_reprs,
dtype=tf.float32,
sequence_length=self._inputs.lengths,
scope='unilstm'
)
return outputs_fw, outputs_bw
def _get_bidirectional_reprs(self, uni_reprs):
with tf.variable_scope('bidirectional_reprs'):
current_outputs = uni_reprs
outputs_fw, outputs_bw = None, None
for size in self._config.bidirectional_sizes:
(outputs_fw, outputs_bw), _ = tf.nn.bidirectional_dynamic_rnn(
model_helpers.lstm_cell(size, self._inputs.keep_prob,
self._config.projection_size),
model_helpers.lstm_cell(size, self._inputs.keep_prob,
self._config.projection_size),
current_outputs,
dtype=tf.float32,
sequence_length=self._inputs.lengths,
scope='bilstm'
)
current_outputs = tf.concat([outputs_fw, outputs_bw], axis=-1)
return outputs_fw, outputs_bw, current_outputs
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for building the model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def project(input_layers, size, name='projection'):
return tf.add_n([tf.layers.dense(layer, size, name=name + '_' + str(i))
for i, layer in enumerate(input_layers)])
def lstm_cell(cell_size, keep_prob, num_proj):
return tf.contrib.rnn.DropoutWrapper(
tf.contrib.rnn.LSTMCell(cell_size, num_proj=min(cell_size, num_proj)),
output_keep_prob=keep_prob)
def multi_lstm_cell(cell_sizes, keep_prob, num_proj):
return tf.contrib.rnn.MultiRNNCell([lstm_cell(cell_size, keep_prob, num_proj)
for cell_size in cell_sizes])
def masked_ce_loss(logits, labels, mask, sparse=False, roll_direction=0):
if roll_direction != 0:
labels = _roll(labels, roll_direction, sparse)
mask *= _roll(mask, roll_direction, True)
ce = ((tf.nn.sparse_softmax_cross_entropy_with_logits if sparse
else tf.nn.softmax_cross_entropy_with_logits_v2)
(logits=logits, labels=labels))
return tf.reduce_sum(mask * ce) / tf.to_float(tf.reduce_sum(mask))
def _roll(arr, direction, sparse=False):
if sparse:
return tf.concat([arr[:, direction:], arr[:, :direction]], axis=1)
return tf.concat([arr[:, direction:, :], arr[:, :direction, :]], axis=1)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A multi-task and semi-supervised NLP model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from model import encoder
from model import shared_inputs
class Inference(object):
def __init__(self, config, inputs, pretrained_embeddings, tasks):
with tf.variable_scope('encoder'):
self.encoder = encoder.Encoder(config, inputs, pretrained_embeddings)
self.modules = {}
for task in tasks:
with tf.variable_scope(task.name):
self.modules[task.name] = task.get_module(inputs, self.encoder)
class Model(object):
def __init__(self, config, pretrained_embeddings, tasks):
self._config = config
self._tasks = tasks
self._global_step, self._optimizer = self._get_optimizer()
self._inputs = shared_inputs.Inputs(config)
with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope:
inference = Inference(config, self._inputs, pretrained_embeddings,
tasks)
self._trainer = inference
self._tester = inference
self._teacher = inference
if config.ema_test or config.ema_teacher:
ema = tf.train.ExponentialMovingAverage(config.ema_decay)
model_vars = tf.get_collection("trainable_variables", "model")
ema_op = ema.apply(model_vars)
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op)
def ema_getter(getter, name, *args, **kwargs):
var = getter(name, *args, **kwargs)
return ema.average(var)
scope.set_custom_getter(ema_getter)
inference_ema = Inference(
config, self._inputs, pretrained_embeddings, tasks)
if config.ema_teacher:
self._teacher = inference_ema
if config.ema_test:
self._tester = inference_ema
self._unlabeled_loss = self._get_consistency_loss(tasks)
self._unlabeled_train_op = self._get_train_op(self._unlabeled_loss)
self._labeled_train_ops = {}
for task in self._tasks:
task_loss = self._trainer.modules[task.name].supervised_loss
self._labeled_train_ops[task.name] = self._get_train_op(task_loss)
def _get_consistency_loss(self, tasks):
return sum([self._trainer.modules[task.name].unsupervised_loss
for task in tasks])
def _get_optimizer(self):
global_step = tf.get_variable('global_step', initializer=0, trainable=False)
warm_up_multiplier = (tf.minimum(tf.to_float(global_step),
self._config.warm_up_steps)
/ self._config.warm_up_steps)
decay_multiplier = 1.0 / (1 + self._config.lr_decay *
tf.sqrt(tf.to_float(global_step)))
lr = self._config.lr * warm_up_multiplier * decay_multiplier
optimizer = tf.train.MomentumOptimizer(lr, self._config.momentum)
return global_step, optimizer
def _get_train_op(self, loss):
grads, vs = zip(*self._optimizer.compute_gradients(loss))
grads, _ = tf.clip_by_global_norm(grads, self._config.grad_clip)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
return self._optimizer.apply_gradients(
zip(grads, vs), global_step=self._global_step)
def _create_feed_dict(self, mb, model, is_training=True):
feed = self._inputs.create_feed_dict(mb, is_training)
if mb.task_name in model.modules:
model.modules[mb.task_name].update_feed_dict(feed, mb)
else:
for module in model.modules.values():
module.update_feed_dict(feed, mb)
return feed
def train_unlabeled(self, sess, mb):
return sess.run([self._unlabeled_train_op, self._unlabeled_loss],
feed_dict=self._create_feed_dict(mb, self._trainer))[1]
def train_labeled(self, sess, mb):
return sess.run([self._labeled_train_ops[mb.task_name],
self._trainer.modules[mb.task_name].supervised_loss,],
feed_dict=self._create_feed_dict(mb, self._trainer))[1]
def run_teacher(self, sess, mb):
result = sess.run({task.name: self._teacher.modules[task.name].probs
for task in self._tasks},
feed_dict=self._create_feed_dict(mb, self._teacher,
False))
for task_name, probs in result.iteritems():
mb.teacher_predictions[task_name] = probs.astype('float16')
def test(self, sess, mb):
return sess.run(
[self._tester.modules[mb.task_name].supervised_loss,
self._tester.modules[mb.task_name].preds],
feed_dict=self._create_feed_dict(mb, self._tester, False))
def get_global_step(self, sess):
return sess.run(self._global_step)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Placeholders for non-task-specific model inputs."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
class Inputs(object):
def __init__(self, config):
self._config = config
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.label_smoothing = tf.placeholder(tf.float32, name='label_smoothing')
self.lengths = tf.placeholder(tf.int32, shape=[None], name='lengths')
self.mask = tf.placeholder(tf.float32, [None, None], name='mask')
self.words = tf.placeholder(tf.int32, shape=[None, None], name='words')
self.chars = tf.placeholder(tf.int32, shape=[None, None, None],
name='chars')
def create_feed_dict(self, mb, is_training):
cvt = mb.task_name == 'unlabeled'
return {
self.keep_prob: 1.0 if not is_training else
(self._config.unlabeled_keep_prob if cvt else
self._config.labeled_keep_prob),
self.label_smoothing: self._config.label_smoothing
if (is_training and not cvt) else 0.0,
self.lengths: mb.lengths,
self.words: mb.words,
self.chars: mb.chars,
self.mask: mb.mask.astype('float32')
}
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base classes for task-specific modules."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import abc
class SupervisedModule(object):
__metaclass__ = abc.ABCMeta
def __init__(self):
self.supervised_loss = NotImplemented
self.probs = NotImplemented
self.preds = NotImplemented
@abc.abstractmethod
def update_feed_dict(self, feed, mb):
pass
class SemiSupervisedModule(SupervisedModule):
__metaclass__ = abc.ABCMeta
def __init__(self):
super(SemiSupervisedModule, self).__init__()
self.unsupervised_loss = NotImplemented
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Preprocesses pretrained word embeddings, creates dev sets for tasks without a
provided one, and figures out the set of output classes for each task.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import random
from base import configure
from base import embeddings
from base import utils
from task_specific.word_level import word_level_data
def main(data_dir='./data'):
random.seed(0)
utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS")
for pretrained in ['glove.6B.300d.txt']:
config = configure.Config(data_dir=data_dir,
for_preprocessing=True,
pretrained_embeddings=pretrained,
word_embedding_size=300)
embeddings.PretrainedEmbeddingLoader(config).build()
utils.log("CONSTRUCTING DEV SETS")
for task_name in ["chunk"]:
# chunking does not come with a provided dev split, so create one by
# selecting a random subset of the data
config = configure.Config(data_dir=data_dir,
for_preprocessing=True)
task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/'
train_sentences = word_level_data.TaggedDataLoader(
config, task_name, False).get_labeled_sentences("train")
random.shuffle(train_sentences)
write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:])
write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500])
utils.log("WRITING LABEL MAPPINGS")
for task_name in ["chunk"]:
for i, label_encoding in enumerate(["BIOES"]):
config = configure.Config(data_dir=data_dir,
for_preprocessing=True,
label_encoding=label_encoding)
token_level = task_name in ["ccg", "pos", "depparse"]
loader = word_level_data.TaggedDataLoader(config, task_name, token_level)
if token_level:
if i != 0:
continue
utils.log("WRITING LABEL MAPPING FOR", task_name.upper())
else:
utils.log(" Writing label mapping for", task_name.upper(),
label_encoding)
utils.log(" ", len(loader.label_mapping), "classes")
utils.write_cpickle(loader.label_mapping,
loader.label_mapping_path)
def write_sentences(fname, sentences):
with open(fname, 'w') as f:
for words, tags in sentences:
for word, tag in zip(words, tags):
f.write(word + " " + tag + "\n")
f.write("\n")
if __name__ == '__main__':
main()
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Defines all the tasks the model can learn."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import abc
from base import embeddings
from task_specific.word_level import depparse_module
from task_specific.word_level import depparse_scorer
from task_specific.word_level import tagging_module
from task_specific.word_level import tagging_scorers
from task_specific.word_level import word_level_data
class Task(object):
__metaclass__ = abc.ABCMeta
def __init__(self, config, name, loader):
self.config = config
self.name = name
self.loader = loader
self.train_set = self.loader.get_dataset("train")
self.val_set = self.loader.get_dataset("dev" if config.dev_set else "test")
@abc.abstractmethod
def get_module(self, inputs, encoder):
pass
@abc.abstractmethod
def get_scorer(self):
pass
class Tagging(Task):
def __init__(self, config, name, is_token_level=True):
super(Tagging, self).__init__(
config, name, word_level_data.TaggedDataLoader(
config, name, is_token_level))
self.n_classes = len(set(self.loader.label_mapping.values()))
self.is_token_level = is_token_level
def get_module(self, inputs, encoder):
return tagging_module.TaggingModule(
self.config, self.name, self.n_classes, inputs, encoder)
def get_scorer(self):
if self.is_token_level:
return tagging_scorers.AccuracyScorer()
else:
return tagging_scorers.EntityLevelF1Scorer(self.loader.label_mapping)
class DependencyParsing(Tagging):
def __init__(self, config, name):
super(DependencyParsing, self).__init__(config, name, True)
def get_module(self, inputs, encoder):
return depparse_module.DepparseModule(
self.config, self.name, self.n_classes, inputs, encoder)
def get_scorer(self):
return depparse_scorer.DepparseScorer(
self.n_classes, (embeddings.get_punctuation_ids(self.config)))
def get_task(config, name):
if name in ["ccg", "pos"]:
return Tagging(config, name, True)
elif name in ["chunk", "ner", "er"]:
return Tagging(config, name, False)
elif name == "depparse":
return DependencyParsing(config, name)
else:
raise ValueError("Unknown task", name)
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Dependency parsing module."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from corpus_processing import minibatching
from model import model_helpers
from model import task_module
class DepparseModule(task_module.SemiSupervisedModule):
def __init__(self, config, task_name, n_classes, inputs, encoder):
super(DepparseModule, self).__init__()
self.task_name = task_name
self.n_classes = n_classes
self.labels = labels = tf.placeholder(tf.float32, [None, None, None],
name=task_name + '_labels')
class PredictionModule(object):
def __init__(self, name, dep_reprs, head_reprs, roll_direction=0):
self.name = name
with tf.variable_scope(name + '/predictions'):
# apply hidden layers to the input representations
arc_dep_hidden = model_helpers.project(
dep_reprs, config.projection_size, 'arc_dep_hidden')
arc_head_hidden = model_helpers.project(
head_reprs, config.projection_size, 'arc_head_hidden')
arc_dep_hidden = tf.nn.relu(arc_dep_hidden)
arc_head_hidden = tf.nn.relu(arc_head_hidden)
arc_head_hidden = tf.nn.dropout(arc_head_hidden, inputs.keep_prob)
arc_dep_hidden = tf.nn.dropout(arc_dep_hidden, inputs.keep_prob)
# bilinear classifier excluding the final dot product
arc_head = tf.layers.dense(
arc_head_hidden, config.depparse_projection_size, name='arc_head')
W = tf.get_variable('shared_W',
shape=[config.projection_size, n_classes,
config.depparse_projection_size])
Wr = tf.get_variable('relation_specific_W',
shape=[config.projection_size,
config.depparse_projection_size])
Wr_proj = tf.tile(tf.expand_dims(Wr, axis=-2), [1, n_classes, 1])
W += Wr_proj
arc_dep = tf.tensordot(arc_dep_hidden, W, axes=[[-1], [0]])
shape = tf.shape(arc_dep)
arc_dep = tf.reshape(arc_dep,
[shape[0], -1, config.depparse_projection_size])
# apply the transformer scaling trick to prevent dot products from
# getting too large (possibly not necessary)
scale = np.power(
config.depparse_projection_size, 0.25).astype('float32')
scale = tf.get_variable('scale', initializer=scale, dtype=tf.float32)
arc_dep /= scale
arc_head /= scale
# compute the scores for each candidate arc
word_scores = tf.matmul(arc_head, arc_dep, transpose_b=True)
root_scores = tf.layers.dense(arc_head, n_classes, name='root_score')
arc_scores = tf.concat([root_scores, word_scores], axis=-1)
# disallow the model from making impossible predictions
mask = inputs.mask
mask_shape = tf.shape(mask)
mask = tf.tile(tf.expand_dims(mask, -1), [1, 1, n_classes])
mask = tf.reshape(mask, [-1, mask_shape[1] * n_classes])
mask = tf.concat([tf.ones((mask_shape[0], 1)),
tf.zeros((mask_shape[0], n_classes - 1)), mask],
axis=1)
mask = tf.tile(tf.expand_dims(mask, 1), [1, mask_shape[1], 1])
arc_scores += (mask - 1) * 100.0
self.logits = arc_scores
self.loss = model_helpers.masked_ce_loss(
self.logits, labels, inputs.mask,
roll_direction=roll_direction)
primary = PredictionModule(
'primary',
[encoder.uni_reprs, encoder.bi_reprs],
[encoder.uni_reprs, encoder.bi_reprs])
ps = [
PredictionModule(
'full',
[encoder.uni_reprs, encoder.bi_reprs],
[encoder.uni_reprs, encoder.bi_reprs]),
PredictionModule('fw_fw', [encoder.uni_fw], [encoder.uni_fw]),
PredictionModule('fw_bw', [encoder.uni_fw], [encoder.uni_bw]),
PredictionModule('bw_fw', [encoder.uni_bw], [encoder.uni_fw]),
PredictionModule('bw_bw', [encoder.uni_bw], [encoder.uni_bw]),
]
self.unsupervised_loss = sum(p.loss for p in ps)
self.supervised_loss = primary.loss
self.probs = tf.nn.softmax(primary.logits)
self.preds = tf.argmax(primary.logits, axis=-1)
def update_feed_dict(self, feed, mb):
if self.task_name in mb.teacher_predictions:
feed[self.labels] = mb.teacher_predictions[self.task_name]
elif mb.task_name != 'unlabeled':
labels = minibatching.build_array(
[[0] + e.labels + [0] for e in mb.examples])
feed[self.labels] = np.eye(
(1 + mb.words.shape[1]) * self.n_classes)[labels]
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Dependency parsing evaluation (computes UAS/LAS)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from task_specific.word_level import word_level_scorer
class DepparseScorer(word_level_scorer.WordLevelScorer):
def __init__(self, n_relations, punctuation):
super(DepparseScorer, self).__init__()
self._n_relations = n_relations
self._punctuation = punctuation if punctuation else None
def _get_results(self):
correct_unlabeled, correct_labeled, count = 0, 0, 0
for example, preds in zip(self._examples, self._preds):
for w, y_true, y_pred in zip(example.words[1:-1], example.labels, preds):
if w in self._punctuation:
continue
count += 1
correct_labeled += (1 if y_pred == y_true else 0)
correct_unlabeled += (1 if int(y_pred // self._n_relations) ==
int(y_true // self._n_relations) else 0)
return [
("las", 100.0 * correct_labeled / count),
("uas", 100.0 * correct_unlabeled / count),
("loss", self.get_loss()),
]
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Sequence tagging module."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from corpus_processing import minibatching
from model import model_helpers
from model import task_module
class TaggingModule(task_module.SemiSupervisedModule):
def __init__(self, config, task_name, n_classes, inputs,
encoder):
super(TaggingModule, self).__init__()
self.task_name = task_name
self.n_classes = n_classes
self.labels = labels = tf.placeholder(tf.float32, [None, None, None],
name=task_name + '_labels')
class PredictionModule(object):
def __init__(self, name, input_reprs, roll_direction=0, activate=True):
self.name = name
with tf.variable_scope(name + '/predictions'):
projected = model_helpers.project(input_reprs, config.projection_size)
if activate:
projected = tf.nn.relu(projected)
self.logits = tf.layers.dense(projected, n_classes, name='predict')
targets = labels
targets *= (1 - inputs.label_smoothing)
targets += inputs.label_smoothing / n_classes
self.loss = model_helpers.masked_ce_loss(
self.logits, targets, inputs.mask, roll_direction=roll_direction)
primary = PredictionModule('primary',
([encoder.uni_reprs, encoder.bi_reprs]))
ps = [
PredictionModule('full', ([encoder.uni_reprs, encoder.bi_reprs]),
activate=False),
PredictionModule('forwards', [encoder.uni_fw]),
PredictionModule('backwards', [encoder.uni_bw]),
PredictionModule('future', [encoder.uni_fw], roll_direction=1),
PredictionModule('past', [encoder.uni_bw], roll_direction=-1),
]
self.unsupervised_loss = sum(p.loss for p in ps)
self.supervised_loss = primary.loss
self.probs = tf.nn.softmax(primary.logits)
self.preds = tf.argmax(primary.logits, axis=-1)
def update_feed_dict(self, feed, mb):
if self.task_name in mb.teacher_predictions:
feed[self.labels] = mb.teacher_predictions[self.task_name]
elif mb.task_name != 'unlabeled':
labels = minibatching.build_array(
[[0] + e.labels + [0] for e in mb.examples])
feed[self.labels] = np.eye(self.n_classes)[labels]
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Sequence tagging evaluation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import abc
from task_specific.word_level import tagging_utils
from task_specific.word_level import word_level_scorer
class AccuracyScorer(word_level_scorer.WordLevelScorer):
def __init__(self, auto_fail_label=None):
super(AccuracyScorer, self).__init__()
self._auto_fail_label = auto_fail_label
def _get_results(self):
correct, count = 0, 0
for example, preds in zip(self._examples, self._preds):
for y_true, y_pred in zip(example.labels, preds):
count += 1
correct += (1 if y_pred == y_true and y_true != self._auto_fail_label
else 0)
return [
("accuracy", 100.0 * correct / count),
("loss", self.get_loss())
]
class F1Scorer(word_level_scorer.WordLevelScorer):
__metaclass__ = abc.ABCMeta
def __init__(self):
super(F1Scorer, self).__init__()
self._n_correct, self._n_predicted, self._n_gold = 0, 0, 0
def _get_results(self):
if self._n_correct == 0:
p, r, f1 = 0, 0, 0
else:
p = 100.0 * self._n_correct / self._n_predicted
r = 100.0 * self._n_correct / self._n_gold
f1 = 2 * p * r / (p + r)
return [
("precision", p),
("recall", r),
("f1", f1),
("loss", self.get_loss()),
]
class EntityLevelF1Scorer(F1Scorer):
def __init__(self, label_mapping):
super(EntityLevelF1Scorer, self).__init__()
self._inv_label_mapping = {v: k for k, v in label_mapping.iteritems()}
def _get_results(self):
self._n_correct, self._n_predicted, self._n_gold = 0, 0, 0
for example, preds in zip(self._examples, self._preds):
sent_spans = set(tagging_utils.get_span_labels(
example.labels, self._inv_label_mapping))
span_preds = set(tagging_utils.get_span_labels(
preds, self._inv_label_mapping))
self._n_correct += len(sent_spans & span_preds)
self._n_gold += len(sent_spans)
self._n_predicted += len(span_preds)
return super(EntityLevelF1Scorer, self)._get_results()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment