Commit 688b6eac authored by SWHL's avatar SWHL
Browse files

Update files

parents
#include "bounded_sequence_encoding.hh"
#include "../../util/scoped.hh"
#define BOOST_TEST_MODULE BoundedSequenceEncodingTest
#include <boost/test/unit_test.hpp>
namespace lm {
namespace interpolate {
namespace {
BOOST_AUTO_TEST_CASE(Simple) {
unsigned char bounds[] = {2};
BoundedSequenceEncoding enc(bounds, bounds + 1);
util::scoped_malloc backing(util::MallocOrThrow(enc.EncodedLength()));
unsigned char input = 1;
enc.Encode(&input, backing.get());
unsigned char output;
enc.Decode(backing.get(), &output);
BOOST_CHECK_EQUAL(1, output);
}
void ExhaustiveTest(unsigned char *bound_begin, unsigned char *bound_end) {
BoundedSequenceEncoding enc(bound_begin, bound_end);
util::scoped_malloc backing(util::MallocOrThrow(enc.EncodedLength()));
std::vector<unsigned char> values(bound_end - bound_begin),
out(bound_end - bound_begin);
while (true) {
enc.Encode(&values[0], backing.get());
enc.Decode(backing.get(), &out[0]);
for (std::size_t i = 0; i != values.size(); ++i) {
BOOST_CHECK_EQUAL(values[i], out[i]);
}
for (std::size_t i = 0;; ++i) {
if (i == values.size()) return;
++values[i];
if (values[i] < bound_begin[i]) break;
values[i] = 0;
}
}
}
void CheckEncodeDecode(unsigned char *bounds, unsigned char *input,
unsigned char *output, std::size_t len) {
BoundedSequenceEncoding encoder(bounds, bounds + len);
util::scoped_malloc backing(util::MallocOrThrow(encoder.EncodedLength()));
encoder.Encode(input, backing.get());
encoder.Decode(backing.get(), output);
for (std::size_t i = 0; i < len; ++i) {
BOOST_CHECK_EQUAL(input[i], output[i]);
}
}
BOOST_AUTO_TEST_CASE(Exhaustive) {
unsigned char bounds[] = {5, 2, 3, 9, 7, 20, 8};
ExhaustiveTest(bounds, bounds + sizeof(bounds) / sizeof(unsigned char));
}
BOOST_AUTO_TEST_CASE(LessThan64) {
unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 3};
unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 2};
unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0};
std::size_t len = sizeof(bounds) / sizeof(unsigned char);
assert(sizeof(input) / sizeof(unsigned char) == len);
assert(sizeof(output) / sizeof(unsigned char) == len);
CheckEncodeDecode(bounds, input, output, len);
}
BOOST_AUTO_TEST_CASE(Exactly64) {
unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255};
unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16};
unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0};
std::size_t len = sizeof(bounds) / sizeof(unsigned char);
assert(sizeof(input) / sizeof(unsigned char) == len);
assert(sizeof(output) / sizeof(unsigned char) == len);
CheckEncodeDecode(bounds, input, output, len);
}
BOOST_AUTO_TEST_CASE(MoreThan64) {
unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255, 255};
unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16, 137};
unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
std::size_t len = sizeof(bounds) / sizeof(unsigned char);
assert(sizeof(input) / sizeof(unsigned char) == len);
assert(sizeof(output) / sizeof(unsigned char) == len);
CheckEncodeDecode(bounds, input, output, len);
}
}}} // namespaces
#ifndef KENLM_INTERPOLATE_INTERPOLATE_INFO_H
#define KENLM_INTERPOLATE_INTERPOLATE_INFO_H
#include <cstddef>
#include <vector>
#include <stdint.h>
namespace lm {
namespace interpolate {
/**
* Stores relevant info for interpolating several language models, for use
* during the three-pass offline log-linear interpolation algorithm.
*/
struct InterpolateInfo {
/**
* @return the number of models being interpolated
*/
std::size_t Models() const {
return orders.size();
}
/**
* The lambda (interpolation weight) for each model.
*/
std::vector<float> lambdas;
/**
* The maximum ngram order for each model.
*/
std::vector<uint8_t> orders;
};
}
}
#endif
#include "../common/model_buffer.hh"
#include "../common/size_option.hh"
#include "pipeline.hh"
#include "tune_instances.hh"
#include "tune_weights.hh"
#include "../../util/fixed_array.hh"
#include "../../util/usage.hh"
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpragmas" // Older gcc doesn't have "-Wunused-local-typedefs" and complains.
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
#include <Eigen/Core>
#pragma GCC diagnostic pop
#include <boost/program_options.hpp>
#include <iostream>
#include <vector>
namespace {
void MungeWeightArgs(int argc, char *argv[], std::vector<const char *> &munged_args) {
// Boost program options doesn't -w 0.2 -0.1 because it thinks -0.1 is an
// option. There appears to be no standard way to fix this without breaking
// single-dash arguments. So here's a hack: put a -w before every number
// if it's within the scope of a weight argument.
munged_args.push_back(argv[0]);
char **inside_weights = NULL;
for (char **i = argv + 1; i < argv + argc; ++i) {
StringPiece arg(*i);
if (starts_with(arg, "-w") || starts_with(arg, "--w")) {
inside_weights = i;
} else if (inside_weights && arg.size() >= 2 && arg[0] == '-' && ((arg[1] >= '0' && arg[1] <= '9') || arg[1] == '.')) {
// If a negative number appears right after -w, don't add another -w.
// And do stay inside weights.
if (inside_weights + 1 != i) {
munged_args.push_back("-w");
}
} else if (starts_with(arg, "-")) {
inside_weights = NULL;
}
munged_args.push_back(*i);
}
}
} // namespace
int main(int argc, char *argv[]) {
try {
Eigen::initParallel();
lm::interpolate::Config pipe_config;
lm::interpolate::InstancesConfig instances_config;
std::vector<std::string> input_models;
std::string tuning_file;
namespace po = boost::program_options;
po::options_description options("Log-linear interpolation options");
options.add_options()
("help,h", po::bool_switch(), "Show this help message")
("model,m", po::value<std::vector<std::string> >(&input_models)->multitoken()->required(), "Models to interpolate, which must be in KenLM intermediate format. The intermediate format can be generated using the --intermediate argument to lmplz.")
("weight,w", po::value<std::vector<float> >(&pipe_config.lambdas)->multitoken(), "Interpolation weights")
("tuning,t", po::value<std::string>(&tuning_file), "File to tune on: a text file with one sentence per line")
("just_tune", po::bool_switch(), "Tune and print weights then quit")
("temp_prefix,T", po::value<std::string>(&pipe_config.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
("memory,S", lm::SizeOption(pipe_config.sort.total_memory, util::GuessPhysicalMemory() ? "50%" : "1G"), "Sorting memory: this is a very rough guide")
("sort_block", lm::SizeOption(pipe_config.sort.buffer_size, "64M"), "Block size");
po::variables_map vm;
std::vector<const char *> munged_args;
MungeWeightArgs(argc, argv, munged_args);
po::store(po::parse_command_line((int)munged_args.size(), &*munged_args.begin(), options), vm);
if (argc == 1 || vm["help"].as<bool>()) {
std::cerr << "Interpolate multiple models\n" << options << std::endl;
return 1;
}
po::notify(vm);
instances_config.sort = pipe_config.sort;
instances_config.model_read_chain_mem = instances_config.sort.buffer_size;
instances_config.extension_write_chain_mem = instances_config.sort.total_memory;
instances_config.lazy_memory = instances_config.sort.total_memory;
if (pipe_config.lambdas.empty() && tuning_file.empty()) {
std::cerr << "Provide a tuning file with -t xor weights with -w." << std::endl;
return 1;
}
if (!pipe_config.lambdas.empty() && !tuning_file.empty()) {
std::cerr << "Provide weights xor a tuning file, not both." << std::endl;
return 1;
}
if (!tuning_file.empty()) {
// Tune weights
std::vector<StringPiece> model_names;
for (std::vector<std::string>::const_iterator i = input_models.begin(); i != input_models.end(); ++i) {
model_names.push_back(*i);
}
lm::interpolate::TuneWeights(util::OpenReadOrThrow(tuning_file.c_str()), model_names, instances_config, pipe_config.lambdas);
std::cerr << "Final weights:";
std::ostream &to = vm["just_tune"].as<bool>() ? std::cout : std::cerr;
for (std::vector<float>::const_iterator i = pipe_config.lambdas.begin(); i != pipe_config.lambdas.end(); ++i) {
to << ' ' << *i;
}
to << std::endl;
}
if (vm["just_tune"].as<bool>()) {
return 0;
}
if (pipe_config.lambdas.size() != input_models.size()) {
std::cerr << "Number of models (" << input_models.size() << ") should match the number of weights (" << pipe_config.lambdas.size() << ")." << std::endl;
return 1;
}
util::FixedArray<lm::ModelBuffer> models(input_models.size());
for (std::size_t i = 0; i < input_models.size(); ++i) {
models.push_back(input_models[i]);
}
lm::interpolate::Pipeline(models, pipe_config, 1);
} catch (const std::exception &e) {
std::cerr << e.what() <<std::endl;
return 1;
}
return 0;
}
#include "merge_probabilities.hh"
#include "../common/ngram_stream.hh"
#include "bounded_sequence_encoding.hh"
#include "interpolate_info.hh"
#include <algorithm>
#include <limits>
#include <numeric>
namespace lm {
namespace interpolate {
/**
* Helper to generate the BoundedSequenceEncoding used for writing the
* from values.
*/
BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order) {
util::FixedArray<uint8_t> max_orders(info.orders.size());
for (std::size_t i = 0; i < info.orders.size(); ++i) {
max_orders.push_back(std::min(order, info.orders[i]));
}
return BoundedSequenceEncoding(max_orders.begin(), max_orders.end());
}
namespace {
/**
* A simple wrapper class that holds information needed to read and write
* the ngrams of a particular order. This class has the memory needed to
* buffer the data needed for the recursive process of computing the
* probabilities and "from" values for each component model.
*
* "From" values indicate, for each model, what order (as an index, so -1)
* was backed off to in order to arrive at a probability. For example, if a
* 5-gram model (order index 4) backed off twice, we would write a 2.
*/
class NGramHandler {
public:
NGramHandler(uint8_t order, const InterpolateInfo &ifo,
util::FixedArray<util::stream::ChainPositions> &models_by_order)
: info(ifo),
encoder(MakeEncoder(info, order)),
out_record(order, encoder.EncodedLength()) {
std::size_t count_has_order = 0;
for (std::size_t i = 0; i < models_by_order.size(); ++i) {
count_has_order += (models_by_order[i].size() >= order);
}
inputs_.Init(count_has_order);
for (std::size_t i = 0; i < models_by_order.size(); ++i) {
if (models_by_order[i].size() < order)
continue;
inputs_.push_back(models_by_order[i][order - 1]);
if (inputs_.back()) {
active_.resize(active_.size() + 1);
active_.back().model = i;
active_.back().stream = &inputs_.back();
}
}
// have to init outside since NGramStreams doesn't forward to
// GenericStreams ctor given a ChainPositions
probs.Init(info.Models());
from.Init(info.Models());
for (std::size_t i = 0; i < info.Models(); ++i) {
probs.push_back(0.0);
from.push_back(0);
}
}
struct StreamIndex {
NGramStream<ProbBackoff> *stream;
NGramStream<ProbBackoff> &Stream() { return *stream; }
std::size_t model;
};
std::size_t ActiveSize() const {
return active_.size();
}
/**
* @return the input stream for a particular model that corresponds to
* this ngram order
*/
StreamIndex &operator[](std::size_t idx) {
return active_[idx];
}
void erase(std::size_t idx) {
active_.erase(active_.begin() + idx);
}
const InterpolateInfo &info;
BoundedSequenceEncoding encoder;
PartialProbGamma out_record;
util::FixedArray<float> probs;
util::FixedArray<uint8_t> from;
private:
std::vector<StreamIndex> active_;
NGramStreams<ProbBackoff> inputs_;
};
/**
* A collection of NGramHandlers.
*/
class NGramHandlers : public util::FixedArray<NGramHandler> {
public:
explicit NGramHandlers(std::size_t num)
: util::FixedArray<NGramHandler>(num) {
}
void push_back(
std::size_t order, const InterpolateInfo &info,
util::FixedArray<util::stream::ChainPositions> &models_by_order) {
new (end()) NGramHandler(order, info, models_by_order);
Constructed();
}
};
/**
* The recursive helper function that computes probability and "from"
* values for all ngrams matching a particular suffix.
*
* The current order can be computed as the suffix length + 1. Note that
* the suffix could be empty (suffix_begin == suffix_end == NULL), in which
* case we are handling unigrams with the UNK token as the fallback
* probability.
*
* @param handlers The full collection of handlers
* @param suffix_begin A start iterator for the suffix
* @param suffix_end An end iterator for the suffix
* @param fallback_probs The probabilities of this ngram if we need to
* back off (that is, the probability of the suffix)
* @param fallback_from The order that the corresponding fallback
* probability in the fallback_probs is from
* @param combined_fallback interpolated fallback_probs
* @param outputs The output streams, one for each order
*/
void HandleSuffix(NGramHandlers &handlers, WordIndex *suffix_begin,
WordIndex *suffix_end,
const util::FixedArray<float> &fallback_probs,
const util::FixedArray<uint8_t> &fallback_from,
float combined_fallback,
util::stream::Streams &outputs) {
uint8_t order = std::distance(suffix_begin, suffix_end) + 1;
if (order > outputs.size()) return;
util::stream::Stream &output = outputs[order - 1];
NGramHandler &handler = handlers[order - 1];
while (true) {
// find the next smallest ngram which matches our suffix
// TODO: priority queue driven.
WordIndex *minimum = NULL;
for (std::size_t i = 0; i < handler.ActiveSize(); ++i) {
if (!std::equal(suffix_begin, suffix_end, handler[i].Stream()->begin() + 1))
continue;
// if we either haven't set a minimum yet or this one is smaller than
// the minimum we found before, replace it
WordIndex *last = handler[i].Stream()->begin();
if (!minimum || *last < *minimum) { minimum = handler[i].Stream()->begin(); }
}
// no more ngrams of this order match our suffix, so we're done
if (!minimum) return;
handler.out_record.ReBase(output.Get());
std::copy(minimum, minimum + order, handler.out_record.begin());
// Default case is having backed off.
std::copy(fallback_probs.begin(), fallback_probs.end(), handler.probs.begin());
std::copy(fallback_from.begin(), fallback_from.end(), handler.from.begin());
for (std::size_t i = 0; i < handler.ActiveSize();) {
if (std::equal(handler.out_record.begin(), handler.out_record.end(),
handler[i].Stream()->begin())) {
handler.probs[handler[i].model] = handler.info.lambdas[handler[i].model] * handler[i].Stream()->Value().prob;
handler.from[handler[i].model] = order - 1;
if (++handler[i].Stream()) {
++i;
} else {
handler.erase(i);
}
} else {
++i;
}
}
handler.out_record.Prob() = std::accumulate(handler.probs.begin(), handler.probs.end(), 0.0);
handler.out_record.LowerProb() = combined_fallback;
handler.encoder.Encode(handler.from.begin(),
handler.out_record.FromBegin());
// we've handled this particular ngram, so now recurse to the higher
// order using the current ngram as the suffix
HandleSuffix(handlers, handler.out_record.begin(), handler.out_record.end(),
handler.probs, handler.from, handler.out_record.Prob(), outputs);
// consume the output
++output;
}
}
/**
* Kicks off the recursion for computing the probabilities and "from"
* values for each ngram order. We begin by handling the UNK token that
* should be at the front of each of the unigram input streams. This is
* then output to the stream and it is used as the fallback for handling
* our unigram case, the unigram used as the fallback for the bigram case,
* etc.
*/
void HandleNGrams(NGramHandlers &handlers, util::stream::Streams &outputs) {
PartialProbGamma unk_record(1, 0);
// First: populate the unk probabilities by reading the first unigram
// from each stream
util::FixedArray<float> unk_probs(handlers[0].info.Models());
// start by populating the ngram id from the first stream
lm::NGram<ProbBackoff> ngram = *handlers[0][0].Stream();
unk_record.ReBase(outputs[0].Get());
std::copy(ngram.begin(), ngram.end(), unk_record.begin());
unk_record.Prob() = 0;
// then populate the probabilities into unk_probs while "multiply" the
// model probabilities together into the unk record
//
// note that from doesn't need to be set for unigrams
assert(handlers[0].ActiveSize() == handlers[0].info.Models());
for (std::size_t i = 0; i < handlers[0].info.Models();) {
ngram = *handlers[0][i].Stream();
unk_probs.push_back(handlers[0].info.lambdas[i] * ngram.Value().prob);
unk_record.Prob() += unk_probs[i];
assert(*ngram.begin() == kUNK);
if (++handlers[0][i].Stream()) {
++i;
} else {
handlers[0].erase(i);
}
}
float unk_combined = unk_record.Prob();
unk_record.LowerProb() = unk_combined;
// flush the unk output record
++outputs[0];
// Then, begin outputting everything in lexicographic order: first we'll
// get the unigram then the first bigram with that context, then the
// first trigram with that bigram context, etc., until we exhaust all of
// the ngrams, then all of the (n-1)grams, etc.
//
// This function is the "root" of this recursive process.
util::FixedArray<uint8_t> unk_from(handlers[0].info.Models());
for (std::size_t i = 0; i < handlers[0].info.Models(); ++i) {
unk_from.push_back(0);
}
// the two nulls are to encode that our "fallback" word is the "0-gram"
// case, e.g. we "backed off" to UNK
// TODO: stop generating vocab ids and LowerProb for unigrams.
HandleSuffix(handlers, NULL, NULL, unk_probs, unk_from, unk_combined, outputs);
// Verify we reached the end. And poison!
for (std::size_t i = 0; i < handlers.size(); ++i) {
UTIL_THROW_IF2(handlers[i].ActiveSize(),
"MergeProbabilities did not exhaust all ngram streams");
outputs[i].Poison();
}
}
} // namespace
void MergeProbabilities::Run(const util::stream::ChainPositions &output_pos) {
NGramHandlers handlers(output_pos.size());
for (std::size_t i = 0; i < output_pos.size(); ++i) {
handlers.push_back(i + 1, info_, models_by_order_);
}
util::stream::Streams outputs(output_pos);
HandleNGrams(handlers, outputs);
}
}} // namespaces
#ifndef LM_INTERPOLATE_MERGE_PROBABILITIES_H
#define LM_INTERPOLATE_MERGE_PROBABILITIES_H
#include "../common/ngram.hh"
#include "bounded_sequence_encoding.hh"
#include "../../util/fixed_array.hh"
#include "../../util/stream/multi_stream.hh"
#include <stdint.h>
namespace lm {
namespace interpolate {
struct InterpolateInfo;
/**
* Make the encoding of backoff values for a given order. This stores values
* in [PartialProbGamma::FromBegin(), PartialProbGamma::FromEnd())
*/
BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order);
/**
* The first pass for the offline log-linear interpolation algorithm. This
* reads K **suffix-ordered** streams for each model, for each order, of
* ngram records (ngram-id, prob, backoff). It further assumes that the
* ngram-ids have been unified over all of the stream inputs.
*
* Its output is records of (ngram-id, prob-prod, backoff-level,
* backoff-level, ...) where the backoff-levels (of which there are K) are
* the context length (0 for unigrams) that the corresponding model had to
* back off to in order to obtain a probability for that ngram-id. Each of
* these streams is terminated with a record whose ngram-id is all
* maximum-integers for simplicity in implementation here.
*
* @param model_by_order An array of length N (max_i N_i) containing at
* the ChainPositions for the streams for order (i + 1).
* The Rus attached to output chains for each order (of length K)
*/
class MergeProbabilities {
public:
MergeProbabilities(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order)
: info_(info), models_by_order_(models_by_order) {}
void Run(const util::stream::ChainPositions &outputs);
private:
const InterpolateInfo &info_;
util::FixedArray<util::stream::ChainPositions> &models_by_order_;
};
/**
* This class represents the output payload for this pass, which consists
* of an ngram-id, a probability, and then a vector of orders from which
* each of the component models backed off to for this ngram, encoded
* using the BoundedSequenceEncoding class.
*/
class PartialProbGamma : public lm::NGramHeader {
public:
PartialProbGamma(std::size_t order, std::size_t backoff_bytes)
: lm::NGramHeader(NULL, order), backoff_bytes_(backoff_bytes) {
// nothing
}
std::size_t TotalSize() const {
return sizeof(WordIndex) * Order() + sizeof(After) + backoff_bytes_;
}
// TODO: cache bounded sequence encoding in the pipeline?
static std::size_t TotalSize(const InterpolateInfo &info, uint8_t order) {
return sizeof(WordIndex) * order + sizeof(After) + MakeEncoder(info, order).EncodedLength();
}
float &Prob() { return Pay().prob; }
float Prob() const { return Pay().prob; }
float &LowerProb() { return Pay().lower_prob; }
float LowerProb() const { return Pay().lower_prob; }
const uint8_t *FromBegin() const { return Pay().from; }
uint8_t *FromBegin() { return Pay().from; }
private:
struct After {
// Note that backoff_and_normalize assumes this comes first.
float prob;
float lower_prob;
uint8_t from[];
};
const After &Pay() const { return *reinterpret_cast<const After *>(end()); }
After &Pay() { return *reinterpret_cast<After*>(end()); }
std::size_t backoff_bytes_;
};
}} // namespaces
#endif // LM_INTERPOLATE_MERGE_PROBABILITIES_H
#include "merge_vocab.hh"
#include "../enumerate_vocab.hh"
#include "universal_vocab.hh"
#include "../lm_exception.hh"
#include "../vocab.hh"
#include "../../util/file_piece.hh"
#include <queue>
#include <string>
#include <iostream>
#include <vector>
namespace lm {
namespace interpolate {
namespace {
class VocabFileReader {
public:
explicit VocabFileReader(const int fd, size_t model_num, uint64_t offset = 0);
VocabFileReader &operator++();
operator bool() const { return !eof_; }
uint64_t operator*() const { return Value(); }
uint64_t Value() const { return hash_value_; }
size_t ModelNum() const { return model_num_; }
WordIndex CurrentIndex() const { return current_index_; }
StringPiece Word() const { return word_; }
private:
uint64_t hash_value_;
WordIndex current_index_;
bool eof_;
size_t model_num_;
StringPiece word_;
util::FilePiece file_piece_;
};
VocabFileReader::VocabFileReader(const int fd, const size_t model_num, uint64_t offset) :
hash_value_(0),
current_index_(0),
eof_(false),
model_num_(model_num),
file_piece_(util::DupOrThrow(fd)) {
word_ = file_piece_.ReadLine('\0');
UTIL_THROW_IF(word_ != "<unk>",
FormatLoadException,
"Vocabulary words are in the wrong place.");
// setup to initial value
++*this;
}
VocabFileReader &VocabFileReader::operator++() {
try {
word_ = file_piece_.ReadLine('\0');
} catch(util::EndOfFileException &e) {
eof_ = true;
return *this;
}
uint64_t prev_hash_value = hash_value_;
hash_value_ = ngram::detail::HashForVocab(word_.data(), word_.size());
// hash values should be monotonically increasing
UTIL_THROW_IF(hash_value_ < prev_hash_value, FormatLoadException,
": word index not monotonically increasing."
<< " model_num: " << model_num_
<< " prev hash: " << prev_hash_value
<< " new hash: " << hash_value_);
++current_index_;
return *this;
}
class CompareFiles {
public:
bool operator()(const VocabFileReader* x,
const VocabFileReader* y)
{ return x->Value() > y->Value(); }
};
class Readers : public util::FixedArray<VocabFileReader> {
public:
Readers(std::size_t number) : util::FixedArray<VocabFileReader>(number) {}
void push_back(int fd, std::size_t i) {
new(end()) VocabFileReader(fd, i);
Constructed();
}
};
} // namespace
WordIndex MergeVocab(util::FixedArray<int> &files, UniversalVocab &vocab, EnumerateVocab &enumerate) {
typedef std::priority_queue<VocabFileReader*, std::vector<VocabFileReader*>, CompareFiles> HeapType;
HeapType heap;
Readers readers(files.size());
for (size_t i = 0; i < files.size(); ++i) {
readers.push_back(files[i], i);
heap.push(&readers.back());
// initialize first index to 0 for <unk>
vocab.InsertUniversalIdx(i, 0, 0);
}
uint64_t prev_hash_value = 0;
// global_index starts with <unk> which is 0
WordIndex global_index = 0;
enumerate.Add(0, "<unk>");
while (!heap.empty()) {
VocabFileReader* top_vocab_file = heap.top();
if (top_vocab_file->Value() != prev_hash_value) {
enumerate.Add(++global_index, top_vocab_file->Word());
}
vocab.InsertUniversalIdx(top_vocab_file->ModelNum(),
top_vocab_file->CurrentIndex(),
global_index);
prev_hash_value = top_vocab_file->Value();
heap.pop();
if (++(*top_vocab_file)) {
heap.push(top_vocab_file);
}
}
return global_index + 1;
}
} // namespace interpolate
} // namespace lm
#ifndef LM_INTERPOLATE_MERGE_VOCAB_H
#define LM_INTERPOLATE_MERGE_VOCAB_H
#include "../word_index.hh"
#include "../../util/file.hh"
#include "../../util/fixed_array.hh"
namespace lm {
class EnumerateVocab;
namespace interpolate {
class UniversalVocab;
// The combined vocabulary is enumerated with enumerate.
// Returns the size of the combined vocabulary.
// Does not take ownership of vocab_files.
WordIndex MergeVocab(util::FixedArray<int> &vocab_files, UniversalVocab &vocab, EnumerateVocab &enumerate);
}} // namespaces
#endif // LM_INTERPOLATE_MERGE_VOCAB_H
#define BOOST_TEST_MODULE InterpolateMergeVocabTest
#include <boost/test/unit_test.hpp>
#include "../enumerate_vocab.hh"
#include "merge_vocab.hh"
#include "universal_vocab.hh"
#include "../lm_exception.hh"
#include "../vocab.hh"
#include "../word_index.hh"
#include "../../util/file.hh"
#include "../../util/file_piece.hh"
#include "../../util/file_stream.hh"
#include "../../util/tokenize_piece.hh"
#include <algorithm>
#include <cstring>
#include <vector>
namespace lm {
namespace interpolate {
namespace {
struct VocabEntry {
explicit VocabEntry(StringPiece value) :
str(value), hash(util::MurmurHash64A(value.data(), value.size())) {}
StringPiece str;
uint64_t hash;
bool operator<(const VocabEntry &other) const {
return hash < other.hash;
}
};
int WriteVocabFile(const std::vector<VocabEntry> &vocab, util::scoped_fd &file) {
file.reset(util::MakeTemp(util::DefaultTempDirectory()));
{
util::FileStream out(file.get(), 128);
for (std::vector<VocabEntry>::const_iterator i = vocab.begin(); i != vocab.end(); ++i) {
out << i->str << '\0';
}
}
util::SeekOrThrow(file.get(), 0);
return file.get();
}
std::vector<VocabEntry> ParseVocab(StringPiece words) {
std::vector<VocabEntry> entries;
entries.push_back(VocabEntry("<unk>"));
for (util::TokenIter<util::SingleCharacter> i(words, '\t'); i; ++i) {
entries.push_back(VocabEntry(*i));
}
std::sort(entries.begin() + 1, entries.end());
return entries;
}
int WriteVocabFile(StringPiece words, util::scoped_fd &file) {
return WriteVocabFile(ParseVocab(words), file);
}
class TestFiles {
public:
TestFiles() {}
int Test0() {
return WriteVocabFile("this\tis\ta\tfirst\tcut", test[0]);
}
int Test1() {
return WriteVocabFile("is this\tthis a\tfirst cut\ta first", test[1]);
}
int Test2() {
return WriteVocabFile("is\tsecd\ti", test[2]);
}
int NoUNK() {
std::vector<VocabEntry> no_unk_vec;
no_unk_vec.push_back(VocabEntry("toto"));
return WriteVocabFile(no_unk_vec, no_unk);
}
int BadOrder() {
std::vector<VocabEntry> bad_order_vec;
bad_order_vec.push_back(VocabEntry("<unk>"));
bad_order_vec.push_back(VocabEntry("0"));
bad_order_vec.push_back(VocabEntry("1"));
bad_order_vec.push_back(VocabEntry("2"));
bad_order_vec.push_back(VocabEntry("a"));
return WriteVocabFile(bad_order_vec, bad_order);
}
private:
util::scoped_fd test[3], no_unk, bad_order;
};
class DoNothingEnumerate : public EnumerateVocab {
public:
void Add(WordIndex, const StringPiece &) {}
};
BOOST_AUTO_TEST_CASE(MergeVocabTest) {
TestFiles files;
util::FixedArray<int> used_files(3);
used_files.push_back(files.Test0());
used_files.push_back(files.Test1());
used_files.push_back(files.Test2());
std::vector<lm::WordIndex> model_max_idx;
model_max_idx.push_back(10);
model_max_idx.push_back(10);
model_max_idx.push_back(10);
util::scoped_fd combined(util::MakeTemp(util::DefaultTempDirectory()));
UniversalVocab universal_vocab(model_max_idx);
{
ngram::ImmediateWriteWordsWrapper writer(NULL, combined.get(), 0);
MergeVocab(used_files, universal_vocab, writer);
}
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 0), 0);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 0), 0);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 0), 0);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 1), 1);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 1), 2);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 1), 8);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 5), 11);
#if BYTE_ORDER == LITTLE_ENDIAN
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 4);
#elif BYTE_ORDER == BIG_ENDIAN
// MurmurHash has a different ordering of the vocabulary.
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 5);
#endif
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 3), 10);
util::SeekOrThrow(combined.get(), 0);
util::FilePiece f(combined.release());
std::vector<VocabEntry> expected = ParseVocab("a\tis this\tthis a\tfirst cut\tthis\ta first\tcut\tis\ti\tsecd\tfirst");
for (std::vector<VocabEntry>::const_iterator i = expected.begin(); i != expected.end(); ++i) {
BOOST_CHECK_EQUAL(i->str, f.ReadLine('\0'));
}
BOOST_CHECK_THROW(f.ReadLine('\0'), util::EndOfFileException);
}
BOOST_AUTO_TEST_CASE(MergeVocabNoUnkTest) {
TestFiles files;
util::FixedArray<int> used_files(1);
used_files.push_back(files.NoUNK());
std::vector<lm::WordIndex> model_max_idx;
model_max_idx.push_back(10);
UniversalVocab universal_vocab(model_max_idx);
DoNothingEnumerate nothing;
BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException);
}
BOOST_AUTO_TEST_CASE(MergeVocabWrongOrderTest) {
TestFiles files;
util::FixedArray<int> used_files(2);
used_files.push_back(files.Test0());
used_files.push_back(files.BadOrder());
std::vector<lm::WordIndex> model_max_idx;
model_max_idx.push_back(10);
model_max_idx.push_back(10);
lm::interpolate::UniversalVocab universal_vocab(model_max_idx);
DoNothingEnumerate nothing;
BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException);
}
}}} // namespaces
#include "normalize.hh"
#include "../common/compare.hh"
#include "../common/ngram_stream.hh"
#include "backoff_matrix.hh"
#include "bounded_sequence_encoding.hh"
#include "interpolate_info.hh"
#include "merge_probabilities.hh"
#include "../weights.hh"
#include "../word_index.hh"
#include "../../util/fixed_array.hh"
#include "../../util/scoped.hh"
#include "../../util/stream/stream.hh"
#include "../../util/stream/rewindable_stream.hh"
#include <functional>
#include <queue>
#include <vector>
namespace lm { namespace interpolate {
namespace {
class BackoffQueueEntry {
public:
BackoffQueueEntry(float &entry, const util::stream::ChainPosition &position)
: entry_(entry), stream_(position) {
entry_ = 0.0;
}
operator bool() const { return stream_; }
NGramHeader operator*() const { return *stream_; }
const NGramHeader *operator->() const { return &*stream_; }
void Enter() {
entry_ = stream_->Value().backoff;
}
BackoffQueueEntry &Next() {
entry_ = 0.0;
++stream_;
return *this;
}
private:
float &entry_;
NGramStream<ProbBackoff> stream_;
};
struct PtrGreater : public std::binary_function<const BackoffQueueEntry *, const BackoffQueueEntry *, bool> {
bool operator()(const BackoffQueueEntry *first, const BackoffQueueEntry *second) const {
return SuffixLexicographicLess<NGramHeader>()(**second, **first);
}
};
class EntryOwner : public util::FixedArray<BackoffQueueEntry> {
public:
void push_back(float &entry, const util::stream::ChainPosition &position) {
new (end()) BackoffQueueEntry(entry, position);
Constructed();
}
};
std::size_t MaxOrder(const util::FixedArray<util::stream::ChainPositions> &model) {
std::size_t ret = 0;
for (const util::stream::ChainPositions *m = model.begin(); m != model.end(); ++m) {
ret = std::max(ret, m->size());
}
return ret;
}
class BackoffManager {
public:
explicit BackoffManager(const util::FixedArray<util::stream::ChainPositions> &models)
: entered_(MaxOrder(models)), matrix_(models.size(), MaxOrder(models)), skip_write_(MaxOrder(models)) {
std::size_t total = 0;
for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) {
total += m->size();
}
for (std::size_t i = 0; i < MaxOrder(models); ++i) {
entered_.push_back(models.size());
}
owner_.Init(total);
for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) {
for (const util::stream::ChainPosition *j = m->begin(); j != m->end(); ++j) {
owner_.push_back(matrix_.Backoff(m - models.begin(), j - m->begin()), *j);
if (owner_.back()) {
queue_.push(&owner_.back());
}
}
}
}
void SetupSkip(std::size_t order, util::stream::Stream &stream) {
skip_write_[order - 2] = &stream;
}
// Move up the backoffs for the given n-gram. The n-grams must be provided
// in suffix lexicographic order.
void Enter(const NGramHeader &to) {
// Check that we exited properly.
for (std::size_t i = to.Order() - 1; i < entered_.size(); ++i) {
assert(entered_[i].empty());
}
SuffixLexicographicLess<NGramHeader> less;
while (!queue_.empty() && less(**queue_.top(), to))
SkipRecord();
while (TopMatches(to)) {
BackoffQueueEntry *matches = queue_.top();
entered_[to.Order() - 1].push_back(matches);
matches->Enter();
queue_.pop();
}
}
void Exit(std::size_t order_minus_1) {
for (BackoffQueueEntry **i = entered_[order_minus_1].begin(); i != entered_[order_minus_1].end(); ++i) {
if ((*i)->Next())
queue_.push(*i);
}
entered_[order_minus_1].clear();
}
float Get(std::size_t model, std::size_t order_minus_1) const {
return matrix_.Backoff(model, order_minus_1);
}
void Finish() {
while (!queue_.empty())
SkipRecord();
}
private:
void SkipRecord() {
BackoffQueueEntry *top = queue_.top();
queue_.pop();
// Is this the last instance of the n-gram?
if (!TopMatches(**top)) {
// An n-gram is being skipped. Called once per skipped n-gram,
// regardless of how many models it comes from.
*reinterpret_cast<float*>(skip_write_[(*top)->Order() - 1]->Get()) = 0.0;
++*skip_write_[(*top)->Order() - 1];
}
if (top->Next())
queue_.push(top);
}
bool TopMatches(const NGramHeader &header) const {
return !queue_.empty() && (*queue_.top())->Order() == header.Order() && std::equal(header.begin(), header.end(), (*queue_.top())->begin());
}
EntryOwner owner_;
std::priority_queue<BackoffQueueEntry*, std::vector<BackoffQueueEntry*>, PtrGreater> queue_;
// Indexed by order then just all the matching models.
util::FixedArray<util::FixedArray<BackoffQueueEntry*> > entered_;
BackoffMatrix matrix_;
std::vector<util::stream::Stream*> skip_write_;
};
typedef long double Accum;
// Handles n-grams of the same order, using recursion to call another instance
// for higher orders.
class Recurse {
public:
Recurse(
const InterpolateInfo &info, // Must stay alive the entire time.
std::size_t order,
const util::stream::ChainPosition &merged_probs,
const util::stream::ChainPosition &prob_out,
const util::stream::ChainPosition &backoff_out,
BackoffManager &backoffs,
Recurse *higher) // higher is null for the highest order.
: order_(order),
encoding_(MakeEncoder(info, order)),
input_(merged_probs, PartialProbGamma(order, encoding_.EncodedLength())),
prob_out_(prob_out),
backoff_out_(backoff_out),
backoffs_(backoffs),
lambdas_(&*info.lambdas.begin()),
higher_(higher),
decoded_backoffs_(info.Models()),
extended_context_(order - 1) {
// This is only for bigrams and above. Summing unigrams is a much easier case.
assert(order >= 2);
}
// context = w_1^{n-1}
// z_lower = Z(w_2^{n-1})
// Input:
// Merged probabilities without backoff applied in input_.
// Backoffs via backoffs_.
// Calculates:
// Z(w_1^{n-1}): intermediate only.
// p_I(x | w_1^{n-1}) for all x: w_1^{n-1}x exists: Written to prob_out_.
// b_I(w_1^{n-1}): Written to backoff_out_.
void SameContext(const NGramHeader &context, Accum z_lower) {
assert(context.size() == order_ - 1);
backoffs_.Enter(context);
prob_out_.Mark();
// This is the backoff term that applies when one assumes everything backs off:
// \prod_i b_i(w_1^{n-1})^{\lambda_i}.
Accum backoff_once = 0.0;
for (std::size_t m = 0; m < decoded_backoffs_.size(); ++m) {
backoff_once += lambdas_[m] * backoffs_.Get(m, order_ - 2);
}
Accum z_delta = 0.0;
std::size_t count = 0;
for (; input_ && std::equal(context.begin(), context.end(), input_->begin()); ++input_, ++prob_out_, ++count) {
// Apply backoffs to probabilities.
// TODO: change bounded sequence encoding to have an iterator for decoding instead of doing a copy here.
encoding_.Decode(input_->FromBegin(), &*decoded_backoffs_.begin());
for (std::size_t m = 0; m < NumModels(); ++m) {
// Apply the backoffs as instructed for model m.
float accumulated = 0.0;
// Change backoffs for [order it backed off to, order - 1) except
// with 0-indexing. There is still the potential to charge backoff
// for order - 1, which is done later. The backoffs charged here
// are b_m(w_{n-1}^{n-1}) ... b_m(w_2^{n-1})
for (unsigned char backed_to = decoded_backoffs_[m]; backed_to < order_ - 2; ++backed_to) {
accumulated += backoffs_.Get(m, backed_to);
}
float lambda = lambdas_[m];
// Lower p(x | w_2^{n-1}) gets all the backoffs except the highest.
input_->LowerProb() += accumulated * lambda;
// Charge the backoff b(w_1^{n-1}) if applicable, but only to attain p(x | w_1^{n-1})
if (decoded_backoffs_[m] < order_ - 1) {
accumulated += backoffs_.Get(m, order_ - 2);
}
input_->Prob() += accumulated * lambda;
}
// TODO: better precision/less operations here.
z_delta += pow(10.0, input_->Prob()) - pow(10.0, input_->LowerProb() + backoff_once);
// Write unnormalized probability record.
std::copy(input_->begin(), input_->end(), reinterpret_cast<WordIndex*>(prob_out_.Get()));
ProbWrite() = input_->Prob();
}
// TODO numerical precision.
Accum z = log10(pow(10.0, z_lower + backoff_once) + z_delta);
// Normalize.
prob_out_.Rewind();
for (std::size_t i = 0; i < count; ++i, ++prob_out_) {
ProbWrite() -= z;
}
// This allows the stream to release data.
prob_out_.Mark();
// Output backoff.
*reinterpret_cast<float*>(backoff_out_.Get()) = z_lower + backoff_once - z;
++backoff_out_;
if (higher_.get())
higher_->ExtendContext(context, z);
backoffs_.Exit(order_ - 2);
}
// Call is given a context and z(context).
// Evaluates y context x for all y,x.
void ExtendContext(const NGramHeader &middle, Accum z_lower) {
assert(middle.size() == order_ - 2);
// Copy because the input will advance. TODO avoid this copy by sharing amongst classes.
std::copy(middle.begin(), middle.end(), extended_context_.begin() + 1);
while (input_ && std::equal(middle.begin(), middle.end(), input_->begin() + 1)) {
*extended_context_.begin() = *input_->begin();
SameContext(NGramHeader(&*extended_context_.begin(), order_ - 1), z_lower);
}
}
void Finish() {
assert(!input_);
prob_out_.Poison();
backoff_out_.Poison();
if (higher_.get())
higher_->Finish();
}
// The BackoffManager class also injects backoffs when it skips ahead e.g. b(</s>) = 1
util::stream::Stream &BackoffStream() { return backoff_out_; }
private:
// Write the probability to the correct place in prob_out_. Should use a proxy but currently incompatible with RewindableStream.
float &ProbWrite() {
return *reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_out_.Get()) + order_ * sizeof(WordIndex));
}
std::size_t NumModels() const { return decoded_backoffs_.size(); }
const std::size_t order_;
const BoundedSequenceEncoding encoding_;
ProxyStream<PartialProbGamma> input_;
util::stream::RewindableStream prob_out_;
util::stream::Stream backoff_out_;
BackoffManager &backoffs_;
const float *const lambdas_;
// Higher order instance of this same class.
util::scoped_ptr<Recurse> higher_;
// Temporary in SameContext.
std::vector<unsigned char> decoded_backoffs_;
// Temporary in ExtendContext.
std::vector<WordIndex> extended_context_;
};
class Thread {
public:
Thread(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order, util::stream::Chains &prob_out, util::stream::Chains &backoff_out)
: info_(info), models_by_order_(models_by_order), prob_out_(prob_out), backoff_out_(backoff_out) {}
void Run(const util::stream::ChainPositions &merged_probabilities) {
// Unigrams do not have enocded backoff info.
ProxyStream<PartialProbGamma> in(merged_probabilities[0], PartialProbGamma(1, 0));
util::stream::RewindableStream prob_write(prob_out_[0]);
Accum z = 0.0;
prob_write.Mark();
WordIndex count = 0;
for (; in; ++in, ++prob_write, ++count) {
// Note assumption that probabilitity comes first
memcpy(prob_write.Get(), in.Get(), sizeof(WordIndex) + sizeof(float));
z += pow(10.0, in->Prob());
}
// TODO HACK TODO: lmplz outputs p(<s>) = 1 to get q to compute nicely. That will always result in 1.0 more than it should be.
z -= 1.0;
float log_z = log10(z);
prob_write.Rewind();
// Normalize unigram probabilities.
for (WordIndex i = 0; i < count; ++i, ++prob_write) {
*reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_write.Get()) + sizeof(WordIndex)) -= log_z;
}
prob_write.Poison();
// Now setup the higher orders.
util::scoped_ptr<Recurse> higher_order;
BackoffManager backoffs(models_by_order_);
std::size_t max_order = merged_probabilities.size();
for (std::size_t order = max_order; order >= 2; --order) {
higher_order.reset(new Recurse(info_, order, merged_probabilities[order - 1], prob_out_[order - 1], backoff_out_[order - 2], backoffs, higher_order.release()));
backoffs.SetupSkip(order, higher_order->BackoffStream());
}
if (max_order > 1) {
higher_order->ExtendContext(NGramHeader(NULL, 0), log_z);
backoffs.Finish();
higher_order->Finish();
}
}
private:
const InterpolateInfo info_;
util::FixedArray<util::stream::ChainPositions> &models_by_order_;
util::stream::ChainPositions prob_out_;
util::stream::ChainPositions backoff_out_;
};
} // namespace
void Normalize(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order, util::stream::Chains &merged_probabilities, util::stream::Chains &prob_out, util::stream::Chains &backoff_out) {
assert(prob_out.size() == backoff_out.size() + 1);
// Arbitrarily put the thread on the merged_probabilities Chains.
merged_probabilities >> Thread(info, models_by_order, prob_out, backoff_out);
}
}} // namespaces
#ifndef LM_INTERPOLATE_NORMALIZE_H
#define LM_INTERPOLATE_NORMALIZE_H
#include "../../util/fixed_array.hh"
/* Pass 2:
* - Multiply backoff weights by the backed off probabilities from pass 1.
* - Compute the normalization factor Z.
* - Send Z to the next highest order.
* - Rewind and divide by Z.
*/
namespace util { namespace stream {
class ChainPositions;
class Chains;
}} // namespaces
namespace lm { namespace interpolate {
struct InterpolateInfo;
void Normalize(
const InterpolateInfo &info,
// Input full models for backoffs. Assumes that renumbering has been done. Suffix order.
util::FixedArray<util::stream::ChainPositions> &models_by_order,
// Input PartialProbGamma from MergeProbabilities. Context order.
util::stream::Chains &merged_probabilities,
// Output NGram<float> with normalized probabilities. Context order.
util::stream::Chains &probabilities_out,
// Output bare floats with backoffs. Note backoffs.size() == order - 1. Suffix order.
util::stream::Chains &backoffs_out);
}} // namespaces
#endif // LM_INTERPOLATE_NORMALIZE_H
#include "normalize.hh"
#include "interpolate_info.hh"
#include "merge_probabilities.hh"
#include "../common/ngram_stream.hh"
#include "../../util/stream/chain.hh"
#include "../../util/stream/multi_stream.hh"
#define BOOST_TEST_MODULE NormalizeTest
#include <boost/test/unit_test.hpp>
namespace lm { namespace interpolate { namespace {
// log without backoff
const float kInputs[] = {-0.3, 1.2, -9.8, 4.0, -7.0, 0.0};
class WriteInput {
public:
WriteInput() {}
void Run(const util::stream::ChainPosition &to) {
util::stream::Stream out(to);
for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float); ++i, ++out) {
memcpy(out.Get(), &i, sizeof(WordIndex));
memcpy((uint8_t*)out.Get() + sizeof(WordIndex), &kInputs[i], sizeof(float));
}
out.Poison();
}
};
void CheckOutput(const util::stream::ChainPosition &from) {
NGramStream<float> in(from);
float sum = 0.0;
for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float) - 1 /* <s> at the end */; ++i) {
sum += pow(10.0, kInputs[i]);
}
sum = log10(sum);
BOOST_REQUIRE(in);
BOOST_CHECK_CLOSE(kInputs[0] - sum, in->Value(), 0.0001);
BOOST_REQUIRE(++in);
BOOST_CHECK_CLOSE(kInputs[1] - sum, in->Value(), 0.0001);
BOOST_REQUIRE(++in);
BOOST_CHECK_CLOSE(kInputs[2] - sum, in->Value(), 0.0001);
BOOST_REQUIRE(++in);
BOOST_CHECK_CLOSE(kInputs[3] - sum, in->Value(), 0.0001);
BOOST_REQUIRE(++in);
BOOST_CHECK_CLOSE(kInputs[4] - sum, in->Value(), 0.0001);
BOOST_REQUIRE(++in);
BOOST_CHECK_CLOSE(kInputs[5] - sum, in->Value(), 0.0001);
BOOST_CHECK(!++in);
}
BOOST_AUTO_TEST_CASE(Unigrams) {
InterpolateInfo info;
info.lambdas.push_back(2.0);
info.lambdas.push_back(-0.1);
info.orders.push_back(1);
info.orders.push_back(1);
BOOST_CHECK_EQUAL(0, MakeEncoder(info, 1).EncodedLength());
// No backoffs.
util::stream::Chains blank(0);
util::FixedArray<util::stream::ChainPositions> models_by_order(2);
models_by_order.push_back(blank);
models_by_order.push_back(blank);
util::stream::Chains merged_probabilities(1);
util::stream::Chains probabilities_out(1);
util::stream::Chains backoffs_out(0);
merged_probabilities.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float) + sizeof(float), 2, 24));
probabilities_out.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float), 2, 100));
merged_probabilities[0] >> WriteInput();
Normalize(info, models_by_order, merged_probabilities, probabilities_out, backoffs_out);
util::stream::ChainPosition checker(probabilities_out[0].Add());
merged_probabilities >> util::stream::kRecycle;
probabilities_out >> util::stream::kRecycle;
CheckOutput(checker);
probabilities_out.Wait();
}
}}} // namespaces
#include "pipeline.hh"
#include "../common/compare.hh"
#include "../common/print.hh"
#include "../common/renumber.hh"
#include "../vocab.hh"
#include "backoff_reunification.hh"
#include "interpolate_info.hh"
#include "merge_probabilities.hh"
#include "merge_vocab.hh"
#include "normalize.hh"
#include "universal_vocab.hh"
#include "../../util/stream/chain.hh"
#include "../../util/stream/count_records.hh"
#include "../../util/stream/io.hh"
#include "../../util/stream/multi_stream.hh"
#include "../../util/stream/sort.hh"
#include "../../util/fixed_array.hh"
namespace lm { namespace interpolate { namespace {
/* Put the original input files on chains and renumber them */
void SetupInputs(std::size_t buffer_size, const UniversalVocab &vocab, util::FixedArray<ModelBuffer> &models, bool exclude_highest, util::FixedArray<util::stream::Chains> &chains, util::FixedArray<util::stream::ChainPositions> &positions) {
chains.clear();
positions.clear();
// TODO: much better memory sizing heuristics e.g. not making the chain larger than it will use.
util::stream::ChainConfig config(0, 2, buffer_size);
for (std::size_t i = 0; i < models.size(); ++i) {
chains.push_back(models[i].Order() - exclude_highest);
for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) {
config.entry_size = sizeof(WordIndex) * (j + 1) + sizeof(float) * 2; // TODO do not include wasteful backoff for highest.
chains.back().push_back(config);
}
if (i == models.size() - 1)
chains.back().back().ActivateProgress();
models[i].Source(chains.back());
for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) {
chains[i][j] >> Renumber(vocab.Mapping(i), j + 1);
}
}
for (std::size_t i = 0; i < chains.size(); ++i) {
positions.push_back(chains[i]);
}
}
template <class Compare> void SinkSort(const util::stream::SortConfig &config, util::stream::Chains &chains, util::stream::Sorts<Compare> &sorts) {
for (std::size_t i = 0; i < chains.size(); ++i) {
sorts.push_back(chains[i], config, Compare(i + 1));
}
}
template <class Compare> void SourceSort(util::stream::Chains &chains, util::stream::Sorts<Compare> &sorts) {
// TODO memory management
for (std::size_t i = 0; i < sorts.size(); ++i) {
sorts[i].Merge(sorts[i].DefaultLazy());
}
for (std::size_t i = 0; i < sorts.size(); ++i) {
sorts[i].Output(chains[i], sorts[i].DefaultLazy());
}
}
} // namespace
void Pipeline(util::FixedArray<ModelBuffer> &models, const Config &config, int write_file) {
// Setup InterpolateInfo and UniversalVocab.
InterpolateInfo info;
info.lambdas = config.lambdas;
std::vector<WordIndex> vocab_sizes;
util::scoped_fd vocab_null(util::MakeTemp(config.sort.temp_prefix));
std::size_t max_order = 0;
util::FixedArray<int> vocab_files(models.size());
for (ModelBuffer *i = models.begin(); i != models.end(); ++i) {
info.orders.push_back(i->Order());
vocab_sizes.push_back(i->Counts()[0]);
vocab_files.push_back(i->VocabFile());
max_order = std::max(max_order, i->Order());
}
util::scoped_ptr<UniversalVocab> vocab(new UniversalVocab(vocab_sizes));
{
ngram::ImmediateWriteWordsWrapper writer(NULL, vocab_null.get(), 0);
MergeVocab(vocab_files, *vocab, writer);
}
std::cerr << "Merging probabilities." << std::endl;
// Pass 1: merge probabilities
util::FixedArray<util::stream::Chains> input_chains(models.size());
util::FixedArray<util::stream::ChainPositions> models_by_order(models.size());
SetupInputs(config.BufferSize(), *vocab, models, false, input_chains, models_by_order);
util::stream::Chains merged_probs(max_order);
for (std::size_t i = 0; i < max_order; ++i) {
merged_probs.push_back(util::stream::ChainConfig(PartialProbGamma::TotalSize(info, i + 1), 2, config.BufferSize())); // TODO: not buffer_size
}
merged_probs >> MergeProbabilities(info, models_by_order);
std::vector<uint64_t> counts(max_order);
for (std::size_t i = 0; i < max_order; ++i) {
merged_probs[i] >> util::stream::CountRecords(&counts[i]);
}
for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) {
*i >> util::stream::kRecycle;
}
// Pass 2: normalize.
{
util::stream::Sorts<ContextOrder> sorts(merged_probs.size());
SinkSort(config.sort, merged_probs, sorts);
merged_probs.Wait(true);
for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) {
i->Wait(true);
}
SourceSort(merged_probs, sorts);
}
std::cerr << "Normalizing" << std::endl;
SetupInputs(config.BufferSize(), *vocab, models, true, input_chains, models_by_order);
util::stream::Chains probabilities(max_order), backoffs(max_order - 1);
std::size_t block_count = 2;
for (std::size_t i = 0; i < max_order; ++i) {
// Careful accounting to ensure RewindableStream can fit the entire vocabulary.
block_count = std::max<std::size_t>(block_count, 2);
// This much needs to fit in RewindableStream.
std::size_t fit = NGram<float>::TotalSize(i + 1) * counts[0];
// fit / (block_count - 1) rounded up
std::size_t min_block = (fit + block_count - 2) / (block_count - 1);
std::size_t specify = std::max(config.BufferSize(), min_block * block_count);
probabilities.push_back(util::stream::ChainConfig(NGram<float>::TotalSize(i + 1), block_count, specify));
}
for (std::size_t i = 0; i < max_order - 1; ++i) {
backoffs.push_back(util::stream::ChainConfig(sizeof(float), 2, config.BufferSize()));
}
Normalize(info, models_by_order, merged_probs, probabilities, backoffs);
util::FixedArray<util::stream::FileBuffer> backoff_buffers(backoffs.size());
for (std::size_t i = 0; i < max_order - 1; ++i) {
backoff_buffers.push_back(util::MakeTemp(config.sort.temp_prefix));
backoffs[i] >> backoff_buffers.back().Sink() >> util::stream::kRecycle;
}
for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) {
*i >> util::stream::kRecycle;
}
merged_probs >> util::stream::kRecycle;
// Pass 3: backoffs in the right place.
{
util::stream::Sorts<SuffixOrder> sorts(probabilities.size());
SinkSort(config.sort, probabilities, sorts);
probabilities.Wait(true);
for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) {
i->Wait(true);
}
backoffs.Wait(true);
merged_probs.Wait(true);
// destroy universal vocab to save RAM.
vocab.reset();
SourceSort(probabilities, sorts);
}
std::cerr << "Reunifying backoffs" << std::endl;
util::stream::ChainPositions prob_pos(max_order - 1);
util::stream::Chains combined(max_order - 1);
for (std::size_t i = 0; i < max_order - 1; ++i) {
if (i == max_order - 2)
backoffs[i].ActivateProgress();
backoffs[i].SetProgressTarget(backoff_buffers[i].Size());
backoffs[i] >> backoff_buffers[i].Source(true);
prob_pos.push_back(probabilities[i].Add());
combined.push_back(util::stream::ChainConfig(NGram<ProbBackoff>::TotalSize(i + 1), 2, config.BufferSize()));
}
util::stream::ChainPositions backoff_pos(backoffs);
ReunifyBackoff(prob_pos, backoff_pos, combined);
util::stream::ChainPositions output_pos(max_order);
for (std::size_t i = 0; i < max_order - 1; ++i) {
output_pos.push_back(combined[i].Add());
}
output_pos.push_back(probabilities.back().Add());
probabilities >> util::stream::kRecycle;
backoffs >> util::stream::kRecycle;
combined >> util::stream::kRecycle;
// TODO genericize to ModelBuffer etc.
PrintARPA(vocab_null.get(), write_file, counts).Run(output_pos);
}
}} // namespaces
#ifndef LM_INTERPOLATE_PIPELINE_H
#define LM_INTERPOLATE_PIPELINE_H
#include "../common/model_buffer.hh"
#include "../../util/fixed_array.hh"
#include "../../util/stream/config.hh"
#include <cstddef>
#include <string>
namespace lm { namespace interpolate {
struct Config {
std::vector<float> lambdas;
util::stream::SortConfig sort;
std::size_t BufferSize() const { return sort.buffer_size; }
};
void Pipeline(util::FixedArray<ModelBuffer> &models, const Config &config, int write_file);
}} // namespaces
#endif // LM_INTERPOLATE_PIPELINE_H
#include "split_worker.hh"
#include "../common/ngram.hh"
namespace lm {
namespace interpolate {
SplitWorker::SplitWorker(std::size_t order, util::stream::Chain &backoff_chain,
util::stream::Chain &sort_chain)
: order_(order) {
backoff_chain >> backoff_input_;
sort_chain >> sort_input_;
}
void SplitWorker::Run(const util::stream::ChainPosition &position) {
// input: ngram record (id, prob, and backoff)
// output: a float to the backoff_input stream
// an ngram id and a float to the sort_input stream
for (util::stream::Stream stream(position); stream; ++stream) {
NGram<ProbBackoff> ngram(stream.Get(), order_);
// write id and prob to the sort stream
float prob = ngram.Value().prob;
lm::WordIndex *out = reinterpret_cast<lm::WordIndex *>(sort_input_.Get());
for (const lm::WordIndex *it = ngram.begin(); it != ngram.end(); ++it) {
*out++ = *it;
}
*reinterpret_cast<float *>(out) = prob;
++sort_input_;
// write backoff to the backoff output stream
float boff = ngram.Value().backoff;
*reinterpret_cast<float *>(backoff_input_.Get()) = boff;
++backoff_input_;
}
sort_input_.Poison();
backoff_input_.Poison();
}
}
}
#ifndef KENLM_INTERPOLATE_SPLIT_WORKER_H_
#define KENLM_INTERPOLATE_SPLIT_WORKER_H_
#include "../../util/stream/chain.hh"
#include "../../util/stream/stream.hh"
namespace lm {
namespace interpolate {
class SplitWorker {
public:
/**
* Constructs a split worker for a particular order. It writes the
* split-off backoff values to the backoff chain and the ngram id and
* probability to the sort chain for each ngram in the input.
*/
SplitWorker(std::size_t order, util::stream::Chain &backoff_chain,
util::stream::Chain &sort_chain);
/**
* The callback invoked to handle the input from the ngram intermediate
* files.
*/
void Run(const util::stream::ChainPosition& position);
private:
/**
* The ngram order we are reading/writing for.
*/
std::size_t order_;
/**
* The stream to write to for the backoff values.
*/
util::stream::Stream backoff_input_;
/**
* The stream to write to for the ngram id + probability values.
*/
util::stream::Stream sort_input_;
};
}
}
#endif
#include "../common/compare.hh"
#include "../common/model_buffer.hh"
#include "../common/ngram.hh"
#include "../../util/stream/chain.hh"
#include "../../util/stream/multi_stream.hh"
#include "../../util/stream/sort.hh"
#include "split_worker.hh"
#include <boost/program_options.hpp>
#include <boost/version.hpp>
#if defined(_WIN32) || defined(_WIN64)
// Windows doesn't define <unistd.h>
//
// So we define what we need here instead:
//
#define STDIN_FILENO = 0
#define STDOUT_FILENO = 1
#else // Huzzah for POSIX!
#include <unistd.h>
#endif
/*
* This is a simple example program that takes in intermediate
* suffix-sorted ngram files and outputs two sets of files: one for backoff
* probability values (raw numbers, in suffix order) and one for
* probability values (ngram id and probability, in *context* order)
*/
int main(int argc, char *argv[]) {
using namespace lm::interpolate;
const std::size_t ONE_GB = 1 << 30;
const std::size_t SIXTY_FOUR_MB = 1 << 26;
const std::size_t NUMBER_OF_BLOCKS = 2;
std::string FILE_NAME = "ngrams";
std::string CONTEXT_SORTED_FILENAME = "csorted-ngrams";
std::string BACKOFF_FILENAME = "backoffs";
std::string TMP_DIR = "/tmp/";
try {
namespace po = boost::program_options;
po::options_description options("canhazinterp Pass-3 options");
options.add_options()
("help,h", po::bool_switch(), "Show this help message")
("ngrams,n", po::value<std::string>(&FILE_NAME), "ngrams file")
("csortngrams,c", po::value<std::string>(&CONTEXT_SORTED_FILENAME), "context sorted ngrams file")
("backoffs,b", po::value<std::string>(&BACKOFF_FILENAME), "backoffs file")
("tmpdir,t", po::value<std::string>(&TMP_DIR), "tmp dir");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, options), vm);
// Display help
if(vm["help"].as<bool>()) {
std::cerr << "Usage: " << options << std::endl;
return 1;
}
}
catch(const std::exception &e) {
std::cerr << e.what() << std::endl;
return 1;
}
// The basic strategy here is to have three chains:
// - The first reads the ngram order inputs using ModelBuffer. Those are
// then stripped of their backoff values and fed into the third chain;
// the backoff values *themselves* are written to the second chain.
//
// - The second chain takes the backoff values and writes them out to a
// file (one for each order).
//
// - The third chain takes just the probability values and ngrams and
// writes them out, sorted in context-order, to a file (one for each
// order).
// This will be used to read in the binary intermediate files. There is
// one file per order (e.g. ngrams.1, ngrams.2, ...)
lm::ModelBuffer buffer(FILE_NAME);
// Create a separate chains for each ngram order for:
// - Input from the intermediate files
// - Output to the backoff file
// - Output to the (context-sorted) probability file
util::stream::Chains ngram_inputs(buffer.Order());
util::stream::Chains backoff_chains(buffer.Order());
util::stream::Chains prob_chains(buffer.Order());
for (std::size_t i = 0; i < buffer.Order(); ++i) {
ngram_inputs.push_back(util::stream::ChainConfig(
lm::NGram<lm::ProbBackoff>::TotalSize(i + 1), NUMBER_OF_BLOCKS, ONE_GB));
backoff_chains.push_back(
util::stream::ChainConfig(sizeof(float), NUMBER_OF_BLOCKS, ONE_GB));
prob_chains.push_back(util::stream::ChainConfig(
sizeof(lm::WordIndex) * (i + 1) + sizeof(float), NUMBER_OF_BLOCKS,
ONE_GB));
}
// This sets the input for each of the ngram order chains to the
// appropriate file
buffer.Source(ngram_inputs);
util::FixedArray<util::scoped_ptr<SplitWorker> > workers(buffer.Order());
for (std::size_t i = 0; i < buffer.Order(); ++i) {
// Attach a SplitWorker to each of the ngram input chains, writing to the
// corresponding order's backoff and probability chains
workers.push_back(
new SplitWorker(i + 1, backoff_chains[i], prob_chains[i]));
ngram_inputs[i] >> boost::ref(*workers.back());
}
util::stream::SortConfig sort_cfg;
sort_cfg.temp_prefix = TMP_DIR;
sort_cfg.buffer_size = SIXTY_FOUR_MB;
sort_cfg.total_memory = ONE_GB;
// This will parallel merge sort the individual order files, putting
// them in context-order instead of suffix-order.
//
// Two new threads will be running, each owned by the chains[i] object.
// - The first executes BlockSorter.Run() to sort the n-gram entries
// - The second executes WriteAndRecycle.Run() to write each sorted
// block to disk as a temporary file
util::stream::Sorts<lm::ContextOrder> sorts(buffer.Order());
for (std::size_t i = 0; i < prob_chains.size(); ++i) {
sorts.push_back(prob_chains[i], sort_cfg, lm::ContextOrder(i + 1));
}
// Set the sort output to be on the same chain
for (std::size_t i = 0; i < prob_chains.size(); ++i) {
// The following call to Chain::Wait()
// joins the threads owned by chains[i].
//
// As such the following call won't return
// until all threads owned by chains[i] have completed.
//
// The following call also resets chain[i]
// so that it can be reused
// (including free'ing the memory previously used by the chain)
prob_chains[i].Wait();
// In an ideal world (without memory restrictions)
// we could merge all of the previously sorted blocks
// by reading them all completely into memory
// and then running merge sort over them.
//
// In the real world, we have memory restrictions;
// depending on how many blocks we have,
// and how much memory we can use to read from each block
// (sort_config.buffer_size)
// it may be the case that we have insufficient memory
// to read sort_config.buffer_size of data from each block from disk.
//
// If this occurs, then it will be necessary to perform one or more rounds
// of merge sort on disk;
// doing so will reduce the number of blocks that we will eventually
// need to read from
// when performing the final round of merge sort in memory.
//
// So, the following call determines whether it is necessary
// to perform one or more rounds of merge sort on disk;
// if such on-disk merge sorting is required, such sorting is performed.
//
// Finally, the following method launches a thread that calls
// OwningMergingReader.Run()
// to perform the final round of merge sort in memory.
//
// Merge sort could have be invoked directly
// so that merge sort memory doesn't coexist with Chain memory.
sorts[i].Output(prob_chains[i]);
}
// Create another model buffer for our output on e.g. csorted-ngrams.1,
// csorted-ngrams.2, ...
lm::ModelBuffer output_buf(CONTEXT_SORTED_FILENAME, true, false);
output_buf.Sink(prob_chains, buffer.Counts());
// Create a third model buffer for our backoff output on e.g. backoff.1,
// backoff.2, ...
lm::ModelBuffer boff_buf(BACKOFF_FILENAME, true, false);
boff_buf.Sink(backoff_chains, buffer.Counts());
// Joins all threads that chains owns,
// and does a for loop over each chain object in chains,
// calling chain.Wait() on each such chain object
ngram_inputs.Wait(true);
backoff_chains.Wait(true);
prob_chains.Wait(true);
return 0;
}
#include "tune_derivatives.hh"
#include "tune_instances.hh"
#include "tune_matrix.hh"
#include "../../util/stream/chain.hh"
#include "../../util/stream/typed_stream.hh"
#include <Eigen/Core>
namespace lm { namespace interpolate {
Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix &hessian) {
gradient = in.CorrectGradientTerm();
hessian = Matrix::Zero(weights.rows(), weights.rows());
// TODO: loop instead to force low-memory evaluation?
// Compute p_I(x)*Z_{\epsilon} i.e. the unnormalized probabilities
Vector weighted_uni((in.LNUnigrams() * weights).array().exp());
// Even -inf doesn't work for <s> because weights can be negative. Manually set it to zero.
weighted_uni(in.BOS()) = 0.0;
Accum Z_epsilon = weighted_uni.sum();
// unigram_cross(i) = \sum_{all x} p_I(x) ln p_i(x)
Vector unigram_cross(in.LNUnigrams().transpose() * weighted_uni / Z_epsilon);
Accum sum_B_I = 0.0;
Accum sum_ln_Z_context = 0.0;
// Temporaries used each cycle of the loop.
Matrix convolve;
Vector full_cross;
Matrix hessian_missing_Z_context;
// Backed off ln p_i(x)B_i(context)
Vector ln_p_i_backed;
// Full ln p_i(x | context)
Vector ln_p_i_full;
// TODO make configurable memory size.
util::stream::Chain chain(util::stream::ChainConfig(in.ReadExtensionsEntrySize(), 2, 64 << 20));
chain.ActivateProgress();
in.ReadExtensions(chain);
util::stream::TypedStream<Extension> extensions(chain.Add());
chain >> util::stream::kRecycle;
// Loop over instances (words in the tuning data).
for (InstanceIndex n = 0; n < in.NumInstances(); ++n) {
assert(extensions);
Accum weighted_backoffs = exp(in.LNBackoffs(n).dot(weights));
// Compute \sum_{x: model does not back off to unigram} p_I(x)Z(epsilon)
Accum unnormalized_sum_x_p_I = 0.0;
// Compute \sum_{x: model does not back off to unigram} p_I(x | context)Z(context)
Accum unnormalized_sum_x_p_I_full = 0.0;
// This should be divided by Z_context then added to the Hessian.
hessian_missing_Z_context = Matrix::Zero(weights.rows(), weights.rows());
full_cross = Vector::Zero(weights.rows());
// Loop over words within an instance for which extension exists. An extension happens when any model matches more than a unigram in the tuning instance.
while (extensions && extensions->instance == n) {
const WordIndex word = extensions->word;
unnormalized_sum_x_p_I += weighted_uni(word);
ln_p_i_backed = in.LNUnigrams().row(word) + in.LNBackoffs(n);
// Calculate ln_p_i_full(i) = ln p_i(word | context) by filling in unigrams then overwriting with extensions.
ln_p_i_full = ln_p_i_backed;
// Loop over all models that have an extension for the same word namely p_i(word | context) matches at least a bigram.
for (; extensions && extensions->word == word && extensions->instance == n; ++extensions) {
ln_p_i_full(extensions->model) = extensions->ln_prob;
}
// This is the weighted product of probabilities. In other words, p_I(word | context) * Z(context) = exp(\sum_i w_i * p_i(word | context)).
Accum weighted = exp(ln_p_i_full.dot(weights));
unnormalized_sum_x_p_I_full += weighted;
// These aren't normalized by Z_context (happens later)
full_cross.noalias() +=
weighted * ln_p_i_full
- weighted_uni(word) * weighted_backoffs /* we'll divide by Z_context later to form B_I */ * in.LNUnigrams().row(word).transpose();
// This will get multiplied by Z_context then added to the Hessian.
hessian_missing_Z_context.noalias() +=
// Replacement terms.
weighted * ln_p_i_full * ln_p_i_full.transpose()
// Presumed unigrams. Z_epsilon * weighted_backoffs will turn into B_I once all of this is divided by Z_context.
- weighted_uni(word) * weighted_backoffs * ln_p_i_backed * ln_p_i_backed.transpose();
}
Accum Z_context =
weighted_backoffs * (Z_epsilon - unnormalized_sum_x_p_I) // Back off and unnormalize the unigrams for which there is no extension.
+ unnormalized_sum_x_p_I_full; // Add the extensions.
sum_ln_Z_context += log(Z_context);
Accum B_I = Z_epsilon / Z_context * weighted_backoffs;
sum_B_I += B_I;
// This is the gradient term for this instance except for -log p_i(w_n | w_1^{n-1}) which was accounted for as part of neg_correct_sum_.
// full_cross(i) is \sum_{all x} p_I(x | context) log p_i(x | context)
// Prior terms excluded dividing by Z_context because it wasn't known at the time.
full_cross /= Z_context;
full_cross +=
// Uncorrected term
B_I * (in.LNBackoffs(n).transpose() + unigram_cross)
// Subtract values that should not have been charged.
- unnormalized_sum_x_p_I / Z_epsilon * B_I * in.LNBackoffs(n).transpose();
gradient += full_cross;
convolve = unigram_cross * in.LNBackoffs(n);
// There's one missing term here, which is independent of context and done at the end.
hessian.noalias() +=
// First term of Hessian, assuming all models back off to unigram.
B_I * (convolve + convolve.transpose() + in.LNBackoffs(n).transpose() * in.LNBackoffs(n))
// Error in the first term, correcting from unigram to full probabilities.
+ hessian_missing_Z_context / Z_context
// Second term of Hessian, with correct full probabilities.
- full_cross * full_cross.transpose();
}
for (Matrix::Index x = 0; x < weighted_uni.rows(); ++x) {
// \sum_{contexts} B_I(context) \sum_x p_I(x) log p_i(x) log p_j(x)
// TODO can this be optimized? It's summing over the entire vocab which should be a matrix operation.
hessian.noalias() += sum_B_I * weighted_uni(x) / Z_epsilon * in.LNUnigrams().row(x).transpose() * in.LNUnigrams().row(x);
}
return exp((in.CorrectGradientTerm().dot(weights) + sum_ln_Z_context) / static_cast<double>(in.NumInstances()));
}
}} // namespaces
#ifndef LM_INTERPOLATE_TUNE_DERIVATIVES_H
#define LM_INTERPOLATE_TUNE_DERIVATIVES_H
#include "tune_matrix.hh"
#include <Eigen/Core>
#include <cmath>
namespace lm { namespace interpolate {
class Instances;
// Given tuning instances and model weights, computes the objective function (log probability), gradient, and Hessian.
// Returns log probability / number of instances.
Accum Derivatives(Instances &instances /* Doesn't modify but ReadExtensions is lazy */, const Vector &weights, Vector &gradient, Matrix &hessian);
}} // namespaces
#endif // LM_INTERPOLATE_TUNE_DERIVATIVES_H
#include "tune_derivatives.hh"
#include "tune_instances.hh"
#include "../../util/stream/config.hh"
#include "../../util/stream/chain.hh"
#include "../../util/stream/io.hh"
#include "../../util/stream/typed_stream.hh"
#define BOOST_TEST_MODULE DerivativeTest
#include <boost/test/unit_test.hpp>
namespace lm { namespace interpolate {
class MockInstances : public Instances {
public:
MockInstances() : chain_(util::stream::ChainConfig(ReadExtensionsEntrySize(), 2, 100)), write_(chain_.Add()) {
extensions_subsequent_.reset(new util::stream::FileBuffer(util::MakeTemp("/tmp/")));
chain_ >> extensions_subsequent_->Sink() >> util::stream::kRecycle;
}
Matrix &LNUnigrams() { return ln_unigrams_; }
BackoffMatrix &LNBackoffs() { return ln_backoffs_; }
WordIndex &BOS() { return bos_; }
Vector &NegLNCorrectSum() { return neg_ln_correct_sum_; }
// Extensions must be provided sorted!
void AddExtension(const Extension &extension) {
*write_ = extension;
++write_;
}
void DoneExtending() {
write_.Poison();
chain_.Wait(true);
}
private:
util::stream::Chain chain_;
util::stream::TypedStream<Extension> write_;
};
namespace {
BOOST_AUTO_TEST_CASE(Small) {
MockInstances mock;
{
// Three vocabulary words plus <s>, two models.
Matrix unigrams(4, 2);
unigrams <<
0.1, 0.6,
0.4, 0.3,
0.5, 0.1,
// <s>
1.0, 1.0;
mock.LNUnigrams() = unigrams.array().log();
}
mock.BOS() = 3;
// One instance
mock.LNBackoffs().resize(1, 2);
mock.LNBackoffs() << 0.2, 0.4;
mock.LNBackoffs() = mock.LNBackoffs().array().log();
// Sparse extensions: model 0 word 2 and model 1 word 1.
// Assuming that model 1 only matches word 1, this is p_1(1 | context)
Accum model_1_word_1 = 1.0 - .6 * .4 - .1 * .4;
mock.NegLNCorrectSum().resize(2);
// We'll suppose correct has WordIndex 1, which backs off in model 0, and matches in model 1
mock.NegLNCorrectSum() << (0.4 * 0.2), model_1_word_1;
mock.NegLNCorrectSum() = -mock.NegLNCorrectSum().array().log();
Accum model_0_word_2 = 1.0 - .1 * .2 - .4 * .2;
Extension ext;
ext.instance = 0;
ext.word = 1;
ext.model = 1;
ext.ln_prob = log(model_1_word_1);
mock.AddExtension(ext);
ext.instance = 0;
ext.word = 2;
ext.model = 0;
ext.ln_prob = log(model_0_word_2);
mock.AddExtension(ext);
mock.DoneExtending();
Vector weights(2);
weights << 0.9, 1.2;
Vector gradient(2);
Matrix hessian(2,2);
Derivatives(mock, weights, gradient, hessian);
// TODO: check perplexity value coming out.
// p_I(x | context)
Vector p_I(3);
p_I <<
pow(0.1 * 0.2, 0.9) * pow(0.6 * 0.4, 1.2),
pow(0.4 * 0.2, 0.9) * pow(model_1_word_1, 1.2),
pow(model_0_word_2, 0.9) * pow(0.1 * 0.4, 1.2);
p_I /= p_I.sum();
Vector expected_gradient = mock.NegLNCorrectSum();
expected_gradient(0) += p_I(0) * log(0.1 * 0.2);
expected_gradient(0) += p_I(1) * log(0.4 * 0.2);
expected_gradient(0) += p_I(2) * log(model_0_word_2);
BOOST_CHECK_CLOSE(expected_gradient(0), gradient(0), 0.01);
expected_gradient(1) += p_I(0) * log(0.6 * 0.4);
expected_gradient(1) += p_I(1) * log(model_1_word_1);
expected_gradient(1) += p_I(2) * log(0.1 * 0.4);
BOOST_CHECK_CLOSE(expected_gradient(1), gradient(1), 0.01);
Matrix expected_hessian(2, 2);
expected_hessian(1, 0) =
// First term
p_I(0) * log(0.1 * 0.2) * log(0.6 * 0.4) +
p_I(1) * log(0.4 * 0.2) * log(model_1_word_1) +
p_I(2) * log(model_0_word_2) * log(0.1 * 0.4);
expected_hessian(1, 0) -=
(p_I(0) * log(0.1 * 0.2) + p_I(1) * log(0.4 * 0.2) + p_I(2) * log(model_0_word_2)) *
(p_I(0) * log(0.6 * 0.4) + p_I(1) * log(model_1_word_1) + p_I(2) * log(0.1 * 0.4));
expected_hessian(0, 1) = expected_hessian(1, 0);
BOOST_CHECK_CLOSE(expected_hessian(1, 0), hessian(1, 0), 0.01);
BOOST_CHECK_CLOSE(expected_hessian(0, 1), hessian(0, 1), 0.01);
}
}}} // namespaces
/* Load tuning instances and filter underlying models to them. A tuning
* instance is an n-gram in the tuning file. To tune towards these, we want
* the correct probability p_i(w_n | w_1^{n-1}) from each model as well as
* all the denominators p_i(v | w_1^{n-1}) that appear in normalization.
*
* In other words, we filter the models to only those n-grams whose context
* appears in the tuning data. This can be divided into two categories:
* - All unigrams. This goes into Instances::ln_unigrams_
* - Bigrams and above whose context appears in the tuning data. These are
* known as extensions. We only care about the longest extension for each
* w_1^{n-1}v since that is what will be used for the probability.
* Because there is a large number of extensions (we tried keeping them in RAM
* and ran out), the streaming framework is used to keep track of extensions
* and sort them so they can be streamed in. Downstream code
* (tune_derivatives.hh) takes a stream of extensions ordered by tuning
* instance, the word v, and the model the extension came from.
*/
#include "tune_instances.hh"
#include "../common/compare.hh"
#include "../common/joint_order.hh"
#include "../common/model_buffer.hh"
#include "../common/ngram_stream.hh"
#include "../common/renumber.hh"
#include "../enumerate_vocab.hh"
#include "merge_vocab.hh"
#include "universal_vocab.hh"
#include "../lm_exception.hh"
#include "../../util/file_piece.hh"
#include "../../util/murmur_hash.hh"
#include "../../util/stream/chain.hh"
#include "../../util/stream/io.hh"
#include "../../util/stream/sort.hh"
#include "../../util/tokenize_piece.hh"
#include <boost/shared_ptr.hpp>
#include <boost/unordered_map.hpp>
#include <cmath>
#include <limits>
#include <vector>
namespace lm { namespace interpolate {
// gcc 4.6 complains about uninitialized when sort code is generated for a 4-byte POD. But that sort code is never used.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
bool Extension::operator<(const Extension &other) const {
if (instance != other.instance)
return instance < other.instance;
if (word != other.word)
return word < other.word;
if (model != other.model)
return model < other.model;
return false;
}
#pragma GCC diagnostic pop
namespace {
// An extension without backoff weights applied yet.
#pragma pack(push)
#pragma pack(1)
struct InitialExtension {
Extension ext;
// Order from which it came.
uint8_t order;
};
#pragma pack(pop)
struct InitialExtensionCompare {
bool operator()(const void *first, const void *second) const {
return reinterpret_cast<const InitialExtension *>(first)->ext < reinterpret_cast<const InitialExtension *>(second)->ext;
}
};
// Intended use
// For each model:
// stream through orders jointly in suffix order:
// Call MatchedBackoff for full matches.
// Call Exit when the context matches.
// Call FinishModel with the unigram probability of the correct word, get full
// probability in return.
// Use backoffs_out to adjust records that were written to the stream.
// backoffs_out(model, order - 1) is the penalty for matching order.
class InstanceMatch {
public:
InstanceMatch(Matrix &backoffs_out, const WordIndex correct)
: seen_(std::numeric_limits<WordIndex>::max()),
backoffs_(backoffs_out),
correct_(correct), correct_from_(1), correct_ln_prob_(std::numeric_limits<float>::quiet_NaN()) {}
void MatchedBackoff(ModelIndex model, uint8_t order, float ln_backoff) {
backoffs_(model, order - 1) = ln_backoff;
}
// We only want the highest-order matches, which are the first to be exited for a given word.
void Exit(const InitialExtension &from, util::stream::Stream &out) {
if (from.ext.word == seen_) return;
seen_ = from.ext.word;
*static_cast<InitialExtension*>(out.Get()) = from;
++out;
if (UTIL_UNLIKELY(correct_ == from.ext.word)) {
correct_from_ = from.order;
correct_ln_prob_ = from.ext.ln_prob;
}
}
WordIndex Correct() const { return correct_; }
// Call this after each model has been passed through. Provide the unigram
// probability of the correct word (which follows the given context).
// This function will return the fully-backed-off probability of the correct
// word.
float FinishModel(ModelIndex model, float correct_ln_unigram) {
seen_ = std::numeric_limits<WordIndex>::max();
// Turn backoffs into multiplied values (added in log space).
// So backoffs_(model, order - 1) is the penalty for matching order.
float accum = 0.0;
for (int order = backoffs_.cols() - 1; order >= 0; --order) {
accum += backoffs_(model, order);
backoffs_(model, order) = accum;
}
if (correct_from_ == 1) {
correct_ln_prob_ = correct_ln_unigram;
}
if (correct_from_ - 1 < backoffs_.cols()) {
correct_ln_prob_ += backoffs_(model, correct_from_ - 1);
}
correct_from_ = 1;
return correct_ln_prob_;
}
private:
// What's the last word we've seen? Used to act only on exiting the longest match.
WordIndex seen_;
Matrix &backoffs_;
const WordIndex correct_;
// These only apply to the most recent model.
uint8_t correct_from_;
float correct_ln_prob_;
};
// Forward information to multiple instances of a context. So if the tuning
// set contains
// a b c d e
// a b c d e
// there's one DispatchContext for a b c d which calls two InstanceMatch, one
// for each tuning instance. This might be to inform them about a b c d g in
// one of the models.
class DispatchContext {
public:
void Register(InstanceMatch &context) {
registered_.push_back(&context);
}
void MatchedBackoff(ModelIndex model, uint8_t order, float ln_backoff) {
for (std::vector<InstanceMatch*>::iterator i = registered_.begin(); i != registered_.end(); ++i)
(*i)->MatchedBackoff(model, order, ln_backoff);
}
void Exit(InitialExtension &from, util::stream::Stream &out, const InstanceMatch *base_instance) {
for (std::vector<InstanceMatch*>::iterator i = registered_.begin(); i != registered_.end(); ++i) {
from.ext.instance = *i - base_instance;
(*i)->Exit(from, out);
}
}
private:
// TODO make these offsets in a big array rather than separately allocated.
std::vector<InstanceMatch*> registered_;
};
// Map from n-gram hash to contexts in the tuning data. TODO: probing hash table?
typedef boost::unordered_map<uint64_t, DispatchContext> ContextMap;
// Handle all the orders of a single model at once.
class JointOrderCallback {
public:
JointOrderCallback(
std::size_t model,
std::size_t full_order_minus_1,
ContextMap &contexts,
util::stream::Stream &out,
const InstanceMatch *base_instance)
: full_order_minus_1_(full_order_minus_1),
contexts_(contexts),
out_(out),
base_instance_(base_instance) {
ext_.ext.model = model;
}
void Enter(std::size_t order_minus_1, const void *data) {}
void Exit(std::size_t order_minus_1, void *data) {
// Match the full n-gram for backoffs.
if (order_minus_1 != full_order_minus_1_) {
NGram<ProbBackoff> gram(data, order_minus_1 + 1);
ContextMap::iterator i = contexts_.find(util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex)));
if (UTIL_UNLIKELY(i != contexts_.end())) {
i->second.MatchedBackoff(ext_.ext.model, gram.Order(), gram.Value().backoff * M_LN10);
}
}
// Match the context of the n-gram to indicate it's an extension.
ContextMap::iterator i = contexts_.find(util::MurmurHashNative(data, order_minus_1 * sizeof(WordIndex)));
if (UTIL_UNLIKELY(i != contexts_.end())) {
NGram<Prob> gram(data, order_minus_1 + 1);
// model is already set.
// instance is set by DispatchContext.
// That leaves word, ln_prob, and order.
ext_.ext.word = *(gram.end() - 1);
ext_.ext.ln_prob = gram.Value().prob * M_LN10;
ext_.order = order_minus_1 + 1;
// model was already set in the constructor.
// ext_.ext.instance is set by the Exit call.
i->second.Exit(ext_, out_, base_instance_);
}
}
void Run(const util::stream::ChainPositions &positions) {
JointOrder<JointOrderCallback, SuffixOrder>(positions, *this);
}
private:
const std::size_t full_order_minus_1_;
// Mapping is constant but values are being manipulated to tell them about
// n-grams.
ContextMap &contexts_;
// Reused variable. model is set correctly.
InitialExtension ext_;
util::stream::Stream &out_;
const InstanceMatch *const base_instance_;
};
// This populates the ln_unigrams_ matrix. It can (and should for efficiency)
// be run in the same scan as JointOrderCallback.
class ReadUnigrams {
public:
explicit ReadUnigrams(Matrix::ColXpr out) : out_(out) {}
// Read renumbered unigrams, fill with <unk> otherwise.
void Run(const util::stream::ChainPosition &position) {
NGramStream<ProbBackoff> stream(position);
assert(stream);
Accum unk = stream->Value().prob * M_LN10;
WordIndex previous = 0;
for (; stream; ++stream) {
WordIndex word = *stream->begin();
out_.segment(previous, word - previous) = Vector::Constant(word - previous, unk);
out_(word) = stream->Value().prob * M_LN10;
//backoffs are used by JointOrderCallback.
previous = word + 1;
}
out_.segment(previous, out_.rows() - previous) = Vector::Constant(out_.rows() - previous, unk);
}
private:
Matrix::ColXpr out_;
};
// Read tuning data into an array of vocab ids. The vocab ids are agreed with MergeVocab.
class IdentifyTuning : public EnumerateVocab {
public:
IdentifyTuning(int tuning_file, std::vector<WordIndex> &out) : indices_(out) {
indices_.clear();
StringPiece line;
std::size_t counter = 0;
std::vector<std::size_t> &eos = words_[util::MurmurHashNative("</s>", 4)];
for (util::FilePiece f(tuning_file); f.ReadLineOrEOF(line);) {
for (util::TokenIter<util::BoolCharacter, true> word(line, util::kSpaces); word; ++word) {
UTIL_THROW_IF(*word == "<s>" || *word == "</s>", FormatLoadException, "Illegal word in tuning data: " << *word);
words_[util::MurmurHashNative(word->data(), word->size())].push_back(counter++);
}
eos.push_back(counter++);
}
// Also get <s>
indices_.resize(counter + 1);
words_[util::MurmurHashNative("<s>", 3)].push_back(indices_.size() - 1);
}
// Apply ids as they come out of MergeVocab if they match.
void Add(WordIndex id, const StringPiece &str) {
boost::unordered_map<uint64_t, std::vector<std::size_t> >::iterator i = words_.find(util::MurmurHashNative(str.data(), str.size()));
if (i != words_.end()) {
for (std::vector<std::size_t>::iterator j = i->second.begin(); j != i->second.end(); ++j) {
indices_[*j] = id;
}
}
}
WordIndex FinishGetBOS() {
WordIndex ret = indices_.back();
indices_.pop_back();
return ret;
}
private:
// array of words in tuning data.
std::vector<WordIndex> &indices_;
// map from hash(string) to offsets in indices_.
boost::unordered_map<uint64_t, std::vector<std::size_t> > words_;
};
} // namespace
// Store information about the first iteration.
class ExtensionsFirstIteration {
public:
explicit ExtensionsFirstIteration(std::size_t instances, std::size_t models, std::size_t max_order, util::stream::Chain &extension_input, const util::stream::SortConfig &config)
: backoffs_by_instance_(new std::vector<Matrix>(instances)), sort_(extension_input, config) {
// Initialize all the backoff matrices to zeros.
for (std::vector<Matrix>::iterator i = backoffs_by_instance_->begin(); i != backoffs_by_instance_->end(); ++i) {
*i = Matrix::Zero(models, max_order);
}
}
Matrix &WriteBackoffs(std::size_t instance) {
return (*backoffs_by_instance_)[instance];
}
// Get the backoff all the way to unigram for a particular tuning instance and model.
Accum FullBackoff(std::size_t instance, std::size_t model) const {
return (*backoffs_by_instance_)[instance](model, 0);
}
void Merge(std::size_t lazy_memory) {
sort_.Merge(lazy_memory);
lazy_memory_ = lazy_memory;
}
void Output(util::stream::Chain &chain) {
sort_.Output(chain, lazy_memory_);
chain >> ApplyBackoffs(backoffs_by_instance_);
}
private:
class ApplyBackoffs {
public:
explicit ApplyBackoffs(boost::shared_ptr<std::vector<Matrix> > backoffs_by_instance)
: backoffs_by_instance_(backoffs_by_instance) {}
void Run(const util::stream::ChainPosition &position) {
// There should always be tuning instances.
const std::vector<Matrix> &backoffs = *backoffs_by_instance_;
assert(!backoffs.empty());
uint8_t max_order = backoffs.front().cols();
for (util::stream::Stream stream(position); stream; ++stream) {
InitialExtension &ini = *reinterpret_cast<InitialExtension*>(stream.Get());
assert(ini.order > 1); // If it's an extension, it should be higher than a unigram.
if (ini.order != max_order) {
ini.ext.ln_prob += backoffs[ini.ext.instance](ini.ext.model, ini.order - 1);
}
}
}
private:
boost::shared_ptr<std::vector<Matrix> > backoffs_by_instance_;
};
// Array of complete backoff matrices by instance.
// Each matrix is by model, then by order.
// Would have liked to use a tensor but it's not that well supported.
// This is a shared pointer so that ApplyBackoffs can run after this class is gone.
boost::shared_ptr<std::vector<Matrix> > backoffs_by_instance_;
// This sorts and stores all the InitialExtensions.
util::stream::Sort<InitialExtensionCompare> sort_;
std::size_t lazy_memory_;
};
Instances::Instances(int tune_file, const std::vector<StringPiece> &model_names, const InstancesConfig &config) : temp_prefix_(config.sort.temp_prefix) {
// All the memory from stack variables here should go away before merge sort of the instances.
{
util::FixedArray<ModelBuffer> models(model_names.size());
// Load tuning set and join vocabulary.
std::vector<WordIndex> vocab_sizes;
vocab_sizes.reserve(model_names.size());
util::FixedArray<int> vocab_files(model_names.size());
std::size_t max_order = 0;
for (std::vector<StringPiece>::const_iterator i = model_names.begin(); i != model_names.end(); ++i) {
models.push_back(*i);
vocab_sizes.push_back(models.back().Counts()[0]);
vocab_files.push_back(models.back().VocabFile());
max_order = std::max(max_order, models.back().Order());
}
UniversalVocab vocab(vocab_sizes);
std::vector<WordIndex> tuning_words;
WordIndex combined_vocab_size;
{
IdentifyTuning identify(tune_file, tuning_words);
combined_vocab_size = MergeVocab(vocab_files, vocab, identify);
bos_ = identify.FinishGetBOS();
}
// Setup the initial extensions storage: a chain going to a sort with a stream in the middle for writing.
util::stream::Chain extensions_chain(util::stream::ChainConfig(sizeof(InitialExtension), 2, config.extension_write_chain_mem));
util::stream::Stream extensions_write(extensions_chain.Add());
extensions_first_.reset(new ExtensionsFirstIteration(tuning_words.size(), model_names.size(), max_order, extensions_chain, config.sort));
// Populate the ContextMap from contexts to instances.
ContextMap cmap;
util::FixedArray<InstanceMatch> instances(tuning_words.size());
{
UTIL_THROW_IF2(tuning_words.empty(), "Empty tuning data");
const WordIndex eos = tuning_words.back();
std::vector<WordIndex> context;
context.push_back(bos_);
for (std::size_t i = 0; i < tuning_words.size(); ++i) {
instances.push_back(boost::ref(extensions_first_->WriteBackoffs(i)), tuning_words[i]);
for (std::size_t j = 0; j < context.size(); ++j) {
cmap[util::MurmurHashNative(&context[j], sizeof(WordIndex) * (context.size() - j))].Register(instances.back());
}
// Prepare for next word by starting a new sentence or shifting context.
if (tuning_words[i] == eos) {
context.clear();
context.push_back(bos_);
} else {
if (context.size() == max_order) {
context.erase(context.begin());
}
context.push_back(tuning_words[i]);
}
}
}
// Go through each model. Populate:
// ln_backoffs_
ln_backoffs_.resize(instances.size(), models.size());
// neg_ln_correct_sum_
neg_ln_correct_sum_.resize(models.size());
// ln_unigrams_
ln_unigrams_.resize(combined_vocab_size, models.size());
// The backoffs in extensions_first_
for (std::size_t m = 0; m < models.size(); ++m) {
std::cerr << "Processing model " << m << '/' << models.size() << ": " << model_names[m] << std::endl;
util::stream::Chains chains(models[m].Order());
for (std::size_t i = 0; i < models[m].Order(); ++i) {
// TODO: stop wasting space for backoffs of highest order.
chains.push_back(util::stream::ChainConfig(NGram<ProbBackoff>::TotalSize(i + 1), 2, config.model_read_chain_mem));
}
chains.back().ActivateProgress();
models[m].Source(chains);
for (std::size_t i = 0; i < models[m].Order(); ++i) {
chains[i] >> Renumber(vocab.Mapping(m), i + 1);
}
// Populate ln_unigrams_.
chains[0] >> ReadUnigrams(ln_unigrams_.col(m));
// Send extensions into extensions_first_ and give data to the instances about backoffs/extensions.
chains >> JointOrderCallback(m, models[m].Order() - 1, cmap, extensions_write, instances.begin());
chains >> util::stream::kRecycle;
chains.Wait(true);
neg_ln_correct_sum_(m) = 0.0;
for (InstanceMatch *i = instances.begin(); i != instances.end(); ++i) {
neg_ln_correct_sum_(m) -= i->FinishModel(m, ln_unigrams_(i->Correct(), m));
ln_backoffs_(i - instances.begin(), m) = extensions_first_->FullBackoff(i - instances.begin(), m);
}
ln_unigrams_(bos_, m) = 0; // Does not matter as long as it does not produce nans since tune_derivatives will overwrite the output.
}
extensions_write.Poison();
}
extensions_first_->Merge(config.lazy_memory);
}
Instances::~Instances() {}
// TODO: size reduction by excluding order for subsequent passes.
std::size_t Instances::ReadExtensionsEntrySize() const {
return sizeof(InitialExtension);
}
void Instances::ReadExtensions(util::stream::Chain &on) {
if (extensions_first_.get()) {
// Lazy sort and save a sorted copy to disk. TODO: cut down on record size by stripping out order information.
extensions_first_->Output(on);
extensions_first_.reset(); // Relevant data will continue to live in workers.
extensions_subsequent_.reset(new util::stream::FileBuffer(util::MakeTemp(temp_prefix_)));
on >> extensions_subsequent_->Sink();
} else {
on.SetProgressTarget(extensions_subsequent_->Size());
on >> extensions_subsequent_->Source();
}
}
// Back door.
Instances::Instances() {}
}} // namespaces
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment