Update files

688b6eac · SWHL · 688b6eac · 688b6eac · 688b6eac · 688b6eac
Commit 688b6eac authored Apr 07, 2023 by SWHL
20 changed files
--- a/cpp/thirdpart/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc
+#include "bounded_sequence_encoding.hh"
+
+#include "../../util/scoped.hh"
+
+#define BOOST_TEST_MODULE BoundedSequenceEncodingTest
+#include <boost/test/unit_test.hpp>
+
+namespace lm {
+namespace interpolate {
+namespace {
+
+BOOST_AUTO_TEST_CASE(Simple) {
+  unsigned char bounds[] = {2};
+  BoundedSequenceEncoding enc(bounds, bounds + 1);
+  util::scoped_malloc backing(util::MallocOrThrow(enc.EncodedLength()));
+  unsigned char input = 1;
+  enc.Encode(&input, backing.get());
+  unsigned char output;
+  enc.Decode(backing.get(), &output);
+  BOOST_CHECK_EQUAL(1, output);
+}
+
+void ExhaustiveTest(unsigned char *bound_begin, unsigned char *bound_end) {
+  BoundedSequenceEncoding enc(bound_begin, bound_end);
+  util::scoped_malloc backing(util::MallocOrThrow(enc.EncodedLength()));
+  std::vector<unsigned char> values(bound_end - bound_begin),
+      out(bound_end - bound_begin);
+  while (true) {
+    enc.Encode(&values[0], backing.get());
+    enc.Decode(backing.get(), &out[0]);
+    for (std::size_t i = 0; i != values.size(); ++i) {
+      BOOST_CHECK_EQUAL(values[i], out[i]);
+    }
+    for (std::size_t i = 0;; ++i) {
+      if (i == values.size()) return;
+      ++values[i];
+      if (values[i] < bound_begin[i]) break;
+      values[i] = 0;
+    }
+  }
+}
+
+void CheckEncodeDecode(unsigned char *bounds, unsigned char *input,
+                       unsigned char *output, std::size_t len) {
+  BoundedSequenceEncoding encoder(bounds, bounds + len);
+  util::scoped_malloc backing(util::MallocOrThrow(encoder.EncodedLength()));
+
+  encoder.Encode(input, backing.get());
+  encoder.Decode(backing.get(), output);
+
+  for (std::size_t i = 0; i < len; ++i) {
+    BOOST_CHECK_EQUAL(input[i], output[i]);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(Exhaustive) {
+  unsigned char bounds[] = {5, 2, 3, 9, 7, 20, 8};
+  ExhaustiveTest(bounds, bounds + sizeof(bounds) / sizeof(unsigned char));
+}
+
+BOOST_AUTO_TEST_CASE(LessThan64) {
+  unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 3};
+  unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 2};
+  unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+  std::size_t len = sizeof(bounds) / sizeof(unsigned char);
+  assert(sizeof(input) / sizeof(unsigned char) == len);
+  assert(sizeof(output) / sizeof(unsigned char) == len);
+
+  CheckEncodeDecode(bounds, input, output, len);
+}
+
+BOOST_AUTO_TEST_CASE(Exactly64) {
+  unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255};
+  unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16};
+  unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+  std::size_t len = sizeof(bounds) / sizeof(unsigned char);
+  assert(sizeof(input) / sizeof(unsigned char) == len);
+  assert(sizeof(output) / sizeof(unsigned char) == len);
+
+  CheckEncodeDecode(bounds, input, output, len);
+}
+
+BOOST_AUTO_TEST_CASE(MoreThan64) {
+  unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255, 255};
+  unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16, 137};
+  unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  std::size_t len = sizeof(bounds) / sizeof(unsigned char);
+  assert(sizeof(input) / sizeof(unsigned char) == len);
+  assert(sizeof(output) / sizeof(unsigned char) == len);
+
+  CheckEncodeDecode(bounds, input, output, len);
+}
+
+}}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/interpolate/interpolate_info.hh
+++ b/cpp/thirdpart/kenlm/lm/interpolate/interpolate_info.hh
+#ifndef KENLM_INTERPOLATE_INTERPOLATE_INFO_H
+#define KENLM_INTERPOLATE_INTERPOLATE_INFO_H
+
+#include <cstddef>
+#include <vector>
+#include <stdint.h>
+
+namespace lm {
+namespace interpolate {
+
+/**
+ * Stores relevant info for interpolating several language models, for use
+ * during the three-pass offline log-linear interpolation algorithm.
+ */
+struct InterpolateInfo {
+  /**
+   * @return the number of models being interpolated
+   */
+  std::size_t Models() const {
+    return orders.size();
+  }
+
+  /**
+   * The lambda (interpolation weight) for each model.
+   */
+  std::vector<float> lambdas;
+
+  /**
+   * The maximum ngram order for each model.
+   */
+  std::vector<uint8_t> orders;
+};
+}
+}
+#endif
--- a/cpp/thirdpart/kenlm/lm/interpolate/interpolate_main.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/interpolate_main.cc
+#include "../common/model_buffer.hh"
+#include "../common/size_option.hh"
+#include "pipeline.hh"
+#include "tune_instances.hh"
+#include "tune_weights.hh"
+#include "../../util/fixed_array.hh"
+#include "../../util/usage.hh"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpragmas" // Older gcc doesn't have "-Wunused-local-typedefs" and complains.
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+#include <Eigen/Core>
+#pragma GCC diagnostic pop
+
+#include <boost/program_options.hpp>
+
+#include <iostream>
+#include <vector>
+
+namespace {
+void MungeWeightArgs(int argc, char *argv[], std::vector<const char *> &munged_args) {
+  // Boost program options doesn't -w 0.2 -0.1 because it thinks -0.1 is an
+  // option.  There appears to be no standard way to fix this without breaking
+  // single-dash arguments.  So here's a hack: put a -w before every number
+  // if it's within the scope of a weight argument.
+  munged_args.push_back(argv[0]);
+  char **inside_weights = NULL;
+  for (char **i = argv + 1; i < argv + argc; ++i) {
+    StringPiece arg(*i);
+    if (starts_with(arg, "-w") || starts_with(arg, "--w")) {
+      inside_weights = i;
+    } else if (inside_weights && arg.size() >= 2 && arg[0] == '-' && ((arg[1] >= '0' && arg[1] <= '9') || arg[1] == '.')) {
+      // If a negative number appears right after -w, don't add another -w.
+      // And do stay inside weights.
+      if (inside_weights + 1 != i) {
+        munged_args.push_back("-w");
+      }
+    } else if (starts_with(arg, "-")) {
+      inside_weights = NULL;
+    }
+    munged_args.push_back(*i);
+  }
+}
+} // namespace
+
+int main(int argc, char *argv[]) {
+  try {
+    Eigen::initParallel();
+    lm::interpolate::Config pipe_config;
+    lm::interpolate::InstancesConfig instances_config;
+    std::vector<std::string> input_models;
+    std::string tuning_file;
+
+    namespace po = boost::program_options;
+    po::options_description options("Log-linear interpolation options");
+    options.add_options()
+      ("help,h", po::bool_switch(), "Show this help message")
+      ("model,m", po::value<std::vector<std::string> >(&input_models)->multitoken()->required(), "Models to interpolate, which must be in KenLM intermediate format.  The intermediate format can be generated using the --intermediate argument to lmplz.")
+      ("weight,w", po::value<std::vector<float> >(&pipe_config.lambdas)->multitoken(), "Interpolation weights")
+      ("tuning,t", po::value<std::string>(&tuning_file), "File to tune on: a text file with one sentence per line")
+      ("just_tune", po::bool_switch(), "Tune and print weights then quit")
+      ("temp_prefix,T", po::value<std::string>(&pipe_config.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
+      ("memory,S", lm::SizeOption(pipe_config.sort.total_memory, util::GuessPhysicalMemory() ? "50%" : "1G"), "Sorting memory: this is a very rough guide")
+      ("sort_block", lm::SizeOption(pipe_config.sort.buffer_size, "64M"), "Block size");
+    po::variables_map vm;
+
+    std::vector<const char *> munged_args;
+    MungeWeightArgs(argc, argv, munged_args);
+
+    po::store(po::parse_command_line((int)munged_args.size(), &*munged_args.begin(), options), vm);
+    if (argc == 1 || vm["help"].as<bool>()) {
+      std::cerr << "Interpolate multiple models\n" << options << std::endl;
+      return 1;
+    }
+    po::notify(vm);
+    instances_config.sort = pipe_config.sort;
+    instances_config.model_read_chain_mem = instances_config.sort.buffer_size;
+    instances_config.extension_write_chain_mem = instances_config.sort.total_memory;
+    instances_config.lazy_memory = instances_config.sort.total_memory;
+
+    if (pipe_config.lambdas.empty() && tuning_file.empty()) {
+      std::cerr << "Provide a tuning file with -t xor weights with -w." << std::endl;
+      return 1;
+    }
+    if (!pipe_config.lambdas.empty() && !tuning_file.empty()) {
+      std::cerr << "Provide weights xor a tuning file, not both." << std::endl;
+      return 1;
+    }
+
+    if (!tuning_file.empty()) {
+      // Tune weights
+      std::vector<StringPiece> model_names;
+      for (std::vector<std::string>::const_iterator i = input_models.begin(); i != input_models.end(); ++i) {
+        model_names.push_back(*i);
+      }
+      lm::interpolate::TuneWeights(util::OpenReadOrThrow(tuning_file.c_str()), model_names, instances_config, pipe_config.lambdas);
+
+      std::cerr << "Final weights:";
+      std::ostream &to = vm["just_tune"].as<bool>() ? std::cout : std::cerr;
+      for (std::vector<float>::const_iterator i = pipe_config.lambdas.begin(); i != pipe_config.lambdas.end(); ++i) {
+        to << ' ' << *i;
+      }
+      to << std::endl;
+    }
+    if (vm["just_tune"].as<bool>()) {
+      return 0;
+    }
+
+    if (pipe_config.lambdas.size() != input_models.size()) {
+      std::cerr << "Number of models (" << input_models.size() << ") should match the number of weights (" << pipe_config.lambdas.size() << ")." << std::endl;
+      return 1;
+    }
+
+    util::FixedArray<lm::ModelBuffer> models(input_models.size());
+    for (std::size_t i = 0; i < input_models.size(); ++i) {
+      models.push_back(input_models[i]);
+    }
+    lm::interpolate::Pipeline(models, pipe_config, 1);
+  } catch (const std::exception &e) {
+    std::cerr << e.what() <<std::endl;
+    return 1;
+  }
+  return 0;
+}
--- a/cpp/thirdpart/kenlm/lm/interpolate/merge_probabilities.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/merge_probabilities.cc
+#include "merge_probabilities.hh"
+#include "../common/ngram_stream.hh"
+#include "bounded_sequence_encoding.hh"
+#include "interpolate_info.hh"
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+
+namespace lm {
+namespace interpolate {
+
+/**
+ * Helper to generate the BoundedSequenceEncoding used for writing the
+ * from values.
+ */
+BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order) {
+  util::FixedArray<uint8_t> max_orders(info.orders.size());
+  for (std::size_t i = 0; i < info.orders.size(); ++i) {
+    max_orders.push_back(std::min(order, info.orders[i]));
+  }
+  return BoundedSequenceEncoding(max_orders.begin(), max_orders.end());
+}
+
+namespace {
+
+/**
+ * A simple wrapper class that holds information needed to read and write
+ * the ngrams of a particular order. This class has the memory needed to
+ * buffer the data needed for the recursive process of computing the
+ * probabilities and "from" values for each component model.
+ *
+ * "From" values indicate, for each model, what order (as an index, so -1)
+ * was backed off to in order to arrive at a probability. For example, if a
+ * 5-gram model (order index 4) backed off twice, we would write a 2.
+ */
+class NGramHandler {
+public:
+  NGramHandler(uint8_t order, const InterpolateInfo &ifo,
+               util::FixedArray<util::stream::ChainPositions> &models_by_order)
+      : info(ifo),
+        encoder(MakeEncoder(info, order)),
+        out_record(order, encoder.EncodedLength()) {
+    std::size_t count_has_order = 0;
+    for (std::size_t i = 0; i < models_by_order.size(); ++i) {
+      count_has_order += (models_by_order[i].size() >= order);
+    }
+    inputs_.Init(count_has_order);
+    for (std::size_t i = 0; i < models_by_order.size(); ++i) {
+      if (models_by_order[i].size() < order)
+        continue;
+      inputs_.push_back(models_by_order[i][order - 1]);
+      if (inputs_.back()) {
+        active_.resize(active_.size() + 1);
+        active_.back().model = i;
+        active_.back().stream = &inputs_.back();
+      }
+    }
+
+    // have to init outside since NGramStreams doesn't forward to
+    // GenericStreams ctor given a ChainPositions
+
+    probs.Init(info.Models());
+    from.Init(info.Models());
+    for (std::size_t i = 0; i < info.Models(); ++i) {
+      probs.push_back(0.0);
+      from.push_back(0);
+    }
+  }
+
+  struct StreamIndex {
+    NGramStream<ProbBackoff> *stream;
+    NGramStream<ProbBackoff> &Stream() { return *stream; }
+    std::size_t model;
+  };
+
+  std::size_t ActiveSize() const {
+    return active_.size();
+  }
+
+  /**
+   * @return the input stream for a particular model that corresponds to
+   * this ngram order
+   */
+  StreamIndex &operator[](std::size_t idx) {
+    return active_[idx];
+  }
+
+  void erase(std::size_t idx) {
+    active_.erase(active_.begin() + idx);
+  }
+
+  const InterpolateInfo &info;
+  BoundedSequenceEncoding encoder;
+  PartialProbGamma out_record;
+  util::FixedArray<float> probs;
+  util::FixedArray<uint8_t> from;
+
+private:
+  std::vector<StreamIndex> active_;
+  NGramStreams<ProbBackoff> inputs_;
+};
+
+/**
+ * A collection of NGramHandlers.
+ */
+class NGramHandlers : public util::FixedArray<NGramHandler> {
+public:
+  explicit NGramHandlers(std::size_t num)
+      : util::FixedArray<NGramHandler>(num) {
+  }
+
+  void push_back(
+      std::size_t order, const InterpolateInfo &info,
+      util::FixedArray<util::stream::ChainPositions> &models_by_order) {
+    new (end()) NGramHandler(order, info, models_by_order);
+    Constructed();
+  }
+};
+
+/**
+ * The recursive helper function that computes probability and "from"
+ * values for all ngrams matching a particular suffix.
+ *
+ * The current order can be computed as the suffix length + 1. Note that
+ * the suffix could be empty (suffix_begin == suffix_end == NULL), in which
+ * case we are handling unigrams with the UNK token as the fallback
+ * probability.
+ *
+ * @param handlers The full collection of handlers
+ * @param suffix_begin A start iterator for the suffix
+ * @param suffix_end An end iterator for the suffix
+ * @param fallback_probs The probabilities of this ngram if we need to
+ *  back off (that is, the probability of the suffix)
+ * @param fallback_from The order that the corresponding fallback
+ *  probability in the fallback_probs is from
+ * @param combined_fallback interpolated fallback_probs
+ * @param outputs The output streams, one for each order
+ */
+void HandleSuffix(NGramHandlers &handlers, WordIndex *suffix_begin,
+                  WordIndex *suffix_end,
+                  const util::FixedArray<float> &fallback_probs,
+                  const util::FixedArray<uint8_t> &fallback_from,
+                  float combined_fallback,
+                  util::stream::Streams &outputs) {
+  uint8_t order = std::distance(suffix_begin, suffix_end) + 1;
+  if (order > outputs.size()) return;
+
+  util::stream::Stream &output = outputs[order - 1];
+  NGramHandler &handler = handlers[order - 1];
+
+  while (true) {
+    // find the next smallest ngram which matches our suffix
+    // TODO: priority queue driven.
+    WordIndex *minimum = NULL;
+    for (std::size_t i = 0; i < handler.ActiveSize(); ++i) {
+      if (!std::equal(suffix_begin, suffix_end, handler[i].Stream()->begin() + 1))
+        continue;
+
+      // if we either haven't set a minimum yet or this one is smaller than
+      // the minimum we found before, replace it
+      WordIndex *last = handler[i].Stream()->begin();
+      if (!minimum || *last < *minimum) { minimum = handler[i].Stream()->begin(); }
+    }
+
+    // no more ngrams of this order match our suffix, so we're done
+    if (!minimum) return;
+
+    handler.out_record.ReBase(output.Get());
+    std::copy(minimum, minimum + order, handler.out_record.begin());
+
+    // Default case is having backed off.
+    std::copy(fallback_probs.begin(), fallback_probs.end(), handler.probs.begin());
+    std::copy(fallback_from.begin(), fallback_from.end(), handler.from.begin());
+
+    for (std::size_t i = 0; i < handler.ActiveSize();) {
+      if (std::equal(handler.out_record.begin(), handler.out_record.end(),
+                     handler[i].Stream()->begin())) {
+        handler.probs[handler[i].model] = handler.info.lambdas[handler[i].model] * handler[i].Stream()->Value().prob;
+        handler.from[handler[i].model] = order - 1;
+        if (++handler[i].Stream()) {
+          ++i;
+        } else {
+          handler.erase(i);
+        }
+      } else {
+        ++i;
+      }
+    }
+    handler.out_record.Prob() = std::accumulate(handler.probs.begin(), handler.probs.end(), 0.0);
+    handler.out_record.LowerProb() = combined_fallback;
+    handler.encoder.Encode(handler.from.begin(),
+                           handler.out_record.FromBegin());
+
+    // we've handled this particular ngram, so now recurse to the higher
+    // order using the current ngram as the suffix
+    HandleSuffix(handlers, handler.out_record.begin(), handler.out_record.end(),
+                 handler.probs, handler.from, handler.out_record.Prob(), outputs);
+    // consume the output
+    ++output;
+  }
+}
+
+/**
+ * Kicks off the recursion for computing the probabilities and "from"
+ * values for each ngram order. We begin by handling the UNK token that
+ * should be at the front of each of the unigram input streams. This is
+ * then output to the stream and it is used as the fallback for handling
+ * our unigram case, the unigram used as the fallback for the bigram case,
+ * etc.
+ */
+void HandleNGrams(NGramHandlers &handlers, util::stream::Streams &outputs) {
+  PartialProbGamma unk_record(1, 0);
+  // First: populate the unk probabilities by reading the first unigram
+  // from each stream
+  util::FixedArray<float> unk_probs(handlers[0].info.Models());
+
+  // start by populating the ngram id from the first stream
+  lm::NGram<ProbBackoff> ngram = *handlers[0][0].Stream();
+  unk_record.ReBase(outputs[0].Get());
+  std::copy(ngram.begin(), ngram.end(), unk_record.begin());
+  unk_record.Prob() = 0;
+
+  // then populate the probabilities into unk_probs while "multiply" the
+  // model probabilities together into the unk record
+  //
+  // note that from doesn't need to be set for unigrams
+  assert(handlers[0].ActiveSize() == handlers[0].info.Models());
+  for (std::size_t i = 0; i < handlers[0].info.Models();) {
+    ngram = *handlers[0][i].Stream();
+    unk_probs.push_back(handlers[0].info.lambdas[i] * ngram.Value().prob);
+    unk_record.Prob() += unk_probs[i];
+    assert(*ngram.begin() == kUNK);
+    if (++handlers[0][i].Stream()) {
+      ++i;
+    } else {
+      handlers[0].erase(i);
+    }
+  }
+  float unk_combined = unk_record.Prob();
+  unk_record.LowerProb() = unk_combined;
+  // flush the unk output record
+  ++outputs[0];
+
+  // Then, begin outputting everything in lexicographic order: first we'll
+  // get the unigram then the first bigram with that context, then the
+  // first trigram with that bigram context, etc., until we exhaust all of
+  // the ngrams, then all of the (n-1)grams, etc.
+  //
+  // This function is the "root" of this recursive process.
+  util::FixedArray<uint8_t> unk_from(handlers[0].info.Models());
+  for (std::size_t i = 0; i < handlers[0].info.Models(); ++i) {
+    unk_from.push_back(0);
+  }
+
+  // the two nulls are to encode that our "fallback" word is the "0-gram"
+  // case, e.g. we "backed off" to UNK
+  // TODO: stop generating vocab ids and LowerProb for unigrams.
+  HandleSuffix(handlers, NULL, NULL, unk_probs, unk_from, unk_combined, outputs);
+
+  // Verify we reached the end.  And poison!
+  for (std::size_t i = 0; i < handlers.size(); ++i) {
+    UTIL_THROW_IF2(handlers[i].ActiveSize(),
+                     "MergeProbabilities did not exhaust all ngram streams");
+    outputs[i].Poison();
+  }
+}
+} // namespace
+
+void MergeProbabilities::Run(const util::stream::ChainPositions &output_pos) {
+  NGramHandlers handlers(output_pos.size());
+  for (std::size_t i = 0; i < output_pos.size(); ++i) {
+    handlers.push_back(i + 1, info_, models_by_order_);
+  }
+
+  util::stream::Streams outputs(output_pos);
+  HandleNGrams(handlers, outputs);
+}
+
+}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/interpolate/merge_probabilities.hh
+++ b/cpp/thirdpart/kenlm/lm/interpolate/merge_probabilities.hh
+#ifndef LM_INTERPOLATE_MERGE_PROBABILITIES_H
+#define LM_INTERPOLATE_MERGE_PROBABILITIES_H
+
+#include "../common/ngram.hh"
+#include "bounded_sequence_encoding.hh"
+#include "../../util/fixed_array.hh"
+#include "../../util/stream/multi_stream.hh"
+
+#include <stdint.h>
+
+namespace lm {
+namespace interpolate {
+
+struct InterpolateInfo;
+
+/**
+ * Make the encoding of backoff values for a given order.  This stores values
+ * in [PartialProbGamma::FromBegin(), PartialProbGamma::FromEnd())
+ */
+BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order);
+
+/**
+ * The first pass for the offline log-linear interpolation algorithm. This
+ * reads K **suffix-ordered** streams for each model, for each order, of
+ * ngram records (ngram-id, prob, backoff). It further assumes that the
+ * ngram-ids have been unified over all of the stream inputs.
+ *
+ * Its output is records of (ngram-id, prob-prod, backoff-level,
+ * backoff-level, ...) where the backoff-levels (of which there are K) are
+ * the context length (0 for unigrams) that the corresponding model had to
+ * back off to in order to obtain a probability for that ngram-id. Each of
+ * these streams is terminated with a record whose ngram-id is all
+ * maximum-integers for simplicity in implementation here.
+ *
+ * @param model_by_order An array of length N (max_i N_i) containing at
+ *  the ChainPositions for the streams for order (i + 1).
+ * The Rus attached to output chains for each order (of length K)
+ */
+class MergeProbabilities {
+  public:
+    MergeProbabilities(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order)
+      : info_(info), models_by_order_(models_by_order) {}
+
+    void Run(const util::stream::ChainPositions &outputs);
+
+  private:
+    const InterpolateInfo &info_;
+    util::FixedArray<util::stream::ChainPositions> &models_by_order_;
+};
+
+/**
+ * This class represents the output payload for this pass, which consists
+ * of an ngram-id, a probability, and then a vector of orders from which
+ * each of the component models backed off to for this ngram, encoded
+ * using the BoundedSequenceEncoding class.
+ */
+class PartialProbGamma : public lm::NGramHeader {
+public:
+  PartialProbGamma(std::size_t order, std::size_t backoff_bytes)
+      : lm::NGramHeader(NULL, order), backoff_bytes_(backoff_bytes) {
+    // nothing
+  }
+
+  std::size_t TotalSize() const {
+    return sizeof(WordIndex) * Order() + sizeof(After) + backoff_bytes_;
+  }
+
+  // TODO: cache bounded sequence encoding in the pipeline?
+  static std::size_t TotalSize(const InterpolateInfo &info, uint8_t order) {
+    return sizeof(WordIndex) * order + sizeof(After) + MakeEncoder(info, order).EncodedLength();
+  }
+
+  float &Prob() { return Pay().prob; }
+  float Prob() const { return Pay().prob; }
+
+  float &LowerProb() { return Pay().lower_prob; }
+  float LowerProb() const { return Pay().lower_prob; }
+
+  const uint8_t *FromBegin() const { return Pay().from; }
+  uint8_t *FromBegin() { return Pay().from; }
+
+private:
+  struct After {
+    // Note that backoff_and_normalize assumes this comes first.
+    float prob;
+    float lower_prob;
+    uint8_t from[];
+  };
+  const After &Pay() const { return *reinterpret_cast<const After *>(end()); }
+  After &Pay() { return *reinterpret_cast<After*>(end()); }
+
+  std::size_t backoff_bytes_;
+};
+
+}} // namespaces
+#endif // LM_INTERPOLATE_MERGE_PROBABILITIES_H
--- a/cpp/thirdpart/kenlm/lm/interpolate/merge_vocab.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/merge_vocab.cc
+#include "merge_vocab.hh"
+
+#include "../enumerate_vocab.hh"
+#include "universal_vocab.hh"
+#include "../lm_exception.hh"
+#include "../vocab.hh"
+#include "../../util/file_piece.hh"
+
+#include <queue>
+#include <string>
+#include <iostream>
+#include <vector>
+
+namespace lm {
+namespace interpolate {
+namespace {
+
+class VocabFileReader {
+  public:
+    explicit VocabFileReader(const int fd, size_t model_num, uint64_t offset = 0);
+
+    VocabFileReader &operator++();
+    operator bool() const { return !eof_; }
+    uint64_t operator*() const { return Value(); }
+
+    uint64_t Value() const { return hash_value_; }
+    size_t ModelNum() const { return model_num_; }
+    WordIndex CurrentIndex() const { return current_index_; }
+
+    StringPiece Word() const { return word_; }
+
+  private:
+    uint64_t hash_value_;
+    WordIndex current_index_;
+    bool eof_;
+    size_t model_num_;
+    StringPiece word_;
+    util::FilePiece file_piece_;
+};
+
+VocabFileReader::VocabFileReader(const int fd, const size_t model_num, uint64_t offset) :
+  hash_value_(0),
+  current_index_(0),
+  eof_(false),
+  model_num_(model_num),
+  file_piece_(util::DupOrThrow(fd)) {
+  word_ = file_piece_.ReadLine('\0');
+  UTIL_THROW_IF(word_ != "<unk>",
+                FormatLoadException,
+                "Vocabulary words are in the wrong place.");
+  // setup to initial value
+  ++*this;
+}
+
+VocabFileReader &VocabFileReader::operator++() {
+  try {
+    word_ = file_piece_.ReadLine('\0');
+  } catch(util::EndOfFileException &e) {
+    eof_ = true;
+    return *this;
+  }
+  uint64_t prev_hash_value = hash_value_;
+  hash_value_ = ngram::detail::HashForVocab(word_.data(), word_.size());
+
+  // hash values should be monotonically increasing
+  UTIL_THROW_IF(hash_value_ < prev_hash_value, FormatLoadException,
+                ": word index not monotonically increasing."
+                << " model_num: " << model_num_
+                << " prev hash: " << prev_hash_value
+                << " new hash: " << hash_value_);
+
+  ++current_index_;
+  return *this;
+}
+
+class CompareFiles {
+public:
+  bool operator()(const VocabFileReader* x,
+                  const VocabFileReader* y)
+  { return x->Value() > y->Value(); }
+};
+
+class Readers : public util::FixedArray<VocabFileReader> {
+  public:
+    Readers(std::size_t number) : util::FixedArray<VocabFileReader>(number) {}
+    void push_back(int fd, std::size_t i) {
+      new(end()) VocabFileReader(fd, i);
+      Constructed();
+    }
+};
+
+} // namespace
+
+WordIndex MergeVocab(util::FixedArray<int> &files, UniversalVocab &vocab, EnumerateVocab &enumerate) {
+  typedef std::priority_queue<VocabFileReader*, std::vector<VocabFileReader*>, CompareFiles> HeapType;
+  HeapType heap;
+  Readers readers(files.size());
+  for (size_t i = 0; i < files.size(); ++i) {
+    readers.push_back(files[i], i);
+    heap.push(&readers.back());
+    // initialize first index to 0 for <unk>
+    vocab.InsertUniversalIdx(i, 0, 0);
+  }
+
+  uint64_t prev_hash_value = 0;
+  // global_index starts with <unk> which is 0
+  WordIndex global_index = 0;
+
+  enumerate.Add(0, "<unk>");
+  while (!heap.empty()) {
+    VocabFileReader* top_vocab_file = heap.top();
+    if (top_vocab_file->Value() != prev_hash_value) {
+      enumerate.Add(++global_index, top_vocab_file->Word());
+    }
+    vocab.InsertUniversalIdx(top_vocab_file->ModelNum(),
+        top_vocab_file->CurrentIndex(),
+        global_index);
+
+    prev_hash_value = top_vocab_file->Value();
+
+    heap.pop();
+    if (++(*top_vocab_file)) {
+      heap.push(top_vocab_file);
+    }
+  }
+  return global_index + 1;
+}
+
+} // namespace interpolate
+} // namespace lm
+
--- a/cpp/thirdpart/kenlm/lm/interpolate/merge_vocab.hh
+++ b/cpp/thirdpart/kenlm/lm/interpolate/merge_vocab.hh
+#ifndef LM_INTERPOLATE_MERGE_VOCAB_H
+#define LM_INTERPOLATE_MERGE_VOCAB_H
+
+#include "../word_index.hh"
+#include "../../util/file.hh"
+#include "../../util/fixed_array.hh"
+
+namespace lm {
+
+class EnumerateVocab;
+
+namespace interpolate {
+
+class UniversalVocab;
+
+// The combined vocabulary is enumerated with enumerate.
+// Returns the size of the combined vocabulary.
+// Does not take ownership of vocab_files.
+WordIndex MergeVocab(util::FixedArray<int> &vocab_files, UniversalVocab &vocab, EnumerateVocab &enumerate);
+
+}} // namespaces
+
+#endif // LM_INTERPOLATE_MERGE_VOCAB_H
--- a/cpp/thirdpart/kenlm/lm/interpolate/merge_vocab_test.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/merge_vocab_test.cc
+#define BOOST_TEST_MODULE InterpolateMergeVocabTest
+#include <boost/test/unit_test.hpp>
+
+#include "../enumerate_vocab.hh"
+#include "merge_vocab.hh"
+#include "universal_vocab.hh"
+#include "../lm_exception.hh"
+#include "../vocab.hh"
+#include "../word_index.hh"
+#include "../../util/file.hh"
+#include "../../util/file_piece.hh"
+#include "../../util/file_stream.hh"
+#include "../../util/tokenize_piece.hh"
+
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+namespace lm {
+namespace interpolate {
+namespace {
+
+struct VocabEntry {
+  explicit VocabEntry(StringPiece value) :
+    str(value), hash(util::MurmurHash64A(value.data(), value.size())) {}
+  StringPiece str;
+  uint64_t hash;
+  bool operator<(const VocabEntry &other) const {
+    return hash < other.hash;
+  }
+};
+
+int WriteVocabFile(const std::vector<VocabEntry> &vocab, util::scoped_fd &file) {
+  file.reset(util::MakeTemp(util::DefaultTempDirectory()));
+  {
+    util::FileStream out(file.get(), 128);
+    for (std::vector<VocabEntry>::const_iterator i = vocab.begin(); i != vocab.end(); ++i) {
+      out << i->str << '\0';
+    }
+  }
+  util::SeekOrThrow(file.get(), 0);
+  return file.get();
+}
+
+std::vector<VocabEntry> ParseVocab(StringPiece words) {
+  std::vector<VocabEntry> entries;
+  entries.push_back(VocabEntry("<unk>"));
+  for (util::TokenIter<util::SingleCharacter> i(words, '\t'); i; ++i) {
+    entries.push_back(VocabEntry(*i));
+  }
+  std::sort(entries.begin() + 1, entries.end());
+  return entries;
+}
+
+int WriteVocabFile(StringPiece words, util::scoped_fd &file) {
+  return WriteVocabFile(ParseVocab(words), file);
+}
+
+class TestFiles {
+  public:
+    TestFiles() {}
+    int Test0() {
+      return WriteVocabFile("this\tis\ta\tfirst\tcut", test[0]);
+    }
+    int Test1() {
+      return WriteVocabFile("is this\tthis a\tfirst cut\ta first", test[1]);
+    }
+    int Test2() {
+      return WriteVocabFile("is\tsecd\ti", test[2]);
+    }
+    int NoUNK() {
+      std::vector<VocabEntry> no_unk_vec;
+      no_unk_vec.push_back(VocabEntry("toto"));
+      return WriteVocabFile(no_unk_vec, no_unk);
+    }
+    int BadOrder() {
+      std::vector<VocabEntry> bad_order_vec;
+      bad_order_vec.push_back(VocabEntry("<unk>"));
+      bad_order_vec.push_back(VocabEntry("0"));
+      bad_order_vec.push_back(VocabEntry("1"));
+      bad_order_vec.push_back(VocabEntry("2"));
+      bad_order_vec.push_back(VocabEntry("a"));
+      return WriteVocabFile(bad_order_vec, bad_order);
+    }
+  private:
+    util::scoped_fd test[3], no_unk, bad_order;
+};
+
+class DoNothingEnumerate : public EnumerateVocab {
+  public:
+    void Add(WordIndex, const StringPiece &) {}
+};
+
+BOOST_AUTO_TEST_CASE(MergeVocabTest) {
+  TestFiles files;
+
+  util::FixedArray<int> used_files(3);
+  used_files.push_back(files.Test0());
+  used_files.push_back(files.Test1());
+  used_files.push_back(files.Test2());
+
+  std::vector<lm::WordIndex> model_max_idx;
+  model_max_idx.push_back(10);
+  model_max_idx.push_back(10);
+  model_max_idx.push_back(10);
+
+  util::scoped_fd combined(util::MakeTemp(util::DefaultTempDirectory()));
+
+  UniversalVocab universal_vocab(model_max_idx);
+  {
+    ngram::ImmediateWriteWordsWrapper writer(NULL, combined.get(), 0);
+    MergeVocab(used_files, universal_vocab, writer);
+  }
+
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 0), 0);
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 0), 0);
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 0), 0);
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 1), 1);
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 1), 2);
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 1), 8);
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 5), 11);
+#if BYTE_ORDER == LITTLE_ENDIAN
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 4);
+#elif BYTE_ORDER == BIG_ENDIAN
+  // MurmurHash has a different ordering of the vocabulary.
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 5);
+#endif
+  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 3), 10);
+
+  util::SeekOrThrow(combined.get(), 0);
+  util::FilePiece f(combined.release());
+  std::vector<VocabEntry> expected = ParseVocab("a\tis this\tthis a\tfirst cut\tthis\ta first\tcut\tis\ti\tsecd\tfirst");
+  for (std::vector<VocabEntry>::const_iterator i = expected.begin(); i != expected.end(); ++i) {
+    BOOST_CHECK_EQUAL(i->str, f.ReadLine('\0'));
+  }
+  BOOST_CHECK_THROW(f.ReadLine('\0'), util::EndOfFileException);
+}
+
+BOOST_AUTO_TEST_CASE(MergeVocabNoUnkTest) {
+  TestFiles files;
+  util::FixedArray<int> used_files(1);
+  used_files.push_back(files.NoUNK());
+
+  std::vector<lm::WordIndex> model_max_idx;
+  model_max_idx.push_back(10);
+
+  UniversalVocab universal_vocab(model_max_idx);
+  DoNothingEnumerate nothing;
+  BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException);
+}
+
+BOOST_AUTO_TEST_CASE(MergeVocabWrongOrderTest) {
+  TestFiles files;
+
+  util::FixedArray<int> used_files(2);
+  used_files.push_back(files.Test0());
+  used_files.push_back(files.BadOrder());
+
+  std::vector<lm::WordIndex> model_max_idx;
+  model_max_idx.push_back(10);
+  model_max_idx.push_back(10);
+
+  lm::interpolate::UniversalVocab universal_vocab(model_max_idx);
+  DoNothingEnumerate nothing;
+  BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException);
+}
+
+}}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/interpolate/normalize.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/normalize.cc
+#include "normalize.hh"
+
+#include "../common/compare.hh"
+#include "../common/ngram_stream.hh"
+#include "backoff_matrix.hh"
+#include "bounded_sequence_encoding.hh"
+#include "interpolate_info.hh"
+#include "merge_probabilities.hh"
+#include "../weights.hh"
+#include "../word_index.hh"
+#include "../../util/fixed_array.hh"
+#include "../../util/scoped.hh"
+#include "../../util/stream/stream.hh"
+#include "../../util/stream/rewindable_stream.hh"
+
+#include <functional>
+#include <queue>
+#include <vector>
+
+namespace lm { namespace interpolate {
+namespace {
+
+class BackoffQueueEntry {
+  public:
+    BackoffQueueEntry(float &entry, const util::stream::ChainPosition &position)
+      : entry_(entry), stream_(position) {
+      entry_ = 0.0;
+    }
+
+    operator bool() const { return stream_; }
+
+    NGramHeader operator*() const { return *stream_; }
+    const NGramHeader *operator->() const { return &*stream_; }
+
+    void Enter() {
+      entry_ = stream_->Value().backoff;
+    }
+
+    BackoffQueueEntry &Next() {
+      entry_ = 0.0;
+      ++stream_;
+      return *this;
+    }
+
+  private:
+    float &entry_;
+    NGramStream<ProbBackoff> stream_;
+};
+
+struct PtrGreater : public std::binary_function<const BackoffQueueEntry *, const BackoffQueueEntry *, bool> {
+  bool operator()(const BackoffQueueEntry *first, const BackoffQueueEntry *second) const {
+    return SuffixLexicographicLess<NGramHeader>()(**second, **first);
+  }
+};
+
+class EntryOwner : public util::FixedArray<BackoffQueueEntry> {
+  public:
+    void push_back(float &entry, const util::stream::ChainPosition &position) {
+      new (end()) BackoffQueueEntry(entry, position);
+      Constructed();
+    }
+};
+
+std::size_t MaxOrder(const util::FixedArray<util::stream::ChainPositions> &model) {
+  std::size_t ret = 0;
+  for (const util::stream::ChainPositions *m = model.begin(); m != model.end(); ++m) {
+    ret = std::max(ret, m->size());
+  }
+  return ret;
+}
+
+class BackoffManager {
+  public:
+    explicit BackoffManager(const util::FixedArray<util::stream::ChainPositions> &models)
+      : entered_(MaxOrder(models)), matrix_(models.size(), MaxOrder(models)), skip_write_(MaxOrder(models)) {
+      std::size_t total = 0;
+      for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) {
+        total += m->size();
+      }
+      for (std::size_t i = 0; i < MaxOrder(models); ++i) {
+        entered_.push_back(models.size());
+      }
+      owner_.Init(total);
+      for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) {
+        for (const util::stream::ChainPosition *j = m->begin(); j != m->end(); ++j) {
+          owner_.push_back(matrix_.Backoff(m - models.begin(), j - m->begin()), *j);
+          if (owner_.back()) {
+            queue_.push(&owner_.back());
+          }
+        }
+      }
+    }
+
+    void SetupSkip(std::size_t order, util::stream::Stream &stream) {
+      skip_write_[order - 2] = &stream;
+    }
+
+    // Move up the backoffs for the given n-gram.  The n-grams must be provided
+    // in suffix lexicographic order.
+    void Enter(const NGramHeader &to) {
+      // Check that we exited properly.
+      for (std::size_t i = to.Order() - 1; i < entered_.size(); ++i) {
+        assert(entered_[i].empty());
+      }
+      SuffixLexicographicLess<NGramHeader> less;
+      while (!queue_.empty() && less(**queue_.top(), to))
+        SkipRecord();
+      while (TopMatches(to)) {
+        BackoffQueueEntry *matches = queue_.top();
+        entered_[to.Order() - 1].push_back(matches);
+        matches->Enter();
+        queue_.pop();
+      }
+    }
+
+    void Exit(std::size_t order_minus_1) {
+      for (BackoffQueueEntry **i = entered_[order_minus_1].begin(); i != entered_[order_minus_1].end(); ++i) {
+        if ((*i)->Next())
+          queue_.push(*i);
+      }
+      entered_[order_minus_1].clear();
+    }
+
+    float Get(std::size_t model, std::size_t order_minus_1) const {
+      return matrix_.Backoff(model, order_minus_1);
+    }
+
+    void Finish() {
+      while (!queue_.empty())
+        SkipRecord();
+    }
+
+  private:
+    void SkipRecord() {
+      BackoffQueueEntry *top = queue_.top();
+      queue_.pop();
+      // Is this the last instance of the n-gram?
+      if (!TopMatches(**top)) {
+        // An n-gram is being skipped.  Called once per skipped n-gram,
+        // regardless of how many models it comes from.
+        *reinterpret_cast<float*>(skip_write_[(*top)->Order() - 1]->Get()) = 0.0;
+        ++*skip_write_[(*top)->Order() - 1];
+      }
+      if (top->Next())
+        queue_.push(top);
+    }
+
+    bool TopMatches(const NGramHeader &header) const {
+      return !queue_.empty() && (*queue_.top())->Order() == header.Order() && std::equal(header.begin(), header.end(), (*queue_.top())->begin());
+    }
+
+    EntryOwner owner_;
+    std::priority_queue<BackoffQueueEntry*, std::vector<BackoffQueueEntry*>, PtrGreater> queue_;
+
+    // Indexed by order then just all the matching models.
+    util::FixedArray<util::FixedArray<BackoffQueueEntry*> > entered_;
+
+    BackoffMatrix matrix_;
+
+    std::vector<util::stream::Stream*> skip_write_;
+};
+
+typedef long double Accum;
+
+// Handles n-grams of the same order, using recursion to call another instance
+// for higher orders.
+class Recurse {
+  public:
+    Recurse(
+        const InterpolateInfo &info, // Must stay alive the entire time.
+        std::size_t order,
+        const util::stream::ChainPosition &merged_probs,
+        const util::stream::ChainPosition &prob_out,
+        const util::stream::ChainPosition &backoff_out,
+        BackoffManager &backoffs,
+        Recurse *higher) // higher is null for the highest order.
+      : order_(order),
+        encoding_(MakeEncoder(info, order)),
+        input_(merged_probs, PartialProbGamma(order, encoding_.EncodedLength())),
+        prob_out_(prob_out),
+        backoff_out_(backoff_out),
+        backoffs_(backoffs),
+        lambdas_(&*info.lambdas.begin()),
+        higher_(higher),
+        decoded_backoffs_(info.Models()),
+        extended_context_(order - 1) {
+      // This is only for bigrams and above.  Summing unigrams is a much easier case.
+      assert(order >= 2);
+    }
+
+    // context = w_1^{n-1}
+    // z_lower = Z(w_2^{n-1})
+    // Input:
+    //   Merged probabilities without backoff applied in input_.
+    //   Backoffs via backoffs_.
+    // Calculates:
+    //   Z(w_1^{n-1}): intermediate only.
+    //   p_I(x | w_1^{n-1}) for all x: w_1^{n-1}x exists: Written to prob_out_.
+    //   b_I(w_1^{n-1}): Written to backoff_out_.
+    void SameContext(const NGramHeader &context, Accum z_lower) {
+      assert(context.size() == order_ - 1);
+      backoffs_.Enter(context);
+      prob_out_.Mark();
+
+      // This is the backoff term that applies when one assumes everything backs off:
+      // \prod_i b_i(w_1^{n-1})^{\lambda_i}.
+      Accum backoff_once = 0.0;
+      for (std::size_t m = 0; m < decoded_backoffs_.size(); ++m) {
+        backoff_once += lambdas_[m] * backoffs_.Get(m, order_ - 2);
+      }
+
+      Accum z_delta = 0.0;
+      std::size_t count = 0;
+      for (; input_ && std::equal(context.begin(), context.end(), input_->begin()); ++input_, ++prob_out_, ++count) {
+        // Apply backoffs to probabilities.
+        // TODO: change bounded sequence encoding to have an iterator for decoding instead of doing a copy here.
+        encoding_.Decode(input_->FromBegin(), &*decoded_backoffs_.begin());
+        for (std::size_t m = 0; m < NumModels(); ++m) {
+          // Apply the backoffs as instructed for model m.
+          float accumulated = 0.0;
+          // Change backoffs for [order it backed off to, order - 1) except
+          // with 0-indexing.  There is still the potential to charge backoff
+          // for order - 1, which is done later.  The backoffs charged here
+          // are b_m(w_{n-1}^{n-1}) ... b_m(w_2^{n-1})
+          for (unsigned char backed_to = decoded_backoffs_[m]; backed_to < order_ - 2; ++backed_to) {
+            accumulated += backoffs_.Get(m, backed_to);
+          }
+          float lambda = lambdas_[m];
+          // Lower p(x | w_2^{n-1}) gets all the backoffs except the highest.
+          input_->LowerProb() += accumulated * lambda;
+          // Charge the backoff b(w_1^{n-1}) if applicable, but only to attain p(x | w_1^{n-1})
+          if (decoded_backoffs_[m] < order_ - 1) {
+            accumulated += backoffs_.Get(m, order_ - 2);
+          }
+          input_->Prob() += accumulated * lambda;
+        }
+        // TODO: better precision/less operations here.
+        z_delta += pow(10.0, input_->Prob()) - pow(10.0, input_->LowerProb() + backoff_once);
+
+        // Write unnormalized probability record.
+        std::copy(input_->begin(), input_->end(), reinterpret_cast<WordIndex*>(prob_out_.Get()));
+        ProbWrite() = input_->Prob();
+      }
+      // TODO numerical precision.
+      Accum z = log10(pow(10.0, z_lower + backoff_once) + z_delta);
+
+      // Normalize.
+      prob_out_.Rewind();
+      for (std::size_t i = 0; i < count; ++i, ++prob_out_) {
+        ProbWrite() -= z;
+      }
+      // This allows the stream to release data.
+      prob_out_.Mark();
+
+      // Output backoff.
+      *reinterpret_cast<float*>(backoff_out_.Get()) = z_lower + backoff_once - z;
+      ++backoff_out_;
+
+      if (higher_.get())
+        higher_->ExtendContext(context, z);
+
+      backoffs_.Exit(order_ - 2);
+    }
+
+    // Call is given a context and z(context).
+    // Evaluates y context x for all y,x.
+    void ExtendContext(const NGramHeader &middle, Accum z_lower) {
+      assert(middle.size() == order_ - 2);
+      // Copy because the input will advance.  TODO avoid this copy by sharing amongst classes.
+      std::copy(middle.begin(), middle.end(), extended_context_.begin() + 1);
+      while (input_ && std::equal(middle.begin(), middle.end(), input_->begin() + 1)) {
+        *extended_context_.begin() = *input_->begin();
+        SameContext(NGramHeader(&*extended_context_.begin(), order_ - 1), z_lower);
+      }
+    }
+
+    void Finish() {
+      assert(!input_);
+      prob_out_.Poison();
+      backoff_out_.Poison();
+      if (higher_.get())
+        higher_->Finish();
+    }
+
+    // The BackoffManager class also injects backoffs when it skips ahead e.g. b(</s>) = 1
+    util::stream::Stream &BackoffStream() { return backoff_out_; }
+
+  private:
+    // Write the probability to the correct place in prob_out_.  Should use a proxy but currently incompatible with RewindableStream.
+    float &ProbWrite() {
+      return *reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_out_.Get()) + order_ * sizeof(WordIndex));
+    }
+
+    std::size_t NumModels() const { return decoded_backoffs_.size(); }
+
+    const std::size_t order_;
+
+    const BoundedSequenceEncoding encoding_;
+
+    ProxyStream<PartialProbGamma> input_;
+    util::stream::RewindableStream prob_out_;
+    util::stream::Stream backoff_out_;
+
+    BackoffManager &backoffs_;
+    const float *const lambdas_;
+
+    // Higher order instance of this same class.
+    util::scoped_ptr<Recurse> higher_;
+
+    // Temporary in SameContext.
+    std::vector<unsigned char> decoded_backoffs_;
+    // Temporary in ExtendContext.
+    std::vector<WordIndex> extended_context_;
+};
+
+class Thread {
+  public:
+    Thread(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order, util::stream::Chains &prob_out, util::stream::Chains &backoff_out)
+      : info_(info), models_by_order_(models_by_order), prob_out_(prob_out), backoff_out_(backoff_out) {}
+
+    void Run(const util::stream::ChainPositions &merged_probabilities) {
+      // Unigrams do not have enocded backoff info.
+      ProxyStream<PartialProbGamma> in(merged_probabilities[0], PartialProbGamma(1, 0));
+      util::stream::RewindableStream prob_write(prob_out_[0]);
+      Accum z = 0.0;
+      prob_write.Mark();
+      WordIndex count = 0;
+      for (; in; ++in, ++prob_write, ++count) {
+        // Note assumption that probabilitity comes first
+        memcpy(prob_write.Get(), in.Get(), sizeof(WordIndex) + sizeof(float));
+        z += pow(10.0, in->Prob());
+      }
+      // TODO HACK TODO: lmplz outputs p(<s>) = 1 to get q to compute nicely.  That will always result in 1.0 more than it should be.
+      z -= 1.0;
+      float log_z = log10(z);
+      prob_write.Rewind();
+      // Normalize unigram probabilities.
+      for (WordIndex i = 0; i < count; ++i, ++prob_write) {
+        *reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_write.Get()) + sizeof(WordIndex)) -= log_z;
+      }
+      prob_write.Poison();
+
+      // Now setup the higher orders.
+      util::scoped_ptr<Recurse> higher_order;
+      BackoffManager backoffs(models_by_order_);
+      std::size_t max_order = merged_probabilities.size();
+      for (std::size_t order = max_order; order >= 2; --order) {
+        higher_order.reset(new Recurse(info_, order, merged_probabilities[order - 1], prob_out_[order - 1], backoff_out_[order - 2], backoffs, higher_order.release()));
+        backoffs.SetupSkip(order, higher_order->BackoffStream());
+      }
+      if (max_order > 1) {
+        higher_order->ExtendContext(NGramHeader(NULL, 0), log_z);
+        backoffs.Finish();
+        higher_order->Finish();
+      }
+    }
+
+  private:
+    const InterpolateInfo info_;
+    util::FixedArray<util::stream::ChainPositions> &models_by_order_;
+    util::stream::ChainPositions prob_out_;
+    util::stream::ChainPositions backoff_out_;
+};
+
+} // namespace
+
+void Normalize(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order, util::stream::Chains &merged_probabilities, util::stream::Chains &prob_out, util::stream::Chains &backoff_out) {
+  assert(prob_out.size() == backoff_out.size() + 1);
+  // Arbitrarily put the thread on the merged_probabilities Chains.
+  merged_probabilities >> Thread(info, models_by_order, prob_out, backoff_out);
+}
+
+}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/interpolate/normalize.hh
+++ b/cpp/thirdpart/kenlm/lm/interpolate/normalize.hh
+#ifndef LM_INTERPOLATE_NORMALIZE_H
+#define LM_INTERPOLATE_NORMALIZE_H
+
+#include "../../util/fixed_array.hh"
+
+/* Pass 2:
+ * - Multiply backoff weights by the backed off probabilities from pass 1.
+ * - Compute the normalization factor Z.
+ * - Send Z to the next highest order.
+ * - Rewind and divide by Z.
+ */
+
+namespace util { namespace stream {
+class ChainPositions;
+class Chains;
+}} // namespaces
+
+namespace lm { namespace interpolate {
+
+struct InterpolateInfo;
+
+void Normalize(
+    const InterpolateInfo &info,
+    // Input full models for backoffs.  Assumes that renumbering has been done. Suffix order.
+    util::FixedArray<util::stream::ChainPositions> &models_by_order,
+    // Input PartialProbGamma from MergeProbabilities. Context order.
+    util::stream::Chains &merged_probabilities,
+    // Output NGram<float> with normalized probabilities. Context order.
+    util::stream::Chains &probabilities_out,
+    // Output bare floats with backoffs.  Note backoffs.size() == order - 1.  Suffix order.
+    util::stream::Chains &backoffs_out);
+
+}} // namespaces
+
+#endif // LM_INTERPOLATE_NORMALIZE_H
--- a/cpp/thirdpart/kenlm/lm/interpolate/normalize_test.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/normalize_test.cc
+#include "normalize.hh"
+
+#include "interpolate_info.hh"
+#include "merge_probabilities.hh"
+#include "../common/ngram_stream.hh"
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/multi_stream.hh"
+
+#define BOOST_TEST_MODULE NormalizeTest
+#include <boost/test/unit_test.hpp>
+
+namespace lm { namespace interpolate { namespace {
+
+// log without backoff
+const float kInputs[] = {-0.3, 1.2, -9.8, 4.0, -7.0, 0.0};
+
+class WriteInput {
+  public:
+    WriteInput() {}
+    void Run(const util::stream::ChainPosition &to) {
+      util::stream::Stream out(to);
+      for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float); ++i, ++out) {
+        memcpy(out.Get(), &i, sizeof(WordIndex));
+        memcpy((uint8_t*)out.Get() + sizeof(WordIndex), &kInputs[i], sizeof(float));
+      }
+      out.Poison();
+    }
+};
+
+void CheckOutput(const util::stream::ChainPosition &from) {
+  NGramStream<float> in(from);
+  float sum = 0.0;
+  for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float) - 1 /* <s> at the end */; ++i) {
+    sum += pow(10.0, kInputs[i]);
+  }
+  sum = log10(sum);
+  BOOST_REQUIRE(in);
+  BOOST_CHECK_CLOSE(kInputs[0] - sum, in->Value(), 0.0001);
+  BOOST_REQUIRE(++in);
+  BOOST_CHECK_CLOSE(kInputs[1] - sum, in->Value(), 0.0001);
+  BOOST_REQUIRE(++in);
+  BOOST_CHECK_CLOSE(kInputs[2] - sum, in->Value(), 0.0001);
+  BOOST_REQUIRE(++in);
+  BOOST_CHECK_CLOSE(kInputs[3] - sum, in->Value(), 0.0001);
+  BOOST_REQUIRE(++in);
+  BOOST_CHECK_CLOSE(kInputs[4] - sum, in->Value(), 0.0001);
+  BOOST_REQUIRE(++in);
+  BOOST_CHECK_CLOSE(kInputs[5] - sum, in->Value(), 0.0001);
+  BOOST_CHECK(!++in);
+}
+
+BOOST_AUTO_TEST_CASE(Unigrams) {
+  InterpolateInfo info;
+  info.lambdas.push_back(2.0);
+  info.lambdas.push_back(-0.1);
+  info.orders.push_back(1);
+  info.orders.push_back(1);
+
+  BOOST_CHECK_EQUAL(0, MakeEncoder(info, 1).EncodedLength());
+
+  // No backoffs.
+  util::stream::Chains blank(0);
+  util::FixedArray<util::stream::ChainPositions> models_by_order(2);
+  models_by_order.push_back(blank);
+  models_by_order.push_back(blank);
+
+  util::stream::Chains merged_probabilities(1);
+  util::stream::Chains probabilities_out(1);
+  util::stream::Chains backoffs_out(0);
+
+  merged_probabilities.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float) + sizeof(float), 2, 24));
+  probabilities_out.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float), 2, 100));
+
+  merged_probabilities[0] >> WriteInput();
+  Normalize(info, models_by_order, merged_probabilities, probabilities_out, backoffs_out);
+
+  util::stream::ChainPosition checker(probabilities_out[0].Add());
+
+  merged_probabilities >> util::stream::kRecycle;
+  probabilities_out >> util::stream::kRecycle;
+
+  CheckOutput(checker);
+  probabilities_out.Wait();
+}
+
+}}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/interpolate/pipeline.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/pipeline.cc
+#include "pipeline.hh"
+
+#include "../common/compare.hh"
+#include "../common/print.hh"
+#include "../common/renumber.hh"
+#include "../vocab.hh"
+#include "backoff_reunification.hh"
+#include "interpolate_info.hh"
+#include "merge_probabilities.hh"
+#include "merge_vocab.hh"
+#include "normalize.hh"
+#include "universal_vocab.hh"
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/count_records.hh"
+#include "../../util/stream/io.hh"
+#include "../../util/stream/multi_stream.hh"
+#include "../../util/stream/sort.hh"
+#include "../../util/fixed_array.hh"
+
+namespace lm { namespace interpolate { namespace {
+
+/* Put the original input files on chains and renumber them */
+void SetupInputs(std::size_t buffer_size, const UniversalVocab &vocab, util::FixedArray<ModelBuffer> &models, bool exclude_highest, util::FixedArray<util::stream::Chains> &chains, util::FixedArray<util::stream::ChainPositions> &positions) {
+  chains.clear();
+  positions.clear();
+  // TODO: much better memory sizing heuristics e.g. not making the chain larger than it will use.
+  util::stream::ChainConfig config(0, 2, buffer_size);
+  for (std::size_t i = 0; i < models.size(); ++i) {
+    chains.push_back(models[i].Order() - exclude_highest);
+    for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) {
+      config.entry_size = sizeof(WordIndex) * (j + 1) + sizeof(float) * 2; // TODO do not include wasteful backoff for highest.
+      chains.back().push_back(config);
+    }
+    if (i == models.size() - 1)
+      chains.back().back().ActivateProgress();
+    models[i].Source(chains.back());
+    for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) {
+      chains[i][j] >> Renumber(vocab.Mapping(i), j + 1);
+    }
+  }
+ for (std::size_t i = 0; i < chains.size(); ++i) {
+    positions.push_back(chains[i]);
+  }
+}
+
+template <class Compare> void SinkSort(const util::stream::SortConfig &config, util::stream::Chains &chains, util::stream::Sorts<Compare> &sorts) {
+  for (std::size_t i = 0; i < chains.size(); ++i) {
+    sorts.push_back(chains[i], config, Compare(i + 1));
+  }
+}
+
+template <class Compare> void SourceSort(util::stream::Chains &chains, util::stream::Sorts<Compare> &sorts) {
+  // TODO memory management
+  for (std::size_t i = 0; i < sorts.size(); ++i) {
+    sorts[i].Merge(sorts[i].DefaultLazy());
+  }
+  for (std::size_t i = 0; i < sorts.size(); ++i) {
+    sorts[i].Output(chains[i], sorts[i].DefaultLazy());
+  }
+}
+
+} // namespace
+
+void Pipeline(util::FixedArray<ModelBuffer> &models, const Config &config, int write_file) {
+  // Setup InterpolateInfo and UniversalVocab.
+  InterpolateInfo info;
+  info.lambdas = config.lambdas;
+  std::vector<WordIndex> vocab_sizes;
+
+  util::scoped_fd vocab_null(util::MakeTemp(config.sort.temp_prefix));
+  std::size_t max_order = 0;
+  util::FixedArray<int> vocab_files(models.size());
+  for (ModelBuffer *i = models.begin(); i != models.end(); ++i) {
+    info.orders.push_back(i->Order());
+    vocab_sizes.push_back(i->Counts()[0]);
+    vocab_files.push_back(i->VocabFile());
+    max_order = std::max(max_order, i->Order());
+  }
+  util::scoped_ptr<UniversalVocab> vocab(new UniversalVocab(vocab_sizes));
+  {
+    ngram::ImmediateWriteWordsWrapper writer(NULL, vocab_null.get(), 0);
+    MergeVocab(vocab_files, *vocab, writer);
+  }
+
+  std::cerr << "Merging probabilities." << std::endl;
+  // Pass 1: merge probabilities
+  util::FixedArray<util::stream::Chains> input_chains(models.size());
+  util::FixedArray<util::stream::ChainPositions> models_by_order(models.size());
+  SetupInputs(config.BufferSize(), *vocab, models, false, input_chains, models_by_order);
+
+  util::stream::Chains merged_probs(max_order);
+  for (std::size_t i = 0; i < max_order; ++i) {
+    merged_probs.push_back(util::stream::ChainConfig(PartialProbGamma::TotalSize(info, i + 1), 2, config.BufferSize())); // TODO: not buffer_size
+  }
+  merged_probs >> MergeProbabilities(info, models_by_order);
+  std::vector<uint64_t> counts(max_order);
+  for (std::size_t i = 0; i < max_order; ++i) {
+    merged_probs[i] >> util::stream::CountRecords(&counts[i]);
+  }
+  for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) {
+    *i >> util::stream::kRecycle;
+  }
+
+  // Pass 2: normalize.
+  {
+    util::stream::Sorts<ContextOrder> sorts(merged_probs.size());
+    SinkSort(config.sort, merged_probs, sorts);
+    merged_probs.Wait(true);
+    for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) {
+      i->Wait(true);
+    }
+    SourceSort(merged_probs, sorts);
+  }
+
+  std::cerr << "Normalizing" << std::endl;
+  SetupInputs(config.BufferSize(), *vocab, models, true, input_chains, models_by_order);
+  util::stream::Chains probabilities(max_order), backoffs(max_order - 1);
+  std::size_t block_count = 2;
+  for (std::size_t i = 0; i < max_order; ++i) {
+    // Careful accounting to ensure RewindableStream can fit the entire vocabulary.
+    block_count = std::max<std::size_t>(block_count, 2);
+    // This much needs to fit in RewindableStream.
+    std::size_t fit = NGram<float>::TotalSize(i + 1) * counts[0];
+    // fit / (block_count - 1) rounded up
+    std::size_t min_block = (fit + block_count - 2) / (block_count - 1);
+    std::size_t specify = std::max(config.BufferSize(), min_block * block_count);
+    probabilities.push_back(util::stream::ChainConfig(NGram<float>::TotalSize(i + 1), block_count, specify));
+  }
+  for (std::size_t i = 0; i < max_order - 1; ++i) {
+    backoffs.push_back(util::stream::ChainConfig(sizeof(float), 2, config.BufferSize()));
+  }
+  Normalize(info, models_by_order, merged_probs, probabilities, backoffs);
+  util::FixedArray<util::stream::FileBuffer> backoff_buffers(backoffs.size());
+  for (std::size_t i = 0; i < max_order - 1; ++i) {
+    backoff_buffers.push_back(util::MakeTemp(config.sort.temp_prefix));
+    backoffs[i] >> backoff_buffers.back().Sink() >> util::stream::kRecycle;
+  }
+  for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) {
+    *i >> util::stream::kRecycle;
+  }
+  merged_probs >> util::stream::kRecycle;
+
+  // Pass 3: backoffs in the right place.
+  {
+    util::stream::Sorts<SuffixOrder> sorts(probabilities.size());
+    SinkSort(config.sort, probabilities, sorts);
+    probabilities.Wait(true);
+    for (util::stream::Chains *i = input_chains.begin(); i != input_chains.end(); ++i) {
+      i->Wait(true);
+    }
+    backoffs.Wait(true);
+    merged_probs.Wait(true);
+    // destroy universal vocab to save RAM.
+    vocab.reset();
+    SourceSort(probabilities, sorts);
+  }
+
+  std::cerr << "Reunifying backoffs" << std::endl;
+  util::stream::ChainPositions prob_pos(max_order - 1);
+  util::stream::Chains combined(max_order - 1);
+  for (std::size_t i = 0; i < max_order - 1; ++i) {
+    if (i == max_order - 2)
+      backoffs[i].ActivateProgress();
+    backoffs[i].SetProgressTarget(backoff_buffers[i].Size());
+    backoffs[i] >> backoff_buffers[i].Source(true);
+    prob_pos.push_back(probabilities[i].Add());
+    combined.push_back(util::stream::ChainConfig(NGram<ProbBackoff>::TotalSize(i + 1), 2, config.BufferSize()));
+  }
+  util::stream::ChainPositions backoff_pos(backoffs);
+
+  ReunifyBackoff(prob_pos, backoff_pos, combined);
+
+  util::stream::ChainPositions output_pos(max_order);
+  for (std::size_t i = 0; i < max_order - 1; ++i) {
+    output_pos.push_back(combined[i].Add());
+  }
+  output_pos.push_back(probabilities.back().Add());
+
+  probabilities >> util::stream::kRecycle;
+  backoffs >> util::stream::kRecycle;
+  combined >> util::stream::kRecycle;
+
+  // TODO genericize to ModelBuffer etc.
+  PrintARPA(vocab_null.get(), write_file, counts).Run(output_pos);
+}
+
+}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/interpolate/pipeline.hh
+++ b/cpp/thirdpart/kenlm/lm/interpolate/pipeline.hh
+#ifndef LM_INTERPOLATE_PIPELINE_H
+#define LM_INTERPOLATE_PIPELINE_H
+
+#include "../common/model_buffer.hh"
+#include "../../util/fixed_array.hh"
+#include "../../util/stream/config.hh"
+
+#include <cstddef>
+#include <string>
+
+namespace lm { namespace interpolate {
+
+struct Config {
+  std::vector<float> lambdas;
+  util::stream::SortConfig sort;
+  std::size_t BufferSize() const { return sort.buffer_size; }
+};
+
+void Pipeline(util::FixedArray<ModelBuffer> &models, const Config &config, int write_file);
+
+}} // namespaces
+#endif // LM_INTERPOLATE_PIPELINE_H
--- a/cpp/thirdpart/kenlm/lm/interpolate/split_worker.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/split_worker.cc
+#include "split_worker.hh"
+#include "../common/ngram.hh"
+
+namespace lm {
+namespace interpolate {
+
+SplitWorker::SplitWorker(std::size_t order, util::stream::Chain &backoff_chain,
+                         util::stream::Chain &sort_chain)
+    : order_(order) {
+  backoff_chain >> backoff_input_;
+  sort_chain >> sort_input_;
+}
+
+void SplitWorker::Run(const util::stream::ChainPosition &position) {
+  // input: ngram record (id, prob, and backoff)
+  // output: a float to the backoff_input stream
+  //         an ngram id and a float to the sort_input stream
+  for (util::stream::Stream stream(position); stream; ++stream) {
+    NGram<ProbBackoff> ngram(stream.Get(), order_);
+
+    // write id and prob to the sort stream
+    float prob = ngram.Value().prob;
+    lm::WordIndex *out = reinterpret_cast<lm::WordIndex *>(sort_input_.Get());
+    for (const lm::WordIndex *it = ngram.begin(); it != ngram.end(); ++it) {
+      *out++ = *it;
+    }
+    *reinterpret_cast<float *>(out) = prob;
+    ++sort_input_;
+
+    // write backoff to the backoff output stream
+    float boff = ngram.Value().backoff;
+    *reinterpret_cast<float *>(backoff_input_.Get()) = boff;
+    ++backoff_input_;
+  }
+  sort_input_.Poison();
+  backoff_input_.Poison();
+}
+
+}
+}
--- a/cpp/thirdpart/kenlm/lm/interpolate/split_worker.hh
+++ b/cpp/thirdpart/kenlm/lm/interpolate/split_worker.hh
+#ifndef KENLM_INTERPOLATE_SPLIT_WORKER_H_
+#define KENLM_INTERPOLATE_SPLIT_WORKER_H_
+
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/stream.hh"
+
+namespace lm {
+namespace interpolate {
+
+class SplitWorker {
+  public:
+    /**
+     * Constructs a split worker for a particular order. It writes the
+     * split-off backoff values to the backoff chain and the ngram id and
+     * probability to the sort chain for each ngram in the input.
+     */
+    SplitWorker(std::size_t order, util::stream::Chain &backoff_chain,
+                util::stream::Chain &sort_chain);
+
+    /**
+     * The callback invoked to handle the input from the ngram intermediate
+     * files.
+     */
+    void Run(const util::stream::ChainPosition& position);
+
+  private:
+    /**
+     * The ngram order we are reading/writing for.
+     */
+    std::size_t order_;
+
+    /**
+     * The stream to write to for the backoff values.
+     */
+    util::stream::Stream backoff_input_;
+
+    /**
+     * The stream to write to for the ngram id + probability values.
+     */
+    util::stream::Stream sort_input_;
+};
+}
+}
+#endif
--- a/cpp/thirdpart/kenlm/lm/interpolate/streaming_example_main.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/streaming_example_main.cc
+#include "../common/compare.hh"
+#include "../common/model_buffer.hh"
+#include "../common/ngram.hh"
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/multi_stream.hh"
+#include "../../util/stream/sort.hh"
+#include "split_worker.hh"
+
+#include <boost/program_options.hpp>
+#include <boost/version.hpp>
+
+#if defined(_WIN32) || defined(_WIN64)
+
+// Windows doesn't define <unistd.h>
+//
+// So we define what we need here instead:
+//
+#define STDIN_FILENO = 0
+#define STDOUT_FILENO = 1
+#else // Huzzah for POSIX!
+#include <unistd.h>
+#endif
+
+/*
+ * This is a simple example program that takes in intermediate
+ * suffix-sorted ngram files and outputs two sets of files: one for backoff
+ * probability values (raw numbers, in suffix order) and one for
+ * probability values (ngram id and probability, in *context* order)
+ */
+int main(int argc, char *argv[]) {
+  using namespace lm::interpolate;
+
+  const std::size_t ONE_GB = 1 << 30;
+  const std::size_t SIXTY_FOUR_MB = 1 << 26;
+  const std::size_t NUMBER_OF_BLOCKS = 2;
+
+  std::string FILE_NAME = "ngrams";
+  std::string CONTEXT_SORTED_FILENAME = "csorted-ngrams";
+  std::string BACKOFF_FILENAME = "backoffs";
+  std::string TMP_DIR = "/tmp/";
+
+  try {
+    namespace po = boost::program_options;
+    po::options_description options("canhazinterp Pass-3 options");
+
+    options.add_options()
+      ("help,h", po::bool_switch(), "Show this help message")
+      ("ngrams,n", po::value<std::string>(&FILE_NAME), "ngrams file")
+      ("csortngrams,c", po::value<std::string>(&CONTEXT_SORTED_FILENAME), "context sorted ngrams file")
+      ("backoffs,b", po::value<std::string>(&BACKOFF_FILENAME), "backoffs file")
+      ("tmpdir,t", po::value<std::string>(&TMP_DIR), "tmp dir");
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, options), vm);
+
+    // Display help
+    if(vm["help"].as<bool>()) {
+      std::cerr << "Usage: " << options << std::endl;
+      return 1;
+    }
+  }
+  catch(const std::exception &e) {
+
+    std::cerr << e.what() << std::endl;
+    return 1;
+
+  }
+
+  // The basic strategy here is to have three chains:
+  // - The first reads the ngram order inputs using ModelBuffer. Those are
+  //   then stripped of their backoff values and fed into the third chain;
+  //   the backoff values *themselves* are written to the second chain.
+  //
+  // - The second chain takes the backoff values and writes them out to a
+  //   file (one for each order).
+  //
+  // - The third chain takes just the probability values and ngrams and
+  //   writes them out, sorted in context-order, to a file (one for each
+  //   order).
+
+  // This will be used to read in the binary intermediate files. There is
+  // one file per order (e.g. ngrams.1, ngrams.2, ...)
+  lm::ModelBuffer buffer(FILE_NAME);
+
+  // Create a separate chains for each ngram order for:
+  // - Input from the intermediate files
+  // - Output to the backoff file
+  // - Output to the (context-sorted) probability file
+  util::stream::Chains ngram_inputs(buffer.Order());
+  util::stream::Chains backoff_chains(buffer.Order());
+  util::stream::Chains prob_chains(buffer.Order());
+  for (std::size_t i = 0; i < buffer.Order(); ++i) {
+    ngram_inputs.push_back(util::stream::ChainConfig(
+        lm::NGram<lm::ProbBackoff>::TotalSize(i + 1), NUMBER_OF_BLOCKS, ONE_GB));
+
+    backoff_chains.push_back(
+        util::stream::ChainConfig(sizeof(float), NUMBER_OF_BLOCKS, ONE_GB));
+
+    prob_chains.push_back(util::stream::ChainConfig(
+        sizeof(lm::WordIndex) * (i + 1) + sizeof(float), NUMBER_OF_BLOCKS,
+        ONE_GB));
+  }
+
+  // This sets the input for each of the ngram order chains to the
+  // appropriate file
+  buffer.Source(ngram_inputs);
+
+  util::FixedArray<util::scoped_ptr<SplitWorker> > workers(buffer.Order());
+  for (std::size_t i = 0; i < buffer.Order(); ++i) {
+    // Attach a SplitWorker to each of the ngram input chains, writing to the
+    // corresponding order's backoff and probability chains
+    workers.push_back(
+        new SplitWorker(i + 1, backoff_chains[i], prob_chains[i]));
+    ngram_inputs[i] >> boost::ref(*workers.back());
+  }
+
+  util::stream::SortConfig sort_cfg;
+  sort_cfg.temp_prefix = TMP_DIR;
+  sort_cfg.buffer_size = SIXTY_FOUR_MB;
+  sort_cfg.total_memory = ONE_GB;
+
+  // This will parallel merge sort the individual order files, putting
+  // them in context-order instead of suffix-order.
+  //
+  // Two new threads will be running, each owned by the chains[i] object.
+  // - The first executes BlockSorter.Run() to sort the n-gram entries
+  // - The second executes WriteAndRecycle.Run() to write each sorted
+  //   block to disk as a temporary file
+  util::stream::Sorts<lm::ContextOrder> sorts(buffer.Order());
+  for (std::size_t i = 0; i < prob_chains.size(); ++i) {
+    sorts.push_back(prob_chains[i], sort_cfg, lm::ContextOrder(i + 1));
+  }
+
+  // Set the sort output to be on the same chain
+  for (std::size_t i = 0; i < prob_chains.size(); ++i) {
+    // The following call to Chain::Wait()
+    //     joins the threads owned by chains[i].
+    //
+    // As such the following call won't return
+    //     until all threads owned by chains[i] have completed.
+    //
+    // The following call also resets chain[i]
+    //     so that it can be reused
+    //     (including free'ing the memory previously used by the chain)
+    prob_chains[i].Wait();
+
+    // In an ideal world (without memory restrictions)
+    //     we could merge all of the previously sorted blocks
+    //     by reading them all completely into memory
+    //     and then running merge sort over them.
+    //
+    // In the real world, we have memory restrictions;
+    //     depending on how many blocks we have,
+    //     and how much memory we can use to read from each block
+    //     (sort_config.buffer_size)
+    //     it may be the case that we have insufficient memory
+    //     to read sort_config.buffer_size of data from each block from disk.
+    //
+    // If this occurs, then it will be necessary to perform one or more rounds
+    // of merge sort on disk;
+    //     doing so will reduce the number of blocks that we will eventually
+    //     need to read from
+    //     when performing the final round of merge sort in memory.
+    //
+    // So, the following call determines whether it is necessary
+    //     to perform one or more rounds of merge sort on disk;
+    //     if such on-disk merge sorting is required, such sorting is performed.
+    //
+    // Finally, the following method launches a thread that calls
+    // OwningMergingReader.Run()
+    //     to perform the final round of merge sort in memory.
+    //
+    // Merge sort could have be invoked directly
+    //     so that merge sort memory doesn't coexist with Chain memory.
+    sorts[i].Output(prob_chains[i]);
+  }
+
+  // Create another model buffer for our output on e.g. csorted-ngrams.1,
+  // csorted-ngrams.2, ...
+  lm::ModelBuffer output_buf(CONTEXT_SORTED_FILENAME, true, false);
+  output_buf.Sink(prob_chains, buffer.Counts());
+
+  // Create a third model buffer for our backoff output on e.g. backoff.1,
+  // backoff.2, ...
+  lm::ModelBuffer boff_buf(BACKOFF_FILENAME, true, false);
+  boff_buf.Sink(backoff_chains, buffer.Counts());
+
+  // Joins all threads that chains owns,
+  //    and does a for loop over each chain object in chains,
+  //    calling chain.Wait() on each such chain object
+  ngram_inputs.Wait(true);
+  backoff_chains.Wait(true);
+  prob_chains.Wait(true);
+
+  return 0;
+}
--- a/cpp/thirdpart/kenlm/lm/interpolate/tune_derivatives.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/tune_derivatives.cc
+#include "tune_derivatives.hh"
+
+#include "tune_instances.hh"
+#include "tune_matrix.hh"
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/typed_stream.hh"
+
+#include <Eigen/Core>
+
+namespace lm { namespace interpolate {
+
+Accum Derivatives(Instances &in, const Vector &weights, Vector &gradient, Matrix &hessian) {
+  gradient = in.CorrectGradientTerm();
+  hessian = Matrix::Zero(weights.rows(), weights.rows());
+
+  // TODO: loop instead to force low-memory evaluation?
+  // Compute p_I(x)*Z_{\epsilon} i.e. the unnormalized probabilities
+  Vector weighted_uni((in.LNUnigrams() * weights).array().exp());
+  // Even -inf doesn't work for <s> because weights can be negative.  Manually set it to zero.
+  weighted_uni(in.BOS()) = 0.0;
+  Accum Z_epsilon = weighted_uni.sum();
+  // unigram_cross(i) = \sum_{all x} p_I(x) ln p_i(x)
+  Vector unigram_cross(in.LNUnigrams().transpose() * weighted_uni / Z_epsilon);
+
+  Accum sum_B_I = 0.0;
+  Accum sum_ln_Z_context = 0.0;
+
+  // Temporaries used each cycle of the loop.
+  Matrix convolve;
+  Vector full_cross;
+  Matrix hessian_missing_Z_context;
+  // Backed off ln p_i(x)B_i(context)
+  Vector ln_p_i_backed;
+  // Full ln p_i(x | context)
+  Vector ln_p_i_full;
+
+  // TODO make configurable memory size.
+  util::stream::Chain chain(util::stream::ChainConfig(in.ReadExtensionsEntrySize(), 2, 64 << 20));
+  chain.ActivateProgress();
+  in.ReadExtensions(chain);
+  util::stream::TypedStream<Extension> extensions(chain.Add());
+  chain >> util::stream::kRecycle;
+
+  // Loop over instances (words in the tuning data).
+  for (InstanceIndex n = 0; n < in.NumInstances(); ++n) {
+    assert(extensions);
+    Accum weighted_backoffs = exp(in.LNBackoffs(n).dot(weights));
+
+    // Compute \sum_{x: model does not back off to unigram} p_I(x)Z(epsilon)
+    Accum unnormalized_sum_x_p_I = 0.0;
+    // Compute \sum_{x: model does not back off to unigram} p_I(x | context)Z(context)
+    Accum unnormalized_sum_x_p_I_full = 0.0;
+
+    // This should be divided by Z_context then added to the Hessian.
+    hessian_missing_Z_context = Matrix::Zero(weights.rows(), weights.rows());
+
+    full_cross = Vector::Zero(weights.rows());
+
+    // Loop over words within an instance for which extension exists.  An extension happens when any model matches more than a unigram in the tuning instance.
+    while (extensions && extensions->instance == n) {
+      const WordIndex word = extensions->word;
+      unnormalized_sum_x_p_I += weighted_uni(word);
+
+      ln_p_i_backed = in.LNUnigrams().row(word) + in.LNBackoffs(n);
+
+      // Calculate ln_p_i_full(i) = ln p_i(word | context) by filling in unigrams then overwriting with extensions.
+      ln_p_i_full = ln_p_i_backed;
+      // Loop over all models that have an extension for the same word namely p_i(word | context) matches at least a bigram.
+      for (; extensions && extensions->word == word && extensions->instance == n; ++extensions) {
+        ln_p_i_full(extensions->model) = extensions->ln_prob;
+      }
+
+      // This is the weighted product of probabilities.  In other words, p_I(word | context) * Z(context) = exp(\sum_i w_i * p_i(word | context)).
+      Accum weighted = exp(ln_p_i_full.dot(weights));
+      unnormalized_sum_x_p_I_full += weighted;
+
+      // These aren't normalized by Z_context (happens later)
+      full_cross.noalias() +=
+        weighted * ln_p_i_full
+        - weighted_uni(word) * weighted_backoffs /* we'll divide by Z_context later to form B_I */ * in.LNUnigrams().row(word).transpose();
+
+      // This will get multiplied by Z_context then added to the Hessian.
+      hessian_missing_Z_context.noalias() +=
+        // Replacement terms.
+        weighted * ln_p_i_full * ln_p_i_full.transpose()
+        // Presumed unigrams.  Z_epsilon * weighted_backoffs will turn into B_I once all of this is divided by Z_context.
+        - weighted_uni(word) * weighted_backoffs * ln_p_i_backed * ln_p_i_backed.transpose();
+    }
+
+    Accum Z_context =
+      weighted_backoffs * (Z_epsilon - unnormalized_sum_x_p_I) // Back off and unnormalize the unigrams for which there is no extension.
+      + unnormalized_sum_x_p_I_full; // Add the extensions.
+    sum_ln_Z_context += log(Z_context);
+    Accum B_I = Z_epsilon / Z_context * weighted_backoffs;
+    sum_B_I += B_I;
+
+    // This is the gradient term for this instance except for -log p_i(w_n | w_1^{n-1}) which was accounted for as part of neg_correct_sum_.
+    // full_cross(i) is \sum_{all x} p_I(x | context) log p_i(x | context)
+    // Prior terms excluded dividing by Z_context because it wasn't known at the time.
+    full_cross /= Z_context;
+    full_cross +=
+      // Uncorrected term
+      B_I * (in.LNBackoffs(n).transpose() + unigram_cross)
+      // Subtract values that should not have been charged.
+      - unnormalized_sum_x_p_I / Z_epsilon * B_I * in.LNBackoffs(n).transpose();
+    gradient += full_cross;
+
+    convolve = unigram_cross * in.LNBackoffs(n);
+    // There's one missing term here, which is independent of context and done at the end.
+    hessian.noalias() +=
+      // First term of Hessian, assuming all models back off to unigram.
+      B_I * (convolve + convolve.transpose() + in.LNBackoffs(n).transpose() * in.LNBackoffs(n))
+      // Error in the first term, correcting from unigram to full probabilities.
+      + hessian_missing_Z_context / Z_context
+      // Second term of Hessian, with correct full probabilities.
+      - full_cross * full_cross.transpose();
+  }
+
+  for (Matrix::Index x = 0; x < weighted_uni.rows(); ++x) {
+    // \sum_{contexts} B_I(context) \sum_x p_I(x) log p_i(x) log p_j(x)
+    // TODO can this be optimized?  It's summing over the entire vocab which should be a matrix operation.
+    hessian.noalias() += sum_B_I * weighted_uni(x) / Z_epsilon * in.LNUnigrams().row(x).transpose() * in.LNUnigrams().row(x);
+  }
+  return exp((in.CorrectGradientTerm().dot(weights) + sum_ln_Z_context) / static_cast<double>(in.NumInstances()));
+}
+
+}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/interpolate/tune_derivatives.hh
+++ b/cpp/thirdpart/kenlm/lm/interpolate/tune_derivatives.hh
+#ifndef LM_INTERPOLATE_TUNE_DERIVATIVES_H
+#define LM_INTERPOLATE_TUNE_DERIVATIVES_H
+
+#include "tune_matrix.hh"
+
+#include <Eigen/Core>
+#include <cmath>
+
+namespace lm { namespace interpolate {
+
+class Instances;
+
+// Given tuning instances and model weights, computes the objective function (log probability), gradient, and Hessian.
+// Returns log probability / number of instances.
+Accum Derivatives(Instances &instances /* Doesn't modify but ReadExtensions is lazy */, const Vector &weights, Vector &gradient, Matrix &hessian);
+
+}} // namespaces
+
+#endif // LM_INTERPOLATE_TUNE_DERIVATIVES_H
+
--- a/cpp/thirdpart/kenlm/lm/interpolate/tune_derivatives_test.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/tune_derivatives_test.cc
+#include "tune_derivatives.hh"
+
+#include "tune_instances.hh"
+
+#include "../../util/stream/config.hh"
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/io.hh"
+#include "../../util/stream/typed_stream.hh"
+
+#define BOOST_TEST_MODULE DerivativeTest
+#include <boost/test/unit_test.hpp>
+
+namespace lm { namespace interpolate {
+
+class MockInstances : public Instances {
+  public:
+    MockInstances() : chain_(util::stream::ChainConfig(ReadExtensionsEntrySize(), 2, 100)), write_(chain_.Add()) {
+      extensions_subsequent_.reset(new util::stream::FileBuffer(util::MakeTemp("/tmp/")));
+      chain_ >> extensions_subsequent_->Sink() >> util::stream::kRecycle;
+    }
+
+    Matrix &LNUnigrams() { return ln_unigrams_; }
+
+    BackoffMatrix &LNBackoffs() { return ln_backoffs_; }
+
+    WordIndex &BOS() { return bos_; }
+
+    Vector &NegLNCorrectSum() { return neg_ln_correct_sum_; }
+
+    // Extensions must be provided sorted!
+    void AddExtension(const Extension &extension) {
+      *write_ = extension;
+      ++write_;
+    }
+
+    void DoneExtending() {
+      write_.Poison();
+      chain_.Wait(true);
+    }
+
+  private:
+    util::stream::Chain chain_;
+    util::stream::TypedStream<Extension> write_;
+};
+
+namespace {
+
+BOOST_AUTO_TEST_CASE(Small) {
+  MockInstances mock;
+
+  {
+    // Three vocabulary words plus <s>, two models.
+    Matrix unigrams(4, 2);
+    unigrams <<
+      0.1, 0.6,
+      0.4, 0.3,
+      0.5, 0.1,
+      // <s>
+      1.0, 1.0;
+    mock.LNUnigrams() = unigrams.array().log();
+  }
+  mock.BOS() = 3;
+
+  // One instance
+  mock.LNBackoffs().resize(1, 2);
+  mock.LNBackoffs() << 0.2, 0.4;
+  mock.LNBackoffs() = mock.LNBackoffs().array().log();
+
+  // Sparse extensions: model 0 word 2 and model 1 word 1.
+
+  // Assuming that model 1 only matches word 1, this is p_1(1 | context)
+  Accum model_1_word_1 = 1.0 - .6 * .4 - .1 * .4;
+
+  mock.NegLNCorrectSum().resize(2);
+  // We'll suppose correct has WordIndex 1, which backs off in model 0, and matches in model 1
+  mock.NegLNCorrectSum() << (0.4 * 0.2), model_1_word_1;
+  mock.NegLNCorrectSum() = -mock.NegLNCorrectSum().array().log();
+
+  Accum model_0_word_2 = 1.0 - .1 * .2 - .4 * .2;
+
+  Extension ext;
+
+  ext.instance = 0;
+  ext.word = 1;
+  ext.model = 1;
+  ext.ln_prob = log(model_1_word_1);
+  mock.AddExtension(ext);
+
+  ext.instance = 0;
+  ext.word = 2;
+  ext.model = 0;
+  ext.ln_prob = log(model_0_word_2);
+  mock.AddExtension(ext);
+
+  mock.DoneExtending();
+
+  Vector weights(2);
+  weights << 0.9, 1.2;
+
+  Vector gradient(2);
+  Matrix hessian(2,2);
+  Derivatives(mock, weights, gradient, hessian);
+  // TODO: check perplexity value coming out.
+
+  // p_I(x | context)
+  Vector p_I(3);
+  p_I <<
+    pow(0.1 * 0.2, 0.9) * pow(0.6 * 0.4, 1.2),
+    pow(0.4 * 0.2, 0.9) * pow(model_1_word_1, 1.2),
+    pow(model_0_word_2, 0.9) * pow(0.1 * 0.4, 1.2);
+  p_I /= p_I.sum();
+
+  Vector expected_gradient = mock.NegLNCorrectSum();
+  expected_gradient(0) += p_I(0) * log(0.1 * 0.2);
+  expected_gradient(0) += p_I(1) * log(0.4 * 0.2);
+  expected_gradient(0) += p_I(2) * log(model_0_word_2);
+  BOOST_CHECK_CLOSE(expected_gradient(0), gradient(0), 0.01);
+
+  expected_gradient(1) += p_I(0) * log(0.6 * 0.4);
+  expected_gradient(1) += p_I(1) * log(model_1_word_1);
+  expected_gradient(1) += p_I(2) * log(0.1 * 0.4);
+  BOOST_CHECK_CLOSE(expected_gradient(1), gradient(1), 0.01);
+
+  Matrix expected_hessian(2, 2);
+  expected_hessian(1, 0) =
+    // First term
+    p_I(0) * log(0.1 * 0.2) * log(0.6 * 0.4) +
+    p_I(1) * log(0.4 * 0.2) * log(model_1_word_1) +
+    p_I(2) * log(model_0_word_2) * log(0.1 * 0.4);
+  expected_hessian(1, 0) -=
+    (p_I(0) * log(0.1 * 0.2) + p_I(1) * log(0.4 * 0.2) + p_I(2) * log(model_0_word_2)) *
+    (p_I(0) * log(0.6 * 0.4) + p_I(1) * log(model_1_word_1) + p_I(2) * log(0.1 * 0.4));
+  expected_hessian(0, 1) = expected_hessian(1, 0);
+  BOOST_CHECK_CLOSE(expected_hessian(1, 0), hessian(1, 0), 0.01);
+  BOOST_CHECK_CLOSE(expected_hessian(0, 1), hessian(0, 1), 0.01);
+}
+
+}}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/interpolate/tune_instances.cc
+++ b/cpp/thirdpart/kenlm/lm/interpolate/tune_instances.cc
+/* Load tuning instances and filter underlying models to them.  A tuning
+ * instance is an n-gram in the tuning file.  To tune towards these, we want
+ * the correct probability p_i(w_n | w_1^{n-1}) from each model as well as
+ * all the denominators p_i(v | w_1^{n-1}) that appear in normalization.
+ *
+ * In other words, we filter the models to only those n-grams whose context
+ * appears in the tuning data.  This can be divided into two categories:
+ * - All unigrams.  This goes into Instances::ln_unigrams_
+ * - Bigrams and above whose context appears in the tuning data.  These are
+ *   known as extensions.  We only care about the longest extension for each
+ *   w_1^{n-1}v since that is what will be used for the probability.
+ * Because there is a large number of extensions (we tried keeping them in RAM
+ * and ran out), the streaming framework is used to keep track of extensions
+ * and sort them so they can be streamed in.  Downstream code
+ * (tune_derivatives.hh) takes a stream of extensions ordered by tuning
+ * instance, the word v, and the model the extension came from.
+ */
+#include "tune_instances.hh"
+
+#include "../common/compare.hh"
+#include "../common/joint_order.hh"
+#include "../common/model_buffer.hh"
+#include "../common/ngram_stream.hh"
+#include "../common/renumber.hh"
+#include "../enumerate_vocab.hh"
+#include "merge_vocab.hh"
+#include "universal_vocab.hh"
+#include "../lm_exception.hh"
+#include "../../util/file_piece.hh"
+#include "../../util/murmur_hash.hh"
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/io.hh"
+#include "../../util/stream/sort.hh"
+#include "../../util/tokenize_piece.hh"
+
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_map.hpp>
+
+#include <cmath>
+#include <limits>
+#include <vector>
+
+namespace lm { namespace interpolate {
+
+// gcc 4.6 complains about uninitialized when sort code is generated for a 4-byte POD.  But that sort code is never used.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+bool Extension::operator<(const Extension &other) const {
+  if (instance != other.instance)
+    return instance < other.instance;
+  if (word != other.word)
+    return word < other.word;
+  if (model != other.model)
+    return model < other.model;
+  return false;
+}
+#pragma GCC diagnostic pop
+
+namespace {
+
+// An extension without backoff weights applied yet.
+#pragma pack(push)
+#pragma pack(1)
+struct InitialExtension {
+  Extension ext;
+  // Order from which it came.
+  uint8_t order;
+};
+#pragma pack(pop)
+
+struct InitialExtensionCompare {
+  bool operator()(const void *first, const void *second) const {
+    return reinterpret_cast<const InitialExtension *>(first)->ext < reinterpret_cast<const InitialExtension *>(second)->ext;
+  }
+};
+
+// Intended use
+// For each model:
+//   stream through orders jointly in suffix order:
+//     Call MatchedBackoff for full matches.
+//     Call Exit when the context matches.
+//   Call FinishModel with the unigram probability of the correct word, get full
+//   probability in return.
+// Use backoffs_out to adjust records that were written to the stream.
+// backoffs_out(model, order - 1) is the penalty for matching order.
+class InstanceMatch {
+  public:
+    InstanceMatch(Matrix &backoffs_out, const WordIndex correct)
+      : seen_(std::numeric_limits<WordIndex>::max()),
+        backoffs_(backoffs_out),
+        correct_(correct), correct_from_(1), correct_ln_prob_(std::numeric_limits<float>::quiet_NaN()) {}
+
+    void MatchedBackoff(ModelIndex model, uint8_t order, float ln_backoff) {
+      backoffs_(model, order - 1) = ln_backoff;
+    }
+
+    // We only want the highest-order matches, which are the first to be exited for a given word.
+    void Exit(const InitialExtension &from, util::stream::Stream &out) {
+      if (from.ext.word == seen_) return;
+      seen_ = from.ext.word;
+      *static_cast<InitialExtension*>(out.Get()) = from;
+      ++out;
+      if (UTIL_UNLIKELY(correct_ == from.ext.word)) {
+        correct_from_ = from.order;
+        correct_ln_prob_ = from.ext.ln_prob;
+      }
+    }
+
+    WordIndex Correct() const { return correct_; }
+
+    // Call this after each model has been passed through.  Provide the unigram
+    // probability of the correct word (which follows the given context).
+    // This function will return the fully-backed-off probability of the correct
+    // word.
+    float FinishModel(ModelIndex model, float correct_ln_unigram) {
+      seen_ = std::numeric_limits<WordIndex>::max();
+      // Turn backoffs into multiplied values (added in log space).
+      // So backoffs_(model, order - 1) is the penalty for matching order.
+      float accum = 0.0;
+      for (int order = backoffs_.cols() - 1; order >= 0; --order) {
+        accum += backoffs_(model, order);
+        backoffs_(model, order) = accum;
+      }
+      if (correct_from_ == 1) {
+        correct_ln_prob_ = correct_ln_unigram;
+      }
+      if (correct_from_ - 1 < backoffs_.cols()) {
+        correct_ln_prob_ += backoffs_(model, correct_from_ - 1);
+      }
+      correct_from_ = 1;
+      return correct_ln_prob_;
+    }
+
+  private:
+    // What's the last word we've seen?  Used to act only on exiting the longest match.
+    WordIndex seen_;
+
+    Matrix &backoffs_;
+
+    const WordIndex correct_;
+
+    // These only apply to the most recent model.
+    uint8_t correct_from_;
+
+    float correct_ln_prob_;
+};
+
+// Forward information to multiple instances of a context.  So if the tuning
+// set contains
+//   a b c d e
+//   a b c d e
+// there's one DispatchContext for a b c d which calls two InstanceMatch, one
+// for each tuning instance.  This might be to inform them about a b c d g in
+// one of the models.
+class DispatchContext {
+  public:
+    void Register(InstanceMatch &context) {
+      registered_.push_back(&context);
+    }
+
+    void MatchedBackoff(ModelIndex model, uint8_t order, float ln_backoff) {
+      for (std::vector<InstanceMatch*>::iterator i = registered_.begin(); i != registered_.end(); ++i)
+        (*i)->MatchedBackoff(model, order, ln_backoff);
+    }
+
+    void Exit(InitialExtension &from, util::stream::Stream &out, const InstanceMatch *base_instance) {
+      for (std::vector<InstanceMatch*>::iterator i = registered_.begin(); i != registered_.end(); ++i) {
+        from.ext.instance = *i - base_instance;
+        (*i)->Exit(from, out);
+      }
+    }
+
+  private:
+    // TODO make these offsets in a big array rather than separately allocated.
+    std::vector<InstanceMatch*> registered_;
+};
+
+// Map from n-gram hash to contexts in the tuning data.  TODO: probing hash table?
+typedef boost::unordered_map<uint64_t, DispatchContext> ContextMap;
+
+// Handle all the orders of a single model at once.
+class JointOrderCallback {
+  public:
+    JointOrderCallback(
+        std::size_t model,
+        std::size_t full_order_minus_1,
+        ContextMap &contexts,
+        util::stream::Stream &out,
+        const InstanceMatch *base_instance)
+      : full_order_minus_1_(full_order_minus_1),
+        contexts_(contexts),
+        out_(out),
+        base_instance_(base_instance) {
+      ext_.ext.model = model;
+    }
+
+    void Enter(std::size_t order_minus_1, const void *data) {}
+
+    void Exit(std::size_t order_minus_1, void *data) {
+      // Match the full n-gram for backoffs.
+      if (order_minus_1 != full_order_minus_1_) {
+        NGram<ProbBackoff> gram(data, order_minus_1 + 1);
+        ContextMap::iterator i = contexts_.find(util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex)));
+        if (UTIL_UNLIKELY(i != contexts_.end())) {
+          i->second.MatchedBackoff(ext_.ext.model, gram.Order(), gram.Value().backoff * M_LN10);
+        }
+      }
+      // Match the context of the n-gram to indicate it's an extension.
+      ContextMap::iterator i = contexts_.find(util::MurmurHashNative(data, order_minus_1 * sizeof(WordIndex)));
+      if (UTIL_UNLIKELY(i != contexts_.end())) {
+        NGram<Prob> gram(data, order_minus_1 + 1);
+        // model is already set.
+        // instance is set by DispatchContext.
+        // That leaves word, ln_prob, and order.
+        ext_.ext.word = *(gram.end() - 1);
+        ext_.ext.ln_prob = gram.Value().prob * M_LN10;
+        ext_.order = order_minus_1 + 1;
+        // model was already set in the constructor.
+        // ext_.ext.instance is set by the Exit call.
+        i->second.Exit(ext_, out_, base_instance_);
+      }
+    }
+
+    void Run(const util::stream::ChainPositions &positions) {
+      JointOrder<JointOrderCallback, SuffixOrder>(positions, *this);
+    }
+
+  private:
+    const std::size_t full_order_minus_1_;
+
+    // Mapping is constant but values are being manipulated to tell them about
+    // n-grams.
+    ContextMap &contexts_;
+
+    // Reused variable.  model is set correctly.
+    InitialExtension ext_;
+
+    util::stream::Stream &out_;
+
+    const InstanceMatch *const base_instance_;
+};
+
+// This populates the ln_unigrams_ matrix.  It can (and should for efficiency)
+// be run in the same scan as JointOrderCallback.
+class ReadUnigrams {
+  public:
+    explicit ReadUnigrams(Matrix::ColXpr out) : out_(out) {}
+
+    // Read renumbered unigrams, fill with <unk> otherwise.
+    void Run(const util::stream::ChainPosition &position) {
+      NGramStream<ProbBackoff> stream(position);
+      assert(stream);
+      Accum unk = stream->Value().prob * M_LN10;
+      WordIndex previous = 0;
+      for (; stream; ++stream) {
+        WordIndex word = *stream->begin();
+        out_.segment(previous, word - previous) = Vector::Constant(word - previous, unk);
+        out_(word) = stream->Value().prob * M_LN10;
+        //backoffs are used by JointOrderCallback.
+        previous = word + 1;
+      }
+      out_.segment(previous, out_.rows() - previous) = Vector::Constant(out_.rows() - previous, unk);
+    }
+
+  private:
+    Matrix::ColXpr out_;
+};
+
+// Read tuning data into an array of vocab ids.  The vocab ids are agreed with MergeVocab.
+class IdentifyTuning : public EnumerateVocab {
+  public:
+    IdentifyTuning(int tuning_file, std::vector<WordIndex> &out) : indices_(out) {
+      indices_.clear();
+      StringPiece line;
+      std::size_t counter = 0;
+      std::vector<std::size_t> &eos = words_[util::MurmurHashNative("</s>", 4)];
+      for (util::FilePiece f(tuning_file); f.ReadLineOrEOF(line);) {
+        for (util::TokenIter<util::BoolCharacter, true> word(line, util::kSpaces); word; ++word) {
+          UTIL_THROW_IF(*word == "<s>" || *word == "</s>", FormatLoadException, "Illegal word in tuning data: " << *word);
+          words_[util::MurmurHashNative(word->data(), word->size())].push_back(counter++);
+        }
+        eos.push_back(counter++);
+      }
+      // Also get <s>
+      indices_.resize(counter + 1);
+      words_[util::MurmurHashNative("<s>", 3)].push_back(indices_.size() - 1);
+    }
+
+    // Apply ids as they come out of MergeVocab if they match.
+    void Add(WordIndex id, const StringPiece &str) {
+      boost::unordered_map<uint64_t, std::vector<std::size_t> >::iterator i = words_.find(util::MurmurHashNative(str.data(), str.size()));
+      if (i != words_.end()) {
+        for (std::vector<std::size_t>::iterator j = i->second.begin(); j != i->second.end(); ++j) {
+          indices_[*j] = id;
+        }
+      }
+    }
+
+    WordIndex FinishGetBOS() {
+      WordIndex ret = indices_.back();
+      indices_.pop_back();
+      return ret;
+    }
+
+  private:
+    // array of words in tuning data.
+    std::vector<WordIndex> &indices_;
+
+    // map from hash(string) to offsets in indices_.
+    boost::unordered_map<uint64_t, std::vector<std::size_t> > words_;
+};
+
+} // namespace
+
+// Store information about the first iteration.
+class ExtensionsFirstIteration {
+  public:
+    explicit ExtensionsFirstIteration(std::size_t instances, std::size_t models, std::size_t max_order, util::stream::Chain &extension_input, const util::stream::SortConfig &config)
+      : backoffs_by_instance_(new std::vector<Matrix>(instances)), sort_(extension_input, config) {
+      // Initialize all the backoff matrices to zeros.
+      for (std::vector<Matrix>::iterator i = backoffs_by_instance_->begin(); i != backoffs_by_instance_->end(); ++i) {
+        *i = Matrix::Zero(models, max_order);
+      }
+    }
+
+    Matrix &WriteBackoffs(std::size_t instance) {
+      return (*backoffs_by_instance_)[instance];
+    }
+
+    // Get the backoff all the way to unigram for a particular tuning instance and model.
+    Accum FullBackoff(std::size_t instance, std::size_t model) const {
+      return (*backoffs_by_instance_)[instance](model, 0);
+    }
+
+    void Merge(std::size_t lazy_memory) {
+      sort_.Merge(lazy_memory);
+      lazy_memory_ = lazy_memory;
+    }
+
+    void Output(util::stream::Chain &chain) {
+      sort_.Output(chain, lazy_memory_);
+      chain >> ApplyBackoffs(backoffs_by_instance_);
+    }
+
+  private:
+    class ApplyBackoffs {
+      public:
+        explicit ApplyBackoffs(boost::shared_ptr<std::vector<Matrix> > backoffs_by_instance)
+          : backoffs_by_instance_(backoffs_by_instance) {}
+
+        void Run(const util::stream::ChainPosition &position) {
+          // There should always be tuning instances.
+          const std::vector<Matrix> &backoffs = *backoffs_by_instance_;
+          assert(!backoffs.empty());
+          uint8_t max_order = backoffs.front().cols();
+          for (util::stream::Stream stream(position); stream; ++stream) {
+            InitialExtension &ini = *reinterpret_cast<InitialExtension*>(stream.Get());
+            assert(ini.order > 1); // If it's an extension, it should be higher than a unigram.
+            if (ini.order != max_order) {
+              ini.ext.ln_prob += backoffs[ini.ext.instance](ini.ext.model, ini.order - 1);
+            }
+          }
+        }
+
+      private:
+        boost::shared_ptr<std::vector<Matrix> > backoffs_by_instance_;
+    };
+
+    // Array of complete backoff matrices by instance.
+    // Each matrix is by model, then by order.
+    // Would have liked to use a tensor but it's not that well supported.
+    // This is a shared pointer so that ApplyBackoffs can run after this class is gone.
+    boost::shared_ptr<std::vector<Matrix> > backoffs_by_instance_;
+
+    // This sorts and stores all the InitialExtensions.
+    util::stream::Sort<InitialExtensionCompare> sort_;
+
+    std::size_t lazy_memory_;
+};
+
+Instances::Instances(int tune_file, const std::vector<StringPiece> &model_names, const InstancesConfig &config) : temp_prefix_(config.sort.temp_prefix) {
+  // All the memory from stack variables here should go away before merge sort of the instances.
+  {
+    util::FixedArray<ModelBuffer> models(model_names.size());
+
+    // Load tuning set and join vocabulary.
+    std::vector<WordIndex> vocab_sizes;
+    vocab_sizes.reserve(model_names.size());
+    util::FixedArray<int> vocab_files(model_names.size());
+    std::size_t max_order = 0;
+    for (std::vector<StringPiece>::const_iterator i = model_names.begin(); i != model_names.end(); ++i) {
+      models.push_back(*i);
+      vocab_sizes.push_back(models.back().Counts()[0]);
+      vocab_files.push_back(models.back().VocabFile());
+      max_order = std::max(max_order, models.back().Order());
+    }
+    UniversalVocab vocab(vocab_sizes);
+    std::vector<WordIndex> tuning_words;
+    WordIndex combined_vocab_size;
+    {
+      IdentifyTuning identify(tune_file, tuning_words);
+      combined_vocab_size = MergeVocab(vocab_files, vocab, identify);
+      bos_ = identify.FinishGetBOS();
+    }
+
+    // Setup the initial extensions storage: a chain going to a sort with a stream in the middle for writing.
+    util::stream::Chain extensions_chain(util::stream::ChainConfig(sizeof(InitialExtension), 2, config.extension_write_chain_mem));
+    util::stream::Stream extensions_write(extensions_chain.Add());
+    extensions_first_.reset(new ExtensionsFirstIteration(tuning_words.size(), model_names.size(), max_order, extensions_chain, config.sort));
+
+    // Populate the ContextMap from contexts to instances.
+    ContextMap cmap;
+    util::FixedArray<InstanceMatch> instances(tuning_words.size());
+    {
+      UTIL_THROW_IF2(tuning_words.empty(), "Empty tuning data");
+      const WordIndex eos = tuning_words.back();
+      std::vector<WordIndex> context;
+      context.push_back(bos_);
+      for (std::size_t i = 0; i < tuning_words.size(); ++i) {
+        instances.push_back(boost::ref(extensions_first_->WriteBackoffs(i)), tuning_words[i]);
+        for (std::size_t j = 0; j < context.size(); ++j) {
+          cmap[util::MurmurHashNative(&context[j], sizeof(WordIndex) * (context.size() - j))].Register(instances.back());
+        }
+        // Prepare for next word by starting a new sentence or shifting context.
+        if (tuning_words[i] == eos) {
+          context.clear();
+          context.push_back(bos_);
+        } else {
+          if (context.size() == max_order) {
+            context.erase(context.begin());
+          }
+          context.push_back(tuning_words[i]);
+        }
+      }
+    }
+
+    // Go through each model.  Populate:
+    // ln_backoffs_
+    ln_backoffs_.resize(instances.size(), models.size());
+    // neg_ln_correct_sum_
+    neg_ln_correct_sum_.resize(models.size());
+    // ln_unigrams_
+    ln_unigrams_.resize(combined_vocab_size, models.size());
+    // The backoffs in extensions_first_
+    for (std::size_t m = 0; m < models.size(); ++m) {
+      std::cerr << "Processing model " << m << '/' << models.size() << ": " << model_names[m] << std::endl;
+      util::stream::Chains chains(models[m].Order());
+      for (std::size_t i = 0; i < models[m].Order(); ++i) {
+        // TODO: stop wasting space for backoffs of highest order.
+        chains.push_back(util::stream::ChainConfig(NGram<ProbBackoff>::TotalSize(i + 1), 2, config.model_read_chain_mem));
+      }
+      chains.back().ActivateProgress();
+      models[m].Source(chains);
+      for (std::size_t i = 0; i < models[m].Order(); ++i) {
+        chains[i] >> Renumber(vocab.Mapping(m), i + 1);
+      }
+
+      // Populate ln_unigrams_.
+      chains[0] >> ReadUnigrams(ln_unigrams_.col(m));
+
+      // Send extensions into extensions_first_ and give data to the instances about backoffs/extensions.
+      chains >> JointOrderCallback(m, models[m].Order() - 1, cmap, extensions_write, instances.begin());
+
+      chains >> util::stream::kRecycle;
+      chains.Wait(true);
+      neg_ln_correct_sum_(m) = 0.0;
+      for (InstanceMatch *i = instances.begin(); i != instances.end(); ++i) {
+        neg_ln_correct_sum_(m) -= i->FinishModel(m, ln_unigrams_(i->Correct(), m));
+        ln_backoffs_(i - instances.begin(), m) = extensions_first_->FullBackoff(i - instances.begin(), m);
+      }
+      ln_unigrams_(bos_, m) = 0; // Does not matter as long as it does not produce nans since tune_derivatives will overwrite the output.
+    }
+    extensions_write.Poison();
+  }
+  extensions_first_->Merge(config.lazy_memory);
+}
+
+Instances::~Instances() {}
+
+// TODO: size reduction by excluding order for subsequent passes.
+std::size_t Instances::ReadExtensionsEntrySize() const {
+  return sizeof(InitialExtension);
+}
+
+void Instances::ReadExtensions(util::stream::Chain &on) {
+  if (extensions_first_.get()) {
+    // Lazy sort and save a sorted copy to disk.  TODO: cut down on record size by stripping out order information.
+    extensions_first_->Output(on);
+    extensions_first_.reset(); // Relevant data will continue to live in workers.
+    extensions_subsequent_.reset(new util::stream::FileBuffer(util::MakeTemp(temp_prefix_)));
+    on >> extensions_subsequent_->Sink();
+  } else {
+    on.SetProgressTarget(extensions_subsequent_->Size());
+    on >> extensions_subsequent_->Source();
+  }
+}
+
+// Back door.
+Instances::Instances() {}
+
+}} // namespaces