Update files

688b6eac · SWHL · 688b6eac · 688b6eac · 688b6eac · 688b6eac
Commit 688b6eac authored Apr 07, 2023 by SWHL
20 changed files
--- a/cpp/thirdpart/kenlm/lm/builder/corpus_count_test.cc
+++ b/cpp/thirdpart/kenlm/lm/builder/corpus_count_test.cc
+#include "corpus_count.hh"
+
+#include "payload.hh"
+#include "../common/ngram_stream.hh"
+#include "../common/ngram.hh"
+
+#include "../../util/file.hh"
+#include "../../util/file_piece.hh"
+#include "../../util/tokenize_piece.hh"
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/stream.hh"
+
+#define BOOST_TEST_MODULE CorpusCountTest
+#include <boost/test/unit_test.hpp>
+
+namespace lm { namespace builder { namespace {
+
+#define Check(str, cnt) { \
+  BOOST_REQUIRE(stream); \
+  w = stream->begin(); \
+  for (util::TokenIter<util::AnyCharacter, true> t(str, " "); t; ++t, ++w) { \
+    BOOST_CHECK_EQUAL(*t, v[*w]); \
+  } \
+  BOOST_CHECK_EQUAL((uint64_t)cnt, stream->Value().count); \
+  ++stream; \
+}
+
+class CheckAnswers {
+  public:
+    void Run(const util::stream::ChainPosition &position) {
+      NGramStream<BuildingPayload> stream(position);
+      const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};
+      WordIndex *w;
+
+      Check("<s> <s> looking", 1);
+      Check("<s> looking on", 1);
+      Check("looking on a", 1);
+      Check("on a little", 2);
+      Check("a little more", 2);
+      Check("little more loin", 2);
+      Check("more loin </s>", 2);
+      Check("<s> <s> on", 2);
+      Check("<s> on a", 1);
+      Check("<s> on foo", 1);
+      Check("on foo little", 1);
+      Check("foo little more", 1);
+      Check("little more loin", 1);
+      Check("more loin </s>", 1);
+      Check("<s> <s> bar", 1);
+      Check("<s> bar </s>", 1);
+      Check("<s> <s> </s>", 1);
+      BOOST_CHECK(!stream);
+    }
+};
+
+BOOST_AUTO_TEST_CASE(Short) {
+  util::scoped_fd input_file(util::MakeTemp("corpus_count_test_temp"));
+  const char input[] = "looking on a little more loin\non a little more loin\non foo little more loin\nbar\n\n";
+  // Blocks of 10 are
+  // looking on a little more loin </s> on a little[duplicate] more[duplicate] loin[duplicate] </s>[duplicate] on[duplicate] foo
+  // little more loin </s> bar </s> </s>
+
+  util::WriteOrThrow(input_file.get(), input, sizeof(input) - 1);
+  util::SeekOrThrow(input_file.get(), 0);
+  util::FilePiece input_piece(input_file.release(), "temp file");
+
+  util::stream::ChainConfig config;
+  config.entry_size = NGram<BuildingPayload>::TotalSize(3);
+  config.total_memory = config.entry_size * 20;
+  config.block_count = 2;
+
+  util::scoped_fd vocab(util::MakeTemp("corpus_count_test_vocab"));
+
+  uint64_t token_count;
+  WordIndex type_count = 10;
+  std::vector<bool> prune_words;
+  util::stream::Chain chain(config);
+  CorpusCount counter(input_piece, vocab.get(), true, token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT);
+  chain >> boost::ref(counter) >> CheckAnswers() >> util::stream::kRecycle;
+
+  chain.Wait();
+  BOOST_CHECK_EQUAL(11, type_count);
+}
+
+}}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/builder/count_ngrams_main.cc
+++ b/cpp/thirdpart/kenlm/lm/builder/count_ngrams_main.cc
+#include "combine_counts.hh"
+#include "corpus_count.hh"
+#include "../common/compare.hh"
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/io.hh"
+#include "../../util/stream/sort.hh"
+#include "../../util/file.hh"
+#include "../../util/file_piece.hh"
+#include "../../util/usage.hh"
+
+#include <boost/program_options.hpp>
+
+#include <string>
+
+namespace {
+class SizeNotify {
+  public:
+    SizeNotify(std::size_t &out) : behind_(out) {}
+
+    void operator()(const std::string &from) {
+      behind_ = util::ParseSize(from);
+    }
+
+  private:
+    std::size_t &behind_;
+};
+
+boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
+  return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
+}
+
+} // namespace
+
+int main(int argc, char *argv[]) {
+  namespace po = boost::program_options;
+  unsigned order;
+  std::size_t ram;
+  std::string temp_prefix, vocab_table, vocab_list;
+  po::options_description options("corpus count");
+  options.add_options()
+    ("help,h", po::bool_switch(), "Show this help message")
+    ("order,o", po::value<unsigned>(&order)->required(), "Order")
+    ("temp_prefix,T", po::value<std::string>(&temp_prefix)->default_value(util::DefaultTempDirectory()), "Temporary file prefix")
+    ("memory,S", SizeOption(ram, "80%"), "RAM")
+    ("read_vocab_table", po::value<std::string>(&vocab_table), "Vocabulary hash table to read.  This should be a probing hash table with size at the beginning.")
+    ("write_vocab_list", po::value<std::string>(&vocab_list), "Vocabulary list to write as null-delimited strings.");
+
+  po::variables_map vm;
+  po::store(po::parse_command_line(argc, argv, options), vm);
+  if (argc == 1 || vm["help"].as<bool>()) {
+    std::cerr << "Counts n-grams from standard input.\n" << options << std::endl;
+    return 1;
+  }
+  po::notify(vm);
+
+  if (!(vocab_table.empty() ^ vocab_list.empty())) {
+    std::cerr << "Specify one of --read_vocab_table or --write_vocab_list for vocabulary handling." << std::endl;
+    return 1;
+  }
+
+  util::NormalizeTempPrefix(temp_prefix);
+
+  util::scoped_fd vocab_file(vocab_table.empty() ? util::CreateOrThrow(vocab_list.c_str()) : util::OpenReadOrThrow(vocab_table.c_str()));
+
+  std::size_t blocks = 2;
+  std::size_t remaining_size = ram - util::SizeOrThrow(vocab_file.get());
+
+  std::size_t memory_for_chain =
+    // This much memory to work with after vocab hash table.
+    static_cast<float>(remaining_size) /
+    // Solve for block size including the dedupe multiplier for one block.
+    (static_cast<float>(blocks) + lm::builder::CorpusCount::DedupeMultiplier(order)) *
+    // Chain likes memory expressed in terms of total memory.
+    static_cast<float>(blocks);
+  std::cerr << "Using " << memory_for_chain << " for chains." << std::endl;
+  
+  util::stream::Chain chain(util::stream::ChainConfig(lm::NGram<uint64_t>::TotalSize(order), blocks, memory_for_chain));
+  util::FilePiece f(0, NULL, &std::cerr);
+  uint64_t token_count = 0;
+  lm::WordIndex type_count = 0;
+  std::vector<bool> empty_prune;
+  std::string empty_string;
+  lm::builder::CorpusCount counter(f, vocab_file.get(), vocab_table.empty(), token_count, type_count, empty_prune, empty_string, chain.BlockSize() / chain.EntrySize(), lm::THROW_UP);
+  chain >> boost::ref(counter);
+
+  util::stream::SortConfig sort_config;
+  sort_config.temp_prefix = temp_prefix;
+  sort_config.buffer_size = 64 * 1024 * 1024;
+  // Intended to run in parallel.
+  sort_config.total_memory = remaining_size;
+  util::stream::Sort<lm::SuffixOrder, lm::builder::CombineCounts> sorted(chain, sort_config, lm::SuffixOrder(order), lm::builder::CombineCounts());
+  chain.Wait(true);
+  util::stream::Chain chain2(util::stream::ChainConfig(lm::NGram<uint64_t>::TotalSize(order), blocks, sort_config.buffer_size));
+  sorted.Output(chain2);
+  // Inefficiently copies if there's only one block.
+  chain2 >> util::stream::WriteAndRecycle(1);
+  chain2.Wait(true);
+  return 0;
+}
--- a/cpp/thirdpart/kenlm/lm/builder/debug_print.hh
+++ b/cpp/thirdpart/kenlm/lm/builder/debug_print.hh
+#ifndef LM_BUILDER_DEBUG_PRINT_H
+#define LM_BUILDER_DEBUG_PRINT_H
+
+#include "payload.hh"
+#include "../common/print.hh"
+#include "../common/ngram_stream.hh"
+#include "../../util/file_stream.hh"
+#include "../../util/file.hh"
+
+#include <boost/lexical_cast.hpp>
+
+namespace lm { namespace builder {
+// Not defined, only specialized.
+template <class T> void PrintPayload(util::FileStream &to, const BuildingPayload &payload);
+template <> inline void PrintPayload<uint64_t>(util::FileStream &to, const BuildingPayload &payload) {
+  to << payload.count;
+}
+template <> inline void PrintPayload<Uninterpolated>(util::FileStream &to, const BuildingPayload &payload) {
+  to << log10(payload.uninterp.prob) << ' ' << log10(payload.uninterp.gamma);
+}
+template <> inline void PrintPayload<ProbBackoff>(util::FileStream &to, const BuildingPayload &payload) {
+  to << payload.complete.prob << ' ' << payload.complete.backoff;
+}
+
+// template parameter is the type stored.
+template <class V> class Print {
+  public:
+    static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) {
+      for (unsigned int i = 0; i < chains.size(); ++i) {
+        std::string file(file_base + boost::lexical_cast<std::string>(i));
+        chains[i] >> Print(vocab, util::CreateOrThrow(file.c_str()));
+      }
+    }
+
+    explicit Print(const VocabReconstitute &vocab, int fd) : vocab_(vocab), to_(fd) {}
+
+    void Run(const util::stream::ChainPositions &chains) {
+      util::scoped_fd fd(to_);
+      util::FileStream out(to_);
+      NGramStreams<BuildingPayload> streams(chains);
+      for (NGramStream<BuildingPayload> *s = streams.begin(); s != streams.end(); ++s) {
+        DumpStream(*s, out);
+      }
+    }
+
+    void Run(const util::stream::ChainPosition &position) {
+      util::scoped_fd fd(to_);
+      util::FileStream out(to_);
+      NGramStream<BuildingPayload> stream(position);
+      DumpStream(stream, out);
+    }
+
+  private:
+    void DumpStream(NGramStream<BuildingPayload> &stream, util::FileStream &to) {
+      for (; stream; ++stream) {
+        PrintPayload<V>(to, stream->Value());
+        for (const WordIndex *w = stream->begin(); w != stream->end(); ++w) {
+          to << ' ' << vocab_.Lookup(*w) << '=' << *w;
+        }
+        to << '\n';
+      }
+    }
+
+    const VocabReconstitute &vocab_;
+    int to_;
+};
+
+}} // namespaces
+
+#endif // LM_BUILDER_DEBUG_PRINT_H
--- a/cpp/thirdpart/kenlm/lm/builder/discount.hh
+++ b/cpp/thirdpart/kenlm/lm/builder/discount.hh
+#ifndef LM_BUILDER_DISCOUNT_H
+#define LM_BUILDER_DISCOUNT_H
+
+#include <algorithm>
+
+#include <stdint.h>
+
+namespace lm {
+namespace builder {
+
+struct Discount {
+  float amount[4];
+
+  float Get(uint64_t count) const {
+    return amount[std::min<uint64_t>(count, 3)];
+  }
+
+  float Apply(uint64_t count) const {
+    return static_cast<float>(count) - Get(count);
+  }
+};
+
+} // namespace builder
+} // namespace lm
+
+#endif // LM_BUILDER_DISCOUNT_H
--- a/cpp/thirdpart/kenlm/lm/builder/dump_counts_main.cc
+++ b/cpp/thirdpart/kenlm/lm/builder/dump_counts_main.cc
+#include "../common/print.hh"
+#include "../word_index.hh"
+#include "../../util/file.hh"
+#include "../../util/read_compressed.hh"
+
+#include <boost/lexical_cast.hpp>
+
+#include <iostream>
+#include <vector>
+
+int main(int argc, char *argv[]) {
+  if (argc != 4) {
+    std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
+    "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
+    "counts.  Each record has order many vocabulary ids.\n"
+    "The vocabulary file contains the words delimited by NULL in order of id.\n"
+    "The vocabulary file may not be compressed because it is mmapped but the counts\n"
+    "file can be compressed.\n";
+    return 1;
+  }
+  util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
+  util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
+  lm::VocabReconstitute vocab(vocab_file.get());
+  unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
+  std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
+  while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
+    UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
+    const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
+    for (const lm::WordIndex *i = words; i != words + order; ++i) {
+      UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ".  Are you sure you have the right order and vocab file for these counts?");
+      std::cout << vocab.Lookup(*i) << ' ';
+    }
+    // TODO don't use std::cout because it is slow.  Add fast uint64_t printing support to FileStream.
+    std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
+  }
+}
--- a/cpp/thirdpart/kenlm/lm/builder/hash_gamma.hh
+++ b/cpp/thirdpart/kenlm/lm/builder/hash_gamma.hh
+#ifndef LM_BUILDER_HASH_GAMMA__
+#define LM_BUILDER_HASH_GAMMA__
+
+#include <stdint.h>
+
+namespace lm { namespace builder {
+
+#pragma pack(push)
+#pragma pack(4)
+
+struct HashGamma {
+    uint64_t hash_value;
+    float gamma;
+};
+
+#pragma pack(pop)
+
+}} // namespaces
+#endif // LM_BUILDER_HASH_GAMMA__
--- a/cpp/thirdpart/kenlm/lm/builder/header_info.hh
+++ b/cpp/thirdpart/kenlm/lm/builder/header_info.hh
+#ifndef LM_BUILDER_HEADER_INFO_H
+#define LM_BUILDER_HEADER_INFO_H
+
+#include <string>
+#include <vector>
+#include <stdint.h>
+
+namespace lm { namespace builder {
+
+// Some configuration info that is used to add
+// comments to the beginning of an ARPA file
+struct HeaderInfo {
+  std::string input_file;
+  uint64_t token_count;
+  std::vector<uint64_t> counts_pruned;
+
+  HeaderInfo() {}
+
+  HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector<uint64_t> &counts_pruned_in)
+    : input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {}
+
+  // TODO: Add smoothing type
+  // TODO: More info if multiple models were interpolated
+};
+
+}} // namespaces
+
+#endif
--- a/cpp/thirdpart/kenlm/lm/builder/initial_probabilities.cc
+++ b/cpp/thirdpart/kenlm/lm/builder/initial_probabilities.cc
+#include "initial_probabilities.hh"
+
+#include "discount.hh"
+#include "hash_gamma.hh"
+#include "payload.hh"
+#include "../common/special.hh"
+#include "../common/ngram_stream.hh"
+#include "../../util/murmur_hash.hh"
+#include "../../util/file.hh"
+#include "../../util/stream/chain.hh"
+#include "../../util/stream/io.hh"
+#include "../../util/stream/stream.hh"
+
+#include <vector>
+
+namespace lm { namespace builder {
+
+namespace {
+struct BufferEntry {
+  // Gamma from page 20 of Chen and Goodman.
+  float gamma;
+  // \sum_w a(c w) for all w.
+  float denominator;
+};
+
+struct HashBufferEntry : public BufferEntry {
+  // Hash value of ngram. Used to join contexts with backoffs.
+  uint64_t hash_value;
+};
+
+// Reads all entries in order like NGramStream does.
+// But deletes any entries that have CutoffCount below or equal to pruning
+// threshold.
+class PruneNGramStream {
+  public:
+    PruneNGramStream(const util::stream::ChainPosition &position, const SpecialVocab &specials) :
+      current_(NULL, NGram<BuildingPayload>::OrderFromSize(position.GetChain().EntrySize())),
+      dest_(NULL, NGram<BuildingPayload>::OrderFromSize(position.GetChain().EntrySize())),
+      currentCount_(0),
+      block_(position),
+      specials_(specials)
+    {
+      StartBlock();
+    }
+
+    NGram<BuildingPayload> &operator*() { return current_; }
+    NGram<BuildingPayload> *operator->() { return &current_; }
+
+    operator bool() const {
+      return block_;
+    }
+
+    PruneNGramStream &operator++() {
+      assert(block_);
+      if(UTIL_UNLIKELY(current_.Order() == 1 && specials_.IsSpecial(*current_.begin())))
+        dest_.NextInMemory();
+      else if(currentCount_ > 0) {
+        if(dest_.Base() < current_.Base()) {
+          memcpy(dest_.Base(), current_.Base(), current_.TotalSize());
+        }
+        dest_.NextInMemory();
+      }
+
+      current_.NextInMemory();
+
+      uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
+      if (current_.Base() == block_base + block_->ValidSize()) {
+        block_->SetValidSize(dest_.Base() - block_base);
+        ++block_;
+        StartBlock();
+        if (block_) {
+          currentCount_ = current_.Value().CutoffCount();
+        }
+      } else {
+        currentCount_ = current_.Value().CutoffCount();
+      }
+
+      return *this;
+    }
+
+  private:
+    void StartBlock() {
+      for (; ; ++block_) {
+        if (!block_) return;
+        if (block_->ValidSize()) break;
+      }
+      current_.ReBase(block_->Get());
+      currentCount_ = current_.Value().CutoffCount();
+
+      dest_.ReBase(block_->Get());
+    }
+
+    NGram<BuildingPayload> current_; // input iterator
+    NGram<BuildingPayload> dest_;    // output iterator
+
+    uint64_t currentCount_;
+
+    util::stream::Link block_;
+
+    const SpecialVocab specials_;
+};
+
+// Extract an array of HashedGamma from an array of BufferEntry.
+class OnlyGamma {
+  public:
+    explicit OnlyGamma(bool pruning) : pruning_(pruning) {}
+
+    void Run(const util::stream::ChainPosition &position) {
+      for (util::stream::Link block_it(position); block_it; ++block_it) {
+        if(pruning_) {
+          const HashBufferEntry *in = static_cast<const HashBufferEntry*>(block_it->Get());
+          const HashBufferEntry *end = static_cast<const HashBufferEntry*>(block_it->ValidEnd());
+
+          // Just make it point to the beginning of the stream so it can be overwritten
+          // With HashGamma values. Do not attempt to interpret the values until set below.
+          HashGamma *out = static_cast<HashGamma*>(block_it->Get());
+          for (; in < end; out += 1, in += 1) {
+            // buffering, otherwise might overwrite values too early
+            float gamma_buf = in->gamma;
+            uint64_t hash_buf = in->hash_value;
+
+            out->gamma = gamma_buf;
+            out->hash_value = hash_buf;
+          }
+          block_it->SetValidSize((block_it->ValidSize() * sizeof(HashGamma)) / sizeof(HashBufferEntry));
+        }
+        else {
+          float *out = static_cast<float*>(block_it->Get());
+          const float *in = out;
+          const float *end = static_cast<const float*>(block_it->ValidEnd());
+          for (out += 1, in += 2; in < end; out += 1, in += 2) {
+            *out = *in;
+          }
+          block_it->SetValidSize(block_it->ValidSize() / 2);
+        }
+      }
+    }
+
+    private:
+      bool pruning_;
+};
+
+class AddRight {
+  public:
+    AddRight(const Discount &discount, const util::stream::ChainPosition &input, bool pruning)
+      : discount_(discount), input_(input), pruning_(pruning) {}
+
+    void Run(const util::stream::ChainPosition &output) {
+      NGramStream<BuildingPayload> in(input_);
+      util::stream::Stream out(output);
+
+      std::vector<WordIndex> previous(in->Order() - 1);
+      // Silly windows requires this workaround to just get an invalid pointer when empty.
+      void *const previous_raw = previous.empty() ? NULL : static_cast<void*>(&previous[0]);
+      const std::size_t size = sizeof(WordIndex) * previous.size();
+
+      for(; in; ++out) {
+        memcpy(previous_raw, in->begin(), size);
+        uint64_t denominator = 0;
+        uint64_t normalizer = 0;
+
+        uint64_t counts[4];
+        memset(counts, 0, sizeof(counts));
+        do {
+          denominator += in->Value().UnmarkedCount();
+
+          // Collect unused probability mass from pruning.
+          // Becomes 0 for unpruned ngrams.
+          normalizer += in->Value().UnmarkedCount() - in->Value().CutoffCount();
+
+          // Chen&Goodman do not mention counting based on cutoffs, but
+          // backoff becomes larger than 1 otherwise, so probably needs
+          // to count cutoffs. Counts normally without pruning.
+          if(in->Value().CutoffCount() > 0)
+            ++counts[std::min(in->Value().CutoffCount(), static_cast<uint64_t>(3))];
+
+        } while (++in && !memcmp(previous_raw, in->begin(), size));
+
+        BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
+        entry.denominator = static_cast<float>(denominator);
+        entry.gamma = 0.0;
+        for (unsigned i = 1; i <= 3; ++i) {
+          entry.gamma += discount_.Get(i) * static_cast<float>(counts[i]);
+        }
+
+        // Makes model sum to 1 with pruning (I hope).
+        entry.gamma += normalizer;
+
+        entry.gamma /= entry.denominator;
+
+        if(pruning_) {
+          // If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...),
+          // so add a hash value that identifies the current ngram.
+          static_cast<HashBufferEntry*>(&entry)->hash_value = util::MurmurHashNative(previous_raw, size);
+        }
+      }
+      out.Poison();
+    }
+
+  private:
+    const Discount &discount_;
+    const util::stream::ChainPosition input_;
+    bool pruning_;
+};
+
+class MergeRight {
+  public:
+    MergeRight(bool interpolate_unigrams, const util::stream::ChainPosition &from_adder, const Discount &discount, const SpecialVocab &specials)
+      : interpolate_unigrams_(interpolate_unigrams), from_adder_(from_adder), discount_(discount), specials_(specials) {}
+
+    // calculate the initial probability of each n-gram (before order-interpolation)
+    // Run() gets invoked once for each order
+    void Run(const util::stream::ChainPosition &primary) {
+      util::stream::Stream summed(from_adder_);
+
+      PruneNGramStream grams(primary, specials_);
+
+      // Without interpolation, the interpolation weight goes to <unk>.
+      if (grams->Order() == 1) {
+        BufferEntry sums(*static_cast<const BufferEntry*>(summed.Get()));
+        // Special case for <unk>
+        assert(*grams->begin() == kUNK);
+        float gamma_assign;
+        if (interpolate_unigrams_) {
+          // Default: treat <unk> like a zeroton.
+          gamma_assign = sums.gamma;
+          grams->Value().uninterp.prob = 0.0;
+        } else {
+          // SRI: give all the interpolation mass to <unk>
+          gamma_assign = 0.0;
+          grams->Value().uninterp.prob = sums.gamma;
+        }
+        grams->Value().uninterp.gamma = gamma_assign;
+
+        for (++grams; *grams->begin() != specials_.BOS(); ++grams) {
+          grams->Value().uninterp.prob = discount_.Apply(grams->Value().count) / sums.denominator;
+          grams->Value().uninterp.gamma = gamma_assign;
+        }
+
+        // Special case for <s>: probability 1.0.  This allows <s> to be
+        // explicitly scored as part of the sentence without impacting
+        // probability and computes q correctly as b(<s>).
+        assert(*grams->begin() == specials_.BOS());
+        grams->Value().uninterp.prob = 1.0;
+        grams->Value().uninterp.gamma = 0.0;
+
+        while (++grams) {
+          grams->Value().uninterp.prob = discount_.Apply(grams->Value().count) / sums.denominator;
+          grams->Value().uninterp.gamma = gamma_assign;
+        }
+        ++summed;
+        return;
+      }
+
+      std::vector<WordIndex> previous(grams->Order() - 1);
+      const std::size_t size = sizeof(WordIndex) * previous.size();
+      for (; grams; ++summed) {
+        memcpy(&previous[0], grams->begin(), size);
+        const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get());
+
+        do {
+          BuildingPayload &pay = grams->Value();
+          pay.uninterp.prob = discount_.Apply(grams->Value().UnmarkedCount()) / sums.denominator;
+          pay.uninterp.gamma = sums.gamma;
+        } while (++grams && !memcmp(&previous[0], grams->begin(), size));
+      }
+    }
+
+  private:
+    bool interpolate_unigrams_;
+    util::stream::ChainPosition from_adder_;
+    Discount discount_;
+    const SpecialVocab specials_;
+};
+
+} // namespace
+
+void InitialProbabilities(
+    const InitialProbabilitiesConfig &config,
+    const std::vector<Discount> &discounts,
+    util::stream::Chains &primary,
+    util::stream::Chains &second_in,
+    util::stream::Chains &gamma_out,
+    const std::vector<uint64_t> &prune_thresholds,
+    bool prune_vocab,
+    const SpecialVocab &specials) {
+  for (size_t i = 0; i < primary.size(); ++i) {
+    util::stream::ChainConfig gamma_config = config.adder_out;
+    if(prune_vocab || prune_thresholds[i] > 0)
+      gamma_config.entry_size = sizeof(HashBufferEntry);
+    else
+      gamma_config.entry_size = sizeof(BufferEntry);
+
+    util::stream::ChainPosition second(second_in[i].Add());
+    second_in[i] >> util::stream::kRecycle;
+    gamma_out.push_back(gamma_config);
+    gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0);
+
+    primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i], specials);
+
+    // Don't bother with the OnlyGamma thread for something to discard.
+    if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0);
+  }
+}
+
+}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/builder/initial_probabilities.hh
+++ b/cpp/thirdpart/kenlm/lm/builder/initial_probabilities.hh
+#ifndef LM_BUILDER_INITIAL_PROBABILITIES_H
+#define LM_BUILDER_INITIAL_PROBABILITIES_H
+
+#include "discount.hh"
+#include "../word_index.hh"
+#include "../../util/stream/config.hh"
+
+#include <vector>
+
+namespace util { namespace stream { class Chains; } }
+
+namespace lm {
+class SpecialVocab;
+namespace builder {
+
+struct InitialProbabilitiesConfig {
+  // These should be small buffers to keep the adder from getting too far ahead
+  util::stream::ChainConfig adder_in;
+  util::stream::ChainConfig adder_out;
+  // SRILM doesn't normally interpolate unigrams.
+  bool interpolate_unigrams;
+};
+
+/* Compute initial (uninterpolated) probabilities
+ * primary: the normal chain of n-grams.  Incoming is context sorted adjusted
+ *   counts.  Outgoing has uninterpolated probabilities for use by Interpolate.
+ * second_in: a second copy of the primary input.  Discard the output.
+ * gamma_out: Computed gamma values are output on these chains in suffix order.
+ *   The values are bare floats and should be buffered for interpolation to
+ *   use.
+ */
+void InitialProbabilities(
+    const InitialProbabilitiesConfig &config,
+    const std::vector<Discount> &discounts,
+    util::stream::Chains &primary,
+    util::stream::Chains &second_in,
+    util::stream::Chains &gamma_out,
+    const std::vector<uint64_t> &prune_thresholds,
+    bool prune_vocab,
+    const SpecialVocab &vocab);
+
+} // namespace builder
+} // namespace lm
+
+#endif // LM_BUILDER_INITIAL_PROBABILITIES_H
--- a/cpp/thirdpart/kenlm/lm/builder/interpolate.cc
+++ b/cpp/thirdpart/kenlm/lm/builder/interpolate.cc
+#include "interpolate.hh"
+
+#include "hash_gamma.hh"
+#include "payload.hh"
+#include "../common/compare.hh"
+#include "../common/joint_order.hh"
+#include "../common/ngram_stream.hh"
+#include "../lm_exception.hh"
+#include "../../util/fixed_array.hh"
+#include "../../util/murmur_hash.hh"
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+namespace lm { namespace builder {
+namespace {
+
+/* Calculate q, the collapsed probability and backoff, as defined in
+ * @inproceedings{Heafield-rest,
+ *   author = {Kenneth Heafield and Philipp Koehn and Alon Lavie},
+ *   title = {Language Model Rest Costs and Space-Efficient Storage},
+ *   year = {2012},
+ *   month = {July},
+ *   booktitle = {Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
+ *   address = {Jeju Island, Korea},
+ *   pages = {1169--1178},
+ *   url = {http://kheafield.com/professional/edinburgh/rest\_paper.pdf},
+ * }
+ * This is particularly convenient to calculate during interpolation because
+ * the needed backoff terms are already accessed at the same time.
+ */
+class OutputQ {
+  public:
+    explicit OutputQ(std::size_t order) : q_delta_(order) {}
+
+    void Gram(unsigned order_minus_1, float full_backoff, ProbBackoff &out) {
+      float &q_del = q_delta_[order_minus_1];
+      if (order_minus_1) {
+        // Divide by context's backoff (which comes in as out.backoff)
+        q_del = q_delta_[order_minus_1 - 1] / out.backoff * full_backoff;
+      } else {
+        q_del = full_backoff;
+      }
+      out.prob = log10f(out.prob * q_del);
+      // TODO: stop wastefully outputting this!
+      out.backoff = 0.0;
+    }
+
+  private:
+    // Product of backoffs in the numerator divided by backoffs in the
+    // denominator.  Does not include
+    std::vector<float> q_delta_;
+};
+
+/* Default: output probability and backoff */
+class OutputProbBackoff {
+  public:
+    explicit OutputProbBackoff(std::size_t /*order*/) {}
+
+    void Gram(unsigned /*order_minus_1*/, float full_backoff, ProbBackoff &out) const {
+      // Correcting for numerical precision issues.  Take that IRST.
+      out.prob = std::min(0.0f, log10f(out.prob));
+      out.backoff = log10f(full_backoff);
+    }
+};
+
+template <class Output> class Callback {
+  public:
+    Callback(float uniform_prob, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab, const SpecialVocab &specials)
+      : backoffs_(backoffs.size()), probs_(backoffs.size() + 2),
+        prune_thresholds_(prune_thresholds),
+        prune_vocab_(prune_vocab),
+        output_(backoffs.size() + 1 /* order */),
+        specials_(specials) {
+      probs_[0] = uniform_prob;
+      for (std::size_t i = 0; i < backoffs.size(); ++i) {
+        backoffs_.push_back(backoffs[i]);
+      }
+    }
+
+    ~Callback() {
+      for (std::size_t i = 0; i < backoffs_.size(); ++i) {
+        if(prune_vocab_ || prune_thresholds_[i + 1] > 0)
+          while(backoffs_[i])
+            ++backoffs_[i];
+
+        if (backoffs_[i]) {
+          std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl;
+          abort();
+        }
+      }
+    }
+
+    void Enter(unsigned order_minus_1, void *data) {
+      NGram<BuildingPayload> gram(data, order_minus_1 + 1);
+      BuildingPayload &pay = gram.Value();
+      pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
+      probs_[order_minus_1 + 1] = pay.complete.prob;
+
+      float out_backoff;
+      if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != specials_.UNK() && *(gram.end() - 1) != specials_.EOS() && backoffs_[order_minus_1]) {
+        if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) {
+          //Compute hash value for current context
+          uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
+
+          const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
+          while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1])
+            hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
+
+          if(current_hash == hashed_backoff->hash_value) {
+            out_backoff = hashed_backoff->gamma;
+            ++backoffs_[order_minus_1];
+          } else {
+            // Has been pruned away so it is not a context anymore
+            out_backoff = 1.0;
+          }
+        } else {
+          out_backoff = *static_cast<const float*>(backoffs_[order_minus_1].Get());
+          ++backoffs_[order_minus_1];
+        }
+      } else {
+        // Not a context.
+        out_backoff = 1.0;
+      }
+
+      output_.Gram(order_minus_1, out_backoff, pay.complete);
+    }
+
+    void Exit(unsigned, void *) const {}
+
+  private:
+    util::FixedArray<util::stream::Stream> backoffs_;
+
+    std::vector<float> probs_;
+    const std::vector<uint64_t>& prune_thresholds_;
+    bool prune_vocab_;
+
+    Output output_;
+    const SpecialVocab specials_;
+};
+} // namespace
+
+Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds, bool prune_vocab, bool output_q, const SpecialVocab &specials)
+  : uniform_prob_(1.0 / static_cast<float>(vocab_size)), // Includes <unk> but excludes <s>.
+    backoffs_(backoffs),
+    prune_thresholds_(prune_thresholds),
+    prune_vocab_(prune_vocab),
+    output_q_(output_q),
+    specials_(specials) {}
+
+// perform order-wise interpolation
+void Interpolate::Run(const util::stream::ChainPositions &positions) {
+  assert(positions.size() == backoffs_.size() + 1);
+  if (output_q_) {
+    typedef Callback<OutputQ> C;
+    C callback(uniform_prob_, backoffs_, prune_thresholds_, prune_vocab_, specials_);
+    JointOrder<C, SuffixOrder>(positions, callback);
+  } else {
+    typedef Callback<OutputProbBackoff> C;
+    C callback(uniform_prob_, backoffs_, prune_thresholds_, prune_vocab_, specials_);
+    JointOrder<C, SuffixOrder>(positions, callback);
+  }
+}
+
+}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/builder/interpolate.hh
+++ b/cpp/thirdpart/kenlm/lm/builder/interpolate.hh
+#ifndef LM_BUILDER_INTERPOLATE_H
+#define LM_BUILDER_INTERPOLATE_H
+
+#include "../common/special.hh"
+#include "../word_index.hh"
+#include "../../util/stream/multi_stream.hh"
+
+#include <vector>
+
+#include <stdint.h>
+
+namespace lm { namespace builder {
+
+/* Interpolate step.
+ * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from
+ * InitialProbabilities.
+ * Output: suffix sorted n-grams with complete probability
+ */
+class Interpolate {
+  public:
+    // Normally vocab_size is the unigram count-1 (since p(<s>) = 0) but might
+    // be larger when the user specifies a consistent vocabulary size.
+    explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab, bool output_q, const SpecialVocab &specials);
+
+    void Run(const util::stream::ChainPositions &positions);
+
+  private:
+    float uniform_prob_;
+    util::stream::ChainPositions backoffs_;
+    const std::vector<uint64_t> prune_thresholds_;
+    bool prune_vocab_;
+    bool output_q_;
+    const SpecialVocab specials_;
+};
+
+}} // namespaces
+#endif // LM_BUILDER_INTERPOLATE_H
--- a/cpp/thirdpart/kenlm/lm/builder/lmplz_main.cc
+++ b/cpp/thirdpart/kenlm/lm/builder/lmplz_main.cc
+#include "output.hh"
+#include "pipeline.hh"
+#include "../common/size_option.hh"
+#include "../lm_exception.hh"
+#include "../../util/file.hh"
+#include "../../util/file_piece.hh"
+#include "../../util/usage.hh"
+
+#include <iostream>
+
+#include <boost/program_options.hpp>
+#include <boost/version.hpp>
+#include <vector>
+
+namespace {
+
+// Parse and validate pruning thresholds then return vector of threshold counts
+// for each n-grams order.
+std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::size_t order) {
+  // convert to vector of integers
+  std::vector<uint64_t> prune_thresholds;
+  prune_thresholds.reserve(order);
+  for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
+    try {
+      prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
+    } catch(const boost::bad_lexical_cast &) {
+      UTIL_THROW(util::Exception, "Bad pruning threshold " << *it);
+    }
+  }
+
+  // Fill with zeros by default.
+  if (prune_thresholds.empty()) {
+    prune_thresholds.resize(order, 0);
+    return prune_thresholds;
+  }
+
+  // validate pruning threshold if specified
+  // throw if each n-gram order has not  threshold specified
+  UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
+  // threshold for unigram can only be 0 (no pruning)
+
+  // check if threshold are not in decreasing order
+  uint64_t lower_threshold = 0;
+  for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
+    UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order.  Otherwise substrings would be removed, which is bad for query-time data structures.");
+    lower_threshold = *it;
+  }
+
+  // Pad to all orders using the last value.
+  prune_thresholds.resize(order, prune_thresholds.back());
+  return prune_thresholds;
+}
+
+lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> &param) {
+  lm::builder::Discount ret;
+  UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+");
+  UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified");
+  ret.amount[0] = 0.0;
+  for (unsigned i = 0; i < 3; ++i) {
+    float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]);
+    UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
+    ret.amount[i + 1] = discount;
+  }
+  return ret;
+}
+
+} // namespace
+
+int main(int argc, char *argv[]) {
+  try {
+    namespace po = boost::program_options;
+    po::options_description options("Language model building options");
+    lm::builder::PipelineConfig pipeline;
+
+    std::string text, intermediate, arpa;
+    std::vector<std::string> pruning;
+    std::vector<std::string> discount_fallback;
+    std::vector<std::string> discount_fallback_default;
+    discount_fallback_default.push_back("0.5");
+    discount_fallback_default.push_back("1");
+    discount_fallback_default.push_back("1.5");
+    bool verbose_header;
+
+    options.add_options()
+      ("help,h", po::bool_switch(), "Show this help message")
+      ("order,o", po::value<std::size_t>(&pipeline.order)
+#if BOOST_VERSION >= 104200
+         ->required()
+#endif
+         , "Order of the model")
+      ("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI.  If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
+      ("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
+      ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value(util::DefaultTempDirectory()), "Temporary file prefix")
+      ("memory,S", lm:: SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
+      ("minimum_block", lm::SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
+      ("sort_block", lm::SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
+      ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
+      ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
+      ("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
+      ("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
+      ("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
+      ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
+      ("intermediate", po::value<std::string>(&intermediate), "Write ngrams to intermediate files.  Turns off ARPA output (which can be reactivated by --arpa file).  Forces --renumber on.")
+      ("renumber", po::bool_switch(&pipeline.renumber_vocabulary), "Renumber the vocabulary identifiers so that they are monotone with the hash of each string.  This is consistent with the ordering used by the trie data structure.")
+      ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities.  See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
+      ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold.  Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above.  The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
+      ("limit_vocab_file", po::value<std::string>(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg")
+      ("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons.  It can also fail if these values are out of range.  This option falls back to user-specified discounts when the closed-form estimate fails.  Note that this option is generally a bad idea: you should deduplicate your corpus instead.  However, class-based models need custom discounts because they lack singleton unigrams.  Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, options), vm);
+
+    if (argc == 1 || vm["help"].as<bool>()) {
+      std::cerr <<
+        "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
+        "Please cite:\n"
+        "@inproceedings{Heafield-estimate,\n"
+        "  author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
+        "  title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
+        "  year = {2013},\n"
+        "  month = {8},\n"
+        "  booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
+        "  address = {Sofia, Bulgaria},\n"
+        "  url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
+        "}\n\n"
+        "Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of\n"
+        "the model (-o) is the only mandatory option.  As this is an on-disk program,\n"
+        "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
+        "Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
+        "Valid units are \% for percentage of memory (supported platforms only) and (in\n"
+        "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y.  Default is K (*1024).\n";
+      uint64_t mem = util::GuessPhysicalMemory();
+      if (mem) {
+        std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
+      } else {
+        std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
+      }
+      std::cerr << options << std::endl;
+      return 1;
+    }
+
+    po::notify(vm);
+
+    // required() appeared in Boost 1.42.0.
+#if BOOST_VERSION < 104200
+    if (!vm.count("order")) {
+      std::cerr << "the option '--order' is required but missing" << std::endl;
+      return 1;
+    }
+#endif
+
+    if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
+      std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl;
+      return 1;
+    }
+
+    if (vm["skip_symbols"].as<bool>()) {
+      pipeline.disallowed_symbol_action = lm::COMPLAIN;
+    } else {
+      pipeline.disallowed_symbol_action = lm::THROW_UP;
+    }
+
+    if (vm.count("discount_fallback")) {
+      pipeline.discount.fallback = ParseDiscountFallback(discount_fallback);
+      pipeline.discount.bad_action = lm::COMPLAIN;
+    } else {
+      // Unused, just here to prevent the compiler from complaining about uninitialized.
+      pipeline.discount.fallback = lm::builder::Discount();
+      pipeline.discount.bad_action = lm::THROW_UP;
+    }
+
+    // parse pruning thresholds.  These depend on order, so it is not done as a notifier.
+    pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);
+
+    if (!vm["limit_vocab_file"].as<std::string>().empty()) {
+      pipeline.prune_vocab = true;
+    }
+    else {
+      pipeline.prune_vocab = false;
+    }
+
+    util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
+
+    lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
+    // TODO: evaluate options for these.
+    initial.adder_in.total_memory = 32768;
+    initial.adder_in.block_count = 2;
+    initial.adder_out.total_memory = 32768;
+    initial.adder_out.block_count = 2;
+    pipeline.read_backoffs = initial.adder_out;
+
+    // Read from stdin, write to stdout by default
+    util::scoped_fd in(0), out(1);
+    if (vm.count("text")) {
+      in.reset(util::OpenReadOrThrow(text.c_str()));
+    }
+    if (vm.count("arpa")) {
+      out.reset(util::CreateOrThrow(arpa.c_str()));
+    }
+
+    try {
+      bool writing_intermediate = vm.count("intermediate");
+      if (writing_intermediate) {
+        pipeline.renumber_vocabulary = true;
+      }
+      lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate, pipeline.output_q);
+      if (!writing_intermediate || vm.count("arpa")) {
+        output.Add(new lm::builder::PrintHook(out.release(), verbose_header));
+      }
+      lm::builder::Pipeline(pipeline, in.release(), output);
+    } catch (const util::MallocException &e) {
+      std::cerr << e.what() << std::endl;
+      std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
+      return 1;
+    }
+    util::PrintUsage(std::cerr);
+  } catch (const std::exception &e) {
+    std::cerr << e.what() << std::endl;
+    return 1;
+  }
+}
--- a/cpp/thirdpart/kenlm/lm/builder/output.cc
+++ b/cpp/thirdpart/kenlm/lm/builder/output.cc
+#include "output.hh"
+
+#include "../common/model_buffer.hh"
+#include "../common/print.hh"
+#include "../../util/file_stream.hh"
+#include "../../util/stream/multi_stream.hh"
+
+#include <iostream>
+
+namespace lm { namespace builder {
+
+OutputHook::~OutputHook() {}
+
+Output::Output(StringPiece file_base, bool keep_buffer, bool output_q)
+  : buffer_(file_base, keep_buffer, output_q) {}
+
+void Output::SinkProbs(util::stream::Chains &chains) {
+  Apply(PROB_PARALLEL_HOOK, chains);
+  if (!buffer_.Keep() && !Have(PROB_SEQUENTIAL_HOOK)) {
+    chains >> util::stream::kRecycle;
+    chains.Wait(true);
+    return;
+  }
+  buffer_.Sink(chains, header_.counts_pruned);
+  chains >> util::stream::kRecycle;
+  chains.Wait(false);
+  if (Have(PROB_SEQUENTIAL_HOOK)) {
+    std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
+    buffer_.Source(chains);
+    Apply(PROB_SEQUENTIAL_HOOK, chains);
+    chains >> util::stream::kRecycle;
+    chains.Wait(true);
+  }
+}
+
+void Output::Apply(HookType hook_type, util::stream::Chains &chains) {
+  for (boost::ptr_vector<OutputHook>::iterator entry = outputs_[hook_type].begin(); entry != outputs_[hook_type].end(); ++entry) {
+    entry->Sink(header_, VocabFile(), chains);
+  }
+}
+
+void PrintHook::Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) {
+  if (verbose_header_) {
+    util::FileStream out(file_.get(), 50);
+    out << "# Input file: " << info.input_file << '\n';
+    out << "# Token count: " << info.token_count << '\n';
+    out << "# Smoothing: Modified Kneser-Ney" << '\n';
+  }
+  chains >> PrintARPA(vocab_file, file_.get(), info.counts_pruned);
+}
+
+}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/builder/output.hh
+++ b/cpp/thirdpart/kenlm/lm/builder/output.hh
+#ifndef LM_BUILDER_OUTPUT_H
+#define LM_BUILDER_OUTPUT_H
+
+#include "header_info.hh"
+#include "../common/model_buffer.hh"
+#include "../../util/file.hh"
+
+#include <boost/ptr_container/ptr_vector.hpp>
+#include <boost/utility.hpp>
+
+namespace util { namespace stream { class Chains; class ChainPositions; } }
+
+/* Outputs from lmplz: ARPA, sharded files, etc */
+namespace lm { namespace builder {
+
+// These are different types of hooks.  Values should be consecutive to enable a vector lookup.
+enum HookType {
+  // TODO: counts.
+  PROB_PARALLEL_HOOK, // Probability and backoff (or just q).  Output must process the orders in parallel or there will be a deadlock.
+  PROB_SEQUENTIAL_HOOK, // Probability and backoff (or just q).  Output can process orders any way it likes.  This requires writing the data to disk then reading.  Useful for ARPA files, which put unigrams first etc.
+  NUMBER_OF_HOOKS // Keep this last so we know how many values there are.
+};
+
+class OutputHook {
+  public:
+    explicit OutputHook(HookType hook_type) : type_(hook_type) {}
+
+    virtual ~OutputHook();
+
+    virtual void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) = 0;
+
+    HookType Type() const { return type_; }
+
+  private:
+    HookType type_;
+};
+
+class Output : boost::noncopyable {
+  public:
+    Output(StringPiece file_base, bool keep_buffer, bool output_q);
+
+    // Takes ownership.
+    void Add(OutputHook *hook) {
+      outputs_[hook->Type()].push_back(hook);
+    }
+
+    bool Have(HookType hook_type) const {
+      return !outputs_[hook_type].empty();
+    }
+
+    int VocabFile() const { return buffer_.VocabFile(); }
+
+    void SetHeader(const HeaderInfo &header) { header_ = header; }
+    const HeaderInfo &GetHeader() const { return header_; }
+
+    // This is called by the pipeline.
+    void SinkProbs(util::stream::Chains &chains);
+
+    unsigned int Steps() const { return Have(PROB_SEQUENTIAL_HOOK); }
+
+  private:
+    void Apply(HookType hook_type, util::stream::Chains &chains);
+
+    ModelBuffer buffer_;
+
+    boost::ptr_vector<OutputHook> outputs_[NUMBER_OF_HOOKS];
+    HeaderInfo header_;
+};
+
+class PrintHook : public OutputHook {
+  public:
+    // Takes ownership
+    PrintHook(int write_fd, bool verbose_header)
+      : OutputHook(PROB_SEQUENTIAL_HOOK), file_(write_fd), verbose_header_(verbose_header) {}
+
+    void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains);
+
+  private:
+    util::scoped_fd file_;
+    bool verbose_header_;
+};
+
+}} // namespaces
+
+#endif // LM_BUILDER_OUTPUT_H
--- a/cpp/thirdpart/kenlm/lm/builder/payload.hh
+++ b/cpp/thirdpart/kenlm/lm/builder/payload.hh
+#ifndef LM_BUILDER_PAYLOAD_H
+#define LM_BUILDER_PAYLOAD_H
+
+#include "../weights.hh"
+#include "../word_index.hh"
+#include <stdint.h>
+
+namespace lm { namespace builder {
+
+struct Uninterpolated {
+  float prob;  // Uninterpolated probability.
+  float gamma; // Interpolation weight for lower order.
+};
+
+union BuildingPayload {
+  uint64_t count;
+  Uninterpolated uninterp;
+  ProbBackoff complete;
+
+  /*mjd**********************************************************************/
+  bool IsMarked() const {
+    return count >> (sizeof(count) * 8 - 1);
+  }
+
+  void Mark() {
+    count |= (1ULL << (sizeof(count) * 8 - 1));
+  }
+
+  void Unmark() {
+    count &= ~(1ULL << (sizeof(count) * 8 - 1));
+  }
+
+  uint64_t UnmarkedCount() const {
+    return count & ~(1ULL << (sizeof(count) * 8 - 1));
+  }
+
+  uint64_t CutoffCount() const {
+    return IsMarked() ? 0 : UnmarkedCount();
+  }
+  /*mjd**********************************************************************/
+};
+
+const WordIndex kBOS = 1;
+const WordIndex kEOS = 2;
+
+}} // namespaces
+
+#endif // LM_BUILDER_PAYLOAD_H
--- a/cpp/thirdpart/kenlm/lm/builder/pipeline.cc
+++ b/cpp/thirdpart/kenlm/lm/builder/pipeline.cc
+#include "pipeline.hh"
+
+#include "adjust_counts.hh"
+#include "combine_counts.hh"
+#include "corpus_count.hh"
+#include "hash_gamma.hh"
+#include "initial_probabilities.hh"
+#include "interpolate.hh"
+#include "output.hh"
+#include "../common/compare.hh"
+#include "../common/renumber.hh"
+
+#include "../sizes.hh"
+#include "../vocab.hh"
+
+#include "../../util/exception.hh"
+#include "../../util/file.hh"
+#include "../../util/stream/io.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+namespace lm { namespace builder {
+
+using util::stream::Sorts;
+
+namespace {
+
+void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint64_t> &counts_pruned, const std::vector<Discount> &discounts) {
+  std::cerr << "Statistics:\n";
+  for (size_t i = 0; i < counts.size(); ++i) {
+    std::cerr << (i + 1) << ' ' << counts_pruned[i];
+    if(counts[i] != counts_pruned[i])
+       std::cerr << "/" << counts[i];
+
+    for (size_t d = 1; d <= 3; ++d)
+      std::cerr << " D" << d << (d == 3 ? "+=" : "=") << discounts[i].amount[d];
+    std::cerr << '\n';
+  }
+}
+
+class Master {
+  public:
+    explicit Master(PipelineConfig &config, unsigned output_steps)
+      : config_(config), chains_(config.order), unigrams_(util::MakeTemp(config_.TempPrefix())), steps_(output_steps + 4) {
+      config_.minimum_block = std::max(NGram<BuildingPayload>::TotalSize(config_.order), config_.minimum_block);
+    }
+
+    const PipelineConfig &Config() const { return config_; }
+
+    util::stream::Chains &MutableChains() { return chains_; }
+
+    template <class T> Master &operator>>(const T &worker) {
+      chains_ >> worker;
+      return *this;
+    }
+
+    // This takes the (partially) sorted ngrams and sets up for adjusted counts.
+    void InitForAdjust(util::stream::Sort<SuffixOrder, CombineCounts> &ngrams, WordIndex types, std::size_t subtract_for_numbering) {
+      const std::size_t each_order_min = config_.minimum_block * config_.block_count;
+      // We know how many unigrams there are.  Don't allocate more than needed to them.
+      const std::size_t min_chains = (config_.order - 1) * each_order_min +
+        std::min(types * NGram<BuildingPayload>::TotalSize(1), each_order_min);
+      // Prevent overflow in subtracting.
+      const std::size_t total = std::max<std::size_t>(config_.TotalMemory(), min_chains + subtract_for_numbering + config_.minimum_block);
+      // Do merge sort with calculated laziness.
+      const std::size_t merge_using = ngrams.Merge(std::min(total - min_chains - subtract_for_numbering, ngrams.DefaultLazy()));
+
+      std::vector<uint64_t> count_bounds(1, types);
+      CreateChains(total - merge_using - subtract_for_numbering, count_bounds);
+      ngrams.Output(chains_.back(), merge_using);
+    }
+
+    // For initial probabilities, but this is generic.
+    void SortAndReadTwice(const std::vector<uint64_t> &counts, Sorts<ContextOrder> &sorts, util::stream::Chains &second, util::stream::ChainConfig second_config) {
+      bool unigrams_are_sorted = !config_.renumber_vocabulary;
+      // Do merge first before allocating chain memory.
+      for (std::size_t i = 0; i < config_.order - unigrams_are_sorted; ++i) {
+        sorts[i].Merge(0);
+      }
+      // There's no lazy merge, so just divide memory amongst the chains.
+      CreateChains(config_.TotalMemory(), counts);
+      chains_.back().ActivateProgress();
+      if (unigrams_are_sorted) {
+        chains_[0] >> unigrams_.Source();
+        second_config.entry_size = NGram<BuildingPayload>::TotalSize(1);
+        second.push_back(second_config);
+        second.back() >> unigrams_.Source();
+      }
+      for (std::size_t i = unigrams_are_sorted; i < config_.order; ++i) {
+        util::scoped_fd fd(sorts[i - unigrams_are_sorted].StealCompleted());
+        chains_[i].SetProgressTarget(util::SizeOrThrow(fd.get()));
+        chains_[i] >> util::stream::PRead(util::DupOrThrow(fd.get()), true);
+        second_config.entry_size = NGram<BuildingPayload>::TotalSize(i + 1);
+        second.push_back(second_config);
+        second.back() >> util::stream::PRead(fd.release(), true);
+      }
+    }
+
+    // There is no sort after this, so go for broke on lazy merging.
+    template <class Compare> void MaximumLazyInput(const std::vector<uint64_t> &counts, Sorts<Compare> &sorts) {
+      // Determine the minimum we can use for all the chains.
+      std::size_t min_chains = 0;
+      for (std::size_t i = 0; i < config_.order; ++i) {
+        min_chains += std::min(counts[i] * NGram<BuildingPayload>::TotalSize(i + 1), static_cast<uint64_t>(config_.minimum_block));
+      }
+      std::size_t for_merge = min_chains > config_.TotalMemory() ? 0 : (config_.TotalMemory() - min_chains);
+      std::vector<std::size_t> laziness;
+      // Prioritize longer n-grams.
+      for (util::stream::Sort<SuffixOrder> *i = sorts.end() - 1; i >= sorts.begin(); --i) {
+        laziness.push_back(i->Merge(for_merge));
+        assert(for_merge >= laziness.back());
+        for_merge -= laziness.back();
+      }
+      std::reverse(laziness.begin(), laziness.end());
+
+      CreateChains(for_merge + min_chains, counts);
+      chains_.back().ActivateProgress();
+      chains_[0] >> unigrams_.Source();
+      for (std::size_t i = 1; i < config_.order; ++i) {
+        sorts[i - 1].Output(chains_[i], laziness[i - 1]);
+      }
+    }
+
+    template <class Compare> void SetupSorts(Sorts<Compare> &sorts, bool exclude_unigrams) {
+      sorts.Init(config_.order - exclude_unigrams);
+      // Unigrams don't get sorted because their order is always the same.
+      if (exclude_unigrams) chains_[0] >> unigrams_.Sink() >> util::stream::kRecycle;
+      for (std::size_t i = exclude_unigrams; i < config_.order; ++i) {
+        sorts.push_back(chains_[i], config_.sort, Compare(i + 1));
+      }
+      chains_.Wait(true);
+    }
+
+    unsigned int Steps() const { return steps_; }
+
+  private:
+    // Create chains, allocating memory to them.  Totally heuristic.  Count
+    // bounds are upper bounds on the counts or not present.
+    void CreateChains(std::size_t remaining_mem, const std::vector<uint64_t> &count_bounds) {
+      std::vector<std::size_t> assignments;
+      assignments.reserve(config_.order);
+      // Start by assigning maximum memory usage (to be refined later).
+      for (std::size_t i = 0; i < count_bounds.size(); ++i) {
+        assignments.push_back(static_cast<std::size_t>(std::min(
+            static_cast<uint64_t>(remaining_mem),
+            count_bounds[i] * static_cast<uint64_t>(NGram<BuildingPayload>::TotalSize(i + 1)))));
+      }
+      assignments.resize(config_.order, remaining_mem);
+
+      // Now we know how much memory everybody wants.  How much will they get?
+      // Proportional to this.
+      std::vector<float> portions;
+      // Indices of orders that have yet to be assigned.
+      std::vector<std::size_t> unassigned;
+      for (std::size_t i = 0; i < config_.order; ++i) {
+        portions.push_back(static_cast<float>((i+1) * NGram<BuildingPayload>::TotalSize(i+1)));
+        unassigned.push_back(i);
+      }
+      /*If somebody doesn't eat their full dinner, give it to the rest of the
+       * family.  Then somebody else might not eat their full dinner etc.  Ends
+       * when everybody unassigned is hungry.
+       */
+      float sum;
+      bool found_more;
+      std::vector<std::size_t> block_count(config_.order);
+      do {
+        sum = 0.0;
+        for (std::size_t i = 0; i < unassigned.size(); ++i) {
+          sum += portions[unassigned[i]];
+        }
+        found_more = false;
+        // If the proportional assignment is more than needed, give it just what it needs.
+        for (std::vector<std::size_t>::iterator i = unassigned.begin(); i != unassigned.end();) {
+          if (assignments[*i] <= remaining_mem * (portions[*i] / sum)) {
+            remaining_mem -= assignments[*i];
+            block_count[*i] = 1;
+            i = unassigned.erase(i);
+            found_more = true;
+          } else {
+            ++i;
+          }
+        }
+      } while (found_more);
+      for (std::vector<std::size_t>::iterator i = unassigned.begin(); i != unassigned.end(); ++i) {
+        assignments[*i] = remaining_mem * (portions[*i] / sum);
+        block_count[*i] = config_.block_count;
+      }
+      chains_.clear();
+      std::cerr << "Chain sizes:";
+      for (std::size_t i = 0; i < config_.order; ++i) {
+        // Always have enough for at least one record.
+        // This was crashing if e.g. there was no 5-gram.
+        assignments[i] = std::max(assignments[i], block_count[i] * NGram<BuildingPayload>::TotalSize(i + 1));
+        std::cerr << ' ' << (i+1) << ":" << assignments[i];
+        chains_.push_back(util::stream::ChainConfig(NGram<BuildingPayload>::TotalSize(i + 1), block_count[i], assignments[i]));
+      }
+      std::cerr << std::endl;
+    }
+
+    PipelineConfig &config_;
+
+    util::stream::Chains chains_;
+
+    util::stream::FileBuffer unigrams_;
+
+    const unsigned int steps_;
+};
+
+util::stream::Sort<SuffixOrder, CombineCounts> *CountText(int text_file /* input */, int vocab_file /* output */, Master &master, uint64_t &token_count, WordIndex &type_count, std::string &text_file_name, std::vector<bool> &prune_words) {
+  const PipelineConfig &config = master.Config();
+  std::cerr << "=== 1/" << master.Steps() << " Counting and sorting n-grams ===" << std::endl;
+
+  const std::size_t vocab_usage = CorpusCount::VocabUsage(config.vocab_estimate);
+  UTIL_THROW_IF(config.TotalMemory() < vocab_usage, util::Exception, "Vocab hash size estimate " << vocab_usage << " exceeds total memory " << config.TotalMemory());
+  std::size_t memory_for_chain =
+    // This much memory to work with after vocab hash table.
+    static_cast<float>(config.TotalMemory() - vocab_usage) /
+    // Solve for block size including the dedupe multiplier for one block.
+    (static_cast<float>(config.block_count) + CorpusCount::DedupeMultiplier(config.order)) *
+    // Chain likes memory expressed in terms of total memory.
+    static_cast<float>(config.block_count);
+  util::stream::Chain chain(util::stream::ChainConfig(NGram<BuildingPayload>::TotalSize(config.order), config.block_count, memory_for_chain));
+
+  type_count = config.vocab_estimate;
+  util::FilePiece text(text_file, NULL, &std::cerr);
+  text_file_name = text.FileName();
+  CorpusCount counter(text, vocab_file, true, token_count, type_count, prune_words, config.prune_vocab_file, chain.BlockSize() / chain.EntrySize(), config.disallowed_symbol_action);
+  chain >> boost::ref(counter);
+
+  util::scoped_ptr<util::stream::Sort<SuffixOrder, CombineCounts> > sorter(new util::stream::Sort<SuffixOrder, CombineCounts>(chain, config.sort, SuffixOrder(config.order), CombineCounts()));
+  chain.Wait(true);
+  return sorter.release();
+}
+
+void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector<uint64_t> &counts_pruned, const std::vector<Discount> &discounts, Master &master, Sorts<SuffixOrder> &primary, util::FixedArray<util::stream::FileBuffer> &gammas, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab, const SpecialVocab &specials) {
+  const PipelineConfig &config = master.Config();
+  util::stream::Chains second(config.order);
+
+  {
+    Sorts<ContextOrder> sorts;
+    master.SetupSorts(sorts, !config.renumber_vocabulary);
+    PrintStatistics(counts, counts_pruned, discounts);
+    lm::ngram::ShowSizes(counts_pruned);
+    std::cerr << "=== 3/" << master.Steps() << " Calculating and sorting initial probabilities ===" << std::endl;
+    master.SortAndReadTwice(counts_pruned, sorts, second, config.initial_probs.adder_in);
+  }
+
+  util::stream::Chains gamma_chains(config.order);
+  InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab, specials);
+  // Don't care about gamma for 0.
+  gamma_chains[0] >> util::stream::kRecycle;
+  gammas.Init(config.order - 1);
+  for (std::size_t i = 1; i < config.order; ++i) {
+    gammas.push_back(util::MakeTemp(config.TempPrefix()));
+    gamma_chains[i] >> gammas[i - 1].Sink() >> util::stream::kRecycle;
+  }
+  // Has to be done here due to gamma_chains scope.
+  master.SetupSorts(primary, true);
+}
+
+void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &master, Sorts<SuffixOrder> &primary, util::FixedArray<util::stream::FileBuffer> &gammas, Output &output, const SpecialVocab &specials) {
+  std::cerr << "=== 4/" << master.Steps() << " Calculating and writing order-interpolated probabilities ===" << std::endl;
+  const PipelineConfig &config = master.Config();
+  master.MaximumLazyInput(counts, primary);
+
+  util::stream::Chains gamma_chains(config.order - 1);
+  for (std::size_t i = 0; i < config.order - 1; ++i) {
+    util::stream::ChainConfig read_backoffs(config.read_backoffs);
+
+    if(config.prune_vocab || config.prune_thresholds[i + 1] > 0)
+        read_backoffs.entry_size = sizeof(HashGamma);
+    else
+        read_backoffs.entry_size = sizeof(float);
+
+    gamma_chains.push_back(read_backoffs);
+    gamma_chains.back() >> gammas[i].Source(true);
+  }
+  master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.prune_vocab, config.output_q, specials);
+  gamma_chains >> util::stream::kRecycle;
+  output.SinkProbs(master.MutableChains());
+}
+
+class VocabNumbering {
+  public:
+    VocabNumbering(int final_vocab, StringPiece temp_prefix, bool renumber)
+      : final_vocab_(final_vocab),
+        renumber_(renumber),
+        specials_(kBOS, kEOS) {
+      if (renumber) {
+        temporary_.reset(util::MakeTemp(temp_prefix));
+      }
+    }
+
+    int WriteOnTheFly() const { return renumber_ ? temporary_.get() : final_vocab_; }
+
+    // Compute the vocabulary mapping and return the memory used.
+    std::size_t ComputeMapping(WordIndex type_count) {
+      if (!renumber_) return 0;
+      ngram::SortedVocabulary::ComputeRenumbering(type_count, temporary_.get(), final_vocab_, vocab_mapping_);
+      temporary_.reset();
+      return sizeof(WordIndex) * vocab_mapping_.size();
+    }
+
+    void ApplyRenumber(util::stream::Chains &chains) {
+      if (!renumber_) return;
+      for (std::size_t i = 0; i < chains.size(); ++i) {
+        chains[i] >> Renumber(&*vocab_mapping_.begin(), i + 1);
+      }
+      specials_ = SpecialVocab(vocab_mapping_[specials_.BOS()], vocab_mapping_[specials_.EOS()]);
+    }
+
+    const SpecialVocab &Specials() const { return specials_; }
+
+  private:
+    int final_vocab_;
+    // Out of order vocab file created on the fly.
+    util::scoped_fd temporary_;
+
+    bool renumber_;
+
+    std::vector<WordIndex> vocab_mapping_;
+
+    SpecialVocab specials_;
+};
+
+} // namespace
+
+void Pipeline(PipelineConfig &config, int text_file, Output &output) {
+  // Some fail-fast sanity checks.
+  if (config.sort.buffer_size * 4 > config.TotalMemory()) {
+    config.sort.buffer_size = config.TotalMemory() / 4;
+    std::cerr << "Warning: changing sort block size to " << config.sort.buffer_size << " bytes due to low total memory." << std::endl;
+  }
+  if (config.minimum_block < NGram<BuildingPayload>::TotalSize(config.order)) {
+    config.minimum_block = NGram<BuildingPayload>::TotalSize(config.order);
+    std::cerr << "Warning: raising minimum block to " << config.minimum_block << " to fit an ngram in every block." << std::endl;
+  }
+  UTIL_THROW_IF(config.sort.buffer_size < config.minimum_block, util::Exception, "Sort block size " << config.sort.buffer_size << " is below the minimum block size " << config.minimum_block << ".");
+  UTIL_THROW_IF(config.TotalMemory() < config.minimum_block * config.order * config.block_count, util::Exception,
+      "Not enough memory to fit " << (config.order * config.block_count) << " blocks with minimum size " << config.minimum_block << ".  Increase memory to " << (config.minimum_block * config.order * config.block_count) << " bytes or decrease the minimum block size.");
+
+  Master master(config, output.Steps());
+  // master's destructor will wait for chains.  But they might be deadlocked if
+  // this thread dies because e.g. it ran out of memory.
+  try {
+    VocabNumbering numbering(output.VocabFile(), config.TempPrefix(), config.renumber_vocabulary);
+    uint64_t token_count;
+    WordIndex type_count;
+    std::string text_file_name;
+    std::vector<bool> prune_words;
+    util::scoped_ptr<util::stream::Sort<SuffixOrder, CombineCounts> > sorted_counts(
+        CountText(text_file, numbering.WriteOnTheFly(), master, token_count, type_count, text_file_name, prune_words));
+    std::cerr << "Unigram tokens " << token_count << " types " << type_count << std::endl;
+
+    // Create vocab mapping, which uses temporary memory, while nothing else is happening.
+    std::size_t subtract_for_numbering = numbering.ComputeMapping(type_count);
+
+    std::cerr << "=== 2/" << master.Steps() << " Calculating and sorting adjusted counts ===" << std::endl;
+    master.InitForAdjust(*sorted_counts, type_count, subtract_for_numbering);
+    sorted_counts.reset();
+
+    std::vector<uint64_t> counts;
+    std::vector<uint64_t> counts_pruned;
+    std::vector<Discount> discounts;
+    master >> AdjustCounts(config.prune_thresholds, counts, counts_pruned, prune_words, config.discount, discounts);
+    numbering.ApplyRenumber(master.MutableChains());
+
+    {
+      util::FixedArray<util::stream::FileBuffer> gammas;
+      Sorts<SuffixOrder> primary;
+      InitialProbabilities(counts, counts_pruned, discounts, master, primary, gammas, config.prune_thresholds, config.prune_vocab, numbering.Specials());
+      output.SetHeader(HeaderInfo(text_file_name, token_count, counts_pruned));
+      // Also does output.
+      InterpolateProbabilities(counts_pruned, master, primary, gammas, output, numbering.Specials());
+    }
+  } catch (const util::Exception &e) {
+    std::cerr << e.what() << std::endl;
+    abort();
+  }
+}
+
+}} // namespaces
--- a/cpp/thirdpart/kenlm/lm/builder/pipeline.hh
+++ b/cpp/thirdpart/kenlm/lm/builder/pipeline.hh
+#ifndef LM_BUILDER_PIPELINE_H
+#define LM_BUILDER_PIPELINE_H
+
+#include "adjust_counts.hh"
+#include "initial_probabilities.hh"
+#include "header_info.hh"
+#include "../lm_exception.hh"
+#include "../word_index.hh"
+#include "../../util/stream/config.hh"
+#include "../../util/file_piece.hh"
+
+#include <string>
+#include <cstddef>
+
+namespace lm { namespace builder {
+
+class Output;
+
+struct PipelineConfig {
+  std::size_t order;
+  util::stream::SortConfig sort;
+  InitialProbabilitiesConfig initial_probs;
+  util::stream::ChainConfig read_backoffs;
+
+  // Estimated vocabulary size.  Used for sizing CorpusCount memory and
+  // initial probing hash table sizing, also in CorpusCount.
+  lm::WordIndex vocab_estimate;
+
+  // Minimum block size to tolerate.
+  std::size_t minimum_block;
+
+  // Number of blocks to use.  This will be overridden to 1 if everything fits.
+  std::size_t block_count;
+
+  // n-gram count thresholds for pruning. 0 values means no pruning for
+  // corresponding n-gram order
+  std::vector<uint64_t> prune_thresholds; //mjd
+  bool prune_vocab;
+  std::string prune_vocab_file;
+
+  /* Renumber the vocabulary the way the trie likes it? */
+  bool renumber_vocabulary;
+
+  // What to do with discount failures.
+  DiscountConfig discount;
+
+  // Compute collapsed q values instead of probability and backoff
+  bool output_q;
+
+  /* Computing the perplexity of LMs with different vocabularies is hard.  For
+   * example, the lowest perplexity is attained by a unigram model that
+   * predicts p(<unk>) = 1 and has no other vocabulary.  Also, linearly
+   * interpolated models will sum to more than 1 because <unk> is duplicated
+   * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
+   * 1 but comes with its own problems).  This option will make the vocabulary
+   * a particular size by replicating <unk> multiple times for purposes of
+   * computing vocabulary size.  It has no effect if the actual vocabulary is
+   * larger.  This parameter serves the same purpose as IRSTLM's "dub".
+   */
+  uint64_t vocab_size_for_unk;
+
+  /* What to do the first time <s>, </s>, or <unk> appears in the input.  If
+   * this is anything but THROW_UP, then the symbol will always be treated as
+   * whitespace.
+   */
+  WarningAction disallowed_symbol_action;
+
+  const std::string &TempPrefix() const { return sort.temp_prefix; }
+  std::size_t TotalMemory() const { return sort.total_memory; }
+};
+
+// Takes ownership of text_file and out_arpa.
+void Pipeline(PipelineConfig &config, int text_file, Output &output);
+
+}} // namespaces
+#endif // LM_BUILDER_PIPELINE_H
--- a/cpp/thirdpart/kenlm/lm/common/CMakeLists.txt
+++ b/cpp/thirdpart/kenlm/lm/common/CMakeLists.txt
+# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
+
+# Explicitly list the source files for this subdirectory
+#
+# If you add any source files to this subdirectory
+#    that should be included in the kenlm library,
+#        (this excludes any unit test files)
+#    you should add them to the following list:
+#
+# In order to set correct paths to these files
+#    in case this variable is referenced by CMake files in the parent directory,
+#    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
+#
+set(KENLM_LM_COMMON_SOURCE
+		${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/print.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc
+		${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc
+  PARENT_SCOPE)
+
+if(BUILD_TESTING)
+  KenLMAddTest(TEST model_buffer_test
+               LIBRARIES kenlm
+               TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test_data)
+endif()
--- a/cpp/thirdpart/kenlm/lm/common/compare.hh
+++ b/cpp/thirdpart/kenlm/lm/common/compare.hh
+#ifndef LM_COMMON_COMPARE_H
+#define LM_COMMON_COMPARE_H
+
+#include "ngram.hh"
+#include "../word_index.hh"
+
+#include <functional>
+#include <string>
+
+namespace lm {
+
+/**
+ * Abstract parent class for defining custom n-gram comparators.
+ */
+template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> {
+  public:
+
+    /**
+     * Constructs a comparator capable of comparing two n-grams.
+     *
+     * @param order Number of words in each n-gram
+     */
+    explicit Comparator(std::size_t order) : order_(order) {}
+
+    /**
+     * Applies the comparator using the Compare method that must be defined in any class that inherits from this class.
+     *
+     * @param lhs A pointer to the n-gram on the left-hand side of the comparison
+     * @param rhs A pointer to the n-gram on the right-hand side of the comparison
+     *
+     * @see ContextOrder::Compare
+     * @see PrefixOrder::Compare
+     * @see SuffixOrder::Compare
+     */
+    inline bool operator()(const void *lhs, const void *rhs) const {
+      return static_cast<const Child*>(this)->Compare(static_cast<const WordIndex*>(lhs), static_cast<const WordIndex*>(rhs));
+    }
+
+    /** Gets the n-gram order defined for this comparator. */
+    std::size_t Order() const { return order_; }
+
+  protected:
+    std::size_t order_;
+};
+
+/**
+ * N-gram comparator that compares n-grams according to their reverse (suffix) order.
+ *
+ * This comparator compares n-grams lexicographically, one word at a time,
+ * beginning with the last word of each n-gram and ending with the first word of each n-gram.
+ *
+ * Some examples of n-gram comparisons as defined by this comparator:
+ * - a b c == a b c
+ * - a b c < a b d
+ * - a b c > a d b
+ * - a b c > a b b
+ * - a b c > x a c
+ * - a b c < x y z
+ */
+class SuffixOrder : public Comparator<SuffixOrder> {
+  public:
+
+    /**
+     * Constructs a comparator capable of comparing two n-grams.
+     *
+     * @param order Number of words in each n-gram
+     */
+    explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {}
+
+    /**
+     * Compares two n-grams lexicographically, one word at a time,
+     * beginning with the last word of each n-gram and ending with the first word of each n-gram.
+     *
+     * @param lhs A pointer to the n-gram on the left-hand side of the comparison
+     * @param rhs A pointer to the n-gram on the right-hand side of the comparison
+     */
+    inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
+      for (std::size_t i = order_ - 1; i != 0; --i) {
+        if (lhs[i] != rhs[i])
+          return lhs[i] < rhs[i];
+      }
+      return lhs[0] < rhs[0];
+    }
+
+    static const unsigned kMatchOffset = 1;
+};
+
+
+/**
+  * N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context.
+  *
+  * This comparator compares n-grams lexicographically, one word at a time,
+  * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
+  * finally, this comparator compares the last word of each n-gram.
+  *
+  * Some examples of n-gram comparisons as defined by this comparator:
+  * - a b c == a b c
+  * - a b c < a b d
+  * - a b c < a d b
+  * - a b c > a b b
+  * - a b c > x a c
+  * - a b c < x y z
+  */
+class ContextOrder : public Comparator<ContextOrder> {
+  public:
+
+    /**
+     * Constructs a comparator capable of comparing two n-grams.
+     *
+     * @param order Number of words in each n-gram
+     */
+    explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {}
+
+    /**
+     * Compares two n-grams lexicographically, one word at a time,
+     * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
+     * finally, this comparator compares the last word of each n-gram.
+     *
+     * @param lhs A pointer to the n-gram on the left-hand side of the comparison
+     * @param rhs A pointer to the n-gram on the right-hand side of the comparison
+     */
+    inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
+      for (int i = order_ - 2; i >= 0; --i) {
+        if (lhs[i] != rhs[i])
+          return lhs[i] < rhs[i];
+      }
+      return lhs[order_ - 1] < rhs[order_ - 1];
+    }
+};
+
+/**
+ * N-gram comparator that compares n-grams according to their natural (prefix) order.
+ *
+ * This comparator compares n-grams lexicographically, one word at a time,
+ * beginning with the first word of each n-gram and ending with the last word of each n-gram.
+ *
+ * Some examples of n-gram comparisons as defined by this comparator:
+ * - a b c == a b c
+ * - a b c < a b d
+ * - a b c < a d b
+ * - a b c > a b b
+ * - a b c < x a c
+ * - a b c < x y z
+ */
+class PrefixOrder : public Comparator<PrefixOrder> {
+  public:
+
+    /**
+     * Constructs a comparator capable of comparing two n-grams.
+     *
+     * @param order Number of words in each n-gram
+     */
+    explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {}
+
+    /**
+     * Compares two n-grams lexicographically, one word at a time,
+     * beginning with the first word of each n-gram and ending with the last word of each n-gram.
+     *
+     * @param lhs A pointer to the n-gram on the left-hand side of the comparison
+     * @param rhs A pointer to the n-gram on the right-hand side of the comparison
+     */
+    inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
+      for (std::size_t i = 0; i < order_; ++i) {
+        if (lhs[i] != rhs[i])
+          return lhs[i] < rhs[i];
+      }
+      return false;
+    }
+
+    static const unsigned kMatchOffset = 0;
+};
+
+template <class Range> struct SuffixLexicographicLess : public std::binary_function<const Range, const Range, bool> {
+  bool operator()(const Range first, const Range second) const {
+    for (const WordIndex *f = first.end() - 1, *s = second.end() - 1; f >= first.begin() && s >= second.begin(); --f, --s) {
+      if (*f < *s) return true;
+      if (*f > *s) return false;
+    }
+    return first.size() < second.size();
+  }
+};
+
+} // namespace lm
+
+#endif // LM_COMMON_COMPARE_H
--- a/cpp/thirdpart/kenlm/lm/common/joint_order.hh
+++ b/cpp/thirdpart/kenlm/lm/common/joint_order.hh
+#ifndef LM_COMMON_JOINT_ORDER_H
+#define LM_COMMON_JOINT_ORDER_H
+
+#include "ngram_stream.hh"
+#include "../lm_exception.hh"
+
+#ifdef DEBUG
+#include "../../util/fixed_array.hh"
+#include <iostream>
+#endif
+
+#include <cstring>
+
+namespace lm {
+
+template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
+  // Allow matching to reference streams[-1].
+  util::FixedArray<ProxyStream<NGramHeader> > streams_with_dummy(positions.size() + 1);
+  // A bogus stream for [-1].
+  streams_with_dummy.push_back();
+  for (std::size_t i = 0; i < positions.size(); ++i) {
+    streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1));
+  }
+  ProxyStream<NGramHeader> *streams = streams_with_dummy.begin() + 1;
+
+  std::size_t order;
+  for (order = 0; order < positions.size() && streams[order]; ++order) {}
+  assert(order); // should always have <unk>.
+
+  // Debugging only: call comparison function to sanity check order.
+#ifdef DEBUG
+  util::FixedArray<Compare> less_compare(order);
+  for (unsigned i = 0; i < order; ++i)
+    less_compare.push_back(i + 1);
+#endif // DEBUG
+
+  std::size_t current = 0;
+  while (true) {
+    // Does the context match the lower one?
+    if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
+      callback.Enter(current, streams[current].Get());
+      // Transition to looking for extensions.
+      if (++current < order) continue;
+    }
+#ifdef DEBUG
+    // match_check[current - 1] matches current-grams
+    // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams).
+    else if (!less_compare[current - 1](streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) {
+      std::cerr << "Stream out of order detected" << std::endl;
+      abort();
+    }
+#endif // DEBUG
+    // No extension left.
+    while(true) {
+      assert(current > 0);
+      --current;
+      callback.Exit(current, streams[current].Get());
+
+      if (++streams[current]) break;
+
+      UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
+
+      order = current;
+      if (!order) return;
+    }
+  }
+}
+
+} // namespaces
+
+#endif // LM_COMMON_JOINT_ORDER_H