Update files

688b6eac · SWHL · 688b6eac · 688b6eac · 688b6eac · 688b6eac
Commit 688b6eac authored Apr 07, 2023 by SWHL
20 changed files
--- a/cpp/thirdpart/kenlm/util/have.hh
+++ b/cpp/thirdpart/kenlm/util/have.hh
+/* Optional packages.  You might want to integrate this with your build system e.g. config.h from ./configure. */
+#ifndef UTIL_HAVE_H
+#define UTIL_HAVE_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef HAVE_ICU
+//#define HAVE_ICU
+#endif
+
+#endif // UTIL_HAVE_H
--- a/cpp/thirdpart/kenlm/util/integer_to_string.cc
+++ b/cpp/thirdpart/kenlm/util/integer_to_string.cc
+#include <iostream>
+/* Fast integer to string conversion.
+Source: https://github.com/miloyip/itoa-benchmark
+Local modifications:
+1. Return end of buffer instead of null terminating
+2. Collapse to single file
+3. Namespace
+4. Remove test hook
+5. Non-x86 support from the branch_lut code
+6. Rename functions
+7. Require __SSE2__ on i386
+
+Copyright (C) 2014 Milo Yip
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+Which is based on: http://0x80.pl/snippets/asm/sse-utoa.c
+
+  SSE: conversion integers to decimal representation
+
+  Author: Wojciech MuÅ‚a
+  e-mail: wojciech_mula@poczta.onet.pl
+  www:    http://0x80.pl/
+
+  License: BSD
+
+  initial release 2011-10-21
+  $Id$
+*/
+
+#include "integer_to_string.hh"
+#include <cassert>
+#include <stdint.h>
+
+namespace util {
+
+namespace {
+const char gDigitsLut[200] = {
+  '0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9',
+  '1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9',
+  '2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9',
+  '3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9',
+  '4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9',
+  '5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9',
+  '6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9',
+  '7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9',
+  '8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9',
+  '9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9'
+};
+} // namespace
+
+// SSE2 implementation according to http://0x80.pl/articles/sse-itoa.html
+// Modifications: (1) fix incorrect digits (2) accept all ranges (3) write to user provided buffer.
+
+#if defined(__amd64) || defined(_M_X64) || (defined(__SSE2__) && (defined(_M_IX86) || defined(i386)))
+
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+#include "intrin.h"
+#endif
+
+#ifdef _MSC_VER
+#define ALIGN_PRE __declspec(align(16))
+#define ALIGN_SUF
+#else
+#define ALIGN_PRE
+#define ALIGN_SUF  __attribute__ ((aligned(16)))
+#endif
+
+namespace {
+
+static const uint32_t kDiv10000 = 0xd1b71759;
+ALIGN_PRE static const uint32_t kDiv10000Vector[4] ALIGN_SUF = { kDiv10000, kDiv10000, kDiv10000, kDiv10000 };
+ALIGN_PRE static const uint32_t k10000Vector[4] ALIGN_SUF = { 10000, 10000, 10000, 10000 };
+ALIGN_PRE static const uint16_t kDivPowersVector[8] ALIGN_SUF = { 8389, 5243, 13108, 32768, 8389, 5243, 13108, 32768 }; // 10^3, 10^2, 10^1, 10^0
+ALIGN_PRE static const uint16_t kShiftPowersVector[8] ALIGN_SUF = {
+    1 << (16 - (23 + 2 - 16)),
+    1 << (16 - (19 + 2 - 16)),
+    1 << (16 - 1 - 2),
+    1 << (15),
+    1 << (16 - (23 + 2 - 16)),
+    1 << (16 - (19 + 2 - 16)),
+    1 << (16 - 1 - 2),
+    1 << (15)
+};
+ALIGN_PRE static const uint16_t k10Vector[8] ALIGN_SUF = { 10, 10, 10, 10, 10, 10, 10, 10 };
+ALIGN_PRE static const char kAsciiZero[16] ALIGN_SUF = { '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' };
+
+inline __m128i Convert8DigitsSSE2(uint32_t value) {
+    assert(value <= 99999999);
+
+    // abcd, efgh = abcdefgh divmod 10000
+    const __m128i abcdefgh = _mm_cvtsi32_si128(value);
+    const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45);
+    const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0]));
+
+    // v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ]
+    const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh);
+
+    // v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ]
+    const __m128i v1a = _mm_slli_epi64(v1, 2);
+
+    // v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ]
+    const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a);
+    const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a);
+
+    // v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ]
+    const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]);
+    const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]);
+
+    // v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ]
+    const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]);
+
+    // v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ]
+    const __m128i v6 = _mm_slli_epi64(v5, 16);
+
+    // v7 = v4 - v6 = { a, b, c, d, e, f, g, h }
+    const __m128i v7 = _mm_sub_epi16(v4, v6);
+
+    return v7;
+}
+
+inline __m128i ShiftDigits_SSE2(__m128i a, unsigned digit) {
+    assert(digit <= 8);
+    switch (digit) {
+        case 0: return a;
+        case 1: return _mm_srli_si128(a, 1);
+        case 2: return _mm_srli_si128(a, 2);
+        case 3: return _mm_srli_si128(a, 3);
+        case 4: return _mm_srli_si128(a, 4);
+        case 5: return _mm_srli_si128(a, 5);
+        case 6: return _mm_srli_si128(a, 6);
+        case 7: return _mm_srli_si128(a, 7);
+        case 8: return _mm_srli_si128(a, 8);
+    }
+    return a; // should not execute here.
+}
+
+} // namespace
+
+// Original name: u32toa_sse2
+char *ToString(uint32_t value, char* buffer) {
+    if (value < 10000) {
+        const uint32_t d1 = (value / 100) << 1;
+        const uint32_t d2 = (value % 100) << 1;
+
+        if (value >= 1000)
+            *buffer++ = gDigitsLut[d1];
+        if (value >= 100)
+            *buffer++ = gDigitsLut[d1 + 1];
+        if (value >= 10)
+            *buffer++ = gDigitsLut[d2];
+        *buffer++ = gDigitsLut[d2 + 1];
+        //*buffer++ = '\0';
+        return buffer;
+    }
+    else if (value < 100000000) {
+        // Experiment shows that this case SSE2 is slower
+#if 0
+        const __m128i a = Convert8DigitsSSE2(value);
+
+        // Convert to bytes, add '0'
+        const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
+
+        // Count number of digit
+        const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0]));
+        unsigned long digit;
+#ifdef _MSC_VER
+        _BitScanForward(&digit, ~mask | 0x8000);
+#else
+        digit = __builtin_ctz(~mask | 0x8000);
+#endif
+
+        // Shift digits to the beginning
+        __m128i result = ShiftDigits_SSE2(va, digit);
+        //__m128i result = _mm_srl_epi64(va, _mm_cvtsi32_si128(digit * 8));
+        _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result);
+        buffer[8 - digit] = '\0';
+#else
+        // value = bbbbcccc
+        const uint32_t b = value / 10000;
+        const uint32_t c = value % 10000;
+
+        const uint32_t d1 = (b / 100) << 1;
+        const uint32_t d2 = (b % 100) << 1;
+
+        const uint32_t d3 = (c / 100) << 1;
+        const uint32_t d4 = (c % 100) << 1;
+
+        if (value >= 10000000)
+            *buffer++ = gDigitsLut[d1];
+        if (value >= 1000000)
+            *buffer++ = gDigitsLut[d1 + 1];
+        if (value >= 100000)
+            *buffer++ = gDigitsLut[d2];
+        *buffer++ = gDigitsLut[d2 + 1];
+
+        *buffer++ = gDigitsLut[d3];
+        *buffer++ = gDigitsLut[d3 + 1];
+        *buffer++ = gDigitsLut[d4];
+        *buffer++ = gDigitsLut[d4 + 1];
+//        *buffer++ = '\0';
+        return buffer;
+#endif
+    }
+    else {
+        // value = aabbbbbbbb in decimal
+
+        const uint32_t a = value / 100000000; // 1 to 42
+        value %= 100000000;
+
+        if (a >= 10) {
+            const unsigned i = a << 1;
+            *buffer++ = gDigitsLut[i];
+            *buffer++ = gDigitsLut[i + 1];
+        }
+        else
+            *buffer++ = '0' + static_cast<char>(a);
+
+        const __m128i b = Convert8DigitsSSE2(value);
+        const __m128i ba = _mm_add_epi8(_mm_packus_epi16(_mm_setzero_si128(), b), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
+        const __m128i result = _mm_srli_si128(ba, 8);
+        _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result);
+//        buffer[8] = '\0';
+        return buffer + 8;
+    }
+}
+
+// Original name: u64toa_sse2
+char *ToString(uint64_t value, char* buffer) {
+    if (value < 100000000) {
+        uint32_t v = static_cast<uint32_t>(value);
+        if (v < 10000) {
+            const uint32_t d1 = (v / 100) << 1;
+            const uint32_t d2 = (v % 100) << 1;
+
+            if (v >= 1000)
+                *buffer++ = gDigitsLut[d1];
+            if (v >= 100)
+                *buffer++ = gDigitsLut[d1 + 1];
+            if (v >= 10)
+                *buffer++ = gDigitsLut[d2];
+            *buffer++ = gDigitsLut[d2 + 1];
+            //*buffer++ = '\0';
+            return buffer;
+        }
+        else {
+            // Experiment shows that this case SSE2 is slower
+#if 0
+            const __m128i a = Convert8DigitsSSE2(v);
+
+            // Convert to bytes, add '0'
+            const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
+
+            // Count number of digit
+            const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0]));
+            unsigned long digit;
+#ifdef _MSC_VER
+            _BitScanForward(&digit, ~mask | 0x8000);
+#else
+            digit = __builtin_ctz(~mask | 0x8000);
+#endif
+
+            // Shift digits to the beginning
+            __m128i result = ShiftDigits_SSE2(va, digit);
+            _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result);
+            buffer[8 - digit] = '\0';
+#else
+            // value = bbbbcccc
+            const uint32_t b = v / 10000;
+            const uint32_t c = v % 10000;
+
+            const uint32_t d1 = (b / 100) << 1;
+            const uint32_t d2 = (b % 100) << 1;
+
+            const uint32_t d3 = (c / 100) << 1;
+            const uint32_t d4 = (c % 100) << 1;
+
+            if (value >= 10000000)
+                *buffer++ = gDigitsLut[d1];
+            if (value >= 1000000)
+                *buffer++ = gDigitsLut[d1 + 1];
+            if (value >= 100000)
+                *buffer++ = gDigitsLut[d2];
+            *buffer++ = gDigitsLut[d2 + 1];
+
+            *buffer++ = gDigitsLut[d3];
+            *buffer++ = gDigitsLut[d3 + 1];
+            *buffer++ = gDigitsLut[d4];
+            *buffer++ = gDigitsLut[d4 + 1];
+            //*buffer++ = '\0';
+            return buffer;
+#endif
+        }
+    }
+    else if (value < 10000000000000000) {
+        const uint32_t v0 = static_cast<uint32_t>(value / 100000000);
+        const uint32_t v1 = static_cast<uint32_t>(value % 100000000);
+
+        const __m128i a0 = Convert8DigitsSSE2(v0);
+        const __m128i a1 = Convert8DigitsSSE2(v1);
+
+        // Convert to bytes, add '0'
+        const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
+
+        // Count number of digit
+        const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0]));
+#ifdef _MSC_VER
+        unsigned long digit;
+        _BitScanForward(&digit, ~mask | 0x8000);
+#else
+        unsigned digit = __builtin_ctz(~mask | 0x8000);
+#endif
+
+        // Shift digits to the beginning
+        __m128i result = ShiftDigits_SSE2(va, digit);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+//        buffer[16 - digit] = '\0';
+        return &buffer[16 - digit];
+    }
+    else {
+        const uint32_t a = static_cast<uint32_t>(value / 10000000000000000); // 1 to 1844
+        value %= 10000000000000000;
+
+        if (a < 10)
+            *buffer++ = '0' + static_cast<char>(a);
+        else if (a < 100) {
+            const uint32_t i = a << 1;
+            *buffer++ = gDigitsLut[i];
+            *buffer++ = gDigitsLut[i + 1];
+        }
+        else if (a < 1000) {
+            *buffer++ = '0' + static_cast<char>(a / 100);
+
+            const uint32_t i = (a % 100) << 1;
+            *buffer++ = gDigitsLut[i];
+            *buffer++ = gDigitsLut[i + 1];
+        }
+        else {
+            const uint32_t i = (a / 100) << 1;
+            const uint32_t j = (a % 100) << 1;
+            *buffer++ = gDigitsLut[i];
+            *buffer++ = gDigitsLut[i + 1];
+            *buffer++ = gDigitsLut[j];
+            *buffer++ = gDigitsLut[j + 1];
+        }
+
+        const uint32_t v0 = static_cast<uint32_t>(value / 100000000);
+        const uint32_t v1 = static_cast<uint32_t>(value % 100000000);
+
+        const __m128i a0 = Convert8DigitsSSE2(v0);
+        const __m128i a1 = Convert8DigitsSSE2(v1);
+
+        // Convert to bytes, add '0'
+        const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), va);
+//        buffer[16] = '\0';
+        return &buffer[16];
+    }
+}
+
+#else // Generic Non-x86 case
+
+// Orignal name: u32toa_branchlut
+char *ToString(uint32_t value, char* buffer) {
+    if (value < 10000) {
+        const uint32_t d1 = (value / 100) << 1;
+        const uint32_t d2 = (value % 100) << 1;
+
+        if (value >= 1000)
+            *buffer++ = gDigitsLut[d1];
+        if (value >= 100)
+            *buffer++ = gDigitsLut[d1 + 1];
+        if (value >= 10)
+            *buffer++ = gDigitsLut[d2];
+        *buffer++ = gDigitsLut[d2 + 1];
+    }
+    else if (value < 100000000) {
+        // value = bbbbcccc
+        const uint32_t b = value / 10000;
+        const uint32_t c = value % 10000;
+
+        const uint32_t d1 = (b / 100) << 1;
+        const uint32_t d2 = (b % 100) << 1;
+
+        const uint32_t d3 = (c / 100) << 1;
+        const uint32_t d4 = (c % 100) << 1;
+
+        if (value >= 10000000)
+            *buffer++ = gDigitsLut[d1];
+        if (value >= 1000000)
+            *buffer++ = gDigitsLut[d1 + 1];
+        if (value >= 100000)
+            *buffer++ = gDigitsLut[d2];
+        *buffer++ = gDigitsLut[d2 + 1];
+
+        *buffer++ = gDigitsLut[d3];
+        *buffer++ = gDigitsLut[d3 + 1];
+        *buffer++ = gDigitsLut[d4];
+        *buffer++ = gDigitsLut[d4 + 1];
+    }
+    else {
+        // value = aabbbbcccc in decimal
+
+        const uint32_t a = value / 100000000; // 1 to 42
+        value %= 100000000;
+
+        if (a >= 10) {
+            const unsigned i = a << 1;
+            *buffer++ = gDigitsLut[i];
+            *buffer++ = gDigitsLut[i + 1];
+        }
+        else
+            *buffer++ = '0' + static_cast<char>(a);
+
+        const uint32_t b = value / 10000; // 0 to 9999
+        const uint32_t c = value % 10000; // 0 to 9999
+
+        const uint32_t d1 = (b / 100) << 1;
+        const uint32_t d2 = (b % 100) << 1;
+
+        const uint32_t d3 = (c / 100) << 1;
+        const uint32_t d4 = (c % 100) << 1;
+
+        *buffer++ = gDigitsLut[d1];
+        *buffer++ = gDigitsLut[d1 + 1];
+        *buffer++ = gDigitsLut[d2];
+        *buffer++ = gDigitsLut[d2 + 1];
+        *buffer++ = gDigitsLut[d3];
+        *buffer++ = gDigitsLut[d3 + 1];
+        *buffer++ = gDigitsLut[d4];
+        *buffer++ = gDigitsLut[d4 + 1];
+    }
+    return buffer; //*buffer++ = '\0';
+}
+
+// Original name: u64toa_branchlut
+char *ToString(uint64_t value, char* buffer) {
+    if (value < 100000000) {
+        uint32_t v = static_cast<uint32_t>(value);
+        if (v < 10000) {
+            const uint32_t d1 = (v / 100) << 1;
+            const uint32_t d2 = (v % 100) << 1;
+
+            if (v >= 1000)
+                *buffer++ = gDigitsLut[d1];
+            if (v >= 100)
+                *buffer++ = gDigitsLut[d1 + 1];
+            if (v >= 10)
+                *buffer++ = gDigitsLut[d2];
+            *buffer++ = gDigitsLut[d2 + 1];
+        }
+        else {
+            // value = bbbbcccc
+            const uint32_t b = v / 10000;
+            const uint32_t c = v % 10000;
+
+            const uint32_t d1 = (b / 100) << 1;
+            const uint32_t d2 = (b % 100) << 1;
+
+            const uint32_t d3 = (c / 100) << 1;
+            const uint32_t d4 = (c % 100) << 1;
+
+            if (value >= 10000000)
+                *buffer++ = gDigitsLut[d1];
+            if (value >= 1000000)
+                *buffer++ = gDigitsLut[d1 + 1];
+            if (value >= 100000)
+                *buffer++ = gDigitsLut[d2];
+            *buffer++ = gDigitsLut[d2 + 1];
+
+            *buffer++ = gDigitsLut[d3];
+            *buffer++ = gDigitsLut[d3 + 1];
+            *buffer++ = gDigitsLut[d4];
+            *buffer++ = gDigitsLut[d4 + 1];
+        }
+    }
+    else if (value < 10000000000000000) {
+        const uint32_t v0 = static_cast<uint32_t>(value / 100000000);
+        const uint32_t v1 = static_cast<uint32_t>(value % 100000000);
+
+        const uint32_t b0 = v0 / 10000;
+        const uint32_t c0 = v0 % 10000;
+
+        const uint32_t d1 = (b0 / 100) << 1;
+        const uint32_t d2 = (b0 % 100) << 1;
+
+        const uint32_t d3 = (c0 / 100) << 1;
+        const uint32_t d4 = (c0 % 100) << 1;
+
+        const uint32_t b1 = v1 / 10000;
+        const uint32_t c1 = v1 % 10000;
+
+        const uint32_t d5 = (b1 / 100) << 1;
+        const uint32_t d6 = (b1 % 100) << 1;
+
+        const uint32_t d7 = (c1 / 100) << 1;
+        const uint32_t d8 = (c1 % 100) << 1;
+
+        if (value >= 1000000000000000)
+            *buffer++ = gDigitsLut[d1];
+        if (value >= 100000000000000)
+            *buffer++ = gDigitsLut[d1 + 1];
+        if (value >= 10000000000000)
+            *buffer++ = gDigitsLut[d2];
+        if (value >= 1000000000000)
+            *buffer++ = gDigitsLut[d2 + 1];
+        if (value >= 100000000000)
+            *buffer++ = gDigitsLut[d3];
+        if (value >= 10000000000)
+            *buffer++ = gDigitsLut[d3 + 1];
+        if (value >= 1000000000)
+            *buffer++ = gDigitsLut[d4];
+        if (value >= 100000000)
+            *buffer++ = gDigitsLut[d4 + 1];
+
+        *buffer++ = gDigitsLut[d5];
+        *buffer++ = gDigitsLut[d5 + 1];
+        *buffer++ = gDigitsLut[d6];
+        *buffer++ = gDigitsLut[d6 + 1];
+        *buffer++ = gDigitsLut[d7];
+        *buffer++ = gDigitsLut[d7 + 1];
+        *buffer++ = gDigitsLut[d8];
+        *buffer++ = gDigitsLut[d8 + 1];
+    }
+    else {
+        const uint32_t a = static_cast<uint32_t>(value / 10000000000000000); // 1 to 1844
+        value %= 10000000000000000;
+
+        if (a < 10)
+            *buffer++ = '0' + static_cast<char>(a);
+        else if (a < 100) {
+            const uint32_t i = a << 1;
+            *buffer++ = gDigitsLut[i];
+            *buffer++ = gDigitsLut[i + 1];
+        }
+        else if (a < 1000) {
+            *buffer++ = '0' + static_cast<char>(a / 100);
+
+            const uint32_t i = (a % 100) << 1;
+            *buffer++ = gDigitsLut[i];
+            *buffer++ = gDigitsLut[i + 1];
+        }
+        else {
+            const uint32_t i = (a / 100) << 1;
+            const uint32_t j = (a % 100) << 1;
+            *buffer++ = gDigitsLut[i];
+            *buffer++ = gDigitsLut[i + 1];
+            *buffer++ = gDigitsLut[j];
+            *buffer++ = gDigitsLut[j + 1];
+        }
+
+        const uint32_t v0 = static_cast<uint32_t>(value / 100000000);
+        const uint32_t v1 = static_cast<uint32_t>(value % 100000000);
+
+        const uint32_t b0 = v0 / 10000;
+        const uint32_t c0 = v0 % 10000;
+
+        const uint32_t d1 = (b0 / 100) << 1;
+        const uint32_t d2 = (b0 % 100) << 1;
+
+        const uint32_t d3 = (c0 / 100) << 1;
+        const uint32_t d4 = (c0 % 100) << 1;
+
+        const uint32_t b1 = v1 / 10000;
+        const uint32_t c1 = v1 % 10000;
+
+        const uint32_t d5 = (b1 / 100) << 1;
+        const uint32_t d6 = (b1 % 100) << 1;
+
+        const uint32_t d7 = (c1 / 100) << 1;
+        const uint32_t d8 = (c1 % 100) << 1;
+
+        *buffer++ = gDigitsLut[d1];
+        *buffer++ = gDigitsLut[d1 + 1];
+        *buffer++ = gDigitsLut[d2];
+        *buffer++ = gDigitsLut[d2 + 1];
+        *buffer++ = gDigitsLut[d3];
+        *buffer++ = gDigitsLut[d3 + 1];
+        *buffer++ = gDigitsLut[d4];
+        *buffer++ = gDigitsLut[d4 + 1];
+        *buffer++ = gDigitsLut[d5];
+        *buffer++ = gDigitsLut[d5 + 1];
+        *buffer++ = gDigitsLut[d6];
+        *buffer++ = gDigitsLut[d6 + 1];
+        *buffer++ = gDigitsLut[d7];
+        *buffer++ = gDigitsLut[d7 + 1];
+        *buffer++ = gDigitsLut[d8];
+        *buffer++ = gDigitsLut[d8 + 1];
+    }
+    return buffer;
+}
+
+#endif // End of architecture if statement.
+
+// Signed wrappers.  The negation is done on the unsigned version because
+// doing so has defined behavior for INT_MIN.
+char *ToString(int32_t value, char *to) {
+  uint32_t un = static_cast<uint32_t>(value);
+  if (value < 0) {
+    *to++ = '-';
+    un = -un;
+  }
+  return ToString(un, to);
+}
+
+char *ToString(int64_t value, char *to) {
+  uint64_t un = static_cast<uint64_t>(value);
+  if (value < 0) {
+    *to++ = '-';
+    un = -un;
+  }
+  return ToString(un, to);
+}
+
+// No optimization for this case yet.
+char *ToString(int16_t value, char *to) {
+  return ToString((int32_t)value, to);
+}
+char *ToString(uint16_t value, char *to) {
+  return ToString((uint32_t)value, to);
+}
+
+// void * to string.  This hasn't been optimized at all really.
+namespace {
+const char kHexDigits[] = "0123456789abcdef";
+} // namespace
+
+char *ToString(const void *v, char *to) {
+  *to++ = '0';
+  *to++ = 'x';
+
+  // Fun fact: gcc/clang boost::lexical_cast on Linux do just "0" while clang on OS X does "0x0"
+  // I happen to prefer 0x0.
+  if (!v) {
+    *to++ = '0';
+    return to;
+  }
+
+  uintptr_t value = reinterpret_cast<uintptr_t>(v);
+  uint8_t shift = sizeof(void*) * 8 - 4;
+  for (; !(value >> shift); shift -= 4) {}
+  for (; ; shift -= 4) {
+    *to++ = kHexDigits[(value >> shift) & 0xf];
+    if (!shift) break;
+  }
+  return to;
+}
+
+} // namespace util
--- a/cpp/thirdpart/kenlm/util/integer_to_string.hh
+++ b/cpp/thirdpart/kenlm/util/integer_to_string.hh
+#ifndef UTIL_INTEGER_TO_STRING_H
+#define UTIL_INTEGER_TO_STRING_H
+#include <cstddef>
+#include <stdint.h>
+
+namespace util {
+
+/* These functions convert integers to strings and return the end pointer.
+ */
+char *ToString(uint32_t value, char *to);
+char *ToString(uint64_t value, char *to);
+
+// Implemented as wrappers to above
+char *ToString(int32_t value, char *to);
+char *ToString(int64_t value, char *to);
+
+// Calls the 32-bit versions for now.
+char *ToString(uint16_t value, char *to);
+char *ToString(int16_t value, char *to);
+
+char *ToString(const void *value, char *to);
+
+inline char *ToString(bool value, char *to) {
+  *to++ = '0' + value;
+  return to;
+}
+
+// How many bytes to reserve in the buffer for these strings:
+// g++ 4.9.1 doesn't work with this:
+// static const std::size_t kBytes = 5;
+// So use enum.
+template <class T> struct ToStringBuf;
+template <> struct ToStringBuf<bool> {
+  enum { kBytes = 1 };
+};
+template <> struct ToStringBuf<uint16_t> {
+  enum { kBytes = 5 };
+};
+template <> struct ToStringBuf<int16_t> {
+  enum { kBytes = 6 };
+};
+template <> struct ToStringBuf<uint32_t> {
+  enum { kBytes = 10 };
+};
+template <> struct ToStringBuf<int32_t> {
+  enum { kBytes = 11 };
+};
+template <> struct ToStringBuf<uint64_t> {
+  enum { kBytes = 20 };
+};
+template <> struct ToStringBuf<int64_t> {
+  // Not a typo.  2^63 has 19 digits.
+  enum { kBytes = 20 };
+};
+
+template <> struct ToStringBuf<const void*> {
+  // Either 18 on 64-bit or 10 on 32-bit.
+  enum { kBytes = sizeof(const void*) * 2 + 2 };
+};
+
+// Maximum over this and float.
+enum { kToStringMaxBytes = 20 };
+
+} // namespace util
+
+#endif // UTIL_INTEGER_TO_STRING_H
--- a/cpp/thirdpart/kenlm/util/integer_to_string_test.cc
+++ b/cpp/thirdpart/kenlm/util/integer_to_string_test.cc
+#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
+#include "integer_to_string.hh"
+#include "string_piece.hh"
+
+#define BOOST_TEST_MODULE IntegerToStringTest
+#include <boost/test/unit_test.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include <limits>
+
+namespace util {
+namespace {
+
+template <class T> void TestValue(const T value) {
+  char buf[ToStringBuf<T>::kBytes];
+  StringPiece result(buf, ToString(value, buf) - buf);
+  BOOST_REQUIRE_GE(static_cast<std::size_t>(ToStringBuf<T>::kBytes), result.size());
+  if (value) {
+    BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), result);
+  } else {
+    // Platforms can do void * as 0x0 or 0.
+    BOOST_CHECK(result == "0x0" || result == "0");
+  }
+}
+
+template <class T> void TestCorners() {
+  TestValue(std::numeric_limits<T>::min());
+  TestValue(std::numeric_limits<T>::max());
+  TestValue((T)0);
+  TestValue((T)-1);
+  TestValue((T)1);
+}
+
+BOOST_AUTO_TEST_CASE(Corners) {
+  TestCorners<uint16_t>();
+  TestCorners<uint32_t>();
+  TestCorners<uint64_t>();
+  TestCorners<int16_t>();
+  TestCorners<int32_t>();
+  TestCorners<int64_t>();
+  TestCorners<const void*>();
+}
+
+template <class T> void TestAll() {
+  for (T i = std::numeric_limits<T>::min(); i < std::numeric_limits<T>::max(); ++i) {
+    TestValue(i);
+  }
+  TestValue(std::numeric_limits<T>::max());
+}
+
+BOOST_AUTO_TEST_CASE(Short) {
+  TestAll<uint16_t>();
+  TestAll<int16_t>();
+}
+
+template <class T> void Test10s() {
+  for (T i = 1; i < std::numeric_limits<T>::max() / 10; i *= 10) {
+    TestValue(i);
+    TestValue(i - 1);
+    TestValue(i + 1);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(Tens) {
+  Test10s<uint64_t>();
+  Test10s<int64_t>();
+  Test10s<uint32_t>();
+  Test10s<int32_t>();
+}
+
+BOOST_AUTO_TEST_CASE(Pointers) {
+  for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
+    TestValue((const void*)i);
+  }
+  for (uintptr_t i = 0; i < 256; ++i) {
+    TestValue((const void*)i);
+    TestValue((const void*)(i + 0xf00));
+  }
+}
+
+}} // namespaces
--- a/cpp/thirdpart/kenlm/util/joint_sort.hh
+++ b/cpp/thirdpart/kenlm/util/joint_sort.hh
+#ifndef UTIL_JOINT_SORT_H
+#define UTIL_JOINT_SORT_H
+
+/* A terrifying amount of C++ to coax std::sort into soring one range while
+ * also permuting another range the same way.
+ */
+
+#include "proxy_iterator.hh"
+
+#include <algorithm>
+#include <functional>
+
+namespace util {
+
+namespace detail {
+
+template <class KeyIter, class ValueIter> class JointProxy;
+
+template <class KeyIter, class ValueIter> class JointIter {
+  public:
+    JointIter() {}
+
+    JointIter(const KeyIter &key_iter, const ValueIter &value_iter) : key_(key_iter), value_(value_iter) {}
+
+    bool operator==(const JointIter<KeyIter, ValueIter> &other) const { return key_ == other.key_; }
+
+    bool operator<(const JointIter<KeyIter, ValueIter> &other) const { return (key_ < other.key_); }
+
+    std::ptrdiff_t operator-(const JointIter<KeyIter, ValueIter> &other) const { return key_ - other.key_; }
+
+    JointIter<KeyIter, ValueIter> &operator+=(std::ptrdiff_t amount) {
+      key_ += amount;
+      value_ += amount;
+      return *this;
+    }
+
+    friend void swap(JointIter &first, JointIter &second) {
+      using std::swap;
+      swap(first.key_, second.key_);
+      swap(first.value_, second.value_);
+    }
+
+    void DeepSwap(JointIter &other) {
+      using std::swap;
+      swap(*key_, *other.key_);
+      swap(*value_, *other.value_);
+    }
+
+  private:
+    friend class JointProxy<KeyIter, ValueIter>;
+    KeyIter key_;
+    ValueIter value_;
+};
+
+template <class KeyIter, class ValueIter> class JointProxy {
+  private:
+    typedef JointIter<KeyIter, ValueIter> InnerIterator;
+
+  public:
+    typedef struct {
+      typename std::iterator_traits<KeyIter>::value_type key;
+      typename std::iterator_traits<ValueIter>::value_type value;
+      const typename std::iterator_traits<KeyIter>::value_type &GetKey() const { return key; }
+    } value_type;
+
+    JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
+    JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}
+
+    operator value_type() const {
+      value_type ret;
+      ret.key = *inner_.key_;
+      ret.value = *inner_.value_;
+      return ret;
+    }
+
+    JointProxy &operator=(const JointProxy &other) {
+      *inner_.key_ = *other.inner_.key_;
+      *inner_.value_ = *other.inner_.value_;
+      return *this;
+    }
+
+    JointProxy &operator=(const value_type &other) {
+      *inner_.key_ = other.key;
+      *inner_.value_ = other.value;
+      return *this;
+    }
+
+    typename std::iterator_traits<KeyIter>::reference GetKey() const {
+      return *(inner_.key_);
+    }
+
+    friend void swap(JointProxy<KeyIter, ValueIter> first, JointProxy<KeyIter, ValueIter> second) {
+      first.Inner().DeepSwap(second.Inner());
+    }
+
+  private:
+    friend class ProxyIterator<JointProxy<KeyIter, ValueIter> >;
+
+    InnerIterator &Inner() { return inner_; }
+    const InnerIterator &Inner() const { return inner_; }
+    InnerIterator inner_;
+};
+
+template <class Proxy, class Less> class LessWrapper : public std::binary_function<const typename Proxy::value_type &, const typename Proxy::value_type &, bool> {
+  public:
+    explicit LessWrapper(const Less &less) : less_(less) {}
+
+    bool operator()(const Proxy &left, const Proxy &right) const {
+      return less_(left.GetKey(), right.GetKey());
+    }
+    bool operator()(const Proxy &left, const typename Proxy::value_type &right) const {
+      return less_(left.GetKey(), right.GetKey());
+    }
+    bool operator()(const typename Proxy::value_type &left, const Proxy &right) const {
+      return less_(left.GetKey(), right.GetKey());
+    }
+    bool operator()(const typename Proxy::value_type &left, const typename Proxy::value_type &right) const {
+      return less_(left.GetKey(), right.GetKey());
+    }
+
+  private:
+    const Less less_;
+};
+
+} // namespace detail
+
+template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
+  public:
+    PairedIterator(const KeyIter &key, const ValueIter &value) :
+      ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
+};
+
+template <class KeyIter, class ValueIter, class Less> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) {
+  ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > full_begin(detail::JointProxy<KeyIter, ValueIter>(key_begin, value_begin));
+  detail::LessWrapper<detail::JointProxy<KeyIter, ValueIter>, Less> less_wrap(less);
+  std::sort(full_begin, full_begin + (key_end - key_begin), less_wrap);
+}
+
+
+template <class KeyIter, class ValueIter> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin) {
+  JointSort(key_begin, key_end, value_begin, std::less<typename std::iterator_traits<KeyIter>::value_type>());
+}
+
+} // namespace util
+
+#endif // UTIL_JOINT_SORT_H
--- a/cpp/thirdpart/kenlm/util/joint_sort_test.cc
+++ b/cpp/thirdpart/kenlm/util/joint_sort_test.cc
+#include "joint_sort.hh"
+
+#define BOOST_TEST_MODULE JointSortTest
+#include <boost/test/unit_test.hpp>
+
+namespace util { namespace {
+
+BOOST_AUTO_TEST_CASE(just_flip) {
+  char keys[2];
+  int values[2];
+  keys[0] = 1; values[0] = 327;
+  keys[1] = 0; values[1] = 87897;
+  JointSort<char *, int *>(keys + 0, keys + 2, values + 0);
+  BOOST_CHECK_EQUAL(0, keys[0]);
+  BOOST_CHECK_EQUAL(87897, values[0]);
+  BOOST_CHECK_EQUAL(1, keys[1]);
+  BOOST_CHECK_EQUAL(327, values[1]);
+}
+
+BOOST_AUTO_TEST_CASE(three) {
+  char keys[3];
+  int values[3];
+  keys[0] = 1; values[0] = 327;
+  keys[1] = 2; values[1] = 87897;
+  keys[2] = 0; values[2] = 10;
+  JointSort<char *, int *>(keys + 0, keys + 3, values + 0);
+  BOOST_CHECK_EQUAL(0, keys[0]);
+  BOOST_CHECK_EQUAL(1, keys[1]);
+  BOOST_CHECK_EQUAL(2, keys[2]);
+}
+
+BOOST_AUTO_TEST_CASE(char_int) {
+  char keys[4];
+  int values[4];
+  keys[0] = 3; values[0] = 327;
+  keys[1] = 1; values[1] = 87897;
+  keys[2] = 2; values[2] = 10;
+  keys[3] = 0; values[3] = 24347;
+  JointSort<char *, int *>(keys + 0, keys + 4, values + 0);
+  BOOST_CHECK_EQUAL(0, keys[0]);
+  BOOST_CHECK_EQUAL(24347, values[0]);
+  BOOST_CHECK_EQUAL(1, keys[1]);
+  BOOST_CHECK_EQUAL(87897, values[1]);
+  BOOST_CHECK_EQUAL(2, keys[2]);
+  BOOST_CHECK_EQUAL(10, values[2]);
+  BOOST_CHECK_EQUAL(3, keys[3]);
+  BOOST_CHECK_EQUAL(327, values[3]);
+}
+
+BOOST_AUTO_TEST_CASE(swap_proxy) {
+  char keys[2] = {0, 1};
+  int values[2] = {2, 3};
+  detail::JointProxy<char *, int *> first(keys, values);
+  detail::JointProxy<char *, int *> second(keys + 1, values + 1);
+  swap(first, second);
+  BOOST_CHECK_EQUAL(1, keys[0]);
+  BOOST_CHECK_EQUAL(0, keys[1]);
+  BOOST_CHECK_EQUAL(3, values[0]);
+  BOOST_CHECK_EQUAL(2, values[1]);
+}
+
+}} // namespace anonymous util
--- a/cpp/thirdpart/kenlm/util/mmap.cc
+++ b/cpp/thirdpart/kenlm/util/mmap.cc
+/* Memory mapping wrappers.
+ * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
+ * NICT.
+ */
+#include "mmap.hh"
+
+#include "exception.hh"
+#include "file.hh"
+#include "scoped.hh"
+
+#include <iostream>
+
+#include <cassert>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <cstdlib>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#include <io.h>
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+namespace util {
+
+std::size_t SizePage() {
+#if defined(_WIN32) || defined(_WIN64)
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return si.dwAllocationGranularity;
+#else
+  return sysconf(_SC_PAGE_SIZE);
+#endif
+}
+
+scoped_mmap::~scoped_mmap() {
+  if (data_ != (void*)-1) {
+    try {
+      // Thanks Denis Filimonov for pointing out NFS likes msync first.
+      SyncOrThrow(data_, size_);
+      UnmapOrThrow(data_, size_);
+    } catch (const util::ErrnoException &e) {
+      std::cerr << e.what();
+      abort();
+    }
+  }
+}
+
+namespace {
+template <class T> T RoundUpPow2(T value, T mult) {
+  return ((value - 1) & ~(mult - 1)) + mult;
+}
+
+std::size_t RoundUpSize(const scoped_memory &mem) {
+  switch(mem.source()) {
+    case scoped_memory::MMAP_ROUND_1G_ALLOCATED:
+      return RoundUpPow2<std::size_t>(mem.size(), 1ULL << 30);
+    case scoped_memory::MMAP_ROUND_2M_ALLOCATED:
+      return RoundUpPow2<std::size_t>(mem.size(), 1ULL << 21);
+    case scoped_memory::MMAP_ROUND_PAGE_ALLOCATED:
+      return RoundUpPow2<std::size_t>(mem.size(), static_cast<std::size_t>(SizePage()));
+    default:
+      return mem.size();
+  }
+}
+
+} // namespace
+
+scoped_memory::scoped_memory(std::size_t size, bool zeroed) : data_(NULL), size_(0), source_(NONE_ALLOCATED) {
+  HugeMalloc(size, zeroed, *this);
+}
+
+void scoped_memory::reset(void *data, std::size_t size, Alloc source) {
+  switch(source_) {
+    case MMAP_ROUND_1G_ALLOCATED:
+    case MMAP_ROUND_2M_ALLOCATED:
+    case MMAP_ROUND_PAGE_ALLOCATED:
+    case MMAP_ALLOCATED:
+      scoped_mmap(data_, RoundUpSize(*this));
+      break;
+    case MALLOC_ALLOCATED:
+      free(data_);
+      break;
+    case NONE_ALLOCATED:
+      break;
+  }
+  data_ = data;
+  size_ = size;
+  source_ = source;
+}
+
+const int kFileFlags =
+#if defined(_WIN32) || defined(_WIN64)
+  0 // MapOrThrow ignores flags on windows
+#elif defined(MAP_FILE)
+  MAP_FILE | MAP_SHARED
+#else
+  MAP_SHARED
+#endif
+  ;
+
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) {
+#ifdef MAP_POPULATE // Linux specific
+  if (prefault) {
+    flags |= MAP_POPULATE;
+  }
+#endif
+#if defined(_WIN32) || defined(_WIN64)
+  int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY;
+  int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ;
+  uint64_t total_size = size + offset;
+  HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast<DWORD>(total_size), NULL);
+  UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed");
+  LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size);
+  CloseHandle(hMapping);
+  UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
+#else
+  int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
+  void *ret;
+  UTIL_THROW_IF((ret = mmap(NULL, size, protect, flags, fd, offset)) == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset);
+#  ifdef MADV_HUGEPAGE
+  /* We like huge pages but it's fine if we can't have them.  Note that huge
+   * pages are not supported for file-backed mmap on linux.
+   */
+  madvise(ret, size, MADV_HUGEPAGE);
+#  endif
+#endif
+  return ret;
+}
+
+void SyncOrThrow(void *start, size_t length) {
+#if defined(_WIN32) || defined(_WIN64)
+  UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap");
+#else
+  UTIL_THROW_IF(length && msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap");
+#endif
+}
+
+void UnmapOrThrow(void *start, size_t length) {
+#if defined(_WIN32) || defined(_WIN64)
+  UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file");
+#else
+  UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed with " << start << " for length " << length);
+#endif
+}
+
+// Linux huge pages.
+#ifdef __linux__
+
+namespace {
+
+bool TryHuge(std::size_t size, bool populate, uint8_t alignment_bits, scoped_memory::Alloc huge_scheme, scoped_memory &to) {
+  // Don't bother with these cases.
+  if (size < (1ULL << alignment_bits) || (1ULL << alignment_bits) < SizePage())
+    return false;
+
+  // First try: Linux >= 3.8 with manually configured hugetlb pages available.
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | (alignment_bits << 26 /* This is MAP_HUGE_SHIFT but some headers are too old. */);
+  if (populate) flags |= MAP_POPULATE;
+  void *ret = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+  if (ret != MAP_FAILED) {
+    to.reset(ret, size, huge_scheme);
+    return true;
+  }
+
+  // There weren't pages in a sysadmin-created pool.  Let's get aligned memory
+  // and hope transparent huge pages kicks in.  Align to a multiple of the huge
+  // page size by overallocating. I feel bad about doing this, but it's also how
+  // posix_memalign is implemented.  And the memory is virtual.
+
+  // Round up requested size to multiple of page size.  This will allow the pages after to be munmapped.
+  std::size_t size_up = RoundUpPow2(size, SizePage());
+
+  std::size_t ask = size_up + (1 << alignment_bits) - SizePage();
+  // Don't populate because this is asking for more than we will use.
+  scoped_mmap larger(mmap(NULL, ask, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0), ask);
+  if (larger.get() == MAP_FAILED) return false;
+
+  // Throw out pages before the alignment point.
+  uintptr_t base = reinterpret_cast<uintptr_t>(larger.get());
+  // Round up to next multiple of alignment.
+  uintptr_t rounded_up = RoundUpPow2(base, static_cast<uintptr_t>(1) << alignment_bits);
+  if (base != rounded_up) {
+    // If this throws an exception (which it shouldn't) then we want to unmap the whole thing by keeping it in larger.
+    UnmapOrThrow(larger.get(), rounded_up - base);
+    larger.steal();
+    larger.reset(reinterpret_cast<void*>(rounded_up), ask - (rounded_up - base));
+  }
+
+  // Throw out pages after the requested size.
+  assert(larger.size() >= size_up);
+  if (larger.size() > size_up) {
+    // This is where we assume size_up is a multiple of page size.
+    UnmapOrThrow(static_cast<uint8_t*>(larger.get()) + size_up, larger.size() - size_up);
+    larger.reset(larger.steal(), size_up);
+  }
+#ifdef MADV_HUGEPAGE
+  madvise(larger.get(), size_up, MADV_HUGEPAGE);
+#endif
+  to.reset(larger.steal(), size, scoped_memory::MMAP_ROUND_PAGE_ALLOCATED);
+  return true;
+}
+
+} // namespace
+
+#endif
+
+void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to) {
+  to.reset();
+#ifdef __linux__
+  // TODO: architectures/page sizes other than 2^21 and 2^30.
+  // Attempt 1 GB pages.
+  // If the user asked for zeroed memory, assume they want it populated.
+  if (size >= (1ULL << 30) && TryHuge(size, zeroed, 30, scoped_memory::MMAP_ROUND_1G_ALLOCATED, to))
+    return;
+  // Attempt 2 MB pages.
+  if (size >= (1ULL << 21) && TryHuge(size, zeroed, 21, scoped_memory::MMAP_ROUND_2M_ALLOCATED, to))
+    return;
+#endif // __linux__
+  // Non-linux will always do this, as will small allocations on Linux.
+  to.reset(zeroed ? calloc(1, size) : malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
+  UTIL_THROW_IF(!to.get(), ErrnoException, "Failed to allocate " << size << " bytes");
+}
+
+namespace {
+#ifdef __linux__
+const std::size_t kTransitionHuge = std::max<std::size_t>(1ULL << 21, SizePage());
+#endif // __linux__
+
+void ReplaceAndCopy(std::size_t to, bool zero_new, scoped_memory &mem) {
+  scoped_memory replacement;
+  HugeMalloc(to, zero_new, replacement);
+  memcpy(replacement.get(), mem.get(), mem.size());
+  // This can't throw.
+  mem.reset(replacement.get(), replacement.size(), replacement.source());
+  replacement.steal();
+}
+} // namespace
+
+void HugeRealloc(std::size_t to, bool zero_new, scoped_memory &mem) {
+  if (!to) {
+    mem.reset();
+    return;
+  }
+  switch (mem.source()) {
+    case scoped_memory::NONE_ALLOCATED:
+      HugeMalloc(to, zero_new, mem);
+      return;
+#ifdef __linux__
+    // TODO really need to collapse these cases with a number.
+    case scoped_memory::MMAP_ROUND_1G_ALLOCATED:
+    case scoped_memory::MMAP_ROUND_2M_ALLOCATED:
+    case scoped_memory::MMAP_ROUND_PAGE_ALLOCATED:
+    case scoped_memory::MMAP_ALLOCATED:
+      // Downsizing below barrier?
+      if (to <= SizePage()) {
+        scoped_malloc replacement(malloc(to));
+        memcpy(replacement.get(), mem.get(), std::min(to, mem.size()));
+        if (zero_new && to > mem.size())
+          memset(static_cast<uint8_t*>(replacement.get()) + mem.size(), 0, to - mem.size());
+        mem.reset(replacement.release(), to, scoped_memory::MALLOC_ALLOCATED);
+      } else {
+        // main path: try to mremap.
+        void *new_addr = mremap(mem.get(), RoundUpSize(mem), to, MREMAP_MAYMOVE);
+        if (new_addr != MAP_FAILED) {
+          scoped_memory::Alloc source(mem.source()); // steal resets mem.source()
+          mem.steal(); // let go otherwise reset() will free it first
+          mem.reset(new_addr, to, source);
+        } else {
+          // Reallocating huge pages can fail with EINVAL.
+          // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/mremap.c?id=refs/tags/v3.19#n346
+          ReplaceAndCopy(to, zero_new, mem);
+        }
+      }
+      return;
+#endif // __linux__
+    case scoped_memory::MALLOC_ALLOCATED:
+#ifdef __linux__
+      // Transition larger allocations to huge pages, but don't keep trying if we're still malloc allocated.
+      if (to >= kTransitionHuge && mem.size() < kTransitionHuge) {
+        ReplaceAndCopy(to, zero_new, mem);
+        return;
+      }
+#endif // __linux__
+      {
+        void *new_addr = std::realloc(mem.get(), to);
+        UTIL_THROW_IF(!new_addr, ErrnoException, "realloc to " << to << " bytes failed.");
+        if (zero_new && to > mem.size())
+          memset(static_cast<uint8_t*>(new_addr) + mem.size(), 0, to - mem.size());
+        mem.steal();
+        mem.reset(new_addr, to, scoped_memory::MALLOC_ALLOCATED);
+      }
+      return;
+    default:
+      UTIL_THROW(Exception, "HugeRealloc called with type " << mem.source());
+  }
+}
+
+void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) {
+  switch (method) {
+    case LAZY:
+      out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
+      break;
+    case POPULATE_OR_LAZY:
+#ifdef MAP_POPULATE
+    case POPULATE_OR_READ:
+#endif
+      out.reset(MapOrThrow(size, false, kFileFlags, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
+      break;
+#ifndef MAP_POPULATE
+    case POPULATE_OR_READ:
+#endif
+    case READ:
+      HugeMalloc(size, false, out);
+      SeekOrThrow(fd, offset);
+      ReadOrThrow(fd, out.get(), size);
+      break;
+    case PARALLEL_READ:
+      UTIL_THROW(Exception, "Parallel read was removed from this repo.");
+      break;
+  }
+}
+
+void *MapZeroedWrite(int fd, std::size_t size) {
+  ResizeOrThrow(fd, 0);
+  ResizeOrThrow(fd, size);
+  return MapOrThrow(size, true, kFileFlags, false, fd, 0);
+}
+
+void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
+  file.reset(CreateOrThrow(name));
+  try {
+    return MapZeroedWrite(file.get(), size);
+  } catch (ErrnoException &e) {
+    e << " in file " << name;
+    throw;
+  }
+}
+
+Rolling::Rolling(const Rolling &copy_from, uint64_t increase) {
+  *this = copy_from;
+  IncreaseBase(increase);
+}
+
+Rolling &Rolling::operator=(const Rolling &copy_from) {
+  fd_ = copy_from.fd_;
+  file_begin_ = copy_from.file_begin_;
+  file_end_ = copy_from.file_end_;
+  for_write_ = copy_from.for_write_;
+  block_ = copy_from.block_;
+  read_bound_ = copy_from.read_bound_;
+
+  current_begin_ = 0;
+  if (copy_from.IsPassthrough()) {
+    current_end_ = copy_from.current_end_;
+    ptr_ = copy_from.ptr_;
+  } else {
+    // Force call on next mmap.
+    current_end_ = 0;
+    ptr_ = NULL;
+  }
+  return *this;
+}
+
+Rolling::Rolling(int fd, bool for_write, std::size_t block, std::size_t read_bound, uint64_t offset, uint64_t amount) {
+  current_begin_ = 0;
+  current_end_ = 0;
+  fd_ = fd;
+  file_begin_ = offset;
+  file_end_ = offset + amount;
+  for_write_ = for_write;
+  block_ = block;
+  read_bound_ = read_bound;
+}
+
+void *Rolling::ExtractNonRolling(scoped_memory &out, uint64_t index, std::size_t size) {
+  out.reset();
+  if (IsPassthrough()) return static_cast<uint8_t*>(get()) + index;
+  uint64_t offset = index + file_begin_;
+  // Round down to multiple of page size.
+  uint64_t cruft = offset % static_cast<uint64_t>(SizePage());
+  std::size_t map_size = static_cast<std::size_t>(size + cruft);
+  out.reset(MapOrThrow(map_size, for_write_, kFileFlags, true, fd_, offset - cruft), map_size, scoped_memory::MMAP_ALLOCATED);
+  return static_cast<uint8_t*>(out.get()) + static_cast<std::size_t>(cruft);
+}
+
+void Rolling::Roll(uint64_t index) {
+  assert(!IsPassthrough());
+  std::size_t amount;
+  if (file_end_ - (index + file_begin_) > static_cast<uint64_t>(block_)) {
+    amount = block_;
+    current_end_ = index + amount - read_bound_;
+  } else {
+    amount = file_end_ - (index + file_begin_);
+    current_end_ = index + amount;
+  }
+  ptr_ = static_cast<uint8_t*>(ExtractNonRolling(mem_, index, amount)) - index;
+
+  current_begin_ = index;
+}
+
+} // namespace util
--- a/cpp/thirdpart/kenlm/util/mmap.hh
+++ b/cpp/thirdpart/kenlm/util/mmap.hh
+#ifndef UTIL_MMAP_H
+#define UTIL_MMAP_H
+// Utilities for mmaped files.
+
+#include <cstddef>
+#include <limits>
+
+#include <stdint.h>
+#include <sys/types.h>
+
+namespace util {
+
+class scoped_fd;
+
+std::size_t SizePage();
+
+// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
+class scoped_mmap {
+  public:
+    scoped_mmap() : data_((void*)-1), size_(0) {}
+    scoped_mmap(void *data, std::size_t size) : data_(data), size_(size) {}
+    ~scoped_mmap();
+
+    void *get() const { return data_; }
+
+    const char *begin() const { return reinterpret_cast<char*>(data_); }
+    char *begin() { return reinterpret_cast<char*>(data_); }
+    const char *end() const { return reinterpret_cast<char*>(data_) + size_; }
+    char *end() { return reinterpret_cast<char*>(data_) + size_; }
+    std::size_t size() const { return size_; }
+
+    void reset(void *data, std::size_t size) {
+      scoped_mmap other(data_, size_);
+      data_ = data;
+      size_ = size;
+    }
+
+    void reset() {
+      reset((void*)-1, 0);
+    }
+
+    void *steal() {
+      void *ret = data_;
+      data_ = (void*)-1;
+      size_ = 0;
+      return ret;
+    }
+
+  private:
+    void *data_;
+    std::size_t size_;
+
+    scoped_mmap(const scoped_mmap &);
+    scoped_mmap &operator=(const scoped_mmap &);
+};
+
+/* For when the memory might come from mmap or malloc.  Uses NULL and 0 for
+ * blanks even though mmap signals errors with (void*)-1).
+ */
+class scoped_memory {
+  public:
+    typedef enum {
+      // TODO: store rounded up size instead?
+      MMAP_ROUND_1G_ALLOCATED, // The size was rounded up for a 1GB page.  Do the same before munmap.
+      MMAP_ROUND_2M_ALLOCATED, // The size was rounded up for a 2MB page.  Do the same before munmap.
+      MMAP_ROUND_PAGE_ALLOCATED, // The size was rounded up to a multiple of the default page size.  Do the same before munmap.
+      MMAP_ALLOCATED, // munmap
+      MALLOC_ALLOCATED, // free
+      NONE_ALLOCATED // nothing to free (though there can be something here if it's owned by somebody else).
+    } Alloc;
+
+    scoped_memory(void *data, std::size_t size, Alloc source)
+      : data_(data), size_(size), source_(source) {}
+
+    scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {}
+
+    // Calls HugeMalloc
+    scoped_memory(std::size_t to, bool zero_new);
+
+#if __cplusplus >= 201103L
+    scoped_memory(scoped_memory &&from) noexcept
+      : data_(from.data_), size_(from.size_), source_(from.source_) {
+      from.steal();
+    }
+#endif
+
+    ~scoped_memory() { reset(); }
+
+    void *get() const { return data_; }
+
+    const char *begin() const { return reinterpret_cast<char*>(data_); }
+    char *begin() { return reinterpret_cast<char*>(data_); }
+    const char *end() const { return reinterpret_cast<char*>(data_) + size_; }
+    char *end() { return reinterpret_cast<char*>(data_) + size_; }
+    std::size_t size() const { return size_; }
+
+    Alloc source() const { return source_; }
+
+    void reset() { reset(NULL, 0, NONE_ALLOCATED); }
+
+    void reset(void *data, std::size_t size, Alloc from);
+
+    void *steal() {
+      void *ret = data_;
+      data_ = NULL;
+      size_ = 0;
+      source_ = NONE_ALLOCATED;
+      return ret;
+    }
+
+  private:
+    void *data_;
+    std::size_t size_;
+
+    Alloc source_;
+
+    scoped_memory(const scoped_memory &);
+    scoped_memory &operator=(const scoped_memory &);
+};
+
+extern const int kFileFlags;
+
+// Cross-platform, error-checking wrapper for mmap().
+void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
+
+// msync wrapper
+void SyncOrThrow(void *start, size_t length);
+
+// Cross-platform, error-checking wrapper for munmap().
+void UnmapOrThrow(void *start, size_t length);
+
+// Allocate memory, promising that all/vast majority of it will be used.  Tries
+// hard to use huge pages on Linux.
+// If you want zeroed memory, pass zeroed = true.
+void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to);
+
+// Reallocates memory ala realloc but with option to zero the new memory.
+// On Linux, the memory can come from anonymous mmap or malloc/calloc.
+// On non-Linux, only malloc/calloc is supported.
+//
+// To summarize, any memory from HugeMalloc or HugeRealloc can be resized with
+// this.
+void HugeRealloc(std::size_t size, bool new_zeroed, scoped_memory &mem);
+
+enum LoadMethod {
+  // mmap with no prepopulate
+  LAZY,
+  // On linux, pass MAP_POPULATE to mmap.
+  POPULATE_OR_LAZY,
+  // Populate on Linux.  malloc and read on non-Linux.
+  POPULATE_OR_READ,
+  // malloc and read.
+  READ,
+  // malloc and read in parallel (recommended for Lustre)
+  PARALLEL_READ,
+};
+
+void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
+
+// Open file name with mmap of size bytes, all of which are initially zero.
+void *MapZeroedWrite(int fd, std::size_t size);
+void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file);
+
+// Forward rolling memory map with no overlap.
+class Rolling {
+  public:
+    Rolling() {}
+
+    explicit Rolling(void *data) { Init(data); }
+
+    Rolling(const Rolling &copy_from, uint64_t increase = 0);
+    Rolling &operator=(const Rolling &copy_from);
+
+    // For an actual rolling mmap.
+    explicit Rolling(int fd, bool for_write, std::size_t block, std::size_t read_bound, uint64_t offset, uint64_t amount);
+
+    // For a static mapping
+    void Init(void *data) {
+      ptr_ = data;
+      current_end_ = std::numeric_limits<uint64_t>::max();
+      current_begin_ = 0;
+      // Mark as a pass-through.
+      fd_ = -1;
+    }
+
+    void IncreaseBase(uint64_t by) {
+      file_begin_ += by;
+      ptr_ = static_cast<uint8_t*>(ptr_) + by;
+      if (!IsPassthrough()) current_end_ = 0;
+    }
+
+    void DecreaseBase(uint64_t by) {
+      file_begin_ -= by;
+      ptr_ = static_cast<uint8_t*>(ptr_) - by;
+      if (!IsPassthrough()) current_end_ = 0;
+    }
+
+    void *ExtractNonRolling(scoped_memory &out, uint64_t index, std::size_t size);
+
+    // Returns base pointer
+    void *get() const { return ptr_; }
+
+    // Returns base pointer.
+    void *CheckedBase(uint64_t index) {
+      if (index >= current_end_ || index < current_begin_) {
+        Roll(index);
+      }
+      return ptr_;
+    }
+
+    // Returns indexed pointer.
+    void *CheckedIndex(uint64_t index) {
+      return static_cast<uint8_t*>(CheckedBase(index)) + index;
+    }
+
+  private:
+    void Roll(uint64_t index);
+
+    // True if this is just a thin wrapper on a pointer.
+    bool IsPassthrough() const { return fd_ == -1; }
+
+    void *ptr_;
+    uint64_t current_begin_;
+    uint64_t current_end_;
+
+    scoped_memory mem_;
+
+    int fd_;
+    uint64_t file_begin_;
+    uint64_t file_end_;
+
+    bool for_write_;
+    std::size_t block_;
+    std::size_t read_bound_;
+};
+
+} // namespace util
+
+#endif // UTIL_MMAP_H
--- a/cpp/thirdpart/kenlm/util/multi_intersection.hh
+++ b/cpp/thirdpart/kenlm/util/multi_intersection.hh
+#ifndef UTIL_MULTI_INTERSECTION_H
+#define UTIL_MULTI_INTERSECTION_H
+
+#include <boost/optional.hpp>
+#include <boost/range/iterator_range.hpp>
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+namespace util {
+
+namespace detail {
+template <class Range> struct RangeLessBySize : public std::binary_function<const Range &, const Range &, bool> {
+  bool operator()(const Range &left, const Range &right) const {
+    return left.size() < right.size();
+  }
+};
+
+/* Takes sets specified by their iterators and a boost::optional containing
+ * the lowest intersection if any.  Each set must be sorted in increasing
+ * order.  sets is changed to truncate the beginning of each sequence to the
+ * location of the match or an empty set.  Precondition: sets is not empty
+ * since the intersection over null is the universe and this function does not
+ * know the universe.
+ */
+template <class Iterator, class Less> boost::optional<typename std::iterator_traits<Iterator>::value_type> FirstIntersectionSorted(std::vector<boost::iterator_range<Iterator> > &sets, const Less &less = std::less<typename std::iterator_traits<Iterator>::value_type>()) {
+  typedef std::vector<boost::iterator_range<Iterator> > Sets;
+  typedef typename std::iterator_traits<Iterator>::value_type Value;
+
+  assert(!sets.empty());
+
+  if (sets.front().empty()) return boost::optional<Value>();
+  // Possibly suboptimal to copy for general Value; makes unsigned int go slightly faster.
+  Value highest(sets.front().front());
+  for (typename Sets::iterator i(sets.begin()); i != sets.end(); ) {
+    i->advance_begin(std::lower_bound(i->begin(), i->end(), highest, less) - i->begin());
+    if (i->empty()) return boost::optional<Value>();
+    if (less(highest, i->front())) {
+      highest = i->front();
+      // start over
+      i = sets.begin();
+    } else {
+      ++i;
+    }
+  }
+  return boost::optional<Value>(highest);
+}
+
+} // namespace detail
+
+template <class Iterator, class Less> boost::optional<typename std::iterator_traits<Iterator>::value_type> FirstIntersection(std::vector<boost::iterator_range<Iterator> > &sets, const Less less) {
+  assert(!sets.empty());
+
+  std::sort(sets.begin(), sets.end(), detail::RangeLessBySize<boost::iterator_range<Iterator> >());
+  return detail::FirstIntersectionSorted(sets, less);
+}
+
+template <class Iterator> boost::optional<typename std::iterator_traits<Iterator>::value_type> FirstIntersection(std::vector<boost::iterator_range<Iterator> > &sets) {
+  return FirstIntersection(sets, std::less<typename std::iterator_traits<Iterator>::value_type>());
+}
+
+template <class Iterator, class Output, class Less> void AllIntersection(std::vector<boost::iterator_range<Iterator> > &sets, Output &out, const Less less) {
+  typedef typename std::iterator_traits<Iterator>::value_type Value;
+  assert(!sets.empty());
+
+  std::sort(sets.begin(), sets.end(), detail::RangeLessBySize<boost::iterator_range<Iterator> >());
+  boost::optional<Value> ret;
+  for (boost::optional<Value> ret; (ret = detail::FirstIntersectionSorted(sets, less)); sets.front().advance_begin(1)) {
+    out(*ret);
+  }
+}
+
+template <class Iterator, class Output> void AllIntersection(std::vector<boost::iterator_range<Iterator> > &sets, Output &out) {
+  AllIntersection(sets, out, std::less<typename std::iterator_traits<Iterator>::value_type>());
+}
+
+} // namespace util
+
+#endif // UTIL_MULTI_INTERSECTION_H
--- a/cpp/thirdpart/kenlm/util/multi_intersection_test.cc
+++ b/cpp/thirdpart/kenlm/util/multi_intersection_test.cc
+#include "multi_intersection.hh"
+
+#define BOOST_TEST_MODULE MultiIntersectionTest
+#include <boost/test/unit_test.hpp>
+
+namespace util {
+namespace {
+
+BOOST_AUTO_TEST_CASE(Empty) {
+  std::vector<boost::iterator_range<const unsigned int*> > sets;
+
+  sets.push_back(boost::iterator_range<const unsigned int*>(static_cast<const unsigned int*>(NULL), static_cast<const unsigned int*>(NULL)));
+  BOOST_CHECK(!FirstIntersection(sets));
+}
+
+BOOST_AUTO_TEST_CASE(Single) {
+  std::vector<unsigned int> nums;
+  nums.push_back(1);
+  nums.push_back(4);
+  nums.push_back(100);
+  std::vector<boost::iterator_range<std::vector<unsigned int>::const_iterator> > sets;
+  sets.push_back(nums);
+
+  boost::optional<unsigned int> ret(FirstIntersection(sets));
+
+  BOOST_REQUIRE(ret);
+  BOOST_CHECK_EQUAL(static_cast<unsigned int>(1), *ret);
+}
+
+template <class T, unsigned int len> boost::iterator_range<const T*> RangeFromArray(const T (&arr)[len]) {
+  return boost::iterator_range<const T*>(arr, arr + len);
+}
+
+BOOST_AUTO_TEST_CASE(MultiNone) {
+  unsigned int nums0[] = {1, 3, 4, 22};
+  unsigned int nums1[] = {2, 5, 12};
+  unsigned int nums2[] = {4, 17};
+
+  std::vector<boost::iterator_range<const unsigned int*> > sets;
+  sets.push_back(RangeFromArray(nums0));
+  sets.push_back(RangeFromArray(nums1));
+  sets.push_back(RangeFromArray(nums2));
+
+  BOOST_CHECK(!FirstIntersection(sets));
+}
+
+BOOST_AUTO_TEST_CASE(MultiOne) {
+  unsigned int nums0[] = {1, 3, 4, 17, 22};
+  unsigned int nums1[] = {2, 5, 12, 17};
+  unsigned int nums2[] = {4, 17};
+
+  std::vector<boost::iterator_range<const unsigned int*> > sets;
+  sets.push_back(RangeFromArray(nums0));
+  sets.push_back(RangeFromArray(nums1));
+  sets.push_back(RangeFromArray(nums2));
+
+  boost::optional<unsigned int> ret(FirstIntersection(sets));
+  BOOST_REQUIRE(ret);
+  BOOST_CHECK_EQUAL(static_cast<unsigned int>(17), *ret);
+}
+
+} // namespace
+} // namespace util
--- a/cpp/thirdpart/kenlm/util/murmur_hash.cc
+++ b/cpp/thirdpart/kenlm/util/murmur_hash.cc
+/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
+ * code is released to the public domain. For business purposes, Murmurhash is
+ * under the MIT license."
+ * This is modified from the original:
+ * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.
+ * length changed to unsigned int.
+ * placed in namespace util
+ * add MurmurHashNative
+ * default option = 0 for seed
+ * ARM port from NICT
+ */
+
+#include "murmur_hash.hh"
+#include <cstring>
+
+namespace util {
+
+//-----------------------------------------------------------------------------
+// MurmurHash2, 64-bit versions, by Austin Appleby
+
+// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
+// and endian-ness issues if used across multiple platforms.
+
+// 64-bit hash for 64-bit platforms
+
+uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed )
+{
+  const uint64_t m = 0xc6a4a7935bd1e995ULL;
+  const int r = 47;
+
+  uint64_t h = seed ^ (len * m);
+
+#if defined(__arm) || defined(__arm__)
+  const size_t ksize = sizeof(uint64_t);
+  const unsigned char * data = (const unsigned char *)key;
+  const unsigned char * end = data + (std::size_t)(len/8) * ksize;
+#else
+  const uint64_t * data = (const uint64_t *)key;
+  const uint64_t * end = data + (len/8);
+#endif
+
+  while(data != end)
+  {
+#if defined(__arm) || defined(__arm__)
+    uint64_t k;
+    memcpy(&k, data, ksize);
+    data += ksize;
+#else
+    uint64_t k = *data++;
+#endif
+
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+
+    h ^= k;
+    h *= m;
+  }
+
+  const unsigned char * data2 = (const unsigned char*)data;
+
+  switch(len & 7)
+  {
+  case 7: h ^= uint64_t(data2[6]) << 48;
+  case 6: h ^= uint64_t(data2[5]) << 40;
+  case 5: h ^= uint64_t(data2[4]) << 32;
+  case 4: h ^= uint64_t(data2[3]) << 24;
+  case 3: h ^= uint64_t(data2[2]) << 16;
+  case 2: h ^= uint64_t(data2[1]) << 8;
+  case 1: h ^= uint64_t(data2[0]);
+          h *= m;
+  };
+
+  h ^= h >> r;
+  h *= m;
+  h ^= h >> r;
+
+  return h;
+}
+
+
+// 64-bit hash for 32-bit platforms
+
+uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed )
+{
+  const unsigned int m = 0x5bd1e995;
+  const int r = 24;
+
+  unsigned int h1 = seed ^ len;
+  unsigned int h2 = 0;
+
+#if defined(__arm) || defined(__arm__)
+  size_t ksize = sizeof(unsigned int);
+  const unsigned char * data = (const unsigned char *)key;
+#else
+  const unsigned int * data = (const unsigned int *)key;
+#endif
+
+  unsigned int k1, k2;
+  while(len >= 8)
+  {
+#if defined(__arm) || defined(__arm__)
+    memcpy(&k1, data, ksize);
+    data += ksize;
+    memcpy(&k2, data, ksize);
+    data += ksize;
+#else
+    k1 = *data++;
+    k2 = *data++;
+#endif
+
+    k1 *= m; k1 ^= k1 >> r; k1 *= m;
+    h1 *= m; h1 ^= k1;
+    len -= 4;
+
+    k2 *= m; k2 ^= k2 >> r; k2 *= m;
+    h2 *= m; h2 ^= k2;
+    len -= 4;
+  }
+
+  if(len >= 4)
+  {
+#if defined(__arm) || defined(__arm__)
+    memcpy(&k1, data, ksize);
+    data += ksize;
+#else
+    k1 = *data++;
+#endif
+    k1 *= m; k1 ^= k1 >> r; k1 *= m;
+    h1 *= m; h1 ^= k1;
+    len -= 4;
+  }
+
+  switch(len)
+  {
+  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
+  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
+  case 1: h2 ^= ((unsigned char*)data)[0];
+      h2 *= m;
+  };
+
+  h1 ^= h2 >> 18; h1 *= m;
+  h2 ^= h1 >> 22; h2 *= m;
+  h1 ^= h2 >> 17; h1 *= m;
+  h2 ^= h1 >> 19; h2 *= m;
+
+  uint64_t h = h1;
+
+  h = (h << 32) | h2;
+
+  return h;
+}
+
+// Trick to test for 64-bit architecture at compile time.
+namespace {
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-function"
+#endif
+template <unsigned L> inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) {
+  return MurmurHash64A(key, len, seed);
+}
+template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) {
+  return MurmurHash64B(key, len, seed);
+}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+} // namespace
+
+uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) {
+  return MurmurHashNativeBackend<sizeof(void*)>(key, len, seed);
+}
+
+} // namespace util
--- a/cpp/thirdpart/kenlm/util/murmur_hash.hh
+++ b/cpp/thirdpart/kenlm/util/murmur_hash.hh
+#ifndef UTIL_MURMUR_HASH_H
+#define UTIL_MURMUR_HASH_H
+#include <cstddef>
+#include <stdint.h>
+
+namespace util {
+
+// 64-bit machine version
+uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0);
+// 32-bit machine version (not the same function as above)
+uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0);
+// Use the version for this arch.  Because the values differ across
+// architectures, really only use it for in-memory structures.
+uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0);
+
+} // namespace util
+
+#endif // UTIL_MURMUR_HASH_H
--- a/cpp/thirdpart/kenlm/util/parallel_read.cc
+++ b/cpp/thirdpart/kenlm/util/parallel_read.cc
+#include "parallel_read.hh"
+
+#include "file.hh"
+
+#ifdef WITH_THREADS
+#include "thread_pool.hh"
+
+namespace util {
+namespace {
+
+class Reader {
+  public:
+    explicit Reader(int fd) : fd_(fd) {}
+
+    struct Request {
+      void *to;
+      std::size_t size;
+      uint64_t offset;
+
+      bool operator==(const Request &other) const {
+        return (to == other.to) && (size == other.size) && (offset == other.offset);
+      }
+    };
+
+    void operator()(const Request &request) {
+      util::ErsatzPRead(fd_, request.to, request.size, request.offset);
+    }
+
+  private:
+    int fd_;
+};
+
+} // namespace
+
+void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) {
+  Reader::Request poison;
+  poison.to = NULL;
+  poison.size = 0;
+  poison.offset = 0;
+  unsigned threads = boost::thread::hardware_concurrency();
+  if (!threads) threads = 2;
+  ThreadPool<Reader> pool(2 /* don't need much of a queue */, threads, fd, poison);
+  const std::size_t kBatch = 1ULL << 25; // 32 MB
+  Reader::Request request;
+  request.to = to;
+  request.size = kBatch;
+  request.offset = offset;
+  for (; amount > kBatch; amount -= kBatch) {
+    pool.Produce(request);
+    request.to = reinterpret_cast<uint8_t*>(request.to) + kBatch;
+    request.offset += kBatch;
+  }
+  request.size = amount;
+  if (request.size) {
+    pool.Produce(request);
+  }
+}
+
+} // namespace util
+
+#else // WITH_THREADS
+
+namespace util {
+void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) {
+ util::ErsatzPRead(fd, to, amount, offset);
+}
+} // namespace util
+
+#endif
--- a/cpp/thirdpart/kenlm/util/parallel_read.hh
+++ b/cpp/thirdpart/kenlm/util/parallel_read.hh
+#ifndef UTIL_PARALLEL_READ__
+#define UTIL_PARALLEL_READ__
+
+/* Read pieces of a file in parallel.  This has a very specific use case:
+ * reading files from Lustre is CPU bound so multiple threads actually
+ * increases throughput.  Speed matters when an LM takes a terabyte.
+ */
+
+#include <cstddef>
+#include <stdint.h>
+
+namespace util {
+void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset);
+} // namespace util
+
+#endif // UTIL_PARALLEL_READ__
--- a/cpp/thirdpart/kenlm/util/pcqueue.hh
+++ b/cpp/thirdpart/kenlm/util/pcqueue.hh
+#ifndef UTIL_PCQUEUE_H
+#define UTIL_PCQUEUE_H
+
+#include "exception.hh"
+
+#include <boost/interprocess/sync/interprocess_semaphore.hpp>
+#include <boost/scoped_array.hpp>
+#include <boost/thread/mutex.hpp>
+#include <boost/utility.hpp>
+
+#include <cerrno>
+
+#ifdef __APPLE__
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <mach/mach_traps.h>
+#include <mach/mach.h>
+#endif // __APPLE__
+
+namespace util {
+
+/* OS X Maverick and Boost interprocess were doing "Function not implemented."
+ * So this is my own wrapper around the mach kernel APIs.
+ */
+#ifdef __APPLE__
+
+#define MACH_CALL(call) UTIL_THROW_IF(KERN_SUCCESS != (call), Exception, "Mach call failure")
+
+class Semaphore {
+  public:
+    explicit Semaphore(int value) : task_(mach_task_self()) {
+      MACH_CALL(semaphore_create(task_, &back_, SYNC_POLICY_FIFO, value));
+    }
+
+    ~Semaphore() {
+      MACH_CALL(semaphore_destroy(task_, back_));
+    }
+
+    void wait() {
+      MACH_CALL(semaphore_wait(back_));
+    }
+
+    void post() {
+      MACH_CALL(semaphore_signal(back_));
+    }
+
+  private:
+    semaphore_t back_;
+    task_t task_;
+};
+
+inline void WaitSemaphore(Semaphore &semaphore) {
+  semaphore.wait();
+}
+
+#else
+typedef boost::interprocess::interprocess_semaphore Semaphore;
+
+inline void WaitSemaphore (Semaphore &on) {
+  while (1) {
+    try {
+      on.wait();
+      break;
+    }
+    catch (boost::interprocess::interprocess_exception &e) {
+      if (e.get_native_error() != EINTR) {
+        throw;
+      }
+    }
+  }
+}
+
+#endif // __APPLE__
+
+/**
+ * Producer consumer queue safe for multiple producers and multiple consumers.
+ * T must be default constructable and have operator=.
+ * The value is copied twice for Consume(T &out) or three times for Consume(),
+ * so larger objects should be passed via pointer.
+ * Strong exception guarantee if operator= throws.  Undefined if semaphores throw.
+ */
+template <class T> class PCQueue : boost::noncopyable {
+ public:
+  explicit PCQueue(size_t size)
+   : empty_(size), used_(0),
+     storage_(new T[size]),
+     end_(storage_.get() + size),
+     produce_at_(storage_.get()),
+     consume_at_(storage_.get()) {}
+
+  // Add a value to the queue.
+  void Produce(const T &val) {
+    WaitSemaphore(empty_);
+    {
+      boost::unique_lock<boost::mutex> produce_lock(produce_at_mutex_);
+      try {
+        *produce_at_ = val;
+      }
+      catch (...) {
+        empty_.post();
+        throw;
+      }
+      if (++produce_at_ == end_) produce_at_ = storage_.get();
+    }
+    used_.post();
+  }
+
+  // Consume a value, assigning it to out.
+  T& Consume(T &out) {
+    WaitSemaphore(used_);
+    {
+      boost::unique_lock<boost::mutex> consume_lock(consume_at_mutex_);
+      try {
+        out = *consume_at_;
+      }
+      catch (...) {
+        used_.post();
+        throw;
+      }
+      if (++consume_at_ == end_) consume_at_ = storage_.get();
+    }
+    empty_.post();
+    return out;
+  }
+
+  // Convenience version of Consume that copies the value to return.
+  // The other version is faster.
+  T Consume() {
+    T ret;
+    Consume(ret);
+    return ret;
+  }
+
+ private:
+  // Number of empty spaces in storage_.
+  Semaphore empty_;
+  // Number of occupied spaces in storage_.
+  Semaphore used_;
+
+  boost::scoped_array<T> storage_;
+
+  T *const end_;
+
+  // Index for next write in storage_.
+  T *produce_at_;
+  boost::mutex produce_at_mutex_;
+
+  // Index for next read from storage_.
+  T *consume_at_;
+  boost::mutex consume_at_mutex_;
+
+};
+
+} // namespace util
+
+#endif // UTIL_PCQUEUE_H
--- a/cpp/thirdpart/kenlm/util/pcqueue_test.cc
+++ b/cpp/thirdpart/kenlm/util/pcqueue_test.cc
+#include "pcqueue.hh"
+
+#define BOOST_TEST_MODULE PCQueueTest
+#include <boost/test/unit_test.hpp>
+
+namespace util {
+namespace {
+
+BOOST_AUTO_TEST_CASE(SingleThread) {
+  PCQueue<int> queue(10);
+  for (int i = 0; i < 10; ++i) {
+    queue.Produce(i);
+  }
+  for (int i = 0; i < 10; ++i) {
+    BOOST_CHECK_EQUAL(i, queue.Consume());
+  }
+}
+
+}
+} // namespace util
--- a/cpp/thirdpart/kenlm/util/pool.cc
+++ b/cpp/thirdpart/kenlm/util/pool.cc
+#include "pool.hh"
+
+#include "scoped.hh"
+
+#include <cstdlib>
+
+#include <algorithm>
+
+namespace util {
+
+Pool::Pool() {
+  current_ = NULL;
+  current_end_ = NULL;
+}
+
+Pool::~Pool() {
+  FreeAll();
+}
+
+void Pool::FreeAll() {
+  for (std::vector<void *>::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) {
+    free(*i);
+  }
+  free_list_.clear();
+  current_ = NULL;
+  current_end_ = NULL;
+}
+
+void *Pool::More(std::size_t size) {
+  std::size_t amount = std::max(static_cast<size_t>(32) << free_list_.size(), size);
+  uint8_t *ret = static_cast<uint8_t*>(MallocOrThrow(amount));
+  free_list_.push_back(ret);
+  current_ = ret + size;
+  current_end_ = ret + amount;
+  return ret;
+}
+
+} // namespace util
--- a/cpp/thirdpart/kenlm/util/pool.hh
+++ b/cpp/thirdpart/kenlm/util/pool.hh
+#ifndef UTIL_POOL_H
+#define UTIL_POOL_H
+
+#include <cassert>
+#include <cstring>
+#include <vector>
+
+#include <stdint.h>
+
+namespace util {
+
+/* Very simple pool.  It can only allocate memory.  And all of the memory it
+ * allocates must be freed at the same time.
+ */
+class Pool {
+  public:
+    Pool();
+
+    ~Pool();
+
+    void *Allocate(std::size_t size) {
+      void *ret = current_;
+      current_ += size;
+      if (current_ > current_end_) {
+        ret = More(size);
+      }
+#ifdef DEBUG
+      base_check_ = ret;
+#endif
+      return ret;
+    }
+
+    /** Extend (or contract) the most recent allocation.
+     * @param base The base pointer of the allocation. This must must have been
+     *   returned by the MOST RECENT call to Allocate or Continue.
+     * @param additional Change in the size.
+     *
+     * In most cases, more memory from the same page is used, in which case
+     * base is unchanged and the function returns false.
+     * If the page runs out, a new page is created and the memory (from base)
+     * is copied.  The function returns true.
+     *
+     * @return Whether the base had to be changed due to allocating a page.
+     */
+    bool Continue(void *&base, std::ptrdiff_t additional) {
+#ifdef DEBUG
+      assert(base == base_check_);
+#endif
+      current_ += additional;
+      if (current_ > current_end_) {
+        std::size_t new_total = current_ - static_cast<uint8_t*>(base);
+        void *new_base = More(new_total);
+        std::memcpy(new_base, base, new_total - additional);
+        base = new_base;
+#ifdef DEBUG
+        base_check_ = base;
+#endif
+        return true;
+      }
+      return false;
+    }
+
+    void FreeAll();
+
+  private:
+    void *More(std::size_t size);
+
+    std::vector<void *> free_list_;
+
+    uint8_t *current_, *current_end_;
+
+#ifdef DEBUG
+    // For debugging, check that Continue came from the most recent call.
+    void *base_check_;
+#endif // DEBUG
+
+    // no copying
+    Pool(const Pool &);
+    Pool &operator=(const Pool &);
+};
+
+/**
+ * Pool designed to allow limited freeing.
+ * Keeps a linked list of free elements in the free spaces.
+ * Will not reduce in size until FreeAll is called.
+ */
+class FreePool {
+  public:
+    explicit FreePool(std::size_t element_size)
+      : free_list_(NULL),
+        element_size_(element_size),
+        padded_size_(std::max(element_size_, sizeof(void*))) {}
+
+    void *Allocate() {
+      if (free_list_) {
+        void *ret = free_list_;
+        free_list_ = *reinterpret_cast<void**>(free_list_);
+        return ret;
+      } else {
+        return backing_.Allocate(padded_size_);
+      }
+    }
+
+    void Free(void *ptr) {
+      *reinterpret_cast<void**>(ptr) = free_list_;
+      free_list_ = ptr;
+    }
+
+    std::size_t ElementSize() const { return element_size_; }
+
+  private:
+    void *free_list_;
+
+    Pool backing_;
+
+    const std::size_t element_size_;
+    const std::size_t padded_size_;
+};
+
+} // namespace util
+
+#endif // UTIL_POOL_H
--- a/cpp/thirdpart/kenlm/util/probing_hash_table.hh
+++ b/cpp/thirdpart/kenlm/util/probing_hash_table.hh
+#ifndef UTIL_PROBING_HASH_TABLE_H
+#define UTIL_PROBING_HASH_TABLE_H
+
+#include "exception.hh"
+#include "mmap.hh"
+
+#include <algorithm>
+#include <cstddef>
+#include <functional>
+#include <vector>
+
+#include <cassert>
+#include <stdint.h>
+
+namespace util {
+
+/* Thrown when table grows too large */
+class ProbingSizeException : public Exception {
+  public:
+    ProbingSizeException() throw() {}
+    ~ProbingSizeException() throw() {}
+};
+
+// std::identity is an SGI extension :-(
+struct IdentityHash {
+  template <class T> T operator()(T arg) const { return arg; }
+};
+
+class DivMod {
+  public:
+    explicit DivMod(std::size_t buckets) : buckets_(buckets) {}
+
+    static uint64_t RoundBuckets(uint64_t from) {
+      return from;
+    }
+
+    template <class It> It Ideal(It begin, uint64_t hash) const {
+      return begin + (hash % buckets_);
+    }
+
+    template <class BaseIt, class OutIt> void Next(BaseIt begin, BaseIt end, OutIt &it) const {
+      if (++it == end) it = begin;
+    }
+
+    void Double() {
+      buckets_ *= 2;
+    }
+
+  private:
+    std::size_t buckets_;
+};
+
+class Power2Mod {
+  public:
+    explicit Power2Mod(std::size_t buckets) {
+      UTIL_THROW_IF(!buckets || (((buckets - 1) & buckets)), ProbingSizeException, "Size " << buckets << " is not a power of 2.");
+      mask_ = buckets - 1;
+    }
+
+    // Round up to next power of 2.
+    static uint64_t RoundBuckets(uint64_t from) {
+      --from;
+      from |= from >> 1;
+      from |= from >> 2;
+      from |= from >> 4;
+      from |= from >> 8;
+      from |= from >> 16;
+      from |= from >> 32;
+      return from + 1;
+    }
+
+    template <class It> It Ideal(It begin, uint64_t hash) const {
+      return begin + (hash & mask_);
+    }
+
+    template <class BaseIt, class OutIt> void Next(BaseIt begin, BaseIt /*end*/, OutIt &it) const {
+      it = begin + ((it - begin + 1) & mask_);
+    }
+
+    void Double() {
+      mask_ = (mask_ << 1) | 1;
+    }
+
+  private:
+    std::size_t mask_;
+};
+
+template <class EntryT, class HashT, class EqualT> class AutoProbing;
+
+/* Non-standard hash table
+ * Buckets must be set at the beginning and must be greater than maximum number
+ * of elements, else it throws ProbingSizeException.
+ * Memory management and initialization is externalized to make it easier to
+ * serialize these to disk and load them quickly.
+ * Uses linear probing to find value.
+ * Only insert and lookup operations.
+ */
+template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key>, class ModT = DivMod> class ProbingHashTable {
+  public:
+    typedef EntryT Entry;
+    typedef typename Entry::Key Key;
+    typedef const Entry *ConstIterator;
+    typedef Entry *MutableIterator;
+    typedef HashT Hash;
+    typedef EqualT Equal;
+    typedef ModT Mod;
+
+    static uint64_t Size(uint64_t entries, float multiplier) {
+      uint64_t buckets = Mod::RoundBuckets(std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries))));
+      return buckets * sizeof(Entry);
+    }
+
+    // Must be assigned to later.
+    ProbingHashTable() : mod_(1), entries_(0)
+#ifdef DEBUG
+      , initialized_(false)
+#endif
+    {}
+
+    ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal())
+      : begin_(reinterpret_cast<MutableIterator>(start)),
+        end_(begin_ + allocated / sizeof(Entry)),
+        buckets_(end_ - begin_),
+        invalid_(invalid),
+        hash_(hash_func),
+        equal_(equal_func),
+        mod_(end_ - begin_),
+        entries_(0)
+#ifdef DEBUG
+        , initialized_(true)
+#endif
+    {}
+
+    void Relocate(void *new_base) {
+      begin_ = reinterpret_cast<MutableIterator>(new_base);
+      end_ = begin_ + buckets_;
+    }
+
+    MutableIterator Ideal(const Key key) {
+      return mod_.Ideal(begin_, hash_(key));
+    }
+    ConstIterator Ideal(const Key key) const {
+      return mod_.Ideal(begin_, hash_(key));
+    }
+
+    template <class T> MutableIterator Insert(const T &t) {
+#ifdef DEBUG
+      assert(initialized_);
+#endif
+      UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
+      return UncheckedInsert(t);
+    }
+
+    // Return true if the value was found (and not inserted).  This is consistent with Find but the opposite of hash_map!
+    template <class T> bool FindOrInsert(const T &t, MutableIterator &out) {
+#ifdef DEBUG
+      assert(initialized_);
+#endif
+      for (MutableIterator i = Ideal(t.GetKey());;mod_.Next(begin_, end_, i)) {
+        Key got(i->GetKey());
+        if (equal_(got, t.GetKey())) { out = i; return true; }
+        if (equal_(got, invalid_)) {
+          UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
+          *i = t;
+          out = i;
+          return false;
+        }
+      }
+    }
+
+    void FinishedInserting() {}
+
+    // Don't change anything related to GetKey,
+    template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
+#ifdef DEBUG
+      assert(initialized_);
+#endif
+      for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
+        Key got(i->GetKey());
+        if (equal_(got, key)) { out = i; return true; }
+        if (equal_(got, invalid_)) return false;
+      }
+    }
+
+    // Like UnsafeMutableFind, but the key must be there.
+    template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
+      for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
+        Key got(i->GetKey());
+        if (equal_(got, key)) { return i; }
+        assert(!equal_(got, invalid_));
+      }
+    }
+
+    // Iterator is both input and output.
+    template <class Key> bool FindFromIdeal(const Key key, ConstIterator &i) const {
+#ifdef DEBUG
+      assert(initialized_);
+#endif
+      for (;; mod_.Next(begin_, end_, i)) {
+        Key got(i->GetKey());
+        if (equal_(got, key)) return true;
+        if (equal_(got, invalid_)) return false;
+      }
+    }
+
+    template <class Key> bool Find(const Key key, ConstIterator &out) const {
+      out = Ideal(key);
+      return FindFromIdeal(key, out);
+    }
+
+    // Like Find but we're sure it must be there.
+    template <class Key> ConstIterator MustFind(const Key key) const {
+      for (ConstIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
+        Key got(i->GetKey());
+        if (equal_(got, key)) { return i; }
+        assert(!equal_(got, invalid_));
+      }
+    }
+
+    void Clear() {
+      Entry invalid;
+      invalid.SetKey(invalid_);
+      std::fill(begin_, end_, invalid);
+      entries_ = 0;
+    }
+
+    // Return number of entries assuming no serialization went on.
+    std::size_t SizeNoSerialization() const {
+      return entries_;
+    }
+
+    // Return memory size expected by Double.
+    std::size_t DoubleTo() const {
+      return buckets_ * 2 * sizeof(Entry);
+    }
+
+    // Inform the table that it has double the amount of memory.
+    // Pass clear_new = false if you are sure the new memory is initialized
+    // properly (to invalid_) i.e. by mremap.
+    void Double(void *new_base, bool clear_new = true) {
+      begin_ = static_cast<MutableIterator>(new_base);
+      MutableIterator old_end = begin_ + buckets_;
+      buckets_ *= 2;
+      end_ = begin_ + buckets_;
+      mod_.Double();
+      if (clear_new) {
+        Entry invalid;
+        invalid.SetKey(invalid_);
+        std::fill(old_end, end_, invalid);
+      }
+      std::vector<Entry> rolled_over;
+      // Move roll-over entries to a buffer because they might not roll over anymore.  This should be small.
+      for (MutableIterator i = begin_; i != old_end && !equal_(i->GetKey(), invalid_); ++i) {
+        rolled_over.push_back(*i);
+        i->SetKey(invalid_);
+      }
+      /* Re-insert everything.  Entries might go backwards to take over a
+       * recently opened gap, stay, move to new territory, or wrap around.   If
+       * an entry wraps around, it might go to a pointer greater than i (which
+       * can happen at the beginning) and it will be revisited to possibly fill
+       * in a gap created later.
+       */
+      Entry temp;
+      for (MutableIterator i = begin_; i != old_end; ++i) {
+        if (!equal_(i->GetKey(), invalid_)) {
+          temp = *i;
+          i->SetKey(invalid_);
+          UncheckedInsert(temp);
+        }
+      }
+      // Put the roll-over entries back in.
+      for (typename std::vector<Entry>::const_iterator i(rolled_over.begin()); i != rolled_over.end(); ++i) {
+        UncheckedInsert(*i);
+      }
+    }
+
+    // Mostly for tests, check consistency of every entry.
+    void CheckConsistency() {
+      MutableIterator last;
+      for (last = end_ - 1; last >= begin_ && !equal_(last->GetKey(), invalid_); --last) {}
+      UTIL_THROW_IF(last == begin_, ProbingSizeException, "Completely full");
+      MutableIterator i;
+      // Beginning can be wrap-arounds.
+      for (i = begin_; !equal_(i->GetKey(), invalid_); ++i) {
+        MutableIterator ideal = Ideal(i->GetKey());
+        UTIL_THROW_IF(ideal > i && ideal <= last, Exception, "Inconsistency at position " << (i - begin_) << " should be at " << (ideal - begin_));
+      }
+      MutableIterator pre_gap = i;
+      for (; i != end_; ++i) {
+        if (equal_(i->GetKey(), invalid_)) {
+          pre_gap = i;
+          continue;
+        }
+        MutableIterator ideal = Ideal(i->GetKey());
+        UTIL_THROW_IF(ideal > i || ideal <= pre_gap, Exception, "Inconsistency at position " << (i - begin_) << " with ideal " << (ideal - begin_));
+      }
+    }
+
+    ConstIterator RawBegin() const {
+      return begin_;
+    }
+    ConstIterator RawEnd() const {
+      return end_;
+    }
+
+  private:
+    friend class AutoProbing<Entry, Hash, Equal>;
+
+    template <class T> MutableIterator UncheckedInsert(const T &t) {
+      for (MutableIterator i(Ideal(t.GetKey()));; mod_.Next(begin_, end_, i)) {
+        if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
+      }
+    }
+
+    MutableIterator begin_;
+    MutableIterator end_;
+    std::size_t buckets_;
+    Key invalid_;
+    Hash hash_;
+    Equal equal_;
+    Mod mod_;
+
+    std::size_t entries_;
+#ifdef DEBUG
+    bool initialized_;
+#endif
+};
+
+// Resizable linear probing hash table.  This owns the memory.
+template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class AutoProbing {
+  private:
+    typedef ProbingHashTable<EntryT, HashT, EqualT, Power2Mod> Backend;
+  public:
+    static std::size_t MemUsage(std::size_t size, float multiplier = 1.5) {
+      return Backend::Size(size, multiplier);
+    }
+
+    typedef EntryT Entry;
+    typedef typename Entry::Key Key;
+    typedef const Entry *ConstIterator;
+    typedef Entry *MutableIterator;
+    typedef HashT Hash;
+    typedef EqualT Equal;
+
+    AutoProbing(std::size_t initial_size = 5, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) :
+      allocated_(Backend::Size(initial_size, 1.2)), mem_(allocated_, KeyIsRawZero(invalid)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) {
+      threshold_ = std::min<std::size_t>(backend_.buckets_ - 1, backend_.buckets_ * 0.9);
+      if (!KeyIsRawZero(invalid)) {
+        Clear();
+      }
+    }
+
+    // Assumes that the key is unique.  Multiple insertions won't cause a failure, just inconsistent lookup.
+    template <class T> MutableIterator Insert(const T &t) {
+      ++backend_.entries_;
+      DoubleIfNeeded();
+      return backend_.UncheckedInsert(t);
+    }
+
+    template <class T> bool FindOrInsert(const T &t, MutableIterator &out) {
+      DoubleIfNeeded();
+      return backend_.FindOrInsert(t, out);
+    }
+
+    template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
+      return backend_.UnsafeMutableFind(key, out);
+    }
+
+    template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
+      return backend_.UnsafeMutableMustFind(key);
+    }
+
+    template <class Key> bool Find(const Key key, ConstIterator &out) const {
+      return backend_.Find(key, out);
+    }
+
+    template <class Key> ConstIterator MustFind(const Key key) const {
+      return backend_.MustFind(key);
+    }
+
+    std::size_t Size() const {
+      return backend_.SizeNoSerialization();
+    }
+
+    void Clear() {
+      backend_.Clear();
+    }
+
+    ConstIterator RawBegin() const {
+      return backend_.RawBegin();
+    }
+    ConstIterator RawEnd() const {
+      return backend_.RawEnd();
+    }
+
+  private:
+    void DoubleIfNeeded() {
+      if (UTIL_LIKELY(Size() < threshold_))
+        return;
+      HugeRealloc(backend_.DoubleTo(), KeyIsRawZero(backend_.invalid_), mem_);
+      allocated_ = backend_.DoubleTo();
+      backend_.Double(mem_.get(), !KeyIsRawZero(backend_.invalid_));
+      threshold_ = std::min<std::size_t>(backend_.buckets_ - 1, backend_.buckets_ * 0.9);
+    }
+
+    bool KeyIsRawZero(const Key &key) {
+      for (const uint8_t *i = reinterpret_cast<const uint8_t*>(&key); i < reinterpret_cast<const uint8_t*>(&key) + sizeof(Key); ++i) {
+        if (*i) return false;
+      }
+      return true;
+    }
+
+    std::size_t allocated_;
+    util::scoped_memory mem_;
+    Backend backend_;
+    std::size_t threshold_;
+};
+
+} // namespace util
+
+#endif // UTIL_PROBING_HASH_TABLE_H
--- a/cpp/thirdpart/kenlm/util/probing_hash_table_benchmark_main.cc
+++ b/cpp/thirdpart/kenlm/util/probing_hash_table_benchmark_main.cc
+#include "file.hh"
+#include "probing_hash_table.hh"
+#include "mmap.hh"
+#include "usage.hh"
+#include "thread_pool.hh"
+#include <boost/thread/mutex.hpp>
+#include <boost/thread/locks.hpp>
+
+#ifdef WIN32
+#include <windows.h>
+#include <processthreadsapi.h>
+#else
+#include <sys/resource.h>
+#include <sys/time.h>
+#endif
+
+#include <iostream>
+
+namespace util {
+namespace {
+
+struct Entry {
+  typedef uint64_t Key;
+  Key key;
+  Key GetKey() const { return key; }
+};
+
+// I don't care if this doesn't run on Windows.  Empirically /dev/urandom was faster than boost::random's Mersenne Twister.
+class URandom {
+  public:
+    URandom() :
+      it_(buf_ + 1024), end_(buf_ + 1024),
+      file_(util::OpenReadOrThrow("/dev/urandom")) {}
+
+    uint64_t Get() {
+      if (it_ == end_) {
+        it_ = buf_;
+        util::ReadOrThrow(file_.get(), buf_, sizeof(buf_));
+        it_ = buf_;
+      }
+      return *it_++;
+    }
+
+    void Batch(uint64_t *begin, uint64_t *end) {
+      util::ReadOrThrow(file_.get(), begin, (end - begin) * sizeof(uint64_t));
+    }
+
+  private:
+    uint64_t buf_[1024];
+    uint64_t *it_, *end_;
+
+    util::scoped_fd file_;
+};
+
+struct PrefetchEntry {
+  uint64_t key;
+  const Entry *pointer;
+};
+
+template <class TableT, unsigned PrefetchSize> class PrefetchQueue {
+  public:
+    typedef TableT Table;
+
+    explicit PrefetchQueue(Table &table) : table_(table), cur_(0), twiddle_(false) {
+      for (PrefetchEntry *i = entries_; i != entries_ + PrefetchSize; ++i)
+        i->pointer = NULL;
+    }
+
+    void Add(uint64_t key) {
+      if (Cur().pointer) {
+        twiddle_ ^= table_.FindFromIdeal(Cur().key, Cur().pointer);
+      }
+      Cur().key = key;
+      Cur().pointer = table_.Ideal(key);
+      __builtin_prefetch(Cur().pointer, 0, 0);
+      Next();
+    }
+
+    bool Drain() {
+      if (Cur().pointer) {
+        for (PrefetchEntry *i = &Cur(); i < entries_ + PrefetchSize; ++i) {
+          twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
+        }
+      }
+      for (PrefetchEntry *i = entries_; i < &Cur(); ++i) {
+        twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
+      }
+      return twiddle_;
+    }
+
+  private:
+    PrefetchEntry &Cur() { return entries_[cur_]; }
+    void Next() {
+      ++cur_;
+      cur_ = cur_ % PrefetchSize;
+    }
+
+    Table &table_;
+    PrefetchEntry entries_[PrefetchSize];
+    std::size_t cur_;
+
+    bool twiddle_;
+
+    PrefetchQueue(const PrefetchQueue&);
+    void operator=(const PrefetchQueue&);
+};
+
+template <class TableT> class Immediate {
+  public:
+    typedef TableT Table;
+
+    explicit Immediate(Table &table) : table_(table), twiddle_(false) {}
+
+    void Add(uint64_t key) {
+      typename Table::ConstIterator it;
+      twiddle_ ^= table_.Find(key, it);
+    }
+
+    bool Drain() const { return twiddle_; }
+
+  private:
+    Table &table_;
+    bool twiddle_;
+};
+
+std::size_t Size(uint64_t entries, float multiplier = 1.5) {
+  typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
+  // Always round up to power of 2 for fair comparison.
+  return Power2Mod::RoundBuckets(Table::Size(entries, multiplier) / sizeof(Entry)) * sizeof(Entry);
+}
+
+template <class Queue> bool Test(URandom &rn, uint64_t entries, const uint64_t *const queries_begin, const uint64_t *const queries_end, bool ordinary_malloc, float multiplier = 1.5) {
+  std::size_t size = Size(entries, multiplier);
+  scoped_memory backing;
+  if (ordinary_malloc) {
+    backing.reset(util::CallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
+  } else {
+    util::HugeMalloc(size, true, backing);
+  }
+  typename Queue::Table table(backing.get(), size);
+
+  double start = CPUTime();
+  for (uint64_t i = 0; i < entries; ++i) {
+    Entry entry;
+    entry.key = rn.Get();
+    table.Insert(entry);
+  }
+  double inserted = CPUTime() - start;
+  double before_lookup = CPUTime();
+  Queue queue(table);
+  for (const uint64_t *i = queries_begin; i != queries_end; ++i) {
+    queue.Add(*i);
+  }
+  bool meaningless = queue.Drain();
+  std::cout << ' ' << (inserted / static_cast<double>(entries)) << ' ' << (CPUTime() - before_lookup) / static_cast<double>(queries_end - queries_begin) << std::flush;
+  return meaningless;
+}
+
+bool TestRun(uint64_t lookups = 20000000, float multiplier = 1.5) {
+  URandom rn;
+  util::scoped_memory queries;
+  HugeMalloc(lookups * sizeof(uint64_t), true, queries);
+  rn.Batch(static_cast<uint64_t*>(queries.get()), static_cast<uint64_t*>(queries.get()) + lookups);
+  uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2;
+  bool meaningless = true;
+  for (uint64_t i = 4; Size(i / multiplier) < physical_mem_limit; i *= 4) {
+    std::cout << static_cast<std::size_t>(i / multiplier) << ' ' << Size(i / multiplier);
+    typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
+    typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, DivMod> TableDiv;
+    const uint64_t *const queries_begin = static_cast<const uint64_t*>(queries.get());
+    meaningless ^= util::Test<Immediate<TableDiv> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
+    meaningless ^= util::Test<Immediate<Table> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 4> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
+    meaningless ^= util::Test<Immediate<Table> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 2> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 4> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 8> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    meaningless ^= util::Test<PrefetchQueue<Table, 16> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
+    std::cout << std::endl;
+  }
+  return meaningless;
+}
+
+template<class Table>
+struct ParallelTestRequest{
+  ParallelTestRequest() : queries_begin_(NULL), queries_end_(NULL), table_(NULL) {}
+  ParallelTestRequest(const uint64_t *queries_begin, const uint64_t *queries_end, Table *table) :
+      queries_begin_(queries_begin),
+      queries_end_(queries_end),
+      table_(table) {}
+  bool operator==(const ParallelTestRequest &rhs) const {
+    return this->queries_begin_ == rhs.queries_begin_ && this->queries_end_ == rhs.queries_end_;
+  }
+  const uint64_t *queries_begin_;
+  const uint64_t *queries_end_;
+  Table * table_;
+};
+
+template <class TableT>
+struct ParallelTestConstruct{
+  ParallelTestConstruct(boost::mutex& lock, const uint64_t* const burn_begin, const uint64_t* const burn_end, TableT* table) : lock_(lock), burn_begin_(burn_begin), burn_end_(burn_end), table_(table){}
+  boost::mutex& lock_;
+  const uint64_t* const burn_begin_;
+  const uint64_t* const burn_end_;
+  TableT* table_;
+};
+
+template<class Queue>
+struct ParallelTestHandler{
+  typedef ParallelTestRequest<typename Queue::Table> Request;
+  explicit ParallelTestHandler(const ParallelTestConstruct<typename Queue::Table>& construct) : lock_(construct.lock_), totalTime_(0.0), nRequests_(0), nQueries_(0), error_(false), twiddle_(false){
+    //perform initial burn
+    for(const uint64_t* i = construct.burn_begin_; i < construct.burn_end_; i++){
+      typename Queue::Table::ConstIterator it;
+      twiddle_ ^= construct.table_->Find(*i, it);
+    }
+  }
+  void operator()(Request request){
+    if (error_) return;
+    Queue queue(*request.table_);
+    double start = ThreadTime();
+    if(start < 0.0){
+      error_ = true;
+      return;
+    }
+    for(const uint64_t *i = request.queries_begin_; i != request.queries_end_; ++i){
+      queue.Add(*i);
+    }
+    twiddle_ ^= queue.Drain();
+    double end = ThreadTime();
+    if(end < 0.0){
+      error_ = true;
+      return;
+    }
+    totalTime_ += end - start;
+    nQueries_ += request.queries_end_ - request.queries_begin_;
+    ++nRequests_;
+  }
+  virtual ~ParallelTestHandler() {
+    boost::unique_lock<boost::mutex> produce_lock(lock_);
+    if (error_){
+      std::cout << "Error ";
+    }
+    else {
+      std::cout << nRequests_ << ' ' << ' ' << nQueries_ << ' ' << totalTime_ << std::endl;
+    }
+    std::cerr << "Meaningless " << twiddle_ << std::endl;
+  }
+  private:
+    boost::mutex &lock_;
+    double totalTime_;
+    std::size_t nRequests_;
+    std::size_t nQueries_;
+    bool error_;
+    bool twiddle_;
+};
+
+template<class Queue>
+void ParallelTest(typename Queue::Table* table, const uint64_t *const queries_begin,
+                  const uint64_t *const queries_end, std::size_t num_threads,
+                  std::size_t tasks_per_thread, std::size_t burn){
+    boost::mutex lock;
+    ParallelTestConstruct<typename Queue::Table> construct(lock, queries_begin, queries_begin + burn, table);
+    ParallelTestRequest<typename Queue::Table> poison(NULL, NULL, NULL);
+    {
+      util::ThreadPool<ParallelTestHandler<Queue> > pool(num_threads, num_threads, construct, poison);
+      const uint64_t queries_per_thread =(static_cast<uint64_t>(queries_end-queries_begin-burn)/num_threads)/tasks_per_thread;
+      for (const uint64_t *i = queries_begin+burn; i + queries_per_thread <= queries_end; i += queries_per_thread){
+        ParallelTestRequest<typename Queue::Table> request(i, i+queries_per_thread, table);
+        pool.Produce(request);
+      }
+    } // pool gets deallocated and all jobs finish
+    std::cout << std::endl;
+}
+
+void ParallelTestRun(std::size_t tasks_per_thread = 1, std::size_t burn = 4000, uint64_t lookups = 20000000, float multiplier = 1.5) {
+  URandom rn;
+  util::scoped_memory queries;
+  HugeMalloc((lookups + burn)* sizeof(uint64_t), true, queries);
+  rn.Batch(static_cast<uint64_t*>(queries.get()), static_cast<uint64_t*>(queries.get()) + lookups + burn);
+  const uint64_t *const queries_begin = static_cast<const uint64_t*>(queries.get());
+  const uint64_t *const queries_end = queries_begin + lookups + burn;
+  typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
+  uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2;
+  for (uint64_t i = 4; Size(i / multiplier, multiplier) < physical_mem_limit; i *= 4) {
+    std::size_t entries = static_cast<std::size_t>(i / multiplier);
+    std::size_t size = Size(i/multiplier, multiplier);
+    scoped_memory backing;
+    util::HugeMalloc(size, true, backing);
+    Table table(backing.get(), size);
+    for (uint64_t j = 0; j < entries; ++j) {
+      Entry entry;
+      entry.key = rn.Get();
+      table.Insert(entry);
+    }
+    for(std::size_t num_threads = 1; num_threads <= 16; num_threads*=2){
+      std::cout << entries << ' ' << size << ' ' << num_threads << ' ' << std::endl;
+      util::ParallelTest<Immediate<Table> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
+      util::ParallelTest<PrefetchQueue<Table, 2> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
+      util::ParallelTest<PrefetchQueue<Table, 4> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
+      util::ParallelTest<PrefetchQueue<Table, 8> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
+      util::ParallelTest<PrefetchQueue<Table, 16> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
+    }
+  }
+}
+
+} // namespace
+} // namespace util
+
+int main() {
+  //bool meaningless = false;
+  std::cout << "#CPU time\n";
+  //meaningless ^= util::TestRun();
+  util::ParallelTestRun(10, 4000);
+  //std::cerr << "Meaningless: " << meaningless << '\n';
+}