Commit 688b6eac authored by SWHL's avatar SWHL
Browse files

Update files

parents
/* Optional packages. You might want to integrate this with your build system e.g. config.h from ./configure. */
#ifndef UTIL_HAVE_H
#define UTIL_HAVE_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifndef HAVE_ICU
//#define HAVE_ICU
#endif
#endif // UTIL_HAVE_H
#include <iostream>
/* Fast integer to string conversion.
Source: https://github.com/miloyip/itoa-benchmark
Local modifications:
1. Return end of buffer instead of null terminating
2. Collapse to single file
3. Namespace
4. Remove test hook
5. Non-x86 support from the branch_lut code
6. Rename functions
7. Require __SSE2__ on i386
Copyright (C) 2014 Milo Yip
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
Which is based on: http://0x80.pl/snippets/asm/sse-utoa.c
SSE: conversion integers to decimal representation
Author: Wojciech Muła
e-mail: wojciech_mula@poczta.onet.pl
www: http://0x80.pl/
License: BSD
initial release 2011-10-21
$Id$
*/
#include "integer_to_string.hh"
#include <cassert>
#include <stdint.h>
namespace util {
namespace {
const char gDigitsLut[200] = {
'0','0','0','1','0','2','0','3','0','4','0','5','0','6','0','7','0','8','0','9',
'1','0','1','1','1','2','1','3','1','4','1','5','1','6','1','7','1','8','1','9',
'2','0','2','1','2','2','2','3','2','4','2','5','2','6','2','7','2','8','2','9',
'3','0','3','1','3','2','3','3','3','4','3','5','3','6','3','7','3','8','3','9',
'4','0','4','1','4','2','4','3','4','4','4','5','4','6','4','7','4','8','4','9',
'5','0','5','1','5','2','5','3','5','4','5','5','5','6','5','7','5','8','5','9',
'6','0','6','1','6','2','6','3','6','4','6','5','6','6','6','7','6','8','6','9',
'7','0','7','1','7','2','7','3','7','4','7','5','7','6','7','7','7','8','7','9',
'8','0','8','1','8','2','8','3','8','4','8','5','8','6','8','7','8','8','8','9',
'9','0','9','1','9','2','9','3','9','4','9','5','9','6','9','7','9','8','9','9'
};
} // namespace
// SSE2 implementation according to http://0x80.pl/articles/sse-itoa.html
// Modifications: (1) fix incorrect digits (2) accept all ranges (3) write to user provided buffer.
#if defined(__amd64) || defined(_M_X64) || (defined(__SSE2__) && (defined(_M_IX86) || defined(i386)))
#include <emmintrin.h>
#ifdef _MSC_VER
#include "intrin.h"
#endif
#ifdef _MSC_VER
#define ALIGN_PRE __declspec(align(16))
#define ALIGN_SUF
#else
#define ALIGN_PRE
#define ALIGN_SUF __attribute__ ((aligned(16)))
#endif
namespace {
static const uint32_t kDiv10000 = 0xd1b71759;
ALIGN_PRE static const uint32_t kDiv10000Vector[4] ALIGN_SUF = { kDiv10000, kDiv10000, kDiv10000, kDiv10000 };
ALIGN_PRE static const uint32_t k10000Vector[4] ALIGN_SUF = { 10000, 10000, 10000, 10000 };
ALIGN_PRE static const uint16_t kDivPowersVector[8] ALIGN_SUF = { 8389, 5243, 13108, 32768, 8389, 5243, 13108, 32768 }; // 10^3, 10^2, 10^1, 10^0
ALIGN_PRE static const uint16_t kShiftPowersVector[8] ALIGN_SUF = {
1 << (16 - (23 + 2 - 16)),
1 << (16 - (19 + 2 - 16)),
1 << (16 - 1 - 2),
1 << (15),
1 << (16 - (23 + 2 - 16)),
1 << (16 - (19 + 2 - 16)),
1 << (16 - 1 - 2),
1 << (15)
};
ALIGN_PRE static const uint16_t k10Vector[8] ALIGN_SUF = { 10, 10, 10, 10, 10, 10, 10, 10 };
ALIGN_PRE static const char kAsciiZero[16] ALIGN_SUF = { '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' };
inline __m128i Convert8DigitsSSE2(uint32_t value) {
assert(value <= 99999999);
// abcd, efgh = abcdefgh divmod 10000
const __m128i abcdefgh = _mm_cvtsi32_si128(value);
const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45);
const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0]));
// v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ]
const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh);
// v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ]
const __m128i v1a = _mm_slli_epi64(v1, 2);
// v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ]
const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a);
const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a);
// v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ]
const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]);
const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]);
// v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ]
const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]);
// v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ]
const __m128i v6 = _mm_slli_epi64(v5, 16);
// v7 = v4 - v6 = { a, b, c, d, e, f, g, h }
const __m128i v7 = _mm_sub_epi16(v4, v6);
return v7;
}
inline __m128i ShiftDigits_SSE2(__m128i a, unsigned digit) {
assert(digit <= 8);
switch (digit) {
case 0: return a;
case 1: return _mm_srli_si128(a, 1);
case 2: return _mm_srli_si128(a, 2);
case 3: return _mm_srli_si128(a, 3);
case 4: return _mm_srli_si128(a, 4);
case 5: return _mm_srli_si128(a, 5);
case 6: return _mm_srli_si128(a, 6);
case 7: return _mm_srli_si128(a, 7);
case 8: return _mm_srli_si128(a, 8);
}
return a; // should not execute here.
}
} // namespace
// Original name: u32toa_sse2
char *ToString(uint32_t value, char* buffer) {
if (value < 10000) {
const uint32_t d1 = (value / 100) << 1;
const uint32_t d2 = (value % 100) << 1;
if (value >= 1000)
*buffer++ = gDigitsLut[d1];
if (value >= 100)
*buffer++ = gDigitsLut[d1 + 1];
if (value >= 10)
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
//*buffer++ = '\0';
return buffer;
}
else if (value < 100000000) {
// Experiment shows that this case SSE2 is slower
#if 0
const __m128i a = Convert8DigitsSSE2(value);
// Convert to bytes, add '0'
const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
// Count number of digit
const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0]));
unsigned long digit;
#ifdef _MSC_VER
_BitScanForward(&digit, ~mask | 0x8000);
#else
digit = __builtin_ctz(~mask | 0x8000);
#endif
// Shift digits to the beginning
__m128i result = ShiftDigits_SSE2(va, digit);
//__m128i result = _mm_srl_epi64(va, _mm_cvtsi32_si128(digit * 8));
_mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result);
buffer[8 - digit] = '\0';
#else
// value = bbbbcccc
const uint32_t b = value / 10000;
const uint32_t c = value % 10000;
const uint32_t d1 = (b / 100) << 1;
const uint32_t d2 = (b % 100) << 1;
const uint32_t d3 = (c / 100) << 1;
const uint32_t d4 = (c % 100) << 1;
if (value >= 10000000)
*buffer++ = gDigitsLut[d1];
if (value >= 1000000)
*buffer++ = gDigitsLut[d1 + 1];
if (value >= 100000)
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
*buffer++ = gDigitsLut[d3];
*buffer++ = gDigitsLut[d3 + 1];
*buffer++ = gDigitsLut[d4];
*buffer++ = gDigitsLut[d4 + 1];
// *buffer++ = '\0';
return buffer;
#endif
}
else {
// value = aabbbbbbbb in decimal
const uint32_t a = value / 100000000; // 1 to 42
value %= 100000000;
if (a >= 10) {
const unsigned i = a << 1;
*buffer++ = gDigitsLut[i];
*buffer++ = gDigitsLut[i + 1];
}
else
*buffer++ = '0' + static_cast<char>(a);
const __m128i b = Convert8DigitsSSE2(value);
const __m128i ba = _mm_add_epi8(_mm_packus_epi16(_mm_setzero_si128(), b), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
const __m128i result = _mm_srli_si128(ba, 8);
_mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result);
// buffer[8] = '\0';
return buffer + 8;
}
}
// Original name: u64toa_sse2
char *ToString(uint64_t value, char* buffer) {
if (value < 100000000) {
uint32_t v = static_cast<uint32_t>(value);
if (v < 10000) {
const uint32_t d1 = (v / 100) << 1;
const uint32_t d2 = (v % 100) << 1;
if (v >= 1000)
*buffer++ = gDigitsLut[d1];
if (v >= 100)
*buffer++ = gDigitsLut[d1 + 1];
if (v >= 10)
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
//*buffer++ = '\0';
return buffer;
}
else {
// Experiment shows that this case SSE2 is slower
#if 0
const __m128i a = Convert8DigitsSSE2(v);
// Convert to bytes, add '0'
const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
// Count number of digit
const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0]));
unsigned long digit;
#ifdef _MSC_VER
_BitScanForward(&digit, ~mask | 0x8000);
#else
digit = __builtin_ctz(~mask | 0x8000);
#endif
// Shift digits to the beginning
__m128i result = ShiftDigits_SSE2(va, digit);
_mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result);
buffer[8 - digit] = '\0';
#else
// value = bbbbcccc
const uint32_t b = v / 10000;
const uint32_t c = v % 10000;
const uint32_t d1 = (b / 100) << 1;
const uint32_t d2 = (b % 100) << 1;
const uint32_t d3 = (c / 100) << 1;
const uint32_t d4 = (c % 100) << 1;
if (value >= 10000000)
*buffer++ = gDigitsLut[d1];
if (value >= 1000000)
*buffer++ = gDigitsLut[d1 + 1];
if (value >= 100000)
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
*buffer++ = gDigitsLut[d3];
*buffer++ = gDigitsLut[d3 + 1];
*buffer++ = gDigitsLut[d4];
*buffer++ = gDigitsLut[d4 + 1];
//*buffer++ = '\0';
return buffer;
#endif
}
}
else if (value < 10000000000000000) {
const uint32_t v0 = static_cast<uint32_t>(value / 100000000);
const uint32_t v1 = static_cast<uint32_t>(value % 100000000);
const __m128i a0 = Convert8DigitsSSE2(v0);
const __m128i a1 = Convert8DigitsSSE2(v1);
// Convert to bytes, add '0'
const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
// Count number of digit
const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0]));
#ifdef _MSC_VER
unsigned long digit;
_BitScanForward(&digit, ~mask | 0x8000);
#else
unsigned digit = __builtin_ctz(~mask | 0x8000);
#endif
// Shift digits to the beginning
__m128i result = ShiftDigits_SSE2(va, digit);
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
// buffer[16 - digit] = '\0';
return &buffer[16 - digit];
}
else {
const uint32_t a = static_cast<uint32_t>(value / 10000000000000000); // 1 to 1844
value %= 10000000000000000;
if (a < 10)
*buffer++ = '0' + static_cast<char>(a);
else if (a < 100) {
const uint32_t i = a << 1;
*buffer++ = gDigitsLut[i];
*buffer++ = gDigitsLut[i + 1];
}
else if (a < 1000) {
*buffer++ = '0' + static_cast<char>(a / 100);
const uint32_t i = (a % 100) << 1;
*buffer++ = gDigitsLut[i];
*buffer++ = gDigitsLut[i + 1];
}
else {
const uint32_t i = (a / 100) << 1;
const uint32_t j = (a % 100) << 1;
*buffer++ = gDigitsLut[i];
*buffer++ = gDigitsLut[i + 1];
*buffer++ = gDigitsLut[j];
*buffer++ = gDigitsLut[j + 1];
}
const uint32_t v0 = static_cast<uint32_t>(value / 100000000);
const uint32_t v1 = static_cast<uint32_t>(value % 100000000);
const __m128i a0 = Convert8DigitsSSE2(v0);
const __m128i a1 = Convert8DigitsSSE2(v1);
// Convert to bytes, add '0'
const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]);
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), va);
// buffer[16] = '\0';
return &buffer[16];
}
}
#else // Generic Non-x86 case
// Orignal name: u32toa_branchlut
char *ToString(uint32_t value, char* buffer) {
if (value < 10000) {
const uint32_t d1 = (value / 100) << 1;
const uint32_t d2 = (value % 100) << 1;
if (value >= 1000)
*buffer++ = gDigitsLut[d1];
if (value >= 100)
*buffer++ = gDigitsLut[d1 + 1];
if (value >= 10)
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
}
else if (value < 100000000) {
// value = bbbbcccc
const uint32_t b = value / 10000;
const uint32_t c = value % 10000;
const uint32_t d1 = (b / 100) << 1;
const uint32_t d2 = (b % 100) << 1;
const uint32_t d3 = (c / 100) << 1;
const uint32_t d4 = (c % 100) << 1;
if (value >= 10000000)
*buffer++ = gDigitsLut[d1];
if (value >= 1000000)
*buffer++ = gDigitsLut[d1 + 1];
if (value >= 100000)
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
*buffer++ = gDigitsLut[d3];
*buffer++ = gDigitsLut[d3 + 1];
*buffer++ = gDigitsLut[d4];
*buffer++ = gDigitsLut[d4 + 1];
}
else {
// value = aabbbbcccc in decimal
const uint32_t a = value / 100000000; // 1 to 42
value %= 100000000;
if (a >= 10) {
const unsigned i = a << 1;
*buffer++ = gDigitsLut[i];
*buffer++ = gDigitsLut[i + 1];
}
else
*buffer++ = '0' + static_cast<char>(a);
const uint32_t b = value / 10000; // 0 to 9999
const uint32_t c = value % 10000; // 0 to 9999
const uint32_t d1 = (b / 100) << 1;
const uint32_t d2 = (b % 100) << 1;
const uint32_t d3 = (c / 100) << 1;
const uint32_t d4 = (c % 100) << 1;
*buffer++ = gDigitsLut[d1];
*buffer++ = gDigitsLut[d1 + 1];
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
*buffer++ = gDigitsLut[d3];
*buffer++ = gDigitsLut[d3 + 1];
*buffer++ = gDigitsLut[d4];
*buffer++ = gDigitsLut[d4 + 1];
}
return buffer; //*buffer++ = '\0';
}
// Original name: u64toa_branchlut
char *ToString(uint64_t value, char* buffer) {
if (value < 100000000) {
uint32_t v = static_cast<uint32_t>(value);
if (v < 10000) {
const uint32_t d1 = (v / 100) << 1;
const uint32_t d2 = (v % 100) << 1;
if (v >= 1000)
*buffer++ = gDigitsLut[d1];
if (v >= 100)
*buffer++ = gDigitsLut[d1 + 1];
if (v >= 10)
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
}
else {
// value = bbbbcccc
const uint32_t b = v / 10000;
const uint32_t c = v % 10000;
const uint32_t d1 = (b / 100) << 1;
const uint32_t d2 = (b % 100) << 1;
const uint32_t d3 = (c / 100) << 1;
const uint32_t d4 = (c % 100) << 1;
if (value >= 10000000)
*buffer++ = gDigitsLut[d1];
if (value >= 1000000)
*buffer++ = gDigitsLut[d1 + 1];
if (value >= 100000)
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
*buffer++ = gDigitsLut[d3];
*buffer++ = gDigitsLut[d3 + 1];
*buffer++ = gDigitsLut[d4];
*buffer++ = gDigitsLut[d4 + 1];
}
}
else if (value < 10000000000000000) {
const uint32_t v0 = static_cast<uint32_t>(value / 100000000);
const uint32_t v1 = static_cast<uint32_t>(value % 100000000);
const uint32_t b0 = v0 / 10000;
const uint32_t c0 = v0 % 10000;
const uint32_t d1 = (b0 / 100) << 1;
const uint32_t d2 = (b0 % 100) << 1;
const uint32_t d3 = (c0 / 100) << 1;
const uint32_t d4 = (c0 % 100) << 1;
const uint32_t b1 = v1 / 10000;
const uint32_t c1 = v1 % 10000;
const uint32_t d5 = (b1 / 100) << 1;
const uint32_t d6 = (b1 % 100) << 1;
const uint32_t d7 = (c1 / 100) << 1;
const uint32_t d8 = (c1 % 100) << 1;
if (value >= 1000000000000000)
*buffer++ = gDigitsLut[d1];
if (value >= 100000000000000)
*buffer++ = gDigitsLut[d1 + 1];
if (value >= 10000000000000)
*buffer++ = gDigitsLut[d2];
if (value >= 1000000000000)
*buffer++ = gDigitsLut[d2 + 1];
if (value >= 100000000000)
*buffer++ = gDigitsLut[d3];
if (value >= 10000000000)
*buffer++ = gDigitsLut[d3 + 1];
if (value >= 1000000000)
*buffer++ = gDigitsLut[d4];
if (value >= 100000000)
*buffer++ = gDigitsLut[d4 + 1];
*buffer++ = gDigitsLut[d5];
*buffer++ = gDigitsLut[d5 + 1];
*buffer++ = gDigitsLut[d6];
*buffer++ = gDigitsLut[d6 + 1];
*buffer++ = gDigitsLut[d7];
*buffer++ = gDigitsLut[d7 + 1];
*buffer++ = gDigitsLut[d8];
*buffer++ = gDigitsLut[d8 + 1];
}
else {
const uint32_t a = static_cast<uint32_t>(value / 10000000000000000); // 1 to 1844
value %= 10000000000000000;
if (a < 10)
*buffer++ = '0' + static_cast<char>(a);
else if (a < 100) {
const uint32_t i = a << 1;
*buffer++ = gDigitsLut[i];
*buffer++ = gDigitsLut[i + 1];
}
else if (a < 1000) {
*buffer++ = '0' + static_cast<char>(a / 100);
const uint32_t i = (a % 100) << 1;
*buffer++ = gDigitsLut[i];
*buffer++ = gDigitsLut[i + 1];
}
else {
const uint32_t i = (a / 100) << 1;
const uint32_t j = (a % 100) << 1;
*buffer++ = gDigitsLut[i];
*buffer++ = gDigitsLut[i + 1];
*buffer++ = gDigitsLut[j];
*buffer++ = gDigitsLut[j + 1];
}
const uint32_t v0 = static_cast<uint32_t>(value / 100000000);
const uint32_t v1 = static_cast<uint32_t>(value % 100000000);
const uint32_t b0 = v0 / 10000;
const uint32_t c0 = v0 % 10000;
const uint32_t d1 = (b0 / 100) << 1;
const uint32_t d2 = (b0 % 100) << 1;
const uint32_t d3 = (c0 / 100) << 1;
const uint32_t d4 = (c0 % 100) << 1;
const uint32_t b1 = v1 / 10000;
const uint32_t c1 = v1 % 10000;
const uint32_t d5 = (b1 / 100) << 1;
const uint32_t d6 = (b1 % 100) << 1;
const uint32_t d7 = (c1 / 100) << 1;
const uint32_t d8 = (c1 % 100) << 1;
*buffer++ = gDigitsLut[d1];
*buffer++ = gDigitsLut[d1 + 1];
*buffer++ = gDigitsLut[d2];
*buffer++ = gDigitsLut[d2 + 1];
*buffer++ = gDigitsLut[d3];
*buffer++ = gDigitsLut[d3 + 1];
*buffer++ = gDigitsLut[d4];
*buffer++ = gDigitsLut[d4 + 1];
*buffer++ = gDigitsLut[d5];
*buffer++ = gDigitsLut[d5 + 1];
*buffer++ = gDigitsLut[d6];
*buffer++ = gDigitsLut[d6 + 1];
*buffer++ = gDigitsLut[d7];
*buffer++ = gDigitsLut[d7 + 1];
*buffer++ = gDigitsLut[d8];
*buffer++ = gDigitsLut[d8 + 1];
}
return buffer;
}
#endif // End of architecture if statement.
// Signed wrappers. The negation is done on the unsigned version because
// doing so has defined behavior for INT_MIN.
char *ToString(int32_t value, char *to) {
uint32_t un = static_cast<uint32_t>(value);
if (value < 0) {
*to++ = '-';
un = -un;
}
return ToString(un, to);
}
char *ToString(int64_t value, char *to) {
uint64_t un = static_cast<uint64_t>(value);
if (value < 0) {
*to++ = '-';
un = -un;
}
return ToString(un, to);
}
// No optimization for this case yet.
char *ToString(int16_t value, char *to) {
return ToString((int32_t)value, to);
}
char *ToString(uint16_t value, char *to) {
return ToString((uint32_t)value, to);
}
// void * to string. This hasn't been optimized at all really.
namespace {
const char kHexDigits[] = "0123456789abcdef";
} // namespace
char *ToString(const void *v, char *to) {
*to++ = '0';
*to++ = 'x';
// Fun fact: gcc/clang boost::lexical_cast on Linux do just "0" while clang on OS X does "0x0"
// I happen to prefer 0x0.
if (!v) {
*to++ = '0';
return to;
}
uintptr_t value = reinterpret_cast<uintptr_t>(v);
uint8_t shift = sizeof(void*) * 8 - 4;
for (; !(value >> shift); shift -= 4) {}
for (; ; shift -= 4) {
*to++ = kHexDigits[(value >> shift) & 0xf];
if (!shift) break;
}
return to;
}
} // namespace util
#ifndef UTIL_INTEGER_TO_STRING_H
#define UTIL_INTEGER_TO_STRING_H
#include <cstddef>
#include <stdint.h>
namespace util {
/* These functions convert integers to strings and return the end pointer.
*/
char *ToString(uint32_t value, char *to);
char *ToString(uint64_t value, char *to);
// Implemented as wrappers to above
char *ToString(int32_t value, char *to);
char *ToString(int64_t value, char *to);
// Calls the 32-bit versions for now.
char *ToString(uint16_t value, char *to);
char *ToString(int16_t value, char *to);
char *ToString(const void *value, char *to);
inline char *ToString(bool value, char *to) {
*to++ = '0' + value;
return to;
}
// How many bytes to reserve in the buffer for these strings:
// g++ 4.9.1 doesn't work with this:
// static const std::size_t kBytes = 5;
// So use enum.
template <class T> struct ToStringBuf;
template <> struct ToStringBuf<bool> {
enum { kBytes = 1 };
};
template <> struct ToStringBuf<uint16_t> {
enum { kBytes = 5 };
};
template <> struct ToStringBuf<int16_t> {
enum { kBytes = 6 };
};
template <> struct ToStringBuf<uint32_t> {
enum { kBytes = 10 };
};
template <> struct ToStringBuf<int32_t> {
enum { kBytes = 11 };
};
template <> struct ToStringBuf<uint64_t> {
enum { kBytes = 20 };
};
template <> struct ToStringBuf<int64_t> {
// Not a typo. 2^63 has 19 digits.
enum { kBytes = 20 };
};
template <> struct ToStringBuf<const void*> {
// Either 18 on 64-bit or 10 on 32-bit.
enum { kBytes = sizeof(const void*) * 2 + 2 };
};
// Maximum over this and float.
enum { kToStringMaxBytes = 20 };
} // namespace util
#endif // UTIL_INTEGER_TO_STRING_H
#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
#include "integer_to_string.hh"
#include "string_piece.hh"
#define BOOST_TEST_MODULE IntegerToStringTest
#include <boost/test/unit_test.hpp>
#include <boost/lexical_cast.hpp>
#include <limits>
namespace util {
namespace {
template <class T> void TestValue(const T value) {
char buf[ToStringBuf<T>::kBytes];
StringPiece result(buf, ToString(value, buf) - buf);
BOOST_REQUIRE_GE(static_cast<std::size_t>(ToStringBuf<T>::kBytes), result.size());
if (value) {
BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), result);
} else {
// Platforms can do void * as 0x0 or 0.
BOOST_CHECK(result == "0x0" || result == "0");
}
}
template <class T> void TestCorners() {
TestValue(std::numeric_limits<T>::min());
TestValue(std::numeric_limits<T>::max());
TestValue((T)0);
TestValue((T)-1);
TestValue((T)1);
}
BOOST_AUTO_TEST_CASE(Corners) {
TestCorners<uint16_t>();
TestCorners<uint32_t>();
TestCorners<uint64_t>();
TestCorners<int16_t>();
TestCorners<int32_t>();
TestCorners<int64_t>();
TestCorners<const void*>();
}
template <class T> void TestAll() {
for (T i = std::numeric_limits<T>::min(); i < std::numeric_limits<T>::max(); ++i) {
TestValue(i);
}
TestValue(std::numeric_limits<T>::max());
}
BOOST_AUTO_TEST_CASE(Short) {
TestAll<uint16_t>();
TestAll<int16_t>();
}
template <class T> void Test10s() {
for (T i = 1; i < std::numeric_limits<T>::max() / 10; i *= 10) {
TestValue(i);
TestValue(i - 1);
TestValue(i + 1);
}
}
BOOST_AUTO_TEST_CASE(Tens) {
Test10s<uint64_t>();
Test10s<int64_t>();
Test10s<uint32_t>();
Test10s<int32_t>();
}
BOOST_AUTO_TEST_CASE(Pointers) {
for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
TestValue((const void*)i);
}
for (uintptr_t i = 0; i < 256; ++i) {
TestValue((const void*)i);
TestValue((const void*)(i + 0xf00));
}
}
}} // namespaces
#ifndef UTIL_JOINT_SORT_H
#define UTIL_JOINT_SORT_H
/* A terrifying amount of C++ to coax std::sort into soring one range while
* also permuting another range the same way.
*/
#include "proxy_iterator.hh"
#include <algorithm>
#include <functional>
namespace util {
namespace detail {
template <class KeyIter, class ValueIter> class JointProxy;
template <class KeyIter, class ValueIter> class JointIter {
public:
JointIter() {}
JointIter(const KeyIter &key_iter, const ValueIter &value_iter) : key_(key_iter), value_(value_iter) {}
bool operator==(const JointIter<KeyIter, ValueIter> &other) const { return key_ == other.key_; }
bool operator<(const JointIter<KeyIter, ValueIter> &other) const { return (key_ < other.key_); }
std::ptrdiff_t operator-(const JointIter<KeyIter, ValueIter> &other) const { return key_ - other.key_; }
JointIter<KeyIter, ValueIter> &operator+=(std::ptrdiff_t amount) {
key_ += amount;
value_ += amount;
return *this;
}
friend void swap(JointIter &first, JointIter &second) {
using std::swap;
swap(first.key_, second.key_);
swap(first.value_, second.value_);
}
void DeepSwap(JointIter &other) {
using std::swap;
swap(*key_, *other.key_);
swap(*value_, *other.value_);
}
private:
friend class JointProxy<KeyIter, ValueIter>;
KeyIter key_;
ValueIter value_;
};
template <class KeyIter, class ValueIter> class JointProxy {
private:
typedef JointIter<KeyIter, ValueIter> InnerIterator;
public:
typedef struct {
typename std::iterator_traits<KeyIter>::value_type key;
typename std::iterator_traits<ValueIter>::value_type value;
const typename std::iterator_traits<KeyIter>::value_type &GetKey() const { return key; }
} value_type;
JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {}
JointProxy(const JointProxy<KeyIter, ValueIter> &other) : inner_(other.inner_) {}
operator value_type() const {
value_type ret;
ret.key = *inner_.key_;
ret.value = *inner_.value_;
return ret;
}
JointProxy &operator=(const JointProxy &other) {
*inner_.key_ = *other.inner_.key_;
*inner_.value_ = *other.inner_.value_;
return *this;
}
JointProxy &operator=(const value_type &other) {
*inner_.key_ = other.key;
*inner_.value_ = other.value;
return *this;
}
typename std::iterator_traits<KeyIter>::reference GetKey() const {
return *(inner_.key_);
}
friend void swap(JointProxy<KeyIter, ValueIter> first, JointProxy<KeyIter, ValueIter> second) {
first.Inner().DeepSwap(second.Inner());
}
private:
friend class ProxyIterator<JointProxy<KeyIter, ValueIter> >;
InnerIterator &Inner() { return inner_; }
const InnerIterator &Inner() const { return inner_; }
InnerIterator inner_;
};
template <class Proxy, class Less> class LessWrapper : public std::binary_function<const typename Proxy::value_type &, const typename Proxy::value_type &, bool> {
public:
explicit LessWrapper(const Less &less) : less_(less) {}
bool operator()(const Proxy &left, const Proxy &right) const {
return less_(left.GetKey(), right.GetKey());
}
bool operator()(const Proxy &left, const typename Proxy::value_type &right) const {
return less_(left.GetKey(), right.GetKey());
}
bool operator()(const typename Proxy::value_type &left, const Proxy &right) const {
return less_(left.GetKey(), right.GetKey());
}
bool operator()(const typename Proxy::value_type &left, const typename Proxy::value_type &right) const {
return less_(left.GetKey(), right.GetKey());
}
private:
const Less less_;
};
} // namespace detail
template <class KeyIter, class ValueIter> class PairedIterator : public ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > {
public:
PairedIterator(const KeyIter &key, const ValueIter &value) :
ProxyIterator<detail::JointProxy<KeyIter, ValueIter> >(detail::JointProxy<KeyIter, ValueIter>(key, value)) {}
};
template <class KeyIter, class ValueIter, class Less> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) {
ProxyIterator<detail::JointProxy<KeyIter, ValueIter> > full_begin(detail::JointProxy<KeyIter, ValueIter>(key_begin, value_begin));
detail::LessWrapper<detail::JointProxy<KeyIter, ValueIter>, Less> less_wrap(less);
std::sort(full_begin, full_begin + (key_end - key_begin), less_wrap);
}
template <class KeyIter, class ValueIter> void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin) {
JointSort(key_begin, key_end, value_begin, std::less<typename std::iterator_traits<KeyIter>::value_type>());
}
} // namespace util
#endif // UTIL_JOINT_SORT_H
#include "joint_sort.hh"
#define BOOST_TEST_MODULE JointSortTest
#include <boost/test/unit_test.hpp>
namespace util { namespace {
BOOST_AUTO_TEST_CASE(just_flip) {
char keys[2];
int values[2];
keys[0] = 1; values[0] = 327;
keys[1] = 0; values[1] = 87897;
JointSort<char *, int *>(keys + 0, keys + 2, values + 0);
BOOST_CHECK_EQUAL(0, keys[0]);
BOOST_CHECK_EQUAL(87897, values[0]);
BOOST_CHECK_EQUAL(1, keys[1]);
BOOST_CHECK_EQUAL(327, values[1]);
}
BOOST_AUTO_TEST_CASE(three) {
char keys[3];
int values[3];
keys[0] = 1; values[0] = 327;
keys[1] = 2; values[1] = 87897;
keys[2] = 0; values[2] = 10;
JointSort<char *, int *>(keys + 0, keys + 3, values + 0);
BOOST_CHECK_EQUAL(0, keys[0]);
BOOST_CHECK_EQUAL(1, keys[1]);
BOOST_CHECK_EQUAL(2, keys[2]);
}
BOOST_AUTO_TEST_CASE(char_int) {
char keys[4];
int values[4];
keys[0] = 3; values[0] = 327;
keys[1] = 1; values[1] = 87897;
keys[2] = 2; values[2] = 10;
keys[3] = 0; values[3] = 24347;
JointSort<char *, int *>(keys + 0, keys + 4, values + 0);
BOOST_CHECK_EQUAL(0, keys[0]);
BOOST_CHECK_EQUAL(24347, values[0]);
BOOST_CHECK_EQUAL(1, keys[1]);
BOOST_CHECK_EQUAL(87897, values[1]);
BOOST_CHECK_EQUAL(2, keys[2]);
BOOST_CHECK_EQUAL(10, values[2]);
BOOST_CHECK_EQUAL(3, keys[3]);
BOOST_CHECK_EQUAL(327, values[3]);
}
BOOST_AUTO_TEST_CASE(swap_proxy) {
char keys[2] = {0, 1};
int values[2] = {2, 3};
detail::JointProxy<char *, int *> first(keys, values);
detail::JointProxy<char *, int *> second(keys + 1, values + 1);
swap(first, second);
BOOST_CHECK_EQUAL(1, keys[0]);
BOOST_CHECK_EQUAL(0, keys[1]);
BOOST_CHECK_EQUAL(3, values[0]);
BOOST_CHECK_EQUAL(2, values[1]);
}
}} // namespace anonymous util
/* Memory mapping wrappers.
* ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
* NICT.
*/
#include "mmap.hh"
#include "exception.hh"
#include "file.hh"
#include "scoped.hh"
#include <iostream>
#include <cassert>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <cstdlib>
#if defined(_WIN32) || defined(_WIN64)
#include <windows.h>
#include <io.h>
#else
#include <sys/mman.h>
#include <unistd.h>
#endif
namespace util {
std::size_t SizePage() {
#if defined(_WIN32) || defined(_WIN64)
SYSTEM_INFO si;
GetSystemInfo(&si);
return si.dwAllocationGranularity;
#else
return sysconf(_SC_PAGE_SIZE);
#endif
}
scoped_mmap::~scoped_mmap() {
if (data_ != (void*)-1) {
try {
// Thanks Denis Filimonov for pointing out NFS likes msync first.
SyncOrThrow(data_, size_);
UnmapOrThrow(data_, size_);
} catch (const util::ErrnoException &e) {
std::cerr << e.what();
abort();
}
}
}
namespace {
template <class T> T RoundUpPow2(T value, T mult) {
return ((value - 1) & ~(mult - 1)) + mult;
}
std::size_t RoundUpSize(const scoped_memory &mem) {
switch(mem.source()) {
case scoped_memory::MMAP_ROUND_1G_ALLOCATED:
return RoundUpPow2<std::size_t>(mem.size(), 1ULL << 30);
case scoped_memory::MMAP_ROUND_2M_ALLOCATED:
return RoundUpPow2<std::size_t>(mem.size(), 1ULL << 21);
case scoped_memory::MMAP_ROUND_PAGE_ALLOCATED:
return RoundUpPow2<std::size_t>(mem.size(), static_cast<std::size_t>(SizePage()));
default:
return mem.size();
}
}
} // namespace
scoped_memory::scoped_memory(std::size_t size, bool zeroed) : data_(NULL), size_(0), source_(NONE_ALLOCATED) {
HugeMalloc(size, zeroed, *this);
}
void scoped_memory::reset(void *data, std::size_t size, Alloc source) {
switch(source_) {
case MMAP_ROUND_1G_ALLOCATED:
case MMAP_ROUND_2M_ALLOCATED:
case MMAP_ROUND_PAGE_ALLOCATED:
case MMAP_ALLOCATED:
scoped_mmap(data_, RoundUpSize(*this));
break;
case MALLOC_ALLOCATED:
free(data_);
break;
case NONE_ALLOCATED:
break;
}
data_ = data;
size_ = size;
source_ = source;
}
const int kFileFlags =
#if defined(_WIN32) || defined(_WIN64)
0 // MapOrThrow ignores flags on windows
#elif defined(MAP_FILE)
MAP_FILE | MAP_SHARED
#else
MAP_SHARED
#endif
;
void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) {
#ifdef MAP_POPULATE // Linux specific
if (prefault) {
flags |= MAP_POPULATE;
}
#endif
#if defined(_WIN32) || defined(_WIN64)
int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY;
int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ;
uint64_t total_size = size + offset;
HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast<DWORD>(total_size), NULL);
UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed");
LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size);
CloseHandle(hMapping);
UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed");
#else
int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ;
void *ret;
UTIL_THROW_IF((ret = mmap(NULL, size, protect, flags, fd, offset)) == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset);
# ifdef MADV_HUGEPAGE
/* We like huge pages but it's fine if we can't have them. Note that huge
* pages are not supported for file-backed mmap on linux.
*/
madvise(ret, size, MADV_HUGEPAGE);
# endif
#endif
return ret;
}
void SyncOrThrow(void *start, size_t length) {
#if defined(_WIN32) || defined(_WIN64)
UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap");
#else
UTIL_THROW_IF(length && msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap");
#endif
}
void UnmapOrThrow(void *start, size_t length) {
#if defined(_WIN32) || defined(_WIN64)
UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file");
#else
UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed with " << start << " for length " << length);
#endif
}
// Linux huge pages.
#ifdef __linux__
namespace {
bool TryHuge(std::size_t size, bool populate, uint8_t alignment_bits, scoped_memory::Alloc huge_scheme, scoped_memory &to) {
// Don't bother with these cases.
if (size < (1ULL << alignment_bits) || (1ULL << alignment_bits) < SizePage())
return false;
// First try: Linux >= 3.8 with manually configured hugetlb pages available.
int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | (alignment_bits << 26 /* This is MAP_HUGE_SHIFT but some headers are too old. */);
if (populate) flags |= MAP_POPULATE;
void *ret = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
if (ret != MAP_FAILED) {
to.reset(ret, size, huge_scheme);
return true;
}
// There weren't pages in a sysadmin-created pool. Let's get aligned memory
// and hope transparent huge pages kicks in. Align to a multiple of the huge
// page size by overallocating. I feel bad about doing this, but it's also how
// posix_memalign is implemented. And the memory is virtual.
// Round up requested size to multiple of page size. This will allow the pages after to be munmapped.
std::size_t size_up = RoundUpPow2(size, SizePage());
std::size_t ask = size_up + (1 << alignment_bits) - SizePage();
// Don't populate because this is asking for more than we will use.
scoped_mmap larger(mmap(NULL, ask, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0), ask);
if (larger.get() == MAP_FAILED) return false;
// Throw out pages before the alignment point.
uintptr_t base = reinterpret_cast<uintptr_t>(larger.get());
// Round up to next multiple of alignment.
uintptr_t rounded_up = RoundUpPow2(base, static_cast<uintptr_t>(1) << alignment_bits);
if (base != rounded_up) {
// If this throws an exception (which it shouldn't) then we want to unmap the whole thing by keeping it in larger.
UnmapOrThrow(larger.get(), rounded_up - base);
larger.steal();
larger.reset(reinterpret_cast<void*>(rounded_up), ask - (rounded_up - base));
}
// Throw out pages after the requested size.
assert(larger.size() >= size_up);
if (larger.size() > size_up) {
// This is where we assume size_up is a multiple of page size.
UnmapOrThrow(static_cast<uint8_t*>(larger.get()) + size_up, larger.size() - size_up);
larger.reset(larger.steal(), size_up);
}
#ifdef MADV_HUGEPAGE
madvise(larger.get(), size_up, MADV_HUGEPAGE);
#endif
to.reset(larger.steal(), size, scoped_memory::MMAP_ROUND_PAGE_ALLOCATED);
return true;
}
} // namespace
#endif
void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to) {
to.reset();
#ifdef __linux__
// TODO: architectures/page sizes other than 2^21 and 2^30.
// Attempt 1 GB pages.
// If the user asked for zeroed memory, assume they want it populated.
if (size >= (1ULL << 30) && TryHuge(size, zeroed, 30, scoped_memory::MMAP_ROUND_1G_ALLOCATED, to))
return;
// Attempt 2 MB pages.
if (size >= (1ULL << 21) && TryHuge(size, zeroed, 21, scoped_memory::MMAP_ROUND_2M_ALLOCATED, to))
return;
#endif // __linux__
// Non-linux will always do this, as will small allocations on Linux.
to.reset(zeroed ? calloc(1, size) : malloc(size), size, scoped_memory::MALLOC_ALLOCATED);
UTIL_THROW_IF(!to.get(), ErrnoException, "Failed to allocate " << size << " bytes");
}
namespace {
#ifdef __linux__
const std::size_t kTransitionHuge = std::max<std::size_t>(1ULL << 21, SizePage());
#endif // __linux__
void ReplaceAndCopy(std::size_t to, bool zero_new, scoped_memory &mem) {
scoped_memory replacement;
HugeMalloc(to, zero_new, replacement);
memcpy(replacement.get(), mem.get(), mem.size());
// This can't throw.
mem.reset(replacement.get(), replacement.size(), replacement.source());
replacement.steal();
}
} // namespace
void HugeRealloc(std::size_t to, bool zero_new, scoped_memory &mem) {
if (!to) {
mem.reset();
return;
}
switch (mem.source()) {
case scoped_memory::NONE_ALLOCATED:
HugeMalloc(to, zero_new, mem);
return;
#ifdef __linux__
// TODO really need to collapse these cases with a number.
case scoped_memory::MMAP_ROUND_1G_ALLOCATED:
case scoped_memory::MMAP_ROUND_2M_ALLOCATED:
case scoped_memory::MMAP_ROUND_PAGE_ALLOCATED:
case scoped_memory::MMAP_ALLOCATED:
// Downsizing below barrier?
if (to <= SizePage()) {
scoped_malloc replacement(malloc(to));
memcpy(replacement.get(), mem.get(), std::min(to, mem.size()));
if (zero_new && to > mem.size())
memset(static_cast<uint8_t*>(replacement.get()) + mem.size(), 0, to - mem.size());
mem.reset(replacement.release(), to, scoped_memory::MALLOC_ALLOCATED);
} else {
// main path: try to mremap.
void *new_addr = mremap(mem.get(), RoundUpSize(mem), to, MREMAP_MAYMOVE);
if (new_addr != MAP_FAILED) {
scoped_memory::Alloc source(mem.source()); // steal resets mem.source()
mem.steal(); // let go otherwise reset() will free it first
mem.reset(new_addr, to, source);
} else {
// Reallocating huge pages can fail with EINVAL.
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/mremap.c?id=refs/tags/v3.19#n346
ReplaceAndCopy(to, zero_new, mem);
}
}
return;
#endif // __linux__
case scoped_memory::MALLOC_ALLOCATED:
#ifdef __linux__
// Transition larger allocations to huge pages, but don't keep trying if we're still malloc allocated.
if (to >= kTransitionHuge && mem.size() < kTransitionHuge) {
ReplaceAndCopy(to, zero_new, mem);
return;
}
#endif // __linux__
{
void *new_addr = std::realloc(mem.get(), to);
UTIL_THROW_IF(!new_addr, ErrnoException, "realloc to " << to << " bytes failed.");
if (zero_new && to > mem.size())
memset(static_cast<uint8_t*>(new_addr) + mem.size(), 0, to - mem.size());
mem.steal();
mem.reset(new_addr, to, scoped_memory::MALLOC_ALLOCATED);
}
return;
default:
UTIL_THROW(Exception, "HugeRealloc called with type " << mem.source());
}
}
void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) {
switch (method) {
case LAZY:
out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
break;
case POPULATE_OR_LAZY:
#ifdef MAP_POPULATE
case POPULATE_OR_READ:
#endif
out.reset(MapOrThrow(size, false, kFileFlags, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED);
break;
#ifndef MAP_POPULATE
case POPULATE_OR_READ:
#endif
case READ:
HugeMalloc(size, false, out);
SeekOrThrow(fd, offset);
ReadOrThrow(fd, out.get(), size);
break;
case PARALLEL_READ:
UTIL_THROW(Exception, "Parallel read was removed from this repo.");
break;
}
}
void *MapZeroedWrite(int fd, std::size_t size) {
ResizeOrThrow(fd, 0);
ResizeOrThrow(fd, size);
return MapOrThrow(size, true, kFileFlags, false, fd, 0);
}
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) {
file.reset(CreateOrThrow(name));
try {
return MapZeroedWrite(file.get(), size);
} catch (ErrnoException &e) {
e << " in file " << name;
throw;
}
}
Rolling::Rolling(const Rolling &copy_from, uint64_t increase) {
*this = copy_from;
IncreaseBase(increase);
}
Rolling &Rolling::operator=(const Rolling &copy_from) {
fd_ = copy_from.fd_;
file_begin_ = copy_from.file_begin_;
file_end_ = copy_from.file_end_;
for_write_ = copy_from.for_write_;
block_ = copy_from.block_;
read_bound_ = copy_from.read_bound_;
current_begin_ = 0;
if (copy_from.IsPassthrough()) {
current_end_ = copy_from.current_end_;
ptr_ = copy_from.ptr_;
} else {
// Force call on next mmap.
current_end_ = 0;
ptr_ = NULL;
}
return *this;
}
Rolling::Rolling(int fd, bool for_write, std::size_t block, std::size_t read_bound, uint64_t offset, uint64_t amount) {
current_begin_ = 0;
current_end_ = 0;
fd_ = fd;
file_begin_ = offset;
file_end_ = offset + amount;
for_write_ = for_write;
block_ = block;
read_bound_ = read_bound;
}
void *Rolling::ExtractNonRolling(scoped_memory &out, uint64_t index, std::size_t size) {
out.reset();
if (IsPassthrough()) return static_cast<uint8_t*>(get()) + index;
uint64_t offset = index + file_begin_;
// Round down to multiple of page size.
uint64_t cruft = offset % static_cast<uint64_t>(SizePage());
std::size_t map_size = static_cast<std::size_t>(size + cruft);
out.reset(MapOrThrow(map_size, for_write_, kFileFlags, true, fd_, offset - cruft), map_size, scoped_memory::MMAP_ALLOCATED);
return static_cast<uint8_t*>(out.get()) + static_cast<std::size_t>(cruft);
}
void Rolling::Roll(uint64_t index) {
assert(!IsPassthrough());
std::size_t amount;
if (file_end_ - (index + file_begin_) > static_cast<uint64_t>(block_)) {
amount = block_;
current_end_ = index + amount - read_bound_;
} else {
amount = file_end_ - (index + file_begin_);
current_end_ = index + amount;
}
ptr_ = static_cast<uint8_t*>(ExtractNonRolling(mem_, index, amount)) - index;
current_begin_ = index;
}
} // namespace util
#ifndef UTIL_MMAP_H
#define UTIL_MMAP_H
// Utilities for mmaped files.
#include <cstddef>
#include <limits>
#include <stdint.h>
#include <sys/types.h>
namespace util {
class scoped_fd;
std::size_t SizePage();
// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here.
class scoped_mmap {
public:
scoped_mmap() : data_((void*)-1), size_(0) {}
scoped_mmap(void *data, std::size_t size) : data_(data), size_(size) {}
~scoped_mmap();
void *get() const { return data_; }
const char *begin() const { return reinterpret_cast<char*>(data_); }
char *begin() { return reinterpret_cast<char*>(data_); }
const char *end() const { return reinterpret_cast<char*>(data_) + size_; }
char *end() { return reinterpret_cast<char*>(data_) + size_; }
std::size_t size() const { return size_; }
void reset(void *data, std::size_t size) {
scoped_mmap other(data_, size_);
data_ = data;
size_ = size;
}
void reset() {
reset((void*)-1, 0);
}
void *steal() {
void *ret = data_;
data_ = (void*)-1;
size_ = 0;
return ret;
}
private:
void *data_;
std::size_t size_;
scoped_mmap(const scoped_mmap &);
scoped_mmap &operator=(const scoped_mmap &);
};
/* For when the memory might come from mmap or malloc. Uses NULL and 0 for
* blanks even though mmap signals errors with (void*)-1).
*/
class scoped_memory {
public:
typedef enum {
// TODO: store rounded up size instead?
MMAP_ROUND_1G_ALLOCATED, // The size was rounded up for a 1GB page. Do the same before munmap.
MMAP_ROUND_2M_ALLOCATED, // The size was rounded up for a 2MB page. Do the same before munmap.
MMAP_ROUND_PAGE_ALLOCATED, // The size was rounded up to a multiple of the default page size. Do the same before munmap.
MMAP_ALLOCATED, // munmap
MALLOC_ALLOCATED, // free
NONE_ALLOCATED // nothing to free (though there can be something here if it's owned by somebody else).
} Alloc;
scoped_memory(void *data, std::size_t size, Alloc source)
: data_(data), size_(size), source_(source) {}
scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {}
// Calls HugeMalloc
scoped_memory(std::size_t to, bool zero_new);
#if __cplusplus >= 201103L
scoped_memory(scoped_memory &&from) noexcept
: data_(from.data_), size_(from.size_), source_(from.source_) {
from.steal();
}
#endif
~scoped_memory() { reset(); }
void *get() const { return data_; }
const char *begin() const { return reinterpret_cast<char*>(data_); }
char *begin() { return reinterpret_cast<char*>(data_); }
const char *end() const { return reinterpret_cast<char*>(data_) + size_; }
char *end() { return reinterpret_cast<char*>(data_) + size_; }
std::size_t size() const { return size_; }
Alloc source() const { return source_; }
void reset() { reset(NULL, 0, NONE_ALLOCATED); }
void reset(void *data, std::size_t size, Alloc from);
void *steal() {
void *ret = data_;
data_ = NULL;
size_ = 0;
source_ = NONE_ALLOCATED;
return ret;
}
private:
void *data_;
std::size_t size_;
Alloc source_;
scoped_memory(const scoped_memory &);
scoped_memory &operator=(const scoped_memory &);
};
extern const int kFileFlags;
// Cross-platform, error-checking wrapper for mmap().
void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0);
// msync wrapper
void SyncOrThrow(void *start, size_t length);
// Cross-platform, error-checking wrapper for munmap().
void UnmapOrThrow(void *start, size_t length);
// Allocate memory, promising that all/vast majority of it will be used. Tries
// hard to use huge pages on Linux.
// If you want zeroed memory, pass zeroed = true.
void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to);
// Reallocates memory ala realloc but with option to zero the new memory.
// On Linux, the memory can come from anonymous mmap or malloc/calloc.
// On non-Linux, only malloc/calloc is supported.
//
// To summarize, any memory from HugeMalloc or HugeRealloc can be resized with
// this.
void HugeRealloc(std::size_t size, bool new_zeroed, scoped_memory &mem);
enum LoadMethod {
// mmap with no prepopulate
LAZY,
// On linux, pass MAP_POPULATE to mmap.
POPULATE_OR_LAZY,
// Populate on Linux. malloc and read on non-Linux.
POPULATE_OR_READ,
// malloc and read.
READ,
// malloc and read in parallel (recommended for Lustre)
PARALLEL_READ,
};
void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out);
// Open file name with mmap of size bytes, all of which are initially zero.
void *MapZeroedWrite(int fd, std::size_t size);
void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file);
// Forward rolling memory map with no overlap.
class Rolling {
public:
Rolling() {}
explicit Rolling(void *data) { Init(data); }
Rolling(const Rolling &copy_from, uint64_t increase = 0);
Rolling &operator=(const Rolling &copy_from);
// For an actual rolling mmap.
explicit Rolling(int fd, bool for_write, std::size_t block, std::size_t read_bound, uint64_t offset, uint64_t amount);
// For a static mapping
void Init(void *data) {
ptr_ = data;
current_end_ = std::numeric_limits<uint64_t>::max();
current_begin_ = 0;
// Mark as a pass-through.
fd_ = -1;
}
void IncreaseBase(uint64_t by) {
file_begin_ += by;
ptr_ = static_cast<uint8_t*>(ptr_) + by;
if (!IsPassthrough()) current_end_ = 0;
}
void DecreaseBase(uint64_t by) {
file_begin_ -= by;
ptr_ = static_cast<uint8_t*>(ptr_) - by;
if (!IsPassthrough()) current_end_ = 0;
}
void *ExtractNonRolling(scoped_memory &out, uint64_t index, std::size_t size);
// Returns base pointer
void *get() const { return ptr_; }
// Returns base pointer.
void *CheckedBase(uint64_t index) {
if (index >= current_end_ || index < current_begin_) {
Roll(index);
}
return ptr_;
}
// Returns indexed pointer.
void *CheckedIndex(uint64_t index) {
return static_cast<uint8_t*>(CheckedBase(index)) + index;
}
private:
void Roll(uint64_t index);
// True if this is just a thin wrapper on a pointer.
bool IsPassthrough() const { return fd_ == -1; }
void *ptr_;
uint64_t current_begin_;
uint64_t current_end_;
scoped_memory mem_;
int fd_;
uint64_t file_begin_;
uint64_t file_end_;
bool for_write_;
std::size_t block_;
std::size_t read_bound_;
};
} // namespace util
#endif // UTIL_MMAP_H
#ifndef UTIL_MULTI_INTERSECTION_H
#define UTIL_MULTI_INTERSECTION_H
#include <boost/optional.hpp>
#include <boost/range/iterator_range.hpp>
#include <algorithm>
#include <functional>
#include <vector>
namespace util {
namespace detail {
template <class Range> struct RangeLessBySize : public std::binary_function<const Range &, const Range &, bool> {
bool operator()(const Range &left, const Range &right) const {
return left.size() < right.size();
}
};
/* Takes sets specified by their iterators and a boost::optional containing
* the lowest intersection if any. Each set must be sorted in increasing
* order. sets is changed to truncate the beginning of each sequence to the
* location of the match or an empty set. Precondition: sets is not empty
* since the intersection over null is the universe and this function does not
* know the universe.
*/
template <class Iterator, class Less> boost::optional<typename std::iterator_traits<Iterator>::value_type> FirstIntersectionSorted(std::vector<boost::iterator_range<Iterator> > &sets, const Less &less = std::less<typename std::iterator_traits<Iterator>::value_type>()) {
typedef std::vector<boost::iterator_range<Iterator> > Sets;
typedef typename std::iterator_traits<Iterator>::value_type Value;
assert(!sets.empty());
if (sets.front().empty()) return boost::optional<Value>();
// Possibly suboptimal to copy for general Value; makes unsigned int go slightly faster.
Value highest(sets.front().front());
for (typename Sets::iterator i(sets.begin()); i != sets.end(); ) {
i->advance_begin(std::lower_bound(i->begin(), i->end(), highest, less) - i->begin());
if (i->empty()) return boost::optional<Value>();
if (less(highest, i->front())) {
highest = i->front();
// start over
i = sets.begin();
} else {
++i;
}
}
return boost::optional<Value>(highest);
}
} // namespace detail
template <class Iterator, class Less> boost::optional<typename std::iterator_traits<Iterator>::value_type> FirstIntersection(std::vector<boost::iterator_range<Iterator> > &sets, const Less less) {
assert(!sets.empty());
std::sort(sets.begin(), sets.end(), detail::RangeLessBySize<boost::iterator_range<Iterator> >());
return detail::FirstIntersectionSorted(sets, less);
}
template <class Iterator> boost::optional<typename std::iterator_traits<Iterator>::value_type> FirstIntersection(std::vector<boost::iterator_range<Iterator> > &sets) {
return FirstIntersection(sets, std::less<typename std::iterator_traits<Iterator>::value_type>());
}
template <class Iterator, class Output, class Less> void AllIntersection(std::vector<boost::iterator_range<Iterator> > &sets, Output &out, const Less less) {
typedef typename std::iterator_traits<Iterator>::value_type Value;
assert(!sets.empty());
std::sort(sets.begin(), sets.end(), detail::RangeLessBySize<boost::iterator_range<Iterator> >());
boost::optional<Value> ret;
for (boost::optional<Value> ret; (ret = detail::FirstIntersectionSorted(sets, less)); sets.front().advance_begin(1)) {
out(*ret);
}
}
template <class Iterator, class Output> void AllIntersection(std::vector<boost::iterator_range<Iterator> > &sets, Output &out) {
AllIntersection(sets, out, std::less<typename std::iterator_traits<Iterator>::value_type>());
}
} // namespace util
#endif // UTIL_MULTI_INTERSECTION_H
#include "multi_intersection.hh"
#define BOOST_TEST_MODULE MultiIntersectionTest
#include <boost/test/unit_test.hpp>
namespace util {
namespace {
BOOST_AUTO_TEST_CASE(Empty) {
std::vector<boost::iterator_range<const unsigned int*> > sets;
sets.push_back(boost::iterator_range<const unsigned int*>(static_cast<const unsigned int*>(NULL), static_cast<const unsigned int*>(NULL)));
BOOST_CHECK(!FirstIntersection(sets));
}
BOOST_AUTO_TEST_CASE(Single) {
std::vector<unsigned int> nums;
nums.push_back(1);
nums.push_back(4);
nums.push_back(100);
std::vector<boost::iterator_range<std::vector<unsigned int>::const_iterator> > sets;
sets.push_back(nums);
boost::optional<unsigned int> ret(FirstIntersection(sets));
BOOST_REQUIRE(ret);
BOOST_CHECK_EQUAL(static_cast<unsigned int>(1), *ret);
}
template <class T, unsigned int len> boost::iterator_range<const T*> RangeFromArray(const T (&arr)[len]) {
return boost::iterator_range<const T*>(arr, arr + len);
}
BOOST_AUTO_TEST_CASE(MultiNone) {
unsigned int nums0[] = {1, 3, 4, 22};
unsigned int nums1[] = {2, 5, 12};
unsigned int nums2[] = {4, 17};
std::vector<boost::iterator_range<const unsigned int*> > sets;
sets.push_back(RangeFromArray(nums0));
sets.push_back(RangeFromArray(nums1));
sets.push_back(RangeFromArray(nums2));
BOOST_CHECK(!FirstIntersection(sets));
}
BOOST_AUTO_TEST_CASE(MultiOne) {
unsigned int nums0[] = {1, 3, 4, 17, 22};
unsigned int nums1[] = {2, 5, 12, 17};
unsigned int nums2[] = {4, 17};
std::vector<boost::iterator_range<const unsigned int*> > sets;
sets.push_back(RangeFromArray(nums0));
sets.push_back(RangeFromArray(nums1));
sets.push_back(RangeFromArray(nums2));
boost::optional<unsigned int> ret(FirstIntersection(sets));
BOOST_REQUIRE(ret);
BOOST_CHECK_EQUAL(static_cast<unsigned int>(17), *ret);
}
} // namespace
} // namespace util
/* Downloaded from http://sites.google.com/site/murmurhash/ which says "All
* code is released to the public domain. For business purposes, Murmurhash is
* under the MIT license."
* This is modified from the original:
* ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit.
* length changed to unsigned int.
* placed in namespace util
* add MurmurHashNative
* default option = 0 for seed
* ARM port from NICT
*/
#include "murmur_hash.hh"
#include <cstring>
namespace util {
//-----------------------------------------------------------------------------
// MurmurHash2, 64-bit versions, by Austin Appleby
// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
// and endian-ness issues if used across multiple platforms.
// 64-bit hash for 64-bit platforms
uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed )
{
const uint64_t m = 0xc6a4a7935bd1e995ULL;
const int r = 47;
uint64_t h = seed ^ (len * m);
#if defined(__arm) || defined(__arm__)
const size_t ksize = sizeof(uint64_t);
const unsigned char * data = (const unsigned char *)key;
const unsigned char * end = data + (std::size_t)(len/8) * ksize;
#else
const uint64_t * data = (const uint64_t *)key;
const uint64_t * end = data + (len/8);
#endif
while(data != end)
{
#if defined(__arm) || defined(__arm__)
uint64_t k;
memcpy(&k, data, ksize);
data += ksize;
#else
uint64_t k = *data++;
#endif
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
const unsigned char * data2 = (const unsigned char*)data;
switch(len & 7)
{
case 7: h ^= uint64_t(data2[6]) << 48;
case 6: h ^= uint64_t(data2[5]) << 40;
case 5: h ^= uint64_t(data2[4]) << 32;
case 4: h ^= uint64_t(data2[3]) << 24;
case 3: h ^= uint64_t(data2[2]) << 16;
case 2: h ^= uint64_t(data2[1]) << 8;
case 1: h ^= uint64_t(data2[0]);
h *= m;
};
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
// 64-bit hash for 32-bit platforms
uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed )
{
const unsigned int m = 0x5bd1e995;
const int r = 24;
unsigned int h1 = seed ^ len;
unsigned int h2 = 0;
#if defined(__arm) || defined(__arm__)
size_t ksize = sizeof(unsigned int);
const unsigned char * data = (const unsigned char *)key;
#else
const unsigned int * data = (const unsigned int *)key;
#endif
unsigned int k1, k2;
while(len >= 8)
{
#if defined(__arm) || defined(__arm__)
memcpy(&k1, data, ksize);
data += ksize;
memcpy(&k2, data, ksize);
data += ksize;
#else
k1 = *data++;
k2 = *data++;
#endif
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
k2 *= m; k2 ^= k2 >> r; k2 *= m;
h2 *= m; h2 ^= k2;
len -= 4;
}
if(len >= 4)
{
#if defined(__arm) || defined(__arm__)
memcpy(&k1, data, ksize);
data += ksize;
#else
k1 = *data++;
#endif
k1 *= m; k1 ^= k1 >> r; k1 *= m;
h1 *= m; h1 ^= k1;
len -= 4;
}
switch(len)
{
case 3: h2 ^= ((unsigned char*)data)[2] << 16;
case 2: h2 ^= ((unsigned char*)data)[1] << 8;
case 1: h2 ^= ((unsigned char*)data)[0];
h2 *= m;
};
h1 ^= h2 >> 18; h1 *= m;
h2 ^= h1 >> 22; h2 *= m;
h1 ^= h2 >> 17; h1 *= m;
h2 ^= h1 >> 19; h2 *= m;
uint64_t h = h1;
h = (h << 32) | h2;
return h;
}
// Trick to test for 64-bit architecture at compile time.
namespace {
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-function"
#endif
template <unsigned L> inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) {
return MurmurHash64A(key, len, seed);
}
template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) {
return MurmurHash64B(key, len, seed);
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif
} // namespace
uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) {
return MurmurHashNativeBackend<sizeof(void*)>(key, len, seed);
}
} // namespace util
#ifndef UTIL_MURMUR_HASH_H
#define UTIL_MURMUR_HASH_H
#include <cstddef>
#include <stdint.h>
namespace util {
// 64-bit machine version
uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0);
// 32-bit machine version (not the same function as above)
uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0);
// Use the version for this arch. Because the values differ across
// architectures, really only use it for in-memory structures.
uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0);
} // namespace util
#endif // UTIL_MURMUR_HASH_H
#include "parallel_read.hh"
#include "file.hh"
#ifdef WITH_THREADS
#include "thread_pool.hh"
namespace util {
namespace {
class Reader {
public:
explicit Reader(int fd) : fd_(fd) {}
struct Request {
void *to;
std::size_t size;
uint64_t offset;
bool operator==(const Request &other) const {
return (to == other.to) && (size == other.size) && (offset == other.offset);
}
};
void operator()(const Request &request) {
util::ErsatzPRead(fd_, request.to, request.size, request.offset);
}
private:
int fd_;
};
} // namespace
void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) {
Reader::Request poison;
poison.to = NULL;
poison.size = 0;
poison.offset = 0;
unsigned threads = boost::thread::hardware_concurrency();
if (!threads) threads = 2;
ThreadPool<Reader> pool(2 /* don't need much of a queue */, threads, fd, poison);
const std::size_t kBatch = 1ULL << 25; // 32 MB
Reader::Request request;
request.to = to;
request.size = kBatch;
request.offset = offset;
for (; amount > kBatch; amount -= kBatch) {
pool.Produce(request);
request.to = reinterpret_cast<uint8_t*>(request.to) + kBatch;
request.offset += kBatch;
}
request.size = amount;
if (request.size) {
pool.Produce(request);
}
}
} // namespace util
#else // WITH_THREADS
namespace util {
void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) {
util::ErsatzPRead(fd, to, amount, offset);
}
} // namespace util
#endif
#ifndef UTIL_PARALLEL_READ__
#define UTIL_PARALLEL_READ__
/* Read pieces of a file in parallel. This has a very specific use case:
* reading files from Lustre is CPU bound so multiple threads actually
* increases throughput. Speed matters when an LM takes a terabyte.
*/
#include <cstddef>
#include <stdint.h>
namespace util {
void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset);
} // namespace util
#endif // UTIL_PARALLEL_READ__
#ifndef UTIL_PCQUEUE_H
#define UTIL_PCQUEUE_H
#include "exception.hh"
#include <boost/interprocess/sync/interprocess_semaphore.hpp>
#include <boost/scoped_array.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/utility.hpp>
#include <cerrno>
#ifdef __APPLE__
#include <mach/semaphore.h>
#include <mach/task.h>
#include <mach/mach_traps.h>
#include <mach/mach.h>
#endif // __APPLE__
namespace util {
/* OS X Maverick and Boost interprocess were doing "Function not implemented."
* So this is my own wrapper around the mach kernel APIs.
*/
#ifdef __APPLE__
#define MACH_CALL(call) UTIL_THROW_IF(KERN_SUCCESS != (call), Exception, "Mach call failure")
class Semaphore {
public:
explicit Semaphore(int value) : task_(mach_task_self()) {
MACH_CALL(semaphore_create(task_, &back_, SYNC_POLICY_FIFO, value));
}
~Semaphore() {
MACH_CALL(semaphore_destroy(task_, back_));
}
void wait() {
MACH_CALL(semaphore_wait(back_));
}
void post() {
MACH_CALL(semaphore_signal(back_));
}
private:
semaphore_t back_;
task_t task_;
};
inline void WaitSemaphore(Semaphore &semaphore) {
semaphore.wait();
}
#else
typedef boost::interprocess::interprocess_semaphore Semaphore;
inline void WaitSemaphore (Semaphore &on) {
while (1) {
try {
on.wait();
break;
}
catch (boost::interprocess::interprocess_exception &e) {
if (e.get_native_error() != EINTR) {
throw;
}
}
}
}
#endif // __APPLE__
/**
* Producer consumer queue safe for multiple producers and multiple consumers.
* T must be default constructable and have operator=.
* The value is copied twice for Consume(T &out) or three times for Consume(),
* so larger objects should be passed via pointer.
* Strong exception guarantee if operator= throws. Undefined if semaphores throw.
*/
template <class T> class PCQueue : boost::noncopyable {
public:
explicit PCQueue(size_t size)
: empty_(size), used_(0),
storage_(new T[size]),
end_(storage_.get() + size),
produce_at_(storage_.get()),
consume_at_(storage_.get()) {}
// Add a value to the queue.
void Produce(const T &val) {
WaitSemaphore(empty_);
{
boost::unique_lock<boost::mutex> produce_lock(produce_at_mutex_);
try {
*produce_at_ = val;
}
catch (...) {
empty_.post();
throw;
}
if (++produce_at_ == end_) produce_at_ = storage_.get();
}
used_.post();
}
// Consume a value, assigning it to out.
T& Consume(T &out) {
WaitSemaphore(used_);
{
boost::unique_lock<boost::mutex> consume_lock(consume_at_mutex_);
try {
out = *consume_at_;
}
catch (...) {
used_.post();
throw;
}
if (++consume_at_ == end_) consume_at_ = storage_.get();
}
empty_.post();
return out;
}
// Convenience version of Consume that copies the value to return.
// The other version is faster.
T Consume() {
T ret;
Consume(ret);
return ret;
}
private:
// Number of empty spaces in storage_.
Semaphore empty_;
// Number of occupied spaces in storage_.
Semaphore used_;
boost::scoped_array<T> storage_;
T *const end_;
// Index for next write in storage_.
T *produce_at_;
boost::mutex produce_at_mutex_;
// Index for next read from storage_.
T *consume_at_;
boost::mutex consume_at_mutex_;
};
} // namespace util
#endif // UTIL_PCQUEUE_H
#include "pcqueue.hh"
#define BOOST_TEST_MODULE PCQueueTest
#include <boost/test/unit_test.hpp>
namespace util {
namespace {
BOOST_AUTO_TEST_CASE(SingleThread) {
PCQueue<int> queue(10);
for (int i = 0; i < 10; ++i) {
queue.Produce(i);
}
for (int i = 0; i < 10; ++i) {
BOOST_CHECK_EQUAL(i, queue.Consume());
}
}
}
} // namespace util
#include "pool.hh"
#include "scoped.hh"
#include <cstdlib>
#include <algorithm>
namespace util {
Pool::Pool() {
current_ = NULL;
current_end_ = NULL;
}
Pool::~Pool() {
FreeAll();
}
void Pool::FreeAll() {
for (std::vector<void *>::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) {
free(*i);
}
free_list_.clear();
current_ = NULL;
current_end_ = NULL;
}
void *Pool::More(std::size_t size) {
std::size_t amount = std::max(static_cast<size_t>(32) << free_list_.size(), size);
uint8_t *ret = static_cast<uint8_t*>(MallocOrThrow(amount));
free_list_.push_back(ret);
current_ = ret + size;
current_end_ = ret + amount;
return ret;
}
} // namespace util
#ifndef UTIL_POOL_H
#define UTIL_POOL_H
#include <cassert>
#include <cstring>
#include <vector>
#include <stdint.h>
namespace util {
/* Very simple pool. It can only allocate memory. And all of the memory it
* allocates must be freed at the same time.
*/
class Pool {
public:
Pool();
~Pool();
void *Allocate(std::size_t size) {
void *ret = current_;
current_ += size;
if (current_ > current_end_) {
ret = More(size);
}
#ifdef DEBUG
base_check_ = ret;
#endif
return ret;
}
/** Extend (or contract) the most recent allocation.
* @param base The base pointer of the allocation. This must must have been
* returned by the MOST RECENT call to Allocate or Continue.
* @param additional Change in the size.
*
* In most cases, more memory from the same page is used, in which case
* base is unchanged and the function returns false.
* If the page runs out, a new page is created and the memory (from base)
* is copied. The function returns true.
*
* @return Whether the base had to be changed due to allocating a page.
*/
bool Continue(void *&base, std::ptrdiff_t additional) {
#ifdef DEBUG
assert(base == base_check_);
#endif
current_ += additional;
if (current_ > current_end_) {
std::size_t new_total = current_ - static_cast<uint8_t*>(base);
void *new_base = More(new_total);
std::memcpy(new_base, base, new_total - additional);
base = new_base;
#ifdef DEBUG
base_check_ = base;
#endif
return true;
}
return false;
}
void FreeAll();
private:
void *More(std::size_t size);
std::vector<void *> free_list_;
uint8_t *current_, *current_end_;
#ifdef DEBUG
// For debugging, check that Continue came from the most recent call.
void *base_check_;
#endif // DEBUG
// no copying
Pool(const Pool &);
Pool &operator=(const Pool &);
};
/**
* Pool designed to allow limited freeing.
* Keeps a linked list of free elements in the free spaces.
* Will not reduce in size until FreeAll is called.
*/
class FreePool {
public:
explicit FreePool(std::size_t element_size)
: free_list_(NULL),
element_size_(element_size),
padded_size_(std::max(element_size_, sizeof(void*))) {}
void *Allocate() {
if (free_list_) {
void *ret = free_list_;
free_list_ = *reinterpret_cast<void**>(free_list_);
return ret;
} else {
return backing_.Allocate(padded_size_);
}
}
void Free(void *ptr) {
*reinterpret_cast<void**>(ptr) = free_list_;
free_list_ = ptr;
}
std::size_t ElementSize() const { return element_size_; }
private:
void *free_list_;
Pool backing_;
const std::size_t element_size_;
const std::size_t padded_size_;
};
} // namespace util
#endif // UTIL_POOL_H
#ifndef UTIL_PROBING_HASH_TABLE_H
#define UTIL_PROBING_HASH_TABLE_H
#include "exception.hh"
#include "mmap.hh"
#include <algorithm>
#include <cstddef>
#include <functional>
#include <vector>
#include <cassert>
#include <stdint.h>
namespace util {
/* Thrown when table grows too large */
class ProbingSizeException : public Exception {
public:
ProbingSizeException() throw() {}
~ProbingSizeException() throw() {}
};
// std::identity is an SGI extension :-(
struct IdentityHash {
template <class T> T operator()(T arg) const { return arg; }
};
class DivMod {
public:
explicit DivMod(std::size_t buckets) : buckets_(buckets) {}
static uint64_t RoundBuckets(uint64_t from) {
return from;
}
template <class It> It Ideal(It begin, uint64_t hash) const {
return begin + (hash % buckets_);
}
template <class BaseIt, class OutIt> void Next(BaseIt begin, BaseIt end, OutIt &it) const {
if (++it == end) it = begin;
}
void Double() {
buckets_ *= 2;
}
private:
std::size_t buckets_;
};
class Power2Mod {
public:
explicit Power2Mod(std::size_t buckets) {
UTIL_THROW_IF(!buckets || (((buckets - 1) & buckets)), ProbingSizeException, "Size " << buckets << " is not a power of 2.");
mask_ = buckets - 1;
}
// Round up to next power of 2.
static uint64_t RoundBuckets(uint64_t from) {
--from;
from |= from >> 1;
from |= from >> 2;
from |= from >> 4;
from |= from >> 8;
from |= from >> 16;
from |= from >> 32;
return from + 1;
}
template <class It> It Ideal(It begin, uint64_t hash) const {
return begin + (hash & mask_);
}
template <class BaseIt, class OutIt> void Next(BaseIt begin, BaseIt /*end*/, OutIt &it) const {
it = begin + ((it - begin + 1) & mask_);
}
void Double() {
mask_ = (mask_ << 1) | 1;
}
private:
std::size_t mask_;
};
template <class EntryT, class HashT, class EqualT> class AutoProbing;
/* Non-standard hash table
* Buckets must be set at the beginning and must be greater than maximum number
* of elements, else it throws ProbingSizeException.
* Memory management and initialization is externalized to make it easier to
* serialize these to disk and load them quickly.
* Uses linear probing to find value.
* Only insert and lookup operations.
*/
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key>, class ModT = DivMod> class ProbingHashTable {
public:
typedef EntryT Entry;
typedef typename Entry::Key Key;
typedef const Entry *ConstIterator;
typedef Entry *MutableIterator;
typedef HashT Hash;
typedef EqualT Equal;
typedef ModT Mod;
static uint64_t Size(uint64_t entries, float multiplier) {
uint64_t buckets = Mod::RoundBuckets(std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries))));
return buckets * sizeof(Entry);
}
// Must be assigned to later.
ProbingHashTable() : mod_(1), entries_(0)
#ifdef DEBUG
, initialized_(false)
#endif
{}
ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal())
: begin_(reinterpret_cast<MutableIterator>(start)),
end_(begin_ + allocated / sizeof(Entry)),
buckets_(end_ - begin_),
invalid_(invalid),
hash_(hash_func),
equal_(equal_func),
mod_(end_ - begin_),
entries_(0)
#ifdef DEBUG
, initialized_(true)
#endif
{}
void Relocate(void *new_base) {
begin_ = reinterpret_cast<MutableIterator>(new_base);
end_ = begin_ + buckets_;
}
MutableIterator Ideal(const Key key) {
return mod_.Ideal(begin_, hash_(key));
}
ConstIterator Ideal(const Key key) const {
return mod_.Ideal(begin_, hash_(key));
}
template <class T> MutableIterator Insert(const T &t) {
#ifdef DEBUG
assert(initialized_);
#endif
UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
return UncheckedInsert(t);
}
// Return true if the value was found (and not inserted). This is consistent with Find but the opposite of hash_map!
template <class T> bool FindOrInsert(const T &t, MutableIterator &out) {
#ifdef DEBUG
assert(initialized_);
#endif
for (MutableIterator i = Ideal(t.GetKey());;mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, t.GetKey())) { out = i; return true; }
if (equal_(got, invalid_)) {
UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
*i = t;
out = i;
return false;
}
}
}
void FinishedInserting() {}
// Don't change anything related to GetKey,
template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
#ifdef DEBUG
assert(initialized_);
#endif
for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { out = i; return true; }
if (equal_(got, invalid_)) return false;
}
}
// Like UnsafeMutableFind, but the key must be there.
template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { return i; }
assert(!equal_(got, invalid_));
}
}
// Iterator is both input and output.
template <class Key> bool FindFromIdeal(const Key key, ConstIterator &i) const {
#ifdef DEBUG
assert(initialized_);
#endif
for (;; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) return true;
if (equal_(got, invalid_)) return false;
}
}
template <class Key> bool Find(const Key key, ConstIterator &out) const {
out = Ideal(key);
return FindFromIdeal(key, out);
}
// Like Find but we're sure it must be there.
template <class Key> ConstIterator MustFind(const Key key) const {
for (ConstIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { return i; }
assert(!equal_(got, invalid_));
}
}
void Clear() {
Entry invalid;
invalid.SetKey(invalid_);
std::fill(begin_, end_, invalid);
entries_ = 0;
}
// Return number of entries assuming no serialization went on.
std::size_t SizeNoSerialization() const {
return entries_;
}
// Return memory size expected by Double.
std::size_t DoubleTo() const {
return buckets_ * 2 * sizeof(Entry);
}
// Inform the table that it has double the amount of memory.
// Pass clear_new = false if you are sure the new memory is initialized
// properly (to invalid_) i.e. by mremap.
void Double(void *new_base, bool clear_new = true) {
begin_ = static_cast<MutableIterator>(new_base);
MutableIterator old_end = begin_ + buckets_;
buckets_ *= 2;
end_ = begin_ + buckets_;
mod_.Double();
if (clear_new) {
Entry invalid;
invalid.SetKey(invalid_);
std::fill(old_end, end_, invalid);
}
std::vector<Entry> rolled_over;
// Move roll-over entries to a buffer because they might not roll over anymore. This should be small.
for (MutableIterator i = begin_; i != old_end && !equal_(i->GetKey(), invalid_); ++i) {
rolled_over.push_back(*i);
i->SetKey(invalid_);
}
/* Re-insert everything. Entries might go backwards to take over a
* recently opened gap, stay, move to new territory, or wrap around. If
* an entry wraps around, it might go to a pointer greater than i (which
* can happen at the beginning) and it will be revisited to possibly fill
* in a gap created later.
*/
Entry temp;
for (MutableIterator i = begin_; i != old_end; ++i) {
if (!equal_(i->GetKey(), invalid_)) {
temp = *i;
i->SetKey(invalid_);
UncheckedInsert(temp);
}
}
// Put the roll-over entries back in.
for (typename std::vector<Entry>::const_iterator i(rolled_over.begin()); i != rolled_over.end(); ++i) {
UncheckedInsert(*i);
}
}
// Mostly for tests, check consistency of every entry.
void CheckConsistency() {
MutableIterator last;
for (last = end_ - 1; last >= begin_ && !equal_(last->GetKey(), invalid_); --last) {}
UTIL_THROW_IF(last == begin_, ProbingSizeException, "Completely full");
MutableIterator i;
// Beginning can be wrap-arounds.
for (i = begin_; !equal_(i->GetKey(), invalid_); ++i) {
MutableIterator ideal = Ideal(i->GetKey());
UTIL_THROW_IF(ideal > i && ideal <= last, Exception, "Inconsistency at position " << (i - begin_) << " should be at " << (ideal - begin_));
}
MutableIterator pre_gap = i;
for (; i != end_; ++i) {
if (equal_(i->GetKey(), invalid_)) {
pre_gap = i;
continue;
}
MutableIterator ideal = Ideal(i->GetKey());
UTIL_THROW_IF(ideal > i || ideal <= pre_gap, Exception, "Inconsistency at position " << (i - begin_) << " with ideal " << (ideal - begin_));
}
}
ConstIterator RawBegin() const {
return begin_;
}
ConstIterator RawEnd() const {
return end_;
}
private:
friend class AutoProbing<Entry, Hash, Equal>;
template <class T> MutableIterator UncheckedInsert(const T &t) {
for (MutableIterator i(Ideal(t.GetKey()));; mod_.Next(begin_, end_, i)) {
if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
}
}
MutableIterator begin_;
MutableIterator end_;
std::size_t buckets_;
Key invalid_;
Hash hash_;
Equal equal_;
Mod mod_;
std::size_t entries_;
#ifdef DEBUG
bool initialized_;
#endif
};
// Resizable linear probing hash table. This owns the memory.
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class AutoProbing {
private:
typedef ProbingHashTable<EntryT, HashT, EqualT, Power2Mod> Backend;
public:
static std::size_t MemUsage(std::size_t size, float multiplier = 1.5) {
return Backend::Size(size, multiplier);
}
typedef EntryT Entry;
typedef typename Entry::Key Key;
typedef const Entry *ConstIterator;
typedef Entry *MutableIterator;
typedef HashT Hash;
typedef EqualT Equal;
AutoProbing(std::size_t initial_size = 5, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) :
allocated_(Backend::Size(initial_size, 1.2)), mem_(allocated_, KeyIsRawZero(invalid)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) {
threshold_ = std::min<std::size_t>(backend_.buckets_ - 1, backend_.buckets_ * 0.9);
if (!KeyIsRawZero(invalid)) {
Clear();
}
}
// Assumes that the key is unique. Multiple insertions won't cause a failure, just inconsistent lookup.
template <class T> MutableIterator Insert(const T &t) {
++backend_.entries_;
DoubleIfNeeded();
return backend_.UncheckedInsert(t);
}
template <class T> bool FindOrInsert(const T &t, MutableIterator &out) {
DoubleIfNeeded();
return backend_.FindOrInsert(t, out);
}
template <class Key> bool UnsafeMutableFind(const Key key, MutableIterator &out) {
return backend_.UnsafeMutableFind(key, out);
}
template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
return backend_.UnsafeMutableMustFind(key);
}
template <class Key> bool Find(const Key key, ConstIterator &out) const {
return backend_.Find(key, out);
}
template <class Key> ConstIterator MustFind(const Key key) const {
return backend_.MustFind(key);
}
std::size_t Size() const {
return backend_.SizeNoSerialization();
}
void Clear() {
backend_.Clear();
}
ConstIterator RawBegin() const {
return backend_.RawBegin();
}
ConstIterator RawEnd() const {
return backend_.RawEnd();
}
private:
void DoubleIfNeeded() {
if (UTIL_LIKELY(Size() < threshold_))
return;
HugeRealloc(backend_.DoubleTo(), KeyIsRawZero(backend_.invalid_), mem_);
allocated_ = backend_.DoubleTo();
backend_.Double(mem_.get(), !KeyIsRawZero(backend_.invalid_));
threshold_ = std::min<std::size_t>(backend_.buckets_ - 1, backend_.buckets_ * 0.9);
}
bool KeyIsRawZero(const Key &key) {
for (const uint8_t *i = reinterpret_cast<const uint8_t*>(&key); i < reinterpret_cast<const uint8_t*>(&key) + sizeof(Key); ++i) {
if (*i) return false;
}
return true;
}
std::size_t allocated_;
util::scoped_memory mem_;
Backend backend_;
std::size_t threshold_;
};
} // namespace util
#endif // UTIL_PROBING_HASH_TABLE_H
#include "file.hh"
#include "probing_hash_table.hh"
#include "mmap.hh"
#include "usage.hh"
#include "thread_pool.hh"
#include <boost/thread/mutex.hpp>
#include <boost/thread/locks.hpp>
#ifdef WIN32
#include <windows.h>
#include <processthreadsapi.h>
#else
#include <sys/resource.h>
#include <sys/time.h>
#endif
#include <iostream>
namespace util {
namespace {
struct Entry {
typedef uint64_t Key;
Key key;
Key GetKey() const { return key; }
};
// I don't care if this doesn't run on Windows. Empirically /dev/urandom was faster than boost::random's Mersenne Twister.
class URandom {
public:
URandom() :
it_(buf_ + 1024), end_(buf_ + 1024),
file_(util::OpenReadOrThrow("/dev/urandom")) {}
uint64_t Get() {
if (it_ == end_) {
it_ = buf_;
util::ReadOrThrow(file_.get(), buf_, sizeof(buf_));
it_ = buf_;
}
return *it_++;
}
void Batch(uint64_t *begin, uint64_t *end) {
util::ReadOrThrow(file_.get(), begin, (end - begin) * sizeof(uint64_t));
}
private:
uint64_t buf_[1024];
uint64_t *it_, *end_;
util::scoped_fd file_;
};
struct PrefetchEntry {
uint64_t key;
const Entry *pointer;
};
template <class TableT, unsigned PrefetchSize> class PrefetchQueue {
public:
typedef TableT Table;
explicit PrefetchQueue(Table &table) : table_(table), cur_(0), twiddle_(false) {
for (PrefetchEntry *i = entries_; i != entries_ + PrefetchSize; ++i)
i->pointer = NULL;
}
void Add(uint64_t key) {
if (Cur().pointer) {
twiddle_ ^= table_.FindFromIdeal(Cur().key, Cur().pointer);
}
Cur().key = key;
Cur().pointer = table_.Ideal(key);
__builtin_prefetch(Cur().pointer, 0, 0);
Next();
}
bool Drain() {
if (Cur().pointer) {
for (PrefetchEntry *i = &Cur(); i < entries_ + PrefetchSize; ++i) {
twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
}
}
for (PrefetchEntry *i = entries_; i < &Cur(); ++i) {
twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
}
return twiddle_;
}
private:
PrefetchEntry &Cur() { return entries_[cur_]; }
void Next() {
++cur_;
cur_ = cur_ % PrefetchSize;
}
Table &table_;
PrefetchEntry entries_[PrefetchSize];
std::size_t cur_;
bool twiddle_;
PrefetchQueue(const PrefetchQueue&);
void operator=(const PrefetchQueue&);
};
template <class TableT> class Immediate {
public:
typedef TableT Table;
explicit Immediate(Table &table) : table_(table), twiddle_(false) {}
void Add(uint64_t key) {
typename Table::ConstIterator it;
twiddle_ ^= table_.Find(key, it);
}
bool Drain() const { return twiddle_; }
private:
Table &table_;
bool twiddle_;
};
std::size_t Size(uint64_t entries, float multiplier = 1.5) {
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
// Always round up to power of 2 for fair comparison.
return Power2Mod::RoundBuckets(Table::Size(entries, multiplier) / sizeof(Entry)) * sizeof(Entry);
}
template <class Queue> bool Test(URandom &rn, uint64_t entries, const uint64_t *const queries_begin, const uint64_t *const queries_end, bool ordinary_malloc, float multiplier = 1.5) {
std::size_t size = Size(entries, multiplier);
scoped_memory backing;
if (ordinary_malloc) {
backing.reset(util::CallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED);
} else {
util::HugeMalloc(size, true, backing);
}
typename Queue::Table table(backing.get(), size);
double start = CPUTime();
for (uint64_t i = 0; i < entries; ++i) {
Entry entry;
entry.key = rn.Get();
table.Insert(entry);
}
double inserted = CPUTime() - start;
double before_lookup = CPUTime();
Queue queue(table);
for (const uint64_t *i = queries_begin; i != queries_end; ++i) {
queue.Add(*i);
}
bool meaningless = queue.Drain();
std::cout << ' ' << (inserted / static_cast<double>(entries)) << ' ' << (CPUTime() - before_lookup) / static_cast<double>(queries_end - queries_begin) << std::flush;
return meaningless;
}
bool TestRun(uint64_t lookups = 20000000, float multiplier = 1.5) {
URandom rn;
util::scoped_memory queries;
HugeMalloc(lookups * sizeof(uint64_t), true, queries);
rn.Batch(static_cast<uint64_t*>(queries.get()), static_cast<uint64_t*>(queries.get()) + lookups);
uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2;
bool meaningless = true;
for (uint64_t i = 4; Size(i / multiplier) < physical_mem_limit; i *= 4) {
std::cout << static_cast<std::size_t>(i / multiplier) << ' ' << Size(i / multiplier);
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, DivMod> TableDiv;
const uint64_t *const queries_begin = static_cast<const uint64_t*>(queries.get());
meaningless ^= util::Test<Immediate<TableDiv> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
meaningless ^= util::Test<Immediate<Table> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
meaningless ^= util::Test<PrefetchQueue<Table, 4> >(rn, i / multiplier, queries_begin, queries_begin + lookups, true, multiplier);
meaningless ^= util::Test<Immediate<Table> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
meaningless ^= util::Test<PrefetchQueue<Table, 2> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
meaningless ^= util::Test<PrefetchQueue<Table, 4> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
meaningless ^= util::Test<PrefetchQueue<Table, 8> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
meaningless ^= util::Test<PrefetchQueue<Table, 16> >(rn, i / multiplier, queries_begin, queries_begin + lookups, false, multiplier);
std::cout << std::endl;
}
return meaningless;
}
template<class Table>
struct ParallelTestRequest{
ParallelTestRequest() : queries_begin_(NULL), queries_end_(NULL), table_(NULL) {}
ParallelTestRequest(const uint64_t *queries_begin, const uint64_t *queries_end, Table *table) :
queries_begin_(queries_begin),
queries_end_(queries_end),
table_(table) {}
bool operator==(const ParallelTestRequest &rhs) const {
return this->queries_begin_ == rhs.queries_begin_ && this->queries_end_ == rhs.queries_end_;
}
const uint64_t *queries_begin_;
const uint64_t *queries_end_;
Table * table_;
};
template <class TableT>
struct ParallelTestConstruct{
ParallelTestConstruct(boost::mutex& lock, const uint64_t* const burn_begin, const uint64_t* const burn_end, TableT* table) : lock_(lock), burn_begin_(burn_begin), burn_end_(burn_end), table_(table){}
boost::mutex& lock_;
const uint64_t* const burn_begin_;
const uint64_t* const burn_end_;
TableT* table_;
};
template<class Queue>
struct ParallelTestHandler{
typedef ParallelTestRequest<typename Queue::Table> Request;
explicit ParallelTestHandler(const ParallelTestConstruct<typename Queue::Table>& construct) : lock_(construct.lock_), totalTime_(0.0), nRequests_(0), nQueries_(0), error_(false), twiddle_(false){
//perform initial burn
for(const uint64_t* i = construct.burn_begin_; i < construct.burn_end_; i++){
typename Queue::Table::ConstIterator it;
twiddle_ ^= construct.table_->Find(*i, it);
}
}
void operator()(Request request){
if (error_) return;
Queue queue(*request.table_);
double start = ThreadTime();
if(start < 0.0){
error_ = true;
return;
}
for(const uint64_t *i = request.queries_begin_; i != request.queries_end_; ++i){
queue.Add(*i);
}
twiddle_ ^= queue.Drain();
double end = ThreadTime();
if(end < 0.0){
error_ = true;
return;
}
totalTime_ += end - start;
nQueries_ += request.queries_end_ - request.queries_begin_;
++nRequests_;
}
virtual ~ParallelTestHandler() {
boost::unique_lock<boost::mutex> produce_lock(lock_);
if (error_){
std::cout << "Error ";
}
else {
std::cout << nRequests_ << ' ' << ' ' << nQueries_ << ' ' << totalTime_ << std::endl;
}
std::cerr << "Meaningless " << twiddle_ << std::endl;
}
private:
boost::mutex &lock_;
double totalTime_;
std::size_t nRequests_;
std::size_t nQueries_;
bool error_;
bool twiddle_;
};
template<class Queue>
void ParallelTest(typename Queue::Table* table, const uint64_t *const queries_begin,
const uint64_t *const queries_end, std::size_t num_threads,
std::size_t tasks_per_thread, std::size_t burn){
boost::mutex lock;
ParallelTestConstruct<typename Queue::Table> construct(lock, queries_begin, queries_begin + burn, table);
ParallelTestRequest<typename Queue::Table> poison(NULL, NULL, NULL);
{
util::ThreadPool<ParallelTestHandler<Queue> > pool(num_threads, num_threads, construct, poison);
const uint64_t queries_per_thread =(static_cast<uint64_t>(queries_end-queries_begin-burn)/num_threads)/tasks_per_thread;
for (const uint64_t *i = queries_begin+burn; i + queries_per_thread <= queries_end; i += queries_per_thread){
ParallelTestRequest<typename Queue::Table> request(i, i+queries_per_thread, table);
pool.Produce(request);
}
} // pool gets deallocated and all jobs finish
std::cout << std::endl;
}
void ParallelTestRun(std::size_t tasks_per_thread = 1, std::size_t burn = 4000, uint64_t lookups = 20000000, float multiplier = 1.5) {
URandom rn;
util::scoped_memory queries;
HugeMalloc((lookups + burn)* sizeof(uint64_t), true, queries);
rn.Batch(static_cast<uint64_t*>(queries.get()), static_cast<uint64_t*>(queries.get()) + lookups + burn);
const uint64_t *const queries_begin = static_cast<const uint64_t*>(queries.get());
const uint64_t *const queries_end = queries_begin + lookups + burn;
typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2;
for (uint64_t i = 4; Size(i / multiplier, multiplier) < physical_mem_limit; i *= 4) {
std::size_t entries = static_cast<std::size_t>(i / multiplier);
std::size_t size = Size(i/multiplier, multiplier);
scoped_memory backing;
util::HugeMalloc(size, true, backing);
Table table(backing.get(), size);
for (uint64_t j = 0; j < entries; ++j) {
Entry entry;
entry.key = rn.Get();
table.Insert(entry);
}
for(std::size_t num_threads = 1; num_threads <= 16; num_threads*=2){
std::cout << entries << ' ' << size << ' ' << num_threads << ' ' << std::endl;
util::ParallelTest<Immediate<Table> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
util::ParallelTest<PrefetchQueue<Table, 2> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
util::ParallelTest<PrefetchQueue<Table, 4> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
util::ParallelTest<PrefetchQueue<Table, 8> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
util::ParallelTest<PrefetchQueue<Table, 16> >(&table, queries_begin, queries_end, num_threads, tasks_per_thread, burn);
}
}
}
} // namespace
} // namespace util
int main() {
//bool meaningless = false;
std::cout << "#CPU time\n";
//meaningless ^= util::TestRun();
util::ParallelTestRun(10, 4000);
//std::cerr << "Meaningless: " << meaningless << '\n';
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment