Commit 688b6eac authored by SWHL's avatar SWHL
Browse files

Update files

parents
#include "probing_hash_table.hh"
#include "murmur_hash.hh"
#include "scoped.hh"
#define BOOST_TEST_MODULE ProbingHashTableTest
#include <boost/test/unit_test.hpp>
#include <boost/scoped_array.hpp>
#include <boost/functional/hash.hpp>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <stdint.h>
namespace util {
namespace {
struct Entry {
unsigned char key;
typedef unsigned char Key;
unsigned char GetKey() const {
return key;
}
void SetKey(unsigned char to) {
key = to;
}
uint64_t GetValue() const {
return value;
}
uint64_t value;
};
typedef ProbingHashTable<Entry, boost::hash<unsigned char> > Table;
BOOST_AUTO_TEST_CASE(simple) {
size_t size = Table::Size(10, 1.2);
boost::scoped_array<char> mem(new char[size]);
memset(mem.get(), 0, size);
Table table(mem.get(), size);
const Entry *i = NULL;
BOOST_CHECK(!table.Find(2, i));
Entry to_ins;
to_ins.key = 3;
to_ins.value = 328920;
table.Insert(to_ins);
BOOST_REQUIRE(table.Find(3, i));
BOOST_CHECK_EQUAL(3, i->GetKey());
BOOST_CHECK_EQUAL(static_cast<uint64_t>(328920), i->GetValue());
BOOST_CHECK(!table.Find(2, i));
}
struct Entry64 {
uint64_t key;
typedef uint64_t Key;
Entry64() {}
explicit Entry64(uint64_t key_in) {
key = key_in;
}
Key GetKey() const { return key; }
void SetKey(uint64_t to) { key = to; }
};
struct MurmurHashEntry64 {
std::size_t operator()(uint64_t value) const {
return util::MurmurHash64A(&value, 8);
}
};
typedef ProbingHashTable<Entry64, MurmurHashEntry64> Table64;
BOOST_AUTO_TEST_CASE(Double) {
for (std::size_t initial = 19; initial < 30; ++initial) {
size_t size = Table64::Size(initial, 1.2);
scoped_malloc mem(MallocOrThrow(size));
Table64 table(mem.get(), size, std::numeric_limits<uint64_t>::max());
table.Clear();
for (uint64_t i = 0; i < 19; ++i) {
table.Insert(Entry64(i));
}
table.CheckConsistency();
mem.call_realloc(table.DoubleTo());
table.Double(mem.get());
table.CheckConsistency();
for (uint64_t i = 20; i < 40 ; ++i) {
table.Insert(Entry64(i));
}
mem.call_realloc(table.DoubleTo());
table.Double(mem.get());
table.CheckConsistency();
}
}
} // namespace
} // namespace util
#ifndef UTIL_PROXY_ITERATOR_H
#define UTIL_PROXY_ITERATOR_H
#include <cstddef>
#include <iterator>
/* This is a RandomAccessIterator that uses a proxy to access the underlying
* data. Useful for packing data at bit offsets but still using STL
* algorithms.
*
* Normally I would use boost::iterator_facade but some people are too lazy to
* install boost and still want to use my language model. It's amazing how
* many operators an iterator has.
*
* The Proxy needs to provide:
* class InnerIterator;
* InnerIterator &Inner();
* const InnerIterator &Inner() const;
*
* InnerIterator has to implement:
* operator==(InnerIterator)
* operator<(InnerIterator)
* operator+=(std::ptrdiff_t)
* operator-(InnerIterator)
* and of course whatever Proxy needs to dereference it.
*
* It's also a good idea to specialize std::swap for Proxy.
*/
namespace util {
template <class Proxy> class ProxyIterator {
private:
// Self.
typedef ProxyIterator<Proxy> S;
typedef typename Proxy::InnerIterator InnerIterator;
public:
typedef std::random_access_iterator_tag iterator_category;
typedef typename Proxy::value_type value_type;
typedef std::ptrdiff_t difference_type;
typedef Proxy reference;
typedef ProxyIterator<Proxy> * pointer;
ProxyIterator() {}
// For cast from non const to const.
template <class AlternateProxy> ProxyIterator(const ProxyIterator<AlternateProxy> &in) : p_(*in) {}
explicit ProxyIterator(const Proxy &p) : p_(p) {}
/* // p_'s swap does value swapping, but here we want iterator swapping
friend inline void swap(ProxyIterator<Proxy> &first, ProxyIterator<Proxy> &second) {
swap(first.I(), second.I());
}*/
// p_'s operator= does value copying, but here we want iterator copying.
S &operator=(const S &other) {
I() = other.I();
return *this;
}
bool operator==(const S &other) const { return I() == other.I(); }
bool operator!=(const S &other) const { return !(*this == other); }
bool operator<(const S &other) const { return I() < other.I(); }
bool operator>(const S &other) const { return other < *this; }
bool operator<=(const S &other) const { return !(*this > other); }
bool operator>=(const S &other) const { return !(*this < other); }
S &operator++() { return *this += 1; }
S operator++(int) { S ret(*this); ++*this; return ret; }
S &operator+=(std::ptrdiff_t amount) { I() += amount; return *this; }
S operator+(std::ptrdiff_t amount) const { S ret(*this); ret += amount; return ret; }
S &operator--() { return *this -= 1; }
S operator--(int) { S ret(*this); --*this; return ret; }
S &operator-=(std::ptrdiff_t amount) { I() += (-amount); return *this; }
S operator-(std::ptrdiff_t amount) const { S ret(*this); ret -= amount; return ret; }
std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); }
Proxy operator*() const { return p_; }
Proxy *operator->() { return &p_; }
const Proxy *operator->() const { return &p_; }
Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); }
const InnerIterator &Inner() { return p_.Inner(); }
private:
InnerIterator &I() { return p_.Inner(); }
const InnerIterator &I() const { return p_.Inner(); }
Proxy p_;
};
template <class Proxy> ProxyIterator<Proxy> operator+(std::ptrdiff_t amount, const ProxyIterator<Proxy> &it) {
return it + amount;
}
} // namespace util
#endif // UTIL_PROXY_ITERATOR_H
#include "read_compressed.hh"
#include "file.hh"
#include "have.hh"
#include "scoped.hh"
#include <algorithm>
#include <iostream>
#include <cassert>
#include <climits>
#include <cstdlib>
#include <cstring>
#ifdef HAVE_ZLIB
#include <zlib.h>
#endif
#ifdef HAVE_BZLIB
#include <bzlib.h>
#endif
#ifdef HAVE_XZLIB
#include <lzma.h>
#endif
namespace util {
CompressedException::CompressedException() throw() {}
CompressedException::~CompressedException() throw() {}
GZException::GZException() throw() {}
GZException::~GZException() throw() {}
BZException::BZException() throw() {}
BZException::~BZException() throw() {}
XZException::XZException() throw() {}
XZException::~XZException() throw() {}
void ReadBase::ReplaceThis(ReadBase *with, ReadCompressed &thunk) {
thunk.internal_.reset(with);
}
ReadBase *ReadBase::Current(ReadCompressed &thunk) { return thunk.internal_.get(); }
uint64_t &ReadBase::ReadCount(ReadCompressed &thunk) {
return thunk.raw_amount_;
}
namespace {
ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size, bool require_compressed);
// Completed file that other classes can thunk to.
class Complete : public ReadBase {
public:
std::size_t Read(void *, std::size_t, ReadCompressed &) {
return 0;
}
};
class Uncompressed : public ReadBase {
public:
explicit Uncompressed(int fd) : fd_(fd) {}
std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
std::size_t got = PartialRead(fd_.get(), to, amount);
ReadCount(thunk) += got;
return got;
}
private:
scoped_fd fd_;
};
class UncompressedWithHeader : public ReadBase {
public:
UncompressedWithHeader(int fd, const void *already_data, std::size_t already_size) : fd_(fd) {
assert(already_size);
buf_.reset(malloc(already_size));
if (!buf_.get()) throw std::bad_alloc();
memcpy(buf_.get(), already_data, already_size);
remain_ = static_cast<uint8_t*>(buf_.get());
end_ = remain_ + already_size;
}
std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
assert(buf_.get());
assert(remain_ != end_);
std::size_t sending = std::min<std::size_t>(amount, end_ - remain_);
memcpy(to, remain_, sending);
remain_ += sending;
if (remain_ == end_) {
ReplaceThis(new Uncompressed(fd_.release()), thunk);
}
return sending;
}
private:
scoped_malloc buf_;
uint8_t *remain_;
uint8_t *end_;
scoped_fd fd_;
};
static const std::size_t kInputBuffer = 16384;
template <class Compression> class StreamCompressed : public ReadBase {
public:
StreamCompressed(int fd, const void *already_data, std::size_t already_size)
: file_(fd),
in_buffer_(MallocOrThrow(kInputBuffer)),
back_(memcpy(in_buffer_.get(), already_data, already_size), already_size) {}
std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
if (amount == 0) return 0;
back_.SetOutput(to, amount);
do {
if (!back_.Stream().avail_in) ReadInput(thunk);
if (!back_.Process()) {
// reached end, at least for the compressed portion.
std::size_t ret = static_cast<const uint8_t *>(static_cast<void*>(back_.Stream().next_out)) - static_cast<const uint8_t*>(to);
ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in, true), thunk);
if (ret) return ret;
// We did not read anything this round, so clients might think EOF. Transfer responsibility to the next reader.
return Current(thunk)->Read(to, amount, thunk);
}
} while (back_.Stream().next_out == to);
return static_cast<const uint8_t*>(static_cast<void*>(back_.Stream().next_out)) - static_cast<const uint8_t*>(to);
}
private:
void ReadInput(ReadCompressed &thunk) {
assert(!back_.Stream().avail_in);
std::size_t got = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer);
back_.SetInput(in_buffer_.get(), got);
ReadCount(thunk) += got;
}
scoped_fd file_;
scoped_malloc in_buffer_;
Compression back_;
};
#ifdef HAVE_ZLIB
class GZip {
public:
GZip(const void *base, std::size_t amount) {
SetInput(base, amount);
stream_.zalloc = Z_NULL;
stream_.zfree = Z_NULL;
stream_.opaque = Z_NULL;
stream_.msg = NULL;
// 32 for zlib and gzip decoding with automatic header detection.
// 15 for maximum window size.
UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib.");
}
~GZip() {
if (Z_OK != inflateEnd(&stream_)) {
std::cerr << "zlib could not close properly." << std::endl;
abort();
}
}
void SetOutput(void *to, std::size_t amount) {
stream_.next_out = static_cast<Bytef*>(to);
stream_.avail_out = std::min<std::size_t>(std::numeric_limits<uInt>::max(), amount);
}
void SetInput(const void *base, std::size_t amount) {
assert(amount < static_cast<std::size_t>(std::numeric_limits<uInt>::max()));
stream_.next_in = const_cast<Bytef*>(static_cast<const Bytef*>(base));
stream_.avail_in = amount;
}
const z_stream &Stream() const { return stream_; }
bool Process() {
int result = inflate(&stream_, 0);
switch (result) {
case Z_OK:
return true;
case Z_STREAM_END:
return false;
case Z_ERRNO:
UTIL_THROW(ErrnoException, "zlib error");
default:
UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result);
}
}
private:
z_stream stream_;
};
#endif // HAVE_ZLIB
#ifdef HAVE_BZLIB
class BZip {
public:
BZip(const void *base, std::size_t amount) {
memset(&stream_, 0, sizeof(stream_));
SetInput(base, amount);
HandleError(BZ2_bzDecompressInit(&stream_, 0, 0));
}
~BZip() {
try {
HandleError(BZ2_bzDecompressEnd(&stream_));
} catch (const std::exception &e) {
std::cerr << e.what() << std::endl;
abort();
}
}
bool Process() {
int ret = BZ2_bzDecompress(&stream_);
if (ret == BZ_STREAM_END) return false;
HandleError(ret);
return true;
}
void SetOutput(void *base, std::size_t amount) {
stream_.next_out = static_cast<char*>(base);
stream_.avail_out = std::min<std::size_t>(std::numeric_limits<unsigned int>::max(), amount);
}
void SetInput(const void *base, std::size_t amount) {
stream_.next_in = const_cast<char*>(static_cast<const char*>(base));
stream_.avail_in = amount;
}
const bz_stream &Stream() const { return stream_; }
private:
void HandleError(int value) {
switch(value) {
case BZ_OK:
return;
case BZ_CONFIG_ERROR:
UTIL_THROW(BZException, "bzip2 seems to be miscompiled.");
case BZ_PARAM_ERROR:
UTIL_THROW(BZException, "bzip2 Parameter error");
case BZ_DATA_ERROR:
UTIL_THROW(BZException, "bzip2 detected a corrupt file");
case BZ_DATA_ERROR_MAGIC:
UTIL_THROW(BZException, "bzip2 detected bad magic bytes. Perhaps this was not a bzip2 file after all?");
case BZ_MEM_ERROR:
throw std::bad_alloc();
default:
UTIL_THROW(BZException, "Unknown bzip2 error code " << value);
}
}
bz_stream stream_;
};
#endif // HAVE_BZLIB
#ifdef HAVE_XZLIB
class XZip {
public:
XZip(const void *base, std::size_t amount)
: stream_(), action_(LZMA_RUN) {
memset(&stream_, 0, sizeof(stream_));
SetInput(base, amount);
HandleError(lzma_stream_decoder(&stream_, UINT64_MAX, 0));
}
~XZip() {
lzma_end(&stream_);
}
void SetOutput(void *base, std::size_t amount) {
stream_.next_out = static_cast<uint8_t*>(base);
stream_.avail_out = amount;
}
void SetInput(const void *base, std::size_t amount) {
stream_.next_in = static_cast<const uint8_t*>(base);
stream_.avail_in = amount;
if (!amount) action_ = LZMA_FINISH;
}
const lzma_stream &Stream() const { return stream_; }
bool Process() {
lzma_ret status = lzma_code(&stream_, action_);
if (status == LZMA_STREAM_END) return false;
HandleError(status);
return true;
}
private:
void HandleError(lzma_ret value) {
switch (value) {
case LZMA_OK:
return;
case LZMA_MEM_ERROR:
throw std::bad_alloc();
case LZMA_FORMAT_ERROR:
UTIL_THROW(XZException, "xzlib says file format not recognized");
case LZMA_OPTIONS_ERROR:
UTIL_THROW(XZException, "xzlib says unsupported compression options");
case LZMA_DATA_ERROR:
UTIL_THROW(XZException, "xzlib says this file is corrupt");
case LZMA_BUF_ERROR:
UTIL_THROW(XZException, "xzlib says unexpected end of input");
default:
UTIL_THROW(XZException, "unrecognized xzlib error " << value);
}
}
lzma_stream stream_;
lzma_action action_;
};
#endif // HAVE_XZLIB
class IStreamReader : public ReadBase {
public:
explicit IStreamReader(std::istream &stream) : stream_(stream) {}
std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
if (!stream_.read(static_cast<char*>(to), amount)) {
UTIL_THROW_IF(!stream_.eof(), ErrnoException, "istream error");
amount = stream_.gcount();
}
ReadCount(thunk) += amount;
return amount;
}
private:
std::istream &stream_;
};
enum MagicResult {
UTIL_UNKNOWN, UTIL_GZIP, UTIL_BZIP, UTIL_XZIP
};
MagicResult DetectMagic(const void *from_void, std::size_t length) {
const uint8_t *header = static_cast<const uint8_t*>(from_void);
if (length >= 2 && header[0] == 0x1f && header[1] == 0x8b) {
return UTIL_GZIP;
}
const uint8_t kBZMagic[3] = {'B', 'Z', 'h'};
if (length >= sizeof(kBZMagic) && !memcmp(header, kBZMagic, sizeof(kBZMagic))) {
return UTIL_BZIP;
}
const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
if (length >= sizeof(kXZMagic) && !memcmp(header, kXZMagic, sizeof(kXZMagic))) {
return UTIL_XZIP;
}
return UTIL_UNKNOWN;
}
ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, const std::size_t already_size, bool require_compressed) {
scoped_fd hold(fd);
std::string header(reinterpret_cast<const char*>(already_data), already_size);
if (header.size() < ReadCompressed::kMagicSize) {
std::size_t original = header.size();
header.resize(ReadCompressed::kMagicSize);
std::size_t got = ReadOrEOF(fd, &header[original], ReadCompressed::kMagicSize - original);
raw_amount += got;
header.resize(original + got);
}
if (header.empty()) {
return new Complete();
}
switch (DetectMagic(&header[0], header.size())) {
case UTIL_GZIP:
#ifdef HAVE_ZLIB
return new StreamCompressed<GZip>(hold.release(), header.data(), header.size());
#else
UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in.");
#endif
case UTIL_BZIP:
#ifdef HAVE_BZLIB
return new StreamCompressed<BZip>(hold.release(), &header[0], header.size());
#else
UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZh), but bzip support was not compiled in.");
#endif
case UTIL_XZIP:
#ifdef HAVE_XZLIB
return new StreamCompressed<XZip>(hold.release(), header.data(), header.size());
#else
UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in.");
#endif
default:
UTIL_THROW_IF(require_compressed, CompressedException, "Uncompressed data detected after a compresssed file. This could be supported but usually indicates an error.");
return new UncompressedWithHeader(hold.release(), header.data(), header.size());
}
}
} // namespace
bool ReadCompressed::DetectCompressedMagic(const void *from_void) {
return DetectMagic(from_void, kMagicSize) != UTIL_UNKNOWN;
}
ReadCompressed::ReadCompressed(int fd) {
Reset(fd);
}
ReadCompressed::ReadCompressed(std::istream &in) {
Reset(in);
}
ReadCompressed::ReadCompressed() {}
void ReadCompressed::Reset(int fd) {
raw_amount_ = 0;
internal_.reset();
internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0, false));
}
void ReadCompressed::Reset(std::istream &in) {
internal_.reset();
internal_.reset(new IStreamReader(in));
}
std::size_t ReadCompressed::Read(void *to, std::size_t amount) {
return internal_->Read(to, amount, *this);
}
std::size_t ReadCompressed::ReadOrEOF(void *const to_in, std::size_t amount) {
uint8_t *to = reinterpret_cast<uint8_t*>(to_in);
while (amount) {
std::size_t got = Read(to, amount);
if (!got) break;
to += got;
amount -= got;
}
return to - reinterpret_cast<uint8_t*>(to_in);
}
} // namespace util
#ifndef UTIL_READ_COMPRESSED_H
#define UTIL_READ_COMPRESSED_H
#include "exception.hh"
#include "scoped.hh"
#include <cstddef>
#include <stdint.h>
namespace util {
class CompressedException : public Exception {
public:
CompressedException() throw();
virtual ~CompressedException() throw();
};
class GZException : public CompressedException {
public:
GZException() throw();
~GZException() throw();
};
class BZException : public CompressedException {
public:
BZException() throw();
~BZException() throw();
};
class XZException : public CompressedException {
public:
XZException() throw();
~XZException() throw();
};
class ReadCompressed;
class ReadBase {
public:
virtual ~ReadBase() {}
virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0;
protected:
static void ReplaceThis(ReadBase *with, ReadCompressed &thunk);
ReadBase *Current(ReadCompressed &thunk);
static uint64_t &ReadCount(ReadCompressed &thunk);
};
class ReadCompressed {
public:
static const std::size_t kMagicSize = 6;
// Must have at least kMagicSize bytes.
static bool DetectCompressedMagic(const void *from);
// Takes ownership of fd.
explicit ReadCompressed(int fd);
// Try to avoid using this. Use the fd instead.
// There is no decompression support for istreams.
explicit ReadCompressed(std::istream &in);
// Must call Reset later.
ReadCompressed();
// Takes ownership of fd.
void Reset(int fd);
// Same advice as the constructor.
void Reset(std::istream &in);
std::size_t Read(void *to, std::size_t amount);
// Repeatedly call read to fill a buffer unless EOF is hit.
// Return number of bytes read.
std::size_t ReadOrEOF(void *const to, std::size_t amount);
uint64_t RawAmount() const { return raw_amount_; }
private:
friend class ReadBase;
scoped_ptr<ReadBase> internal_;
uint64_t raw_amount_;
};
} // namespace util
#endif // UTIL_READ_COMPRESSED_H
#include "read_compressed.hh"
#include "file.hh"
#include "have.hh"
#define BOOST_TEST_MODULE ReadCompressedTest
#include <boost/test/unit_test.hpp>
#include <boost/scoped_ptr.hpp>
#include <fstream>
#include <string>
#include <cstdlib>
#if defined __MINGW32__
#include <ctime>
#include <fcntl.h>
#if !defined mkstemp
// TODO insecure
int mkstemp(char * stemplate)
{
char *filename = mktemp(stemplate);
if (filename == NULL)
return -1;
return open(filename, O_RDWR | O_CREAT, 0600);
}
#endif
#endif // defined
namespace util {
namespace {
void ReadLoop(ReadCompressed &reader, void *to_void, std::size_t amount) {
uint8_t *to = static_cast<uint8_t*>(to_void);
while (amount) {
std::size_t ret = reader.Read(to, amount);
BOOST_REQUIRE(ret);
to += ret;
amount -= ret;
}
}
const uint32_t kSize4 = 100000 / 4;
std::string WriteRandom() {
char name[] = "tempXXXXXX";
scoped_fd original(mkstemp(name));
BOOST_REQUIRE(original.get() > 0);
for (uint32_t i = 0; i < kSize4; ++i) {
WriteOrThrow(original.get(), &i, sizeof(uint32_t));
}
return name;
}
void VerifyRead(ReadCompressed &reader) {
for (uint32_t i = 0; i < kSize4; ++i) {
uint32_t got;
ReadLoop(reader, &got, sizeof(uint32_t));
BOOST_CHECK_EQUAL(i, got);
}
char ignored;
BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
// Test double EOF call.
BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1));
}
void TestRandom(const char *compressor) {
std::string name(WriteRandom());
char gzname[] = "tempXXXXXX";
scoped_fd gzipped(mkstemp(gzname));
std::string command(compressor);
#ifdef __CYGWIN__
command += ".exe";
#endif
command += " <\"";
command += name;
command += "\" >\"";
command += gzname;
command += "\"";
BOOST_REQUIRE_EQUAL(0, system(command.c_str()));
BOOST_CHECK_EQUAL(0, unlink(name.c_str()));
BOOST_CHECK_EQUAL(0, unlink(gzname));
ReadCompressed reader(gzipped.release());
VerifyRead(reader);
}
BOOST_AUTO_TEST_CASE(Uncompressed) {
TestRandom("cat");
}
#ifdef HAVE_ZLIB
BOOST_AUTO_TEST_CASE(ReadGZ) {
TestRandom("gzip");
}
#endif // HAVE_ZLIB
#ifdef HAVE_BZLIB
BOOST_AUTO_TEST_CASE(ReadBZ) {
TestRandom("bzip2");
}
#endif // HAVE_BZLIB
#ifdef HAVE_XZLIB
BOOST_AUTO_TEST_CASE(ReadXZ) {
TestRandom("xz");
}
#endif
#ifdef HAVE_ZLIB
BOOST_AUTO_TEST_CASE(AppendGZ) {
}
#endif
BOOST_AUTO_TEST_CASE(IStream) {
std::string name(WriteRandom());
std::fstream stream(name.c_str(), std::ios::in);
BOOST_CHECK_EQUAL(0, unlink(name.c_str()));
ReadCompressed reader;
reader.Reset(stream);
VerifyRead(reader);
}
} // namespace
} // namespace util
#include "scoped.hh"
#include <cstdlib>
#if !defined(_WIN32) && !defined(_WIN64)
#include <sys/mman.h>
#endif
namespace util {
// TODO: if we're really under memory pressure, don't allocate memory to
// display the error.
MallocException::MallocException(std::size_t requested) throw() {
*this << "for " << requested << " bytes ";
}
MallocException::~MallocException() throw() {}
namespace {
void *InspectAddr(void *addr, std::size_t requested, const char *func_name) {
UTIL_THROW_IF_ARG(!addr && requested, MallocException, (requested), "in " << func_name);
return addr;
}
} // namespace
void *MallocOrThrow(std::size_t requested) {
return InspectAddr(std::malloc(requested), requested, "malloc");
}
void *CallocOrThrow(std::size_t requested) {
return InspectAddr(std::calloc(requested, 1), requested, "calloc");
}
void scoped_malloc::call_realloc(std::size_t requested) {
p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc");
}
void AdviseHugePages(const void *addr, std::size_t size) {
#if MADV_HUGEPAGE
madvise((void*)addr, size, MADV_HUGEPAGE);
#endif
}
} // namespace util
#ifndef UTIL_SCOPED_H
#define UTIL_SCOPED_H
/* Other scoped objects in the style of scoped_ptr. */
#include "exception.hh"
#include <cstddef>
#include <cstdlib>
namespace util {
class MallocException : public ErrnoException {
public:
explicit MallocException(std::size_t requested) throw();
~MallocException() throw();
};
void *MallocOrThrow(std::size_t requested);
void *CallocOrThrow(std::size_t requested);
/* Unfortunately, defining the operator* for void * makes the compiler complain.
* So scoped is specialized to void. This includes the functionality common to
* both, namely everything except reference.
*/
template <class T, class Closer> class scoped_base {
public:
explicit scoped_base(T *p = NULL) : p_(p) {}
~scoped_base() { Closer::Close(p_); }
#if __cplusplus >= 201103L
scoped_base(scoped_base &&from) noexcept : p_(from.p_) {
from.p_ = nullptr;
}
#endif
void reset(T *p = NULL) {
scoped_base other(p_);
p_ = p;
}
T *get() { return p_; }
const T *get() const { return p_; }
T *operator->() { return p_; }
const T *operator->() const { return p_; }
T *release() {
T *ret = p_;
p_ = NULL;
return ret;
}
protected:
T *p_;
#if __cplusplus >= 201103L
public:
scoped_base(const scoped_base &) = delete;
scoped_base &operator=(const scoped_base &) = delete;
#else
private:
scoped_base(const scoped_base &);
scoped_base &operator=(const scoped_base &);
#endif
};
template <class T, class Closer> class scoped : public scoped_base<T, Closer> {
public:
explicit scoped(T *p = NULL) : scoped_base<T, Closer>(p) {}
T &operator*() { return *scoped_base<T, Closer>::p_; }
const T&operator*() const { return *scoped_base<T, Closer>::p_; }
};
template <class Closer> class scoped<void, Closer> : public scoped_base<void, Closer> {
public:
explicit scoped(void *p = NULL) : scoped_base<void, Closer>(p) {}
};
/* Closer for c functions like std::free and cmph cleanup functions */
template <class T, void (*clean)(T*)> struct scoped_c_forward {
static void Close(T *p) { clean(p); }
};
// Call a C function to delete stuff
template <class T, void (*clean)(T*)> class scoped_c : public scoped<T, scoped_c_forward<T, clean> > {
public:
explicit scoped_c(T *p = NULL) : scoped<T, scoped_c_forward<T, clean> >(p) {}
};
class scoped_malloc : public scoped_c<void, std::free> {
public:
explicit scoped_malloc(void *p = NULL) : scoped_c<void, std::free>(p) {}
explicit scoped_malloc(std::size_t size) : scoped_c<void, std::free>(MallocOrThrow(size)) {}
void call_realloc(std::size_t to);
};
/* scoped_array using delete[] */
struct scoped_delete_array_forward {
template <class T> static void Close(T *p) { delete [] p; }
};
// Hat tip to boost.
template <class T> class scoped_array : public scoped<T, scoped_delete_array_forward> {
public:
explicit scoped_array(T *p = NULL) : scoped<T, scoped_delete_array_forward>(p) {}
T &operator[](std::size_t idx) { return scoped<T, scoped_delete_array_forward>::p_[idx]; }
const T &operator[](std::size_t idx) const { return scoped<T, scoped_delete_array_forward>::p_[idx]; }
};
/* scoped_ptr using delete. If only there were a template typedef. */
struct scoped_delete_forward {
template <class T> static void Close(T *p) { delete p; }
};
template <class T> class scoped_ptr : public scoped<T, scoped_delete_forward> {
public:
explicit scoped_ptr(T *p = NULL) : scoped<T, scoped_delete_forward>(p) {}
};
void AdviseHugePages(const void *addr, std::size_t size);
} // namespace util
#endif // UTIL_SCOPED_H
#ifndef UTIL_SIZED_ITERATOR_H
#define UTIL_SIZED_ITERATOR_H
#include "pool.hh"
#include "proxy_iterator.hh"
#include <algorithm>
#include <functional>
#include <string>
#include <stdint.h>
#include <cstring>
#include <stdlib.h>
namespace util {
class SizedInnerIterator {
public:
SizedInnerIterator() {}
SizedInnerIterator(void *ptr, std::size_t size) : ptr_(static_cast<uint8_t*>(ptr)), size_(size) {}
bool operator==(const SizedInnerIterator &other) const {
return ptr_ == other.ptr_;
}
bool operator<(const SizedInnerIterator &other) const {
return ptr_ < other.ptr_;
}
SizedInnerIterator &operator+=(std::ptrdiff_t amount) {
ptr_ += amount * size_;
return *this;
}
std::ptrdiff_t operator-(const SizedInnerIterator &other) const {
return (ptr_ - other.ptr_) / size_;
}
const void *Data() const { return ptr_; }
void *Data() { return ptr_; }
std::size_t EntrySize() const { return size_; }
friend void swap(SizedInnerIterator &first, SizedInnerIterator &second);
private:
uint8_t *ptr_;
std::size_t size_;
};
inline void swap(SizedInnerIterator &first, SizedInnerIterator &second) {
using std::swap;
swap(first.ptr_, second.ptr_);
swap(first.size_, second.size_);
}
class ValueBlock {
public:
explicit ValueBlock(const void *from, FreePool &pool)
: ptr_(std::memcpy(pool.Allocate(), from, pool.ElementSize())),
pool_(pool) {}
ValueBlock(const ValueBlock &from)
: ptr_(std::memcpy(from.pool_.Allocate(), from.ptr_, from.pool_.ElementSize())),
pool_(from.pool_) {}
ValueBlock &operator=(const ValueBlock &from) {
std::memcpy(ptr_, from.ptr_, pool_.ElementSize());
return *this;
}
~ValueBlock() { pool_.Free(ptr_); }
const void *Data() const { return ptr_; }
void *Data() { return ptr_; }
private:
void *ptr_;
FreePool &pool_;
};
class SizedProxy {
public:
SizedProxy() {}
SizedProxy(void *ptr, FreePool &pool) : inner_(ptr, pool.ElementSize()), pool_(&pool) {}
operator ValueBlock() const {
return ValueBlock(inner_.Data(), *pool_);
}
SizedProxy &operator=(const SizedProxy &from) {
memcpy(inner_.Data(), from.inner_.Data(), inner_.EntrySize());
return *this;
}
SizedProxy &operator=(const ValueBlock &from) {
memcpy(inner_.Data(), from.Data(), inner_.EntrySize());
return *this;
}
const void *Data() const { return inner_.Data(); }
void *Data() { return inner_.Data(); }
friend void swap(SizedProxy first, SizedProxy second);
private:
friend class util::ProxyIterator<SizedProxy>;
typedef ValueBlock value_type;
typedef SizedInnerIterator InnerIterator;
InnerIterator &Inner() { return inner_; }
const InnerIterator &Inner() const { return inner_; }
InnerIterator inner_;
FreePool *pool_;
};
inline void swap(SizedProxy first, SizedProxy second) {
std::swap_ranges(
static_cast<char*>(first.inner_.Data()),
static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(),
static_cast<char*>(second.inner_.Data()));
}
typedef ProxyIterator<SizedProxy> SizedIterator;
// Useful wrapper for a comparison function i.e. sort.
template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public std::binary_function<const Proxy &, const Proxy &, bool> {
public:
explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {}
bool operator()(const Proxy &first, const Proxy &second) const {
return delegate_(first.Data(), second.Data());
}
bool operator()(const Proxy &first, const ValueBlock &second) const {
return delegate_(first.Data(), second.Data());
}
bool operator()(const ValueBlock &first, const Proxy &second) const {
return delegate_(first.Data(), second.Data());
}
bool operator()(const ValueBlock &first, const ValueBlock &second) const {
return delegate_(first.Data(), second.Data());
}
const Delegate &GetDelegate() const { return delegate_; }
private:
const Delegate delegate_;
};
template <unsigned Size> class JustPOD {
unsigned char data[Size];
};
template <class Delegate, unsigned Size> class JustPODDelegate : std::binary_function<const JustPOD<Size> &, const JustPOD<Size> &, bool> {
public:
explicit JustPODDelegate(const Delegate &compare) : delegate_(compare) {}
bool operator()(const JustPOD<Size> &first, const JustPOD<Size> &second) const {
return delegate_(&first, &second);
}
private:
Delegate delegate_;
};
#define UTIL_SORT_SPECIALIZE(Size) \
case Size: \
std::sort(static_cast<JustPOD<Size>*>(start), static_cast<JustPOD<Size>*>(end), JustPODDelegate<Compare, Size>(compare)); \
break;
template <class Compare> void SizedSort(void *start, void *end, std::size_t element_size, const Compare &compare) {
switch (element_size) {
// Benchmarking sort found it's about 2x faster with an explicitly sized type. So here goes :-(.
UTIL_SORT_SPECIALIZE(4);
UTIL_SORT_SPECIALIZE(8);
UTIL_SORT_SPECIALIZE(12);
UTIL_SORT_SPECIALIZE(16);
UTIL_SORT_SPECIALIZE(17); // This is used by interpolation.
UTIL_SORT_SPECIALIZE(20);
UTIL_SORT_SPECIALIZE(24);
UTIL_SORT_SPECIALIZE(28);
UTIL_SORT_SPECIALIZE(32);
default:
// Recent g++ versions create a temporary value_type then compare with it.
// Problem is that value_type in this case needs to be a runtime-sized array.
// Previously I had std::string serve this role. However, there were a lot
// of string new and delete calls.
//
// The temporary value is on the stack, so there will typically only be one
// at a time. But we can't guarantee that. So here is a pool optimized for
// the case where one element is allocated at any given time. It can
// allocate more, should the underlying C++ sort code change.
{
FreePool pool(element_size);
// TODO is this necessary anymore?
#if defined(_WIN32) || defined(_WIN64)
std::stable_sort
#else
std::sort
#endif
(SizedIterator(SizedProxy(start, pool)), SizedIterator(SizedProxy(end, pool)), SizedCompare<Compare>(compare));
}
}
}
} // namespace util
// Dirty hack because g++ 4.6 at least wants to do a bunch of copy operations.
namespace std {
inline void iter_swap(util::SizedIterator first, util::SizedIterator second) {
util::swap(*first, *second);
}
} // namespace std
#endif // UTIL_SIZED_ITERATOR_H
#include "sized_iterator.hh"
#define BOOST_TEST_MODULE SizedIteratorTest
#include <boost/test/unit_test.hpp>
namespace util { namespace {
struct CompareChar {
bool operator()(const void *first, const void *second) const {
return *static_cast<const char*>(first) < *static_cast<const char*>(second);
}
};
BOOST_AUTO_TEST_CASE(sort) {
char items[3] = {1, 2, 0};
SizedSort(items, items + 3, 1, CompareChar());
BOOST_CHECK_EQUAL(0, items[0]);
BOOST_CHECK_EQUAL(1, items[1]);
BOOST_CHECK_EQUAL(2, items[2]);
}
}} // namespace anonymous util
#ifndef UTIL_SORTED_UNIFORM_H
#define UTIL_SORTED_UNIFORM_H
#include <algorithm>
#include <cstddef>
#include <cassert>
#include <stdint.h>
namespace util {
template <class T> class IdentityAccessor {
public:
typedef T Key;
T operator()(const T *in) const { return *in; }
};
struct Pivot64 {
static inline std::size_t Calc(uint64_t off, uint64_t range, std::size_t width) {
std::size_t ret = static_cast<std::size_t>(static_cast<float>(off) / static_cast<float>(range) * static_cast<float>(width));
// Cap for floating point rounding
return (ret < width) ? ret : width - 1;
}
};
// Use when off * width is <2^64. This is guaranteed when each of them is actually a 32-bit value.
struct Pivot32 {
static inline std::size_t Calc(uint64_t off, uint64_t range, uint64_t width) {
return static_cast<std::size_t>((off * width) / (range + 1));
}
};
// Usage: PivotSelect<sizeof(DataType)>::T
template <unsigned> struct PivotSelect;
template <> struct PivotSelect<8> { typedef Pivot64 T; };
template <> struct PivotSelect<4> { typedef Pivot32 T; };
template <> struct PivotSelect<2> { typedef Pivot32 T; };
/* Binary search. */
template <class Iterator, class Accessor> bool BinaryFind(
const Accessor &accessor,
Iterator begin,
Iterator end,
const typename Accessor::Key key, Iterator &out) {
while (end > begin) {
Iterator pivot(begin + (end - begin) / 2);
typename Accessor::Key mid(accessor(pivot));
if (mid < key) {
begin = pivot + 1;
} else if (mid > key) {
end = pivot;
} else {
out = pivot;
return true;
}
}
return false;
}
// Search the range [before_it + 1, after_it - 1] for key.
// Preconditions:
// before_v <= key <= after_v
// before_v <= all values in the range [before_it + 1, after_it - 1] <= after_v
// range is sorted.
template <class Iterator, class Accessor, class Pivot> bool BoundedSortedUniformFind(
const Accessor &accessor,
Iterator before_it, typename Accessor::Key before_v,
Iterator after_it, typename Accessor::Key after_v,
const typename Accessor::Key key, Iterator &out) {
while (after_it - before_it > 1) {
Iterator pivot(before_it + (1 + Pivot::Calc(key - before_v, after_v - before_v, after_it - before_it - 1)));
typename Accessor::Key mid(accessor(pivot));
if (mid < key) {
before_it = pivot;
before_v = mid;
} else if (mid > key) {
after_it = pivot;
after_v = mid;
} else {
out = pivot;
return true;
}
}
return false;
}
template <class Iterator, class Accessor, class Pivot> bool SortedUniformFind(const Accessor &accessor, Iterator begin, Iterator end, const typename Accessor::Key key, Iterator &out) {
if (begin == end) return false;
typename Accessor::Key below(accessor(begin));
if (key <= below) {
if (key == below) { out = begin; return true; }
return false;
}
// Make the range [begin, end].
--end;
typename Accessor::Key above(accessor(end));
if (key >= above) {
if (key == above) { out = end; return true; }
return false;
}
return BoundedSortedUniformFind<Iterator, Accessor, Pivot>(accessor, begin, below, end, above, key, out);
}
} // namespace util
#endif // UTIL_SORTED_UNIFORM_H
#include "sorted_uniform.hh"
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int.hpp>
#include <boost/random/variate_generator.hpp>
#include <boost/scoped_array.hpp>
#include <boost/unordered_map.hpp>
#define BOOST_TEST_MODULE SortedUniformTest
#include <boost/test/unit_test.hpp>
#include <algorithm>
#include <limits>
#include <vector>
namespace util {
namespace {
template <class KeyT, class ValueT> struct Entry {
typedef KeyT Key;
typedef ValueT Value;
Key key;
Value value;
Key GetKey() const {
return key;
}
Value GetValue() const {
return value;
}
bool operator<(const Entry<Key,Value> &other) const {
return key < other.key;
}
};
template <class KeyT> struct Accessor {
typedef KeyT Key;
template <class Value> Key operator()(const Entry<Key, Value> *entry) const {
return entry->GetKey();
}
};
template <class Key, class Value> void Check(const Entry<Key, Value> *begin, const Entry<Key, Value> *end, const boost::unordered_map<Key, Value> &reference, const Key key) {
typename boost::unordered_map<Key, Value>::const_iterator ref = reference.find(key);
typedef const Entry<Key, Value> *It;
// g++ can't tell that require will crash and burn.
It i = NULL;
bool ret = SortedUniformFind<It, Accessor<Key>, Pivot64>(Accessor<Key>(), begin, end, key, i);
if (ref == reference.end()) {
BOOST_CHECK(!ret);
} else {
BOOST_REQUIRE(ret);
BOOST_CHECK_EQUAL(ref->second, i->GetValue());
}
}
BOOST_AUTO_TEST_CASE(empty) {
typedef const Entry<uint64_t, float> T;
const T *i;
bool ret = SortedUniformFind<const T*, Accessor<uint64_t>, Pivot64>(Accessor<uint64_t>(), (const T*)NULL, (const T*)NULL, (uint64_t)10, i);
BOOST_CHECK(!ret);
}
template <class Key> void RandomTest(Key upper, size_t entries, size_t queries) {
typedef unsigned char Value;
boost::mt19937 rng;
boost::uniform_int<Key> range_key(0, upper);
boost::uniform_int<Value> range_value(0, 255);
boost::variate_generator<boost::mt19937&, boost::uniform_int<Key> > gen_key(rng, range_key);
boost::variate_generator<boost::mt19937&, boost::uniform_int<unsigned char> > gen_value(rng, range_value);
typedef Entry<Key, Value> Ent;
std::vector<Ent> backing;
boost::unordered_map<Key, unsigned char> reference;
Ent ent;
for (size_t i = 0; i < entries; ++i) {
Key key = gen_key();
unsigned char value = gen_value();
if (reference.insert(std::make_pair(key, value)).second) {
ent.key = key;
ent.value = value;
backing.push_back(ent);
}
}
std::sort(backing.begin(), backing.end());
// Random queries.
for (size_t i = 0; i < queries; ++i) {
const Key key = gen_key();
Check<Key, unsigned char>(&*backing.begin(), &*backing.end(), reference, key);
}
typename boost::unordered_map<Key, unsigned char>::const_iterator it = reference.begin();
for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) {
Check<Key, unsigned char>(&*backing.begin(), &*backing.end(), reference, it->second);
}
}
BOOST_AUTO_TEST_CASE(basic) {
RandomTest<uint8_t>(11, 10, 200);
}
BOOST_AUTO_TEST_CASE(tiny_dense_random) {
RandomTest<uint8_t>(11, 50, 200);
}
BOOST_AUTO_TEST_CASE(small_dense_random) {
RandomTest<uint8_t>(100, 100, 200);
}
BOOST_AUTO_TEST_CASE(small_sparse_random) {
RandomTest<uint8_t>(200, 15, 200);
}
BOOST_AUTO_TEST_CASE(medium_sparse_random) {
RandomTest<uint16_t>(32000, 1000, 2000);
}
BOOST_AUTO_TEST_CASE(sparse_random) {
RandomTest<uint64_t>(std::numeric_limits<uint64_t>::max(), 100000, 2000);
}
} // namespace
} // namespace util
#include "spaces.hh"
namespace util {
// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
} // namespace util
#ifndef UTIL_SPACES_H
#define UTIL_SPACES_H
// bool array of spaces.
namespace util {
extern const bool kSpaces[256];
} // namespace util
#endif // UTIL_SPACES_H
# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
# Explicitly list the source files for this subdirectory
#
# If you add any source files to this subdirectory
# that should be included in the kenlm library,
# (this excludes any unit test files)
# you should add them to the following list:
#
# In order to allow CMake files in the parent directory
# to see this variable definition, we set PARENT_SCOPE.
#
# In order to set correct paths to these files
# when this variable is referenced by CMake files in the parent directory,
# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
#
set(KENLM_UTIL_STREAM_SOURCE
${CMAKE_CURRENT_SOURCE_DIR}/chain.cc
${CMAKE_CURRENT_SOURCE_DIR}/count_records.cc
${CMAKE_CURRENT_SOURCE_DIR}/io.cc
${CMAKE_CURRENT_SOURCE_DIR}/line_input.cc
${CMAKE_CURRENT_SOURCE_DIR}/multi_progress.cc
${CMAKE_CURRENT_SOURCE_DIR}/rewindable_stream.cc
PARENT_SCOPE)
if(BUILD_TESTING)
# Explicitly list the Boost test files to be compiled
set(KENLM_BOOST_TESTS_LIST
io_test
sort_test
stream_test
rewindable_stream_test
)
AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
LIBRARIES kenlm_util ${Boost_LIBRARIES} Threads::Threads)
endif()
#ifndef UTIL_STREAM_BLOCK_H
#define UTIL_STREAM_BLOCK_H
#include <cstddef>
#include <stdint.h>
namespace util {
namespace stream {
/**
* Encapsulates a block of memory.
*/
class Block {
public:
/**
* Constructs an empty block.
*/
Block() : mem_(NULL), valid_size_(0) {}
/**
* Constructs a block that encapsulates a segment of memory.
*
* @param[in] mem The segment of memory to encapsulate
* @param[in] size The size of the memory segment in bytes
*/
Block(void *mem, std::size_t size) : mem_(mem), valid_size_(size) {}
/**
* Set the number of bytes in this block that should be interpreted as valid.
*
* @param[in] to Number of bytes
*/
void SetValidSize(std::size_t to) { valid_size_ = to; }
/**
* Gets the number of bytes in this block that should be interpreted as valid.
* This is important because read might fill in less than Allocated at EOF.
*/
std::size_t ValidSize() const { return valid_size_; }
/** Gets a void pointer to the memory underlying this block. */
void *Get() { return mem_; }
/** Gets a const void pointer to the memory underlying this block. */
const void *Get() const { return mem_; }
/**
* Gets a const void pointer to the end of the valid section of memory
* encapsulated by this block.
*/
const void *ValidEnd() const {
return reinterpret_cast<const uint8_t*>(mem_) + valid_size_;
}
/**
* Returns true if this block encapsulates a valid (non-NULL) block of memory.
*
* This method is a user-defined implicit conversion function to boolean;
* among other things, this method enables bare instances of this class
* to be used as the condition of an if statement.
*/
operator bool() const { return mem_ != NULL; }
/**
* Returns true if this block is empty.
*
* In other words, if Get()==NULL, this method will return true.
*/
bool operator!() const { return mem_ == NULL; }
private:
friend class Link;
friend class RewindableStream;
/**
* Points this block's memory at NULL.
*
* This class defines poison as a block whose memory pointer is NULL.
*/
void SetToPoison() {
mem_ = NULL;
}
void *mem_;
std::size_t valid_size_;
};
} // namespace stream
} // namespace util
#endif // UTIL_STREAM_BLOCK_H
#include "chain.hh"
#include "io.hh"
#include "../exception.hh"
#include "../pcqueue.hh"
#include <cstdlib>
#include <new>
#include <iostream>
#include <stdint.h>
namespace util {
namespace stream {
ChainConfigException::ChainConfigException() throw() { *this << "Chain configured with "; }
ChainConfigException::~ChainConfigException() throw() {}
Thread::~Thread() {
thread_.join();
}
void Thread::UnhandledException(const std::exception &e) {
std::cerr << e.what() << std::endl;
abort();
}
void Recycler::Run(const ChainPosition &position) {
for (Link l(position); l; ++l) {
l->SetValidSize(position.GetChain().BlockSize());
}
}
const Recycler kRecycle = Recycler();
Chain::Chain(const ChainConfig &config) : config_(config), complete_called_(false) {
UTIL_THROW_IF(!config.entry_size, ChainConfigException, "zero-size entries.");
UTIL_THROW_IF(!config.block_count, ChainConfigException, "block count zero");
UTIL_THROW_IF(config.total_memory < config.entry_size * config.block_count, ChainConfigException, config.total_memory << " total memory, too small for " << config.block_count << " blocks of containing entries of size " << config.entry_size);
// Round down block size to a multiple of entry size.
block_size_ = config.total_memory / (config.block_count * config.entry_size) * config.entry_size;
}
Chain::~Chain() {
Wait();
}
ChainPosition Chain::Add() {
if (!Running()) Start();
PCQueue<Block> &in = queues_.back();
queues_.push_back(new PCQueue<Block>(config_.block_count));
return ChainPosition(in, queues_.back(), this, progress_);
}
Chain &Chain::operator>>(const WriteAndRecycle &writer) {
threads_.push_back(new Thread(Complete(), writer));
return *this;
}
void Chain::Wait(bool release_memory) {
if (queues_.empty()) {
assert(threads_.empty());
return; // Nothing to wait for.
}
if (!complete_called_) CompleteLoop();
threads_.clear();
for (std::size_t i = 0; queues_.front().Consume(); ++i) {
if (i == config_.block_count) {
std::cerr << "Chain ending without poison." << std::endl;
abort();
}
}
queues_.clear();
progress_.Finished();
complete_called_ = false;
if (release_memory) memory_.reset();
}
void Chain::Start() {
Wait(false);
if (!memory_.get()) {
// Allocate memory.
assert(threads_.empty());
assert(queues_.empty());
std::size_t malloc_size = block_size_ * config_.block_count;
memory_.reset(MallocOrThrow(malloc_size));
}
// This queue can accomodate all blocks.
queues_.push_back(new PCQueue<Block>(config_.block_count));
// Populate the lead queue with blocks.
uint8_t *base = static_cast<uint8_t*>(memory_.get());
for (std::size_t i = 0; i < config_.block_count; ++i) {
queues_.front().Produce(Block(base, block_size_));
base += block_size_;
}
}
ChainPosition Chain::Complete() {
assert(Running());
UTIL_THROW_IF(complete_called_, util::Exception, "CompleteLoop() called twice");
complete_called_ = true;
return ChainPosition(queues_.back(), queues_.front(), this, progress_);
}
Link::Link() : in_(NULL), out_(NULL), poisoned_(true) {}
void Link::Init(const ChainPosition &position) {
UTIL_THROW_IF(in_, util::Exception, "Link::Init twice");
in_ = position.in_;
out_ = position.out_;
poisoned_ = false;
progress_ = position.progress_;
in_->Consume(current_);
}
Link::Link(const ChainPosition &position) : in_(NULL) {
Init(position);
}
Link::~Link() {
if (current_) {
// Probably an exception unwinding.
std::cerr << "Last input should have been poison. The program should end soon with an error. If it doesn't, there's a bug." << std::endl;
// abort();
} else {
if (!poisoned_) {
// Poison is a block whose memory pointer is NULL.
//
// Because we're in the else block,
// we know that the memory pointer of current_ is NULL.
//
// Pass the current (poison) block!
out_->Produce(current_);
}
}
}
Link &Link::operator++() {
assert(current_);
progress_ += current_.ValidSize();
out_->Produce(current_);
in_->Consume(current_);
if (!current_) {
poisoned_ = true;
out_->Produce(current_);
}
return *this;
}
void Link::Poison() {
assert(!poisoned_);
current_.SetToPoison();
out_->Produce(current_);
poisoned_ = true;
}
} // namespace stream
} // namespace util
#ifndef UTIL_STREAM_CHAIN_H
#define UTIL_STREAM_CHAIN_H
#include "block.hh"
#include "config.hh"
#include "multi_progress.hh"
#include "../scoped.hh"
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/thread/thread.hpp>
#include <cstddef>
#include <cassert>
namespace util {
template <class T> class PCQueue;
namespace stream {
class ChainConfigException : public Exception {
public:
ChainConfigException() throw();
~ChainConfigException() throw();
};
class Chain;
class RewindableStream;
/**
* Encapsulates a @ref PCQueue "producer queue" and a @ref PCQueue "consumer queue" within a @ref Chain "chain".
*
* Specifies position in chain for Link constructor.
*/
class ChainPosition {
public:
const Chain &GetChain() const { return *chain_; }
private:
friend class Chain;
friend class Link;
friend class RewindableStream;
ChainPosition(PCQueue<Block> &in, PCQueue<Block> &out, Chain *chain, MultiProgress &progress)
: in_(&in), out_(&out), chain_(chain), progress_(progress.Add()) {}
PCQueue<Block> *in_, *out_;
Chain *chain_;
WorkerProgress progress_;
};
/**
* Encapsulates a worker thread processing data at a given position in the chain.
*
* Each instance of this class owns one boost thread in which the worker is Run().
*/
class Thread {
public:
/**
* Constructs a new Thread in which the provided Worker is Run().
*
* Position is usually ChainPosition but if there are multiple streams involved, this can be ChainPositions.
*
* After a call to this constructor, the provided worker will be running within a boost thread owned by the newly constructed Thread object.
*/
template <class Position, class Worker> Thread(const Position &position, const Worker &worker)
: thread_(boost::ref(*this), position, worker) {}
~Thread();
/**
* Launches the provided worker in this object's boost thread.
*
* This method is called automatically by this class's @ref Thread() "constructor".
*/
template <class Position, class Worker> void operator()(const Position &position, Worker &worker) {
try {
worker.Run(position);
} catch (const std::exception &e) {
UnhandledException(e);
}
}
private:
void UnhandledException(const std::exception &e);
boost::thread thread_;
};
/**
* This resets blocks to full valid size. Used to close the loop in Chain by recycling blocks.
*/
class Recycler {
public:
/**
* Resets the blocks in the chain such that the blocks' respective valid sizes match the chain's block size.
*
* @see Block::SetValidSize()
* @see Chain::BlockSize()
*/
void Run(const ChainPosition &position);
};
extern const Recycler kRecycle;
class WriteAndRecycle;
/**
* Represents a sequence of workers, through which @ref Block "blocks" can pass.
*/
class Chain {
private:
template <class T, void (T::*ptr)(const ChainPosition &) = &T::Run> struct CheckForRun {
typedef Chain type;
};
public:
/**
* Constructs a configured Chain.
*
* @param config Specifies how to configure the Chain.
*/
explicit Chain(const ChainConfig &config);
/**
* Destructs a Chain.
*
* This method waits for the chain's threads to complete,
* and frees the memory held by this chain.
*/
~Chain();
void ActivateProgress() {
assert(!Running());
progress_.Activate();
}
void SetProgressTarget(uint64_t target) {
progress_.SetTarget(target);
}
/**
* Gets the number of bytes in each record of a Block.
*
* @see ChainConfig::entry_size
*/
std::size_t EntrySize() const {
return config_.entry_size;
}
/**
* Gets the inital @ref Block::ValidSize "valid size" for @ref Block "blocks" in this chain.
*
* @see Block::ValidSize
*/
std::size_t BlockSize() const {
return block_size_;
}
/**
* Number of blocks going through the Chain.
*/
std::size_t BlockCount() const {
return config_.block_count;
}
/** Two ways to add to the chain: Add() or operator>>. */
ChainPosition Add();
/**
* Adds a new worker to this chain,
* and runs that worker in a new Thread owned by this chain.
*
* The worker must have a Run method that accepts a position argument.
*
* @see Thread::operator()()
*/
template <class Worker> typename CheckForRun<Worker>::type &operator>>(const Worker &worker) {
assert(!complete_called_);
threads_.push_back(new Thread(Add(), worker));
return *this;
}
/**
* Adds a new worker to this chain (but avoids copying that worker),
* and runs that worker in a new Thread owned by this chain.
*
* The worker must have a Run method that accepts a position argument.
*
* @see Thread::operator()()
*/
template <class Worker> typename CheckForRun<Worker>::type &operator>>(const boost::reference_wrapper<Worker> &worker) {
assert(!complete_called_);
threads_.push_back(new Thread(Add(), worker));
return *this;
}
// Note that Link and Stream also define operator>> outside this class.
// To complete the loop, call CompleteLoop(), >> kRecycle, or the destructor.
void CompleteLoop() {
threads_.push_back(new Thread(Complete(), kRecycle));
}
/**
* Adds a Recycler worker to this chain,
* and runs that worker in a new Thread owned by this chain.
*/
Chain &operator>>(const Recycler &) {
CompleteLoop();
return *this;
}
/**
* Adds a WriteAndRecycle worker to this chain,
* and runs that worker in a new Thread owned by this chain.
*/
Chain &operator>>(const WriteAndRecycle &writer);
// Chains are reusable. Call Wait to wait for everything to finish and free memory.
void Wait(bool release_memory = true);
// Waits for the current chain to complete (if any) then starts again.
void Start();
bool Running() const { return !queues_.empty(); }
private:
ChainPosition Complete();
ChainConfig config_;
std::size_t block_size_;
scoped_malloc memory_;
boost::ptr_vector<PCQueue<Block> > queues_;
bool complete_called_;
boost::ptr_vector<Thread> threads_;
MultiProgress progress_;
};
// Create the link in the worker thread using the position token.
/**
* Represents a C++ style iterator over @ref Block "blocks".
*/
class Link {
public:
// Either default construct and Init or just construct all at once.
/**
* Constructs an @ref Init "initialized" link.
*
* @see Init
*/
explicit Link(const ChainPosition &position);
/**
* Constructs a link that must subsequently be @ref Init "initialized".
*
* @see Init
*/
Link();
/**
* Initializes the link with the input @ref PCQueue "consumer queue" and output @ref PCQueue "producer queue" at a given @ref ChainPosition "position" in the @ref Chain "chain".
*
* @see Link()
*/
void Init(const ChainPosition &position);
/**
* Destructs the link object.
*
* If necessary, this method will pass a poison block
* to this link's output @ref PCQueue "producer queue".
*
* @see Block::SetToPoison()
*/
~Link();
/**
* Gets a reference to the @ref Block "block" at this link.
*/
Block &operator*() { return current_; }
/**
* Gets a const reference to the @ref Block "block" at this link.
*/
const Block &operator*() const { return current_; }
/**
* Gets a pointer to the @ref Block "block" at this link.
*/
Block *operator->() { return &current_; }
/**
* Gets a const pointer to the @ref Block "block" at this link.
*/
const Block *operator->() const { return &current_; }
/**
* Gets the link at the next @ref ChainPosition "position" in the @ref Chain "chain".
*/
Link &operator++();
/**
* Returns true if the @ref Block "block" at this link encapsulates a valid (non-NULL) block of memory.
*
* This method is a user-defined implicit conversion function to boolean;
* among other things, this method enables bare instances of this class
* to be used as the condition of an if statement.
*/
operator bool() const { return current_; }
/**
* @ref Block::SetToPoison() "Poisons" the @ref Block "block" at this link,
* and passes this now-poisoned block to this link's output @ref PCQueue "producer queue".
*
* @see Block::SetToPoison()
*/
void Poison();
private:
Block current_;
PCQueue<Block> *in_, *out_;
bool poisoned_;
WorkerProgress progress_;
};
inline Chain &operator>>(Chain &chain, Link &link) {
link.Init(chain.Add());
return chain;
}
} // namespace stream
} // namespace util
#endif // UTIL_STREAM_CHAIN_H
#ifndef UTIL_STREAM_CONFIG_H
#define UTIL_STREAM_CONFIG_H
#include <cstddef>
#include <string>
namespace util { namespace stream {
/**
* Represents how a chain should be configured.
*/
struct ChainConfig {
/** Constructs an configuration with underspecified (or default) parameters. */
ChainConfig() {}
/**
* Constructs a chain configuration object.
*
* @param [in] in_entry_size Number of bytes in each record.
* @param [in] in_block_count Number of blocks in the chain.
* @param [in] in_total_memory Total number of bytes available to the chain.
* This value will be divided amongst the blocks in the chain.
*/
ChainConfig(std::size_t in_entry_size, std::size_t in_block_count, std::size_t in_total_memory)
: entry_size(in_entry_size), block_count(in_block_count), total_memory(in_total_memory) {}
/**
* Number of bytes in each record.
*/
std::size_t entry_size;
/**
* Number of blocks in the chain.
*/
std::size_t block_count;
/**
* Total number of bytes available to the chain.
* This value will be divided amongst the blocks in the chain.
* Chain's constructor will make this a multiple of entry_size.
*/
std::size_t total_memory;
};
/**
* Represents how a sorter should be configured.
*/
struct SortConfig {
/** Filename prefix where temporary files should be placed. */
std::string temp_prefix;
/** Size of each input/output buffer. */
std::size_t buffer_size;
/** Total memory to use when running alone. */
std::size_t total_memory;
};
}} // namespaces
#endif // UTIL_STREAM_CONFIG_H
#include "count_records.hh"
#include "chain.hh"
namespace util { namespace stream {
void CountRecords::Run(const ChainPosition &position) {
for (Link link(position); link; ++link) {
*count_ += link->ValidSize() / position.GetChain().EntrySize();
}
}
}} // namespaces
#include <stdint.h>
namespace util { namespace stream {
class ChainPosition;
class CountRecords {
public:
explicit CountRecords(uint64_t *out)
: count_(out) {
*count_ = 0;
}
void Run(const ChainPosition &position);
private:
uint64_t *count_;
};
}} // namespaces
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment