Unverified Commit f30dbe87 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

fix address alignment, required by cran (#3415)

* fix dataset binary file alignment

* many fixes

* fix warnings

* fix bug

* Update file_io.cpp

* Update file_io.cpp

* simplify code

* Apply suggestions from code review

* general

* remove unneeded alignment

* Update file_io.h

* int32 to byte8 alignment

* Apply suggestions from code review

* Apply suggestions from code review
parent 3c0e12dc
...@@ -93,11 +93,11 @@ class FeatureGroup { ...@@ -93,11 +93,11 @@ class FeatureGroup {
const char* memory_ptr = reinterpret_cast<const char*>(memory); const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse // get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr)); is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_multi_val_); memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_multi_val_));
is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr)); is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_sparse_); memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr)); num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(num_feature_); memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
// get bin mapper // get bin mapper
bin_mappers_.clear(); bin_mappers_.clear();
bin_offsets_.clear(); bin_offsets_.clear();
...@@ -301,9 +301,9 @@ class FeatureGroup { ...@@ -301,9 +301,9 @@ class FeatureGroup {
* \param file File want to write * \param file File want to write
*/ */
void SaveBinaryToFile(const VirtualFileWriter* writer) const { void SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&is_multi_val_, sizeof(is_multi_val_)); writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
writer->Write(&is_sparse_, sizeof(is_sparse_)); writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
writer->Write(&num_feature_, sizeof(num_feature_)); writer->AlignedWrite(&num_feature_, sizeof(num_feature_));
for (int i = 0; i < num_feature_; ++i) { for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(writer); bin_mappers_[i]->SaveBinaryToFile(writer);
} }
...@@ -320,7 +320,9 @@ class FeatureGroup { ...@@ -320,7 +320,9 @@ class FeatureGroup {
* \brief Get sizes in byte of this object * \brief Get sizes in byte of this object
*/ */
size_t SizesInByte() const { size_t SizesInByte() const {
size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_); size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
VirtualFileWriter::AlignedSize(sizeof(num_feature_));
for (int i = 0; i < num_feature_; ++i) { for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte(); ret += bin_mappers_[i]->SizesInByte();
} }
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <cstring> #include <cstring>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <vector>
namespace LightGBM { namespace LightGBM {
...@@ -31,6 +32,16 @@ struct VirtualFileWriter { ...@@ -31,6 +32,16 @@ struct VirtualFileWriter {
* \return Number of bytes written * \return Number of bytes written
*/ */
virtual size_t Write(const void* data, size_t bytes) const = 0; virtual size_t Write(const void* data, size_t bytes) const = 0;
size_t AlignedWrite(const void* data, size_t bytes, size_t alignment = 8) const {
auto ret = Write(data, bytes);
if (bytes % alignment != 0) {
size_t padding = AlignedSize(bytes, alignment) - bytes;
std::vector<char> tmp(padding, 0);
ret += Write(tmp.data(), padding);
}
return ret;
}
/*! /*!
* \brief Create appropriate writer for filename * \brief Create appropriate writer for filename
* \param filename Filename of the data * \param filename Filename of the data
...@@ -43,6 +54,14 @@ struct VirtualFileWriter { ...@@ -43,6 +54,14 @@ struct VirtualFileWriter {
* \return True when the file exists * \return True when the file exists
*/ */
static bool Exists(const std::string& filename); static bool Exists(const std::string& filename);
static size_t AlignedSize(size_t bytes, size_t alignment = 8) {
if (bytes % alignment == 0) {
return bytes;
} else {
return bytes / alignment * alignment + alignment;
}
}
}; };
/** /**
......
...@@ -522,36 +522,37 @@ namespace LightGBM { ...@@ -522,36 +522,37 @@ namespace LightGBM {
int BinMapper::SizeForSpecificBin(int bin) { int BinMapper::SizeForSpecificBin(int bin) {
int size = 0; int size = 0;
size += sizeof(int); size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(int)));
size += sizeof(MissingType); size +=
size += sizeof(bool); static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(MissingType)));
size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(bool)));
size += sizeof(double); size += sizeof(double);
size += sizeof(BinType); size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(BinType)));
size += 2 * sizeof(double); size += 2 * sizeof(double);
size += bin * sizeof(double); size += bin * sizeof(double);
size += sizeof(uint32_t) * 2; size += static_cast<int>(VirtualFileWriter::AlignedSize(sizeof(uint32_t))) * 2;
return size; return size;
} }
void BinMapper::CopyTo(char * buffer) const { void BinMapper::CopyTo(char * buffer) const {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_)); std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += sizeof(num_bin_); buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(buffer, &missing_type_, sizeof(missing_type_)); std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
buffer += sizeof(missing_type_); buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(buffer, &is_trivial_, sizeof(is_trivial_)); std::memcpy(buffer, &is_trivial_, sizeof(is_trivial_));
buffer += sizeof(is_trivial_); buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_)); std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_); buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_)); std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_); buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(buffer, &min_val_, sizeof(min_val_)); std::memcpy(buffer, &min_val_, sizeof(min_val_));
buffer += sizeof(min_val_); buffer += sizeof(min_val_);
std::memcpy(buffer, &max_val_, sizeof(max_val_)); std::memcpy(buffer, &max_val_, sizeof(max_val_));
buffer += sizeof(max_val_); buffer += sizeof(max_val_);
std::memcpy(buffer, &default_bin_, sizeof(default_bin_)); std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
buffer += sizeof(default_bin_); buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(buffer, &most_freq_bin_, sizeof(most_freq_bin_)); std::memcpy(buffer, &most_freq_bin_, sizeof(most_freq_bin_));
buffer += sizeof(most_freq_bin_); buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) { if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double)); std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else { } else {
...@@ -561,23 +562,23 @@ namespace LightGBM { ...@@ -561,23 +562,23 @@ namespace LightGBM {
void BinMapper::CopyFrom(const char * buffer) { void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_)); std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += sizeof(num_bin_); buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(&missing_type_, buffer, sizeof(missing_type_)); std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
buffer += sizeof(missing_type_); buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(&is_trivial_, buffer, sizeof(is_trivial_)); std::memcpy(&is_trivial_, buffer, sizeof(is_trivial_));
buffer += sizeof(is_trivial_); buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_)); std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_); buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_)); std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_); buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(&min_val_, buffer, sizeof(min_val_)); std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_); buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_)); std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_); buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_)); std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_); buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(&most_freq_bin_, buffer, sizeof(most_freq_bin_)); std::memcpy(&most_freq_bin_, buffer, sizeof(most_freq_bin_));
buffer += sizeof(most_freq_bin_); buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) { if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_); bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double)); std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
...@@ -592,15 +593,15 @@ namespace LightGBM { ...@@ -592,15 +593,15 @@ namespace LightGBM {
} }
void BinMapper::SaveBinaryToFile(const VirtualFileWriter* writer) const { void BinMapper::SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&num_bin_, sizeof(num_bin_)); writer->AlignedWrite(&num_bin_, sizeof(num_bin_));
writer->Write(&missing_type_, sizeof(missing_type_)); writer->AlignedWrite(&missing_type_, sizeof(missing_type_));
writer->Write(&is_trivial_, sizeof(is_trivial_)); writer->AlignedWrite(&is_trivial_, sizeof(is_trivial_));
writer->Write(&sparse_rate_, sizeof(sparse_rate_)); writer->Write(&sparse_rate_, sizeof(sparse_rate_));
writer->Write(&bin_type_, sizeof(bin_type_)); writer->AlignedWrite(&bin_type_, sizeof(bin_type_));
writer->Write(&min_val_, sizeof(min_val_)); writer->Write(&min_val_, sizeof(min_val_));
writer->Write(&max_val_, sizeof(max_val_)); writer->Write(&max_val_, sizeof(max_val_));
writer->Write(&default_bin_, sizeof(default_bin_)); writer->AlignedWrite(&default_bin_, sizeof(default_bin_));
writer->Write(&most_freq_bin_, sizeof(most_freq_bin_)); writer->AlignedWrite(&most_freq_bin_, sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) { if (bin_type_ == BinType::NumericalBin) {
writer->Write(bin_upper_bound_.data(), sizeof(double) * num_bin_); writer->Write(bin_upper_bound_.data(), sizeof(double) * num_bin_);
} else { } else {
...@@ -609,8 +610,14 @@ namespace LightGBM { ...@@ -609,8 +610,14 @@ namespace LightGBM {
} }
size_t BinMapper::SizesInByte() const { size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(missing_type_) + sizeof(is_trivial_) + sizeof(sparse_rate_) size_t ret = VirtualFileWriter::AlignedSize(sizeof(num_bin_)) +
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_) + sizeof(most_freq_bin_); VirtualFileWriter::AlignedSize(sizeof(missing_type_)) +
VirtualFileWriter::AlignedSize(sizeof(is_trivial_)) +
sizeof(sparse_rate_) +
VirtualFileWriter::AlignedSize(sizeof(bin_type_)) +
sizeof(min_val_) + sizeof(max_val_) +
VirtualFileWriter::AlignedSize(sizeof(default_bin_)) +
VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) { if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_; ret += sizeof(double) * num_bin_;
} else { } else {
......
...@@ -924,47 +924,61 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -924,47 +924,61 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
} }
Log::Info("Saving data to binary file %s", bin_filename); Log::Info("Saving data to binary file %s", bin_filename);
size_t size_of_token = std::strlen(binary_file_token); size_t size_of_token = std::strlen(binary_file_token);
writer->Write(binary_file_token, size_of_token); writer->AlignedWrite(binary_file_token, size_of_token);
// get size of header // get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_) size_t size_of_header =
+ sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) VirtualFileWriter::AlignedSize(sizeof(num_data_)) +
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ VirtualFileWriter::AlignedSize(sizeof(num_features_)) +
+ sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2; VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) +
VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(label_idx_)) +
VirtualFileWriter::AlignedSize(sizeof(num_groups_)) +
3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) +
sizeof(uint64_t) * (num_groups_ + 1) +
2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) +
VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(int)) * 3 +
VirtualFileWriter::AlignedSize(sizeof(bool)) * 2;
// size of feature names // size of feature names
for (int i = 0; i < num_total_features_; ++i) { for (int i = 0; i < num_total_features_; ++i) {
size_of_header += feature_names_[i].size() + sizeof(int); size_of_header +=
VirtualFileWriter::AlignedSize(feature_names_[i].size()) +
VirtualFileWriter::AlignedSize(sizeof(int));
} }
// size of forced bins // size of forced bins
for (int i = 0; i < num_total_features_; ++i) { for (int i = 0; i < num_total_features_; ++i) {
size_of_header += size_of_header += forced_bin_bounds_[i].size() * sizeof(double) +
forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int); VirtualFileWriter::AlignedSize(sizeof(int));
} }
writer->Write(&size_of_header, sizeof(size_of_header)); writer->Write(&size_of_header, sizeof(size_of_header));
// write header // write header
writer->Write(&num_data_, sizeof(num_data_)); writer->AlignedWrite(&num_data_, sizeof(num_data_));
writer->Write(&num_features_, sizeof(num_features_)); writer->AlignedWrite(&num_features_, sizeof(num_features_));
writer->Write(&num_total_features_, sizeof(num_total_features_)); writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_));
writer->Write(&label_idx_, sizeof(label_idx_)); writer->AlignedWrite(&label_idx_, sizeof(label_idx_));
writer->Write(&max_bin_, sizeof(max_bin_)); writer->AlignedWrite(&max_bin_, sizeof(max_bin_));
writer->Write(&bin_construct_sample_cnt_, writer->AlignedWrite(&bin_construct_sample_cnt_,
sizeof(bin_construct_sample_cnt_)); sizeof(bin_construct_sample_cnt_));
writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_)); writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_));
writer->Write(&use_missing_, sizeof(use_missing_)); writer->AlignedWrite(&use_missing_, sizeof(use_missing_));
writer->Write(&zero_as_missing_, sizeof(zero_as_missing_)); writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_));
writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_); writer->AlignedWrite(used_feature_map_.data(),
writer->Write(&num_groups_, sizeof(num_groups_)); sizeof(int) * num_total_features_);
writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_); writer->AlignedWrite(&num_groups_, sizeof(num_groups_));
writer->Write(feature2group_.data(), sizeof(int) * num_features_); writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_);
writer->Write(feature2subfeature_.data(), sizeof(int) * num_features_); writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(feature2subfeature_.data(),
sizeof(int) * num_features_);
writer->Write(group_bin_boundaries_.data(), writer->Write(group_bin_boundaries_.data(),
sizeof(uint64_t) * (num_groups_ + 1)); sizeof(uint64_t) * (num_groups_ + 1));
writer->Write(group_feature_start_.data(), sizeof(int) * num_groups_); writer->AlignedWrite(group_feature_start_.data(),
writer->Write(group_feature_cnt_.data(), sizeof(int) * num_groups_); sizeof(int) * num_groups_);
writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_);
if (max_bin_by_feature_.empty()) { if (max_bin_by_feature_.empty()) {
ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_); ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
} }
writer->Write(max_bin_by_feature_.data(), writer->AlignedWrite(max_bin_by_feature_.data(),
sizeof(int32_t) * num_total_features_); sizeof(int32_t) * num_total_features_);
if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) { if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
max_bin_by_feature_.clear(); max_bin_by_feature_.clear();
...@@ -972,14 +986,14 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -972,14 +986,14 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
// write feature names // write feature names
for (int i = 0; i < num_total_features_; ++i) { for (int i = 0; i < num_total_features_; ++i) {
int str_len = static_cast<int>(feature_names_[i].size()); int str_len = static_cast<int>(feature_names_[i].size());
writer->Write(&str_len, sizeof(int)); writer->AlignedWrite(&str_len, sizeof(int));
const char* c_str = feature_names_[i].c_str(); const char* c_str = feature_names_[i].c_str();
writer->Write(c_str, sizeof(char) * str_len); writer->AlignedWrite(c_str, sizeof(char) * str_len);
} }
// write forced bins // write forced bins
for (int i = 0; i < num_total_features_; ++i) { for (int i = 0; i < num_total_features_; ++i) {
int num_bounds = static_cast<int>(forced_bin_bounds_[i].size()); int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
writer->Write(&num_bounds, sizeof(int)); writer->AlignedWrite(&num_bounds, sizeof(int));
for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) { for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
writer->Write(&forced_bin_bounds_[i][j], sizeof(double)); writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
......
...@@ -286,8 +286,10 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -286,8 +286,10 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
// check token // check token
size_t size_of_token = std::strlen(Dataset::binary_file_token); size_t size_of_token = std::strlen(Dataset::binary_file_token);
size_t read_cnt = reader->Read(buffer.data(), sizeof(char) * size_of_token); size_t read_cnt = reader->Read(
if (read_cnt != sizeof(char) * size_of_token) { buffer.data(),
VirtualFileWriter::AlignedSize(sizeof(char) * size_of_token));
if (read_cnt < sizeof(char) * size_of_token) {
Log::Fatal("Binary file error: token has the wrong size"); Log::Fatal("Binary file error: token has the wrong size");
} }
if (std::string(buffer.data()) != std::string(Dataset::binary_file_token)) { if (std::string(buffer.data()) != std::string(Dataset::binary_file_token)) {
...@@ -317,53 +319,59 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -317,53 +319,59 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
// get header // get header
const char* mem_ptr = buffer.data(); const char* mem_ptr = buffer.data();
dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr)); dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(dataset->num_data_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_data_));
dataset->num_features_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_features_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_features_));
dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_total_features_); mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(dataset->num_total_features_));
dataset->label_idx_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->label_idx_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->label_idx_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->label_idx_));
dataset->max_bin_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->max_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->max_bin_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->max_bin_));
dataset->bin_construct_sample_cnt_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->bin_construct_sample_cnt_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->bin_construct_sample_cnt_); mem_ptr += VirtualFileWriter::AlignedSize(
sizeof(dataset->bin_construct_sample_cnt_));
dataset->min_data_in_bin_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->min_data_in_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->min_data_in_bin_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->min_data_in_bin_));
dataset->use_missing_ = *(reinterpret_cast<const bool*>(mem_ptr)); dataset->use_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += sizeof(dataset->use_missing_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->use_missing_));
dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr)); dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += sizeof(dataset->zero_as_missing_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->zero_as_missing_));
const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr); const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
dataset->used_feature_map_.clear(); dataset->used_feature_map_.clear();
for (int i = 0; i < dataset->num_total_features_; ++i) { for (int i = 0; i < dataset->num_total_features_; ++i) {
dataset->used_feature_map_.push_back(tmp_feature_map[i]); dataset->used_feature_map_.push_back(tmp_feature_map[i]);
} }
mem_ptr += sizeof(int) * dataset->num_total_features_; mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) *
dataset->num_total_features_);
// num_groups // num_groups
dataset->num_groups_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->num_groups_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_groups_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_groups_));
// real_feature_idx_ // real_feature_idx_
const int* tmp_ptr_real_feature_idx_ = reinterpret_cast<const int*>(mem_ptr); const int* tmp_ptr_real_feature_idx_ = reinterpret_cast<const int*>(mem_ptr);
dataset->real_feature_idx_.clear(); dataset->real_feature_idx_.clear();
for (int i = 0; i < dataset->num_features_; ++i) { for (int i = 0; i < dataset->num_features_; ++i) {
dataset->real_feature_idx_.push_back(tmp_ptr_real_feature_idx_[i]); dataset->real_feature_idx_.push_back(tmp_ptr_real_feature_idx_[i]);
} }
mem_ptr += sizeof(int) * dataset->num_features_; mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_);
// feature2group // feature2group
const int* tmp_ptr_feature2group = reinterpret_cast<const int*>(mem_ptr); const int* tmp_ptr_feature2group = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2group_.clear(); dataset->feature2group_.clear();
for (int i = 0; i < dataset->num_features_; ++i) { for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2group_.push_back(tmp_ptr_feature2group[i]); dataset->feature2group_.push_back(tmp_ptr_feature2group[i]);
} }
mem_ptr += sizeof(int) * dataset->num_features_; mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_);
// feature2subfeature // feature2subfeature
const int* tmp_ptr_feature2subfeature = reinterpret_cast<const int*>(mem_ptr); const int* tmp_ptr_feature2subfeature = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2subfeature_.clear(); dataset->feature2subfeature_.clear();
for (int i = 0; i < dataset->num_features_; ++i) { for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2subfeature_.push_back(tmp_ptr_feature2subfeature[i]); dataset->feature2subfeature_.push_back(tmp_ptr_feature2subfeature[i]);
} }
mem_ptr += sizeof(int) * dataset->num_features_; mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_);
// group_bin_boundaries // group_bin_boundaries
const uint64_t* tmp_ptr_group_bin_boundaries = reinterpret_cast<const uint64_t*>(mem_ptr); const uint64_t* tmp_ptr_group_bin_boundaries = reinterpret_cast<const uint64_t*>(mem_ptr);
dataset->group_bin_boundaries_.clear(); dataset->group_bin_boundaries_.clear();
...@@ -378,7 +386,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -378,7 +386,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
for (int i = 0; i < dataset->num_groups_; ++i) { for (int i = 0; i < dataset->num_groups_; ++i) {
dataset->group_feature_start_.push_back(tmp_ptr_group_feature_start[i]); dataset->group_feature_start_.push_back(tmp_ptr_group_feature_start[i]);
} }
mem_ptr += sizeof(int) * (dataset->num_groups_); mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_));
// group_feature_cnt_ // group_feature_cnt_
const int* tmp_ptr_group_feature_cnt = reinterpret_cast<const int*>(mem_ptr); const int* tmp_ptr_group_feature_cnt = reinterpret_cast<const int*>(mem_ptr);
...@@ -386,7 +395,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -386,7 +395,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
for (int i = 0; i < dataset->num_groups_; ++i) { for (int i = 0; i < dataset->num_groups_; ++i) {
dataset->group_feature_cnt_.push_back(tmp_ptr_group_feature_cnt[i]); dataset->group_feature_cnt_.push_back(tmp_ptr_group_feature_cnt[i]);
} }
mem_ptr += sizeof(int) * (dataset->num_groups_); mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_));
if (!config_.max_bin_by_feature.empty()) { if (!config_.max_bin_by_feature.empty()) {
CHECK_EQ(static_cast<size_t>(dataset->num_total_features_), config_.max_bin_by_feature.size()); CHECK_EQ(static_cast<size_t>(dataset->num_total_features_), config_.max_bin_by_feature.size());
...@@ -400,7 +410,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -400,7 +410,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
dataset->max_bin_by_feature_.push_back(tmp_ptr_max_bin_by_feature[i]); dataset->max_bin_by_feature_.push_back(tmp_ptr_max_bin_by_feature[i]);
} }
} }
mem_ptr += sizeof(int32_t) * (dataset->num_total_features_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int32_t) *
(dataset->num_total_features_));
if (ArrayArgs<int32_t>::CheckAll(dataset->max_bin_by_feature_, -1)) { if (ArrayArgs<int32_t>::CheckAll(dataset->max_bin_by_feature_, -1)) {
dataset->max_bin_by_feature_.clear(); dataset->max_bin_by_feature_.clear();
} }
...@@ -410,22 +421,24 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -410,22 +421,24 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
// write feature names // write feature names
for (int i = 0; i < dataset->num_total_features_; ++i) { for (int i = 0; i < dataset->num_total_features_; ++i) {
int str_len = *(reinterpret_cast<const int*>(mem_ptr)); int str_len = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(int); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int));
std::stringstream str_buf; std::stringstream str_buf;
auto tmp_arr = reinterpret_cast<const char*>(mem_ptr);
for (int j = 0; j < str_len; ++j) { for (int j = 0; j < str_len; ++j) {
char tmp_char = *(reinterpret_cast<const char*>(mem_ptr)); char tmp_char = tmp_arr[j];
mem_ptr += sizeof(char);
str_buf << tmp_char; str_buf << tmp_char;
} }
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(char) * str_len);
dataset->feature_names_.emplace_back(str_buf.str()); dataset->feature_names_.emplace_back(str_buf.str());
} }
// get forced_bin_bounds_ // get forced_bin_bounds_
dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>()); dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>());
for (int i = 0; i < dataset->num_total_features_; ++i) { for (int i = 0; i < dataset->num_total_features_; ++i) {
int num_bounds = *(reinterpret_cast<const int*>(mem_ptr)); int num_bounds = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(int); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int));
dataset->forced_bin_bounds_[i] = std::vector<double>(); dataset->forced_bin_bounds_[i] = std::vector<double>();
const double* tmp_ptr_forced_bounds = reinterpret_cast<const double*>(mem_ptr); const double* tmp_ptr_forced_bounds =
reinterpret_cast<const double*>(mem_ptr);
for (int j = 0; j < num_bounds; ++j) { for (int j = 0; j < num_bounds; ++j) {
double bound = tmp_ptr_forced_bounds[j]; double bound = tmp_ptr_forced_bounds[j];
dataset->forced_bin_bounds_[i].push_back(bound); dataset->forced_bin_bounds_[i].push_back(bound);
......
...@@ -452,10 +452,12 @@ class DenseBin : public Bin { ...@@ -452,10 +452,12 @@ class DenseBin : public Bin {
} }
void SaveBinaryToFile(const VirtualFileWriter* writer) const override { void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
writer->Write(data_.data(), sizeof(VAL_T) * data_.size()); writer->AlignedWrite(data_.data(), sizeof(VAL_T) * data_.size());
} }
size_t SizesInByte() const override { return sizeof(VAL_T) * data_.size(); } size_t SizesInByte() const override {
return VirtualFileWriter::AlignedSize(sizeof(VAL_T) * data_.size());
}
DenseBin<VAL_T, IS_4BIT>* Clone() override; DenseBin<VAL_T, IS_4BIT>* Clone() override;
......
...@@ -472,44 +472,46 @@ void Metadata::LoadFromMemory(const void* memory) { ...@@ -472,44 +472,46 @@ void Metadata::LoadFromMemory(const void* memory) {
const char* mem_ptr = reinterpret_cast<const char*>(memory); const char* mem_ptr = reinterpret_cast<const char*>(memory);
num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr)); num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(num_data_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(num_data_));
num_weights_ = *(reinterpret_cast<const data_size_t*>(mem_ptr)); num_weights_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(num_weights_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(num_weights_));
num_queries_ = *(reinterpret_cast<const data_size_t*>(mem_ptr)); num_queries_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(num_queries_); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(num_queries_));
if (!label_.empty()) { label_.clear(); } if (!label_.empty()) { label_.clear(); }
label_ = std::vector<label_t>(num_data_); label_ = std::vector<label_t>(num_data_);
std::memcpy(label_.data(), mem_ptr, sizeof(label_t) * num_data_); std::memcpy(label_.data(), mem_ptr, sizeof(label_t) * num_data_);
mem_ptr += sizeof(label_t) * num_data_; mem_ptr += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_data_);
if (num_weights_ > 0) { if (num_weights_ > 0) {
if (!weights_.empty()) { weights_.clear(); } if (!weights_.empty()) { weights_.clear(); }
weights_ = std::vector<label_t>(num_weights_); weights_ = std::vector<label_t>(num_weights_);
std::memcpy(weights_.data(), mem_ptr, sizeof(label_t) * num_weights_); std::memcpy(weights_.data(), mem_ptr, sizeof(label_t) * num_weights_);
mem_ptr += sizeof(label_t) * num_weights_; mem_ptr += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_weights_);
weight_load_from_file_ = true; weight_load_from_file_ = true;
} }
if (num_queries_ > 0) { if (num_queries_ > 0) {
if (!query_boundaries_.empty()) { query_boundaries_.clear(); } if (!query_boundaries_.empty()) { query_boundaries_.clear(); }
query_boundaries_ = std::vector<data_size_t>(num_queries_ + 1); query_boundaries_ = std::vector<data_size_t>(num_queries_ + 1);
std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t) * (num_queries_ + 1)); std::memcpy(query_boundaries_.data(), mem_ptr, sizeof(data_size_t) * (num_queries_ + 1));
mem_ptr += sizeof(data_size_t) * (num_queries_ + 1); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(data_size_t) *
(num_queries_ + 1));
query_load_from_file_ = true; query_load_from_file_ = true;
} }
LoadQueryWeights(); LoadQueryWeights();
} }
void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const { void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&num_data_, sizeof(num_data_)); writer->AlignedWrite(&num_data_, sizeof(num_data_));
writer->Write(&num_weights_, sizeof(num_weights_)); writer->AlignedWrite(&num_weights_, sizeof(num_weights_));
writer->Write(&num_queries_, sizeof(num_queries_)); writer->AlignedWrite(&num_queries_, sizeof(num_queries_));
writer->Write(label_.data(), sizeof(label_t) * num_data_); writer->AlignedWrite(label_.data(), sizeof(label_t) * num_data_);
if (!weights_.empty()) { if (!weights_.empty()) {
writer->Write(weights_.data(), sizeof(label_t) * num_weights_); writer->AlignedWrite(weights_.data(), sizeof(label_t) * num_weights_);
} }
if (!query_boundaries_.empty()) { if (!query_boundaries_.empty()) {
writer->Write(query_boundaries_.data(), sizeof(data_size_t) * (num_queries_ + 1)); writer->AlignedWrite(query_boundaries_.data(),
sizeof(data_size_t) * (num_queries_ + 1));
} }
if (num_init_score_ > 0) { if (num_init_score_ > 0) {
Log::Warning("Please note that `init_score` is not saved in binary file.\n" Log::Warning("Please note that `init_score` is not saved in binary file.\n"
...@@ -518,14 +520,16 @@ void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const { ...@@ -518,14 +520,16 @@ void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const {
} }
size_t Metadata::SizesInByte() const { size_t Metadata::SizesInByte() const {
size_t size = sizeof(num_data_) + sizeof(num_weights_) size_t size = VirtualFileWriter::AlignedSize(sizeof(num_data_)) +
+ sizeof(num_queries_); VirtualFileWriter::AlignedSize(sizeof(num_weights_)) +
size += sizeof(label_t) * num_data_; VirtualFileWriter::AlignedSize(sizeof(num_queries_));
size += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_data_);
if (!weights_.empty()) { if (!weights_.empty()) {
size += sizeof(label_t) * num_weights_; size += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_weights_);
} }
if (!query_boundaries_.empty()) { if (!query_boundaries_.empty()) {
size += sizeof(data_size_t) * (num_queries_ + 1); size += VirtualFileWriter::AlignedSize(sizeof(data_size_t) *
(num_queries_ + 1));
} }
return size; return size;
} }
......
...@@ -503,14 +503,15 @@ class SparseBin : public Bin { ...@@ -503,14 +503,15 @@ class SparseBin : public Bin {
} }
void SaveBinaryToFile(const VirtualFileWriter* writer) const override { void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
writer->Write(&num_vals_, sizeof(num_vals_)); writer->AlignedWrite(&num_vals_, sizeof(num_vals_));
writer->Write(deltas_.data(), sizeof(uint8_t) * (num_vals_ + 1)); writer->AlignedWrite(deltas_.data(), sizeof(uint8_t) * (num_vals_ + 1));
writer->Write(vals_.data(), sizeof(VAL_T) * num_vals_); writer->AlignedWrite(vals_.data(), sizeof(VAL_T) * num_vals_);
} }
size_t SizesInByte() const override { size_t SizesInByte() const override {
return sizeof(num_vals_) + sizeof(uint8_t) * (num_vals_ + 1) + return VirtualFileWriter::AlignedSize(sizeof(num_vals_)) +
sizeof(VAL_T) * num_vals_; VirtualFileWriter::AlignedSize(sizeof(uint8_t) * (num_vals_ + 1)) +
VirtualFileWriter::AlignedSize(sizeof(VAL_T) * num_vals_);
} }
void LoadFromMemory( void LoadFromMemory(
...@@ -518,9 +519,9 @@ class SparseBin : public Bin { ...@@ -518,9 +519,9 @@ class SparseBin : public Bin {
const std::vector<data_size_t>& local_used_indices) override { const std::vector<data_size_t>& local_used_indices) override {
const char* mem_ptr = reinterpret_cast<const char*>(memory); const char* mem_ptr = reinterpret_cast<const char*>(memory);
data_size_t tmp_num_vals = *(reinterpret_cast<const data_size_t*>(mem_ptr)); data_size_t tmp_num_vals = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(tmp_num_vals); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(tmp_num_vals));
const uint8_t* tmp_delta = reinterpret_cast<const uint8_t*>(mem_ptr); const uint8_t* tmp_delta = reinterpret_cast<const uint8_t*>(mem_ptr);
mem_ptr += sizeof(uint8_t) * (tmp_num_vals + 1); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(uint8_t) * (tmp_num_vals + 1));
const VAL_T* tmp_vals = reinterpret_cast<const VAL_T*>(mem_ptr); const VAL_T* tmp_vals = reinterpret_cast<const VAL_T*>(mem_ptr);
deltas_.clear(); deltas_.clear();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment