Unverified Commit 0f7983b6 authored by Scott Votaw's avatar Scott Votaw Committed by GitHub
Browse files

feature: Add serialization of reference dataset (#5427)

* Add serialization of reference dataset

* lint and missing file

* Fixes from reviewers

* responded to comments

* revert sdk change
parent 74dfd905
......@@ -104,7 +104,7 @@ class BinMapper {
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const;
void SaveBinaryToFile(BinaryWriter* writer) const;
/*!
* \brief Mapping bin into feature value
......@@ -286,7 +286,7 @@ class Bin {
* \brief Save binary data to file
* \param file File want to write
*/
virtual void SaveBinaryToFile(const VirtualFileWriter* writer) const = 0;
virtual void SaveBinaryToFile(BinaryWriter* writer) const = 0;
/*!
* \brief Load from memory
......
......@@ -29,6 +29,7 @@
typedef void* DatasetHandle; /*!< \brief Handle of dataset. */
typedef void* BoosterHandle; /*!< \brief Handle of booster. */
typedef void* FastConfigHandle; /*!< \brief Handle of FastConfig. */
typedef void* ByteBufferHandle; /*!< \brief Handle of ByteBuffer. */
#define C_API_DTYPE_FLOAT32 (0) /*!< \brief float32 (single precision float). */
#define C_API_DTYPE_FLOAT64 (1) /*!< \brief float64 (double precision float). */
......@@ -96,6 +97,22 @@ LIGHTGBM_C_EXPORT int LGBM_SampleIndices(int32_t num_total_row,
void* out,
int32_t* out_len);
/*!
* \brief Get a ByteBuffer value at an index.
* \param handle Handle of byte buffer to be read
* \param index Index of value to return
* \param[out] out_val Byte value at index to return
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_ByteBufferGetAt(ByteBufferHandle handle, int32_t index, uint8_t* out_val);
/*!
* \brief Free space for byte buffer.
* \param handle Handle of byte buffer to be freed
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_ByteBufferFree(ByteBufferHandle handle);
/* --- start Dataset interface */
/*!
......@@ -164,6 +181,23 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetInitStreaming(DatasetHandle dataset,
int32_t nthreads,
int32_t omp_max_threads);
/*!
* \brief Allocate the space for dataset and bucket feature bins according to serialized reference dataset.
* \param ref_buffer A binary representation of the dataset schema (feature groups, bins, etc.)
* \param ref_buffer_size The size of the reference array in bytes
* \param num_row Number of total rows the dataset will contain
* \param num_classes Number of classes (will be used only in case of multiclass and specifying initial scores)
* \param parameters Additional parameters
* \param[out] out Created dataset
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSerializedReference(const void* ref_buffer,
int32_t ref_buffer_size,
int64_t num_row,
int32_t num_classes,
const char* parameters,
DatasetHandle* out);
/*!
* \brief Push data to existing dataset, if ``nrow + start_row == num_total_row``, will call ``dataset->FinishLoad``.
* \param dataset Handle of dataset
......@@ -464,6 +498,17 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetFree(DatasetHandle handle);
LIGHTGBM_C_EXPORT int LGBM_DatasetSaveBinary(DatasetHandle handle,
const char* filename);
/*!
* \brief Create a dataset schema representation as a binary byte array (excluding data).
* \param handle Handle of dataset
* \param[out] out The output byte array
* \param[out] out_len The length of the output byte array (returned for convenience)
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetSerializeReferenceToBinary(DatasetHandle handle,
ByteBufferHandle* out,
int32_t* out_len);
/*!
* \brief Save dataset to text file, intended for debugging use only.
* \param handle Handle of dataset
......
......@@ -9,6 +9,7 @@
#include <LightGBM/feature_group.h>
#include <LightGBM/meta.h>
#include <LightGBM/train_share_states.h>
#include <LightGBM/utils/byte_buffer.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>
......@@ -124,7 +125,7 @@ class Metadata {
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const;
void SaveBinaryToFile(BinaryWriter* writer) const;
/*!
* \brief Get sizes in byte of this object
......@@ -621,6 +622,11 @@ class Dataset {
*/
LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);
/*!
* \brief Serialize the overall Dataset definition/schema to a binary buffer (i.e., without data)
*/
LIGHTGBM_EXPORT void SerializeReference(ByteBuffer* out);
LIGHTGBM_EXPORT void DumpTextFile(const char* text_filename);
LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
......@@ -919,6 +925,10 @@ class Dataset {
#endif // USE_CUDA
private:
void SerializeHeader(BinaryWriter* serializer);
size_t GetSerializedHeaderSize();
void CreateCUDAColumnData();
std::string data_filename_;
......@@ -938,8 +948,11 @@ class Dataset {
int label_idx_ = 0;
/*! \brief store feature names */
std::vector<std::string> feature_names_;
/*! \brief store feature names */
/*! \brief serialized versions */
static const int kSerializedReferenceVersionLength;
static const char* serialized_reference_version;
static const char* binary_file_token;
static const char* binary_serialized_reference_token;
int num_groups_;
std::vector<int> real_feature_idx_;
std::vector<int> feature2group_;
......
......@@ -28,6 +28,8 @@ class DatasetLoader {
LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
LIGHTGBM_EXPORT Dataset* LoadFromSerializedReference(const char* buffer, size_t buffer_size, data_size_t num_data, int32_t num_classes);
LIGHTGBM_EXPORT Dataset* ConstructFromSampleData(double** sample_values,
int** sample_indices,
int num_col,
......@@ -45,6 +47,8 @@ class DatasetLoader {
const std::unordered_set<int>& categorical_features);
private:
void LoadHeaderFromMemory(Dataset* dataset, const char* buffer);
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
void SetHeader(const char* filename);
......
......@@ -110,14 +110,56 @@ class FeatureGroup {
}
/*!
* \brief Constructor from memory
* \brief Constructor from memory when data is present
* \param memory Pointer of memory
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
* \param group_id Id of group
*/
FeatureGroup(const void* memory, data_size_t num_all_data,
FeatureGroup(const void* memory,
data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices,
int group_id) {
// Load the definition schema first
const char* memory_ptr = LoadDefinitionFromMemory(memory, group_id);
// Allocate memory for the data
data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) {
num_data = static_cast<data_size_t>(local_used_indices.size());
}
AllocateBins(num_data);
// Now load the actual data
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->LoadFromMemory(memory_ptr, local_used_indices);
memory_ptr += multi_bin_data_[i]->SizesInByte();
}
} else {
bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
}
}
/*!
* \brief Constructor from definition in memory (without data)
* \param memory Pointer of memory
* \param local_used_indices Local used indices, empty means using all data
*/
FeatureGroup(const void* memory, data_size_t num_data, int group_id) {
LoadDefinitionFromMemory(memory, group_id);
AllocateBins(num_data);
}
/*! \brief Destructor */
~FeatureGroup() {}
/*!
* \brief Load the overall definition of the feature group from binary serialized data
* \param memory Pointer of memory
* \param group_id Id of group
*/
const char* LoadDefinitionFromMemory(const void* memory, int group_id) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
......@@ -128,9 +170,9 @@ class FeatureGroup {
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(is_sparse_));
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += VirtualFileWriter::AlignedSize(sizeof(num_feature_));
// get bin mapper
bin_mappers_.clear();
// get bin mapper(s)
bin_mappers_.clear();
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
memory_ptr += bin_mappers_[i]->SizesInByte();
......@@ -158,22 +200,23 @@ class FeatureGroup {
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
}
data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) {
num_data = static_cast<data_size_t>(local_used_indices.size());
}
return memory_ptr;
}
/*!
* \brief Allocate the bins
* \param num_all_data Number of global data
*/
inline void AllocateBins(data_size_t num_data) {
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(
num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
} else {
multi_bin_data_.emplace_back(
Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
}
multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
memory_ptr += multi_bin_data_.back()->SizesInByte();
}
} else {
if (is_sparse_) {
......@@ -181,14 +224,9 @@ class FeatureGroup {
} else {
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
}
// get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
}
}
/*! \brief Destructor */
~FeatureGroup() {}
/*!
* \brief Initialize for pushing in a streaming fashion. By default, no action needed.
* \param num_thread The number of external threads that will be calling the push APIs
......@@ -414,10 +452,11 @@ class FeatureGroup {
}
/*!
* \brief Save binary data to file
* \param file File want to write
* \brief Write to binary stream
* \param writer Writer
* \param include_data Whether to write data (true) or just header information (false)
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const {
void SerializeToBinary(BinaryWriter* writer, bool include_data = true) const {
writer->AlignedWrite(&is_multi_val_, sizeof(is_multi_val_));
writer->AlignedWrite(&is_dense_multi_val_, sizeof(is_dense_multi_val_));
writer->AlignedWrite(&is_sparse_, sizeof(is_sparse_));
......@@ -425,19 +464,22 @@ class FeatureGroup {
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(writer);
}
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->SaveBinaryToFile(writer);
if (include_data) {
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->SaveBinaryToFile(writer);
}
} else {
bin_data_->SaveBinaryToFile(writer);
}
} else {
bin_data_->SaveBinaryToFile(writer);
}
}
/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
size_t SizesInByte(bool include_data = true) const {
size_t ret = VirtualFileWriter::AlignedSize(sizeof(is_multi_val_)) +
VirtualFileWriter::AlignedSize(sizeof(is_dense_multi_val_)) +
VirtualFileWriter::AlignedSize(sizeof(is_sparse_)) +
......@@ -445,11 +487,13 @@ class FeatureGroup {
for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
if (!is_multi_val_) {
ret += bin_data_->SizesInByte();
} else {
for (int i = 0; i < num_feature_; ++i) {
ret += multi_bin_data_[i]->SizesInByte();
if (include_data) {
if (!is_multi_val_) {
ret += bin_data_->SizesInByte();
} else {
for (int i = 0; i < num_feature_; ++i) {
ret += multi_bin_data_[i]->SizesInByte();
}
}
}
return ret;
......
/*!
* Copyright (c) 2022 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_UTILS_BINARY_WRITER_H_
#define LIGHTGBM_UTILS_BINARY_WRITER_H_
#include <cstdlib>
#include <vector>
namespace LightGBM {
/*!
* \brief An interface for serializing binary data to a buffer
*/
struct BinaryWriter {
/*!
* \brief Append data to this binary target
* \param data Buffer to write from
* \param bytes Number of bytes to write from buffer
* \return Number of bytes written
*/
virtual size_t Write(const void* data, size_t bytes) = 0;
/*!
* \brief Append data to this binary target aligned on a given byte size boundary
* \param data Buffer to write from
* \param bytes Number of bytes to write from buffer
* \param alignment The size of bytes to align to in whole increments
* \return Number of bytes written
*/
size_t AlignedWrite(const void* data, size_t bytes, size_t alignment = 8) {
auto ret = Write(data, bytes);
if (bytes % alignment != 0) {
size_t padding = AlignedSize(bytes, alignment) - bytes;
std::vector<char> tmp(padding, 0);
ret += Write(tmp.data(), padding);
}
return ret;
}
/*!
* \brief The aligned size of a buffer length.
* \param bytes The number of bytes in a buffer
* \param alignment The size of bytes to align to in whole increments
* \return Number of aligned bytes
*/
static size_t AlignedSize(size_t bytes, size_t alignment = 8) {
if (bytes % alignment == 0) {
return bytes;
} else {
return bytes / alignment * alignment + alignment;
}
}
};
} // namespace LightGBM
#endif // LIGHTGBM_UTILS_BINARY_WRITER_H_
/*!
* Copyright (c) 2022 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_UTILS_BYTE_BUFFER_H_
#define LIGHTGBM_UTILS_BYTE_BUFFER_H_
#include <LightGBM/export.h>
#include <LightGBM/utils/binary_writer.h>
#include <string>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <memory>
#include <vector>
namespace LightGBM {
/*!
* \brief An implementation for serializing binary data to an auto-expanding memory buffer
*/
struct ByteBuffer final : public BinaryWriter {
ByteBuffer() {}
explicit ByteBuffer(size_t initial_size) {
buffer_.reserve(initial_size);
}
size_t Write(const void* data, size_t bytes) {
const char* mem_ptr = static_cast<const char*>(data);
for (size_t i = 0; i < bytes; ++i) {
buffer_.push_back(mem_ptr[i]);
}
return bytes;
}
LIGHTGBM_EXPORT void Reserve(size_t capacity) {
buffer_.reserve(capacity);
}
LIGHTGBM_EXPORT size_t GetSize() {
return buffer_.size();
}
LIGHTGBM_EXPORT char GetAt(size_t index) {
return buffer_.at(index);
}
LIGHTGBM_EXPORT char* Data() {
return buffer_.data();
}
private:
std::vector<char> buffer_;
};
} // namespace LightGBM
#endif // LightGBM_UTILS_BYTE_BUFFER_H_
......@@ -5,6 +5,8 @@
#ifndef LIGHTGBM_UTILS_FILE_IO_H_
#define LIGHTGBM_UTILS_FILE_IO_H_
#include <LightGBM/utils/binary_writer.h>
#include <string>
#include <cstdio>
#include <cstdlib>
......@@ -18,50 +20,28 @@ namespace LightGBM {
/*!
* \brief An interface for writing files from buffers
*/
struct VirtualFileWriter {
struct VirtualFileWriter : BinaryWriter {
virtual ~VirtualFileWriter() {}
/*!
* \brief Initialize the writer
* \return True when the file is available for writes
*/
virtual bool Init() = 0;
/*!
* \brief Append buffer to file
* \param data Buffer to write from
* \param bytes Number of bytes to write from buffer
* \return Number of bytes written
*/
virtual size_t Write(const void* data, size_t bytes) const = 0;
size_t AlignedWrite(const void* data, size_t bytes, size_t alignment = 8) const {
auto ret = Write(data, bytes);
if (bytes % alignment != 0) {
size_t padding = AlignedSize(bytes, alignment) - bytes;
std::vector<char> tmp(padding, 0);
ret += Write(tmp.data(), padding);
}
return ret;
}
/*!
* \brief Create appropriate writer for filename
* \param filename Filename of the data
* \return File writer instance
*/
static std::unique_ptr<VirtualFileWriter> Make(const std::string& filename);
/*!
* \brief Check filename existence
* \param filename Filename of the data
* \return True when the file exists
*/
static bool Exists(const std::string& filename);
static size_t AlignedSize(size_t bytes, size_t alignment = 8) {
if (bytes % alignment == 0) {
return bytes;
} else {
return bytes / alignment * alignment + alignment;
}
}
};
/**
......
......@@ -12,6 +12,7 @@
#include <LightGBM/network.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/prediction_early_stop.h>
#include <LightGBM/utils/byte_buffer.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
......@@ -951,6 +952,19 @@ int LGBM_SampleIndices(int32_t num_total_row,
API_END();
}
int LGBM_ByteBufferGetAt(ByteBufferHandle handle, int32_t index, uint8_t* out_val) {
API_BEGIN();
LightGBM::ByteBuffer* byteBuffer = reinterpret_cast<LightGBM::ByteBuffer*>(handle);
*out_val = byteBuffer->GetAt(index);
API_END();
}
int LGBM_ByteBufferFree(ByteBufferHandle handle) {
API_BEGIN();
delete reinterpret_cast<LightGBM::ByteBuffer*>(handle);
API_END();
}
int LGBM_DatasetCreateFromFile(const char* filename,
const char* parameters,
const DatasetHandle reference,
......@@ -1013,6 +1027,25 @@ int LGBM_DatasetCreateByReference(const DatasetHandle reference,
API_END();
}
int LGBM_DatasetCreateFromSerializedReference(const void* ref_buffer,
int32_t ref_buffer_size,
int64_t num_row,
int32_t num_classes,
const char* parameters,
DatasetHandle* out) {
API_BEGIN();
auto param = Config::Str2Map(parameters);
Config config;
config.Set(param);
OMP_SET_NUM_THREADS(config.num_threads);
DatasetLoader loader(config, nullptr, 1, nullptr);
*out = loader.LoadFromSerializedReference(static_cast<const char*>(ref_buffer),
static_cast<size_t>(ref_buffer_size),
static_cast<data_size_t>(num_row),
num_classes);
API_END();
}
int LGBM_DatasetInitStreaming(DatasetHandle dataset,
int32_t has_weights,
int32_t has_init_scores,
......@@ -1613,6 +1646,19 @@ int LGBM_DatasetSaveBinary(DatasetHandle handle,
API_END();
}
int LGBM_DatasetSerializeReferenceToBinary(DatasetHandle handle,
ByteBufferHandle* out,
int32_t* out_len) {
API_BEGIN();
auto dataset = reinterpret_cast<Dataset*>(handle);
std::unique_ptr<LightGBM::ByteBuffer> ret;
ret.reset(new LightGBM::ByteBuffer());
dataset->SerializeReference(ret.get());
*out_len = static_cast<int32_t>(ret->GetSize());
*out = ret.release();
API_END();
}
int LGBM_DatasetDumpText(DatasetHandle handle,
const char* filename) {
API_BEGIN();
......
......@@ -577,7 +577,7 @@ namespace LightGBM {
}
}
void BinMapper::SaveBinaryToFile(const VirtualFileWriter* writer) const {
void BinMapper::SaveBinaryToFile(BinaryWriter* writer) const {
writer->AlignedWrite(&num_bin_, sizeof(num_bin_));
writer->AlignedWrite(&missing_type_, sizeof(missing_type_));
writer->AlignedWrite(&is_trivial_, sizeof(is_trivial_));
......
......@@ -19,8 +19,13 @@
namespace LightGBM {
const int Dataset::kSerializedReferenceVersionLength = 2;
const char* Dataset::serialized_reference_version = "v1";
const char* Dataset::binary_file_token =
"______LightGBM_Binary_File_Token______\n";
const char* Dataset::binary_serialized_reference_token =
"______LightGBM_Binary_Serialized_Token______\n";
Dataset::Dataset() {
data_filename_ = "noname";
......@@ -994,80 +999,9 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
Log::Info("Saving data to binary file %s", bin_filename);
size_t size_of_token = std::strlen(binary_file_token);
writer->AlignedWrite(binary_file_token, size_of_token);
// get size of header
size_t size_of_header =
VirtualFileWriter::AlignedSize(sizeof(num_data_)) +
VirtualFileWriter::AlignedSize(sizeof(num_features_)) +
VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) +
VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(label_idx_)) +
VirtualFileWriter::AlignedSize(sizeof(num_groups_)) +
3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) +
sizeof(uint64_t) * (num_groups_ + 1) +
2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) +
VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(int)) * 3 +
VirtualFileWriter::AlignedSize(sizeof(bool)) * 3;
// size of feature names
for (int i = 0; i < num_total_features_; ++i) {
size_of_header +=
VirtualFileWriter::AlignedSize(feature_names_[i].size()) +
VirtualFileWriter::AlignedSize(sizeof(int));
}
// size of forced bins
for (int i = 0; i < num_total_features_; ++i) {
size_of_header += forced_bin_bounds_[i].size() * sizeof(double) +
VirtualFileWriter::AlignedSize(sizeof(int));
}
writer->Write(&size_of_header, sizeof(size_of_header));
// write header
writer->AlignedWrite(&num_data_, sizeof(num_data_));
writer->AlignedWrite(&num_features_, sizeof(num_features_));
writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_));
writer->AlignedWrite(&label_idx_, sizeof(label_idx_));
writer->AlignedWrite(&max_bin_, sizeof(max_bin_));
writer->AlignedWrite(&bin_construct_sample_cnt_,
sizeof(bin_construct_sample_cnt_));
writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_));
writer->AlignedWrite(&use_missing_, sizeof(use_missing_));
writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_));
writer->AlignedWrite(&has_raw_, sizeof(has_raw_));
writer->AlignedWrite(used_feature_map_.data(),
sizeof(int) * num_total_features_);
writer->AlignedWrite(&num_groups_, sizeof(num_groups_));
writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(feature2subfeature_.data(),
sizeof(int) * num_features_);
writer->Write(group_bin_boundaries_.data(),
sizeof(uint64_t) * (num_groups_ + 1));
writer->AlignedWrite(group_feature_start_.data(),
sizeof(int) * num_groups_);
writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_);
if (max_bin_by_feature_.empty()) {
ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
}
writer->AlignedWrite(max_bin_by_feature_.data(),
sizeof(int32_t) * num_total_features_);
if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
max_bin_by_feature_.clear();
}
// write feature names
for (int i = 0; i < num_total_features_; ++i) {
int str_len = static_cast<int>(feature_names_[i].size());
writer->AlignedWrite(&str_len, sizeof(int));
const char* c_str = feature_names_[i].c_str();
writer->AlignedWrite(c_str, sizeof(char) * str_len);
}
// write forced bins
for (int i = 0; i < num_total_features_; ++i) {
int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
writer->AlignedWrite(&num_bounds, sizeof(int));
for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
}
}
// Write the basic header information for the dataset
SerializeHeader(writer.get());
// get size of meta data
size_t size_of_metadata = metadata_.SizesInByte();
......@@ -1081,7 +1015,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
size_t size_of_feature = feature_groups_[i]->SizesInByte();
writer->Write(&size_of_feature, sizeof(size_of_feature));
// write feature
feature_groups_[i]->SaveBinaryToFile(writer.get());
feature_groups_[i]->SerializeToBinary(writer.get());
}
// write raw data; use row-major order so we can read row-by-row
......@@ -1098,6 +1032,117 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
}
}
void Dataset::SerializeReference(ByteBuffer* buffer) {
Log::Info("Saving data reference to binary buffer");
// Calculate approximate size of output and reserve space
size_t size_of_token = std::strlen(binary_serialized_reference_token);
size_t initial_capacity = size_of_token + GetSerializedHeaderSize();
// write feature group definitions
for (int i = 0; i < num_groups_; ++i) {
initial_capacity += feature_groups_[i]->SizesInByte(/* include_data */ false);
}
// Give a little extra just in case, to avoid unnecessary resizes
buffer->Reserve(static_cast<size_t>(1.1 * static_cast<double>(initial_capacity)));
// Write token that marks the data as binary reference, and the version
buffer->AlignedWrite(binary_serialized_reference_token, size_of_token);
buffer->AlignedWrite(serialized_reference_version, kSerializedReferenceVersionLength);
// Write the basic definition of the overall dataset
SerializeHeader(buffer);
// write feature group definitions
for (int i = 0; i < num_groups_; ++i) {
// get size of feature
size_t size_of_feature = feature_groups_[i]->SizesInByte(false);
buffer->Write(&size_of_feature, sizeof(size_of_feature));
// write feature
feature_groups_[i]->SerializeToBinary(buffer, /* include_data */ false);
}
}
size_t Dataset::GetSerializedHeaderSize() {
size_t size_of_header =
VirtualFileWriter::AlignedSize(sizeof(num_data_)) +
VirtualFileWriter::AlignedSize(sizeof(num_features_)) +
VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) +
VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(label_idx_)) +
VirtualFileWriter::AlignedSize(sizeof(num_groups_)) +
3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) +
sizeof(uint64_t) * (num_groups_ + 1) +
2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) +
VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) +
VirtualFileWriter::AlignedSize(sizeof(int)) * 3 +
VirtualFileWriter::AlignedSize(sizeof(bool)) * 3;
// size of feature names and forced bins
for (int i = 0; i < num_total_features_; ++i) {
size_of_header +=
VirtualFileWriter::AlignedSize(feature_names_[i].size()) +
VirtualFileWriter::AlignedSize(sizeof(int)) +
forced_bin_bounds_[i].size() * sizeof(double) +
VirtualFileWriter::AlignedSize(sizeof(int));
}
return size_of_header;
}
void Dataset::SerializeHeader(BinaryWriter* writer) {
size_t size_of_header = GetSerializedHeaderSize();
writer->Write(&size_of_header, sizeof(size_of_header));
// write header
writer->AlignedWrite(&num_data_, sizeof(num_data_));
writer->AlignedWrite(&num_features_, sizeof(num_features_));
writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_));
writer->AlignedWrite(&label_idx_, sizeof(label_idx_));
writer->AlignedWrite(&max_bin_, sizeof(max_bin_));
writer->AlignedWrite(&bin_construct_sample_cnt_,
sizeof(bin_construct_sample_cnt_));
writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_));
writer->AlignedWrite(&use_missing_, sizeof(use_missing_));
writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_));
writer->AlignedWrite(&has_raw_, sizeof(has_raw_));
writer->AlignedWrite(used_feature_map_.data(),
sizeof(int) * num_total_features_);
writer->AlignedWrite(&num_groups_, sizeof(num_groups_));
writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_);
writer->AlignedWrite(feature2subfeature_.data(),
sizeof(int) * num_features_);
writer->Write(group_bin_boundaries_.data(),
sizeof(uint64_t) * (num_groups_ + 1));
writer->AlignedWrite(group_feature_start_.data(),
sizeof(int) * num_groups_);
writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_);
if (max_bin_by_feature_.empty()) {
ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
}
writer->AlignedWrite(max_bin_by_feature_.data(),
sizeof(int32_t) * num_total_features_);
if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
max_bin_by_feature_.clear();
}
// write feature names
for (int i = 0; i < num_total_features_; ++i) {
int str_len = static_cast<int>(feature_names_[i].size());
writer->AlignedWrite(&str_len, sizeof(int));
const char* c_str = feature_names_[i].c_str();
writer->AlignedWrite(c_str, sizeof(char) * str_len);
}
// write forced bins
for (int i = 0; i < num_total_features_; ++i) {
int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
writer->AlignedWrite(&num_bounds, sizeof(int));
for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
}
}
}
void Dataset::DumpTextFile(const char* text_filename) {
FILE* file = NULL;
#if _MSC_VER
......
......@@ -353,6 +353,67 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
return dataset.release();
}
Dataset* DatasetLoader::LoadFromSerializedReference(const char* binary_data, size_t buffer_size, data_size_t num_data, int32_t num_classes) {
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
auto mem_ptr = binary_data;
// check token
const size_t size_of_token = std::strlen(Dataset::binary_serialized_reference_token);
size_t size_of_token_in_input = VirtualFileWriter::AlignedSize(sizeof(char) * size_of_token);
if (buffer_size < size_of_token_in_input) {
Log::Fatal("Binary definition file error: token has the wrong size");
}
if (std::string(mem_ptr, size_of_token) != std::string(Dataset::binary_serialized_reference_token)) {
Log::Fatal("Input file is not LightGBM binary reference file");
}
mem_ptr += size_of_token_in_input;
size_t size_of_version = VirtualFileWriter::AlignedSize(Dataset::kSerializedReferenceVersionLength);
std::string version(mem_ptr, Dataset::kSerializedReferenceVersionLength);
if (version != std::string(Dataset::serialized_reference_version)) {
Log::Fatal("Unexpected version of serialized binary data: %s", version.c_str());
}
mem_ptr += size_of_version;
size_t size_of_header = *(reinterpret_cast<const size_t*>(mem_ptr));
mem_ptr += sizeof(size_t);
LoadHeaderFromMemory(dataset.get(), mem_ptr);
dataset->num_data_ = num_data; // update to the given num_data
mem_ptr += size_of_header;
// read feature group definitions
for (int i = 0; i < dataset->num_groups_; ++i) {
// read feature size
const size_t size_of_feature = *(reinterpret_cast<const size_t*>(mem_ptr));
mem_ptr += sizeof(size_t);
dataset->feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(new FeatureGroup(mem_ptr, num_data, i)));
mem_ptr += size_of_feature;
}
dataset->feature_groups_.shrink_to_fit();
dataset->numeric_feature_map_ = std::vector<int>(dataset->num_features_, false);
dataset->num_numeric_features_ = 0;
for (int i = 0; i < dataset->num_features_; ++i) {
if (dataset->FeatureBinMapper(i)->bin_type() == BinType::CategoricalBin) {
dataset->numeric_feature_map_[i] = -1;
} else {
dataset->numeric_feature_map_[i] = dataset->num_numeric_features_;
++dataset->num_numeric_features_;
}
}
int has_weights = config_.weight_column.size() > 0;
int has_init_scores = num_classes > 0;
int has_queries = config_.group_column.size() > 0;
dataset->metadata_.Init(num_data, has_weights, has_init_scores, has_queries, num_classes);
Log::Info("Loaded reference dataset: %d features, %d num_data", dataset->num_features_, num_data);
return dataset.release();
}
Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* bin_filename,
int rank, int num_machines, int* num_global_data,
std::vector<data_size_t>* used_data_indices) {
......@@ -388,7 +449,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
size_t size_of_head = *(reinterpret_cast<size_t*>(buffer.data()));
// re-allocmate space if not enough
// re-allocate space if not enough
if (size_of_head > buffer_size) {
buffer_size = size_of_head;
buffer.resize(buffer_size);
......@@ -401,135 +462,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
}
// get header
const char* mem_ptr = buffer.data();
dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_data_));
dataset->num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_features_));
dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(dataset->num_total_features_));
dataset->label_idx_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->label_idx_));
dataset->max_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->max_bin_));
dataset->bin_construct_sample_cnt_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(
sizeof(dataset->bin_construct_sample_cnt_));
dataset->min_data_in_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->min_data_in_bin_));
dataset->use_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->use_missing_));
dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->zero_as_missing_));
dataset->has_raw_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->has_raw_));
const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
dataset->used_feature_map_.clear();
for (int i = 0; i < dataset->num_total_features_; ++i) {
dataset->used_feature_map_.push_back(tmp_feature_map[i]);
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) *
dataset->num_total_features_);
// num_groups
dataset->num_groups_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_groups_));
// real_feature_idx_
const int* tmp_ptr_real_feature_idx_ = reinterpret_cast<const int*>(mem_ptr);
dataset->real_feature_idx_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->real_feature_idx_.push_back(tmp_ptr_real_feature_idx_[i]);
}
mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_);
// feature2group
const int* tmp_ptr_feature2group = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2group_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2group_.push_back(tmp_ptr_feature2group[i]);
}
mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_);
// feature2subfeature
const int* tmp_ptr_feature2subfeature = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2subfeature_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2subfeature_.push_back(tmp_ptr_feature2subfeature[i]);
}
mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_);
// group_bin_boundaries
const uint64_t* tmp_ptr_group_bin_boundaries = reinterpret_cast<const uint64_t*>(mem_ptr);
dataset->group_bin_boundaries_.clear();
for (int i = 0; i < dataset->num_groups_ + 1; ++i) {
dataset->group_bin_boundaries_.push_back(tmp_ptr_group_bin_boundaries[i]);
}
mem_ptr += sizeof(uint64_t) * (dataset->num_groups_ + 1);
// group_feature_start_
const int* tmp_ptr_group_feature_start = reinterpret_cast<const int*>(mem_ptr);
dataset->group_feature_start_.clear();
for (int i = 0; i < dataset->num_groups_; ++i) {
dataset->group_feature_start_.push_back(tmp_ptr_group_feature_start[i]);
}
mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_));
// group_feature_cnt_
const int* tmp_ptr_group_feature_cnt = reinterpret_cast<const int*>(mem_ptr);
dataset->group_feature_cnt_.clear();
for (int i = 0; i < dataset->num_groups_; ++i) {
dataset->group_feature_cnt_.push_back(tmp_ptr_group_feature_cnt[i]);
}
mem_ptr +=
VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_));
if (!config_.max_bin_by_feature.empty()) {
CHECK_EQ(static_cast<size_t>(dataset->num_total_features_), config_.max_bin_by_feature.size());
CHECK_GT(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())), 1);
dataset->max_bin_by_feature_.resize(dataset->num_total_features_);
dataset->max_bin_by_feature_.assign(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end());
} else {
const int32_t* tmp_ptr_max_bin_by_feature = reinterpret_cast<const int32_t*>(mem_ptr);
dataset->max_bin_by_feature_.clear();
for (int i = 0; i < dataset->num_total_features_; ++i) {
dataset->max_bin_by_feature_.push_back(tmp_ptr_max_bin_by_feature[i]);
}
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int32_t) *
(dataset->num_total_features_));
if (ArrayArgs<int32_t>::CheckAll(dataset->max_bin_by_feature_, -1)) {
dataset->max_bin_by_feature_.clear();
}
// get feature names
dataset->feature_names_.clear();
// write feature names
for (int i = 0; i < dataset->num_total_features_; ++i) {
int str_len = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int));
std::stringstream str_buf;
auto tmp_arr = reinterpret_cast<const char*>(mem_ptr);
for (int j = 0; j < str_len; ++j) {
char tmp_char = tmp_arr[j];
str_buf << tmp_char;
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(char) * str_len);
dataset->feature_names_.emplace_back(str_buf.str());
}
// get forced_bin_bounds_
dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>());
for (int i = 0; i < dataset->num_total_features_; ++i) {
int num_bounds = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int));
dataset->forced_bin_bounds_[i] = std::vector<double>();
const double* tmp_ptr_forced_bounds =
reinterpret_cast<const double*>(mem_ptr);
for (int j = 0; j < num_bounds; ++j) {
double bound = tmp_ptr_forced_bounds[j];
dataset->forced_bin_bounds_[i].push_back(bound);
}
mem_ptr += num_bounds * sizeof(double);
}
LoadHeaderFromMemory(dataset.get(), mem_ptr);
// read size of meta data
read_cnt = reader->Read(buffer.data(), sizeof(size_t));
......@@ -821,6 +754,131 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
// ---- private functions ----
void DatasetLoader::LoadHeaderFromMemory(Dataset* dataset, const char* buffer) {
// get header
const char* mem_ptr = buffer;
dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_data_));
dataset->num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_features_));
dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_total_features_));
dataset->label_idx_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->label_idx_));
dataset->max_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->max_bin_));
dataset->bin_construct_sample_cnt_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->bin_construct_sample_cnt_));
dataset->min_data_in_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->min_data_in_bin_));
dataset->use_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->use_missing_));
dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->zero_as_missing_));
dataset->has_raw_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->has_raw_));
const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
dataset->used_feature_map_.clear();
for (int i = 0; i < dataset->num_total_features_; ++i) {
dataset->used_feature_map_.push_back(tmp_feature_map[i]);
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_total_features_);
// num_groups
dataset->num_groups_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(dataset->num_groups_));
// real_feature_idx_
const int* tmp_ptr_real_feature_idx_ = reinterpret_cast<const int*>(mem_ptr);
dataset->real_feature_idx_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->real_feature_idx_.push_back(tmp_ptr_real_feature_idx_[i]);
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_);
// feature2group
const int* tmp_ptr_feature2group = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2group_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2group_.push_back(tmp_ptr_feature2group[i]);
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_);
// feature2subfeature
const int* tmp_ptr_feature2subfeature = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2subfeature_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2subfeature_.push_back(tmp_ptr_feature2subfeature[i]);
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * dataset->num_features_);
// group_bin_boundaries
const uint64_t* tmp_ptr_group_bin_boundaries = reinterpret_cast<const uint64_t*>(mem_ptr);
dataset->group_bin_boundaries_.clear();
for (int i = 0; i < dataset->num_groups_ + 1; ++i) {
dataset->group_bin_boundaries_.push_back(tmp_ptr_group_bin_boundaries[i]);
}
mem_ptr += sizeof(uint64_t) * (dataset->num_groups_ + 1);
// group_feature_start_
const int* tmp_ptr_group_feature_start = reinterpret_cast<const int*>(mem_ptr);
dataset->group_feature_start_.clear();
for (int i = 0; i < dataset->num_groups_; ++i) {
dataset->group_feature_start_.push_back(tmp_ptr_group_feature_start[i]);
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_));
// group_feature_cnt_
const int* tmp_ptr_group_feature_cnt = reinterpret_cast<const int*>(mem_ptr);
dataset->group_feature_cnt_.clear();
for (int i = 0; i < dataset->num_groups_; ++i) {
dataset->group_feature_cnt_.push_back(tmp_ptr_group_feature_cnt[i]);
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int) * (dataset->num_groups_));
if (!config_.max_bin_by_feature.empty()) {
CHECK_EQ(static_cast<size_t>(dataset->num_total_features_), config_.max_bin_by_feature.size());
CHECK_GT(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())), 1);
dataset->max_bin_by_feature_.resize(dataset->num_total_features_);
dataset->max_bin_by_feature_.assign(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end());
} else {
const int32_t* tmp_ptr_max_bin_by_feature = reinterpret_cast<const int32_t*>(mem_ptr);
dataset->max_bin_by_feature_.clear();
for (int i = 0; i < dataset->num_total_features_; ++i) {
dataset->max_bin_by_feature_.push_back(tmp_ptr_max_bin_by_feature[i]);
}
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int32_t) * (dataset->num_total_features_));
if (ArrayArgs<int32_t>::CheckAll(dataset->max_bin_by_feature_, -1)) {
dataset->max_bin_by_feature_.clear();
}
// get feature names
dataset->feature_names_.clear();
for (int i = 0; i < dataset->num_total_features_; ++i) {
int str_len = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int));
std::stringstream str_buf;
auto tmp_arr = reinterpret_cast<const char*>(mem_ptr);
for (int j = 0; j < str_len; ++j) {
char tmp_char = tmp_arr[j];
str_buf << tmp_char;
}
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(char) * str_len);
dataset->feature_names_.emplace_back(str_buf.str());
}
// get forced_bin_bounds_
dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>());
for (int i = 0; i < dataset->num_total_features_; ++i) {
int num_bounds = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += VirtualFileWriter::AlignedSize(sizeof(int));
dataset->forced_bin_bounds_[i] = std::vector<double>();
const double* tmp_ptr_forced_bounds =
reinterpret_cast<const double*>(mem_ptr);
for (int j = 0; j < num_bounds; ++j) {
double bound = tmp_ptr_forced_bounds[j];
dataset->forced_bin_bounds_[i].push_back(bound);
}
mem_ptr += num_bounds * sizeof(double);
}
}
void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binary) {
if (dataset->num_data_ <= 0) {
Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str());
......
......@@ -451,7 +451,7 @@ class DenseBin : public Bin {
}
}
void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
void SaveBinaryToFile(BinaryWriter* writer) const override {
writer->AlignedWrite(data_.data(), sizeof(VAL_T) * data_.size());
}
......
......@@ -46,7 +46,7 @@ struct LocalFile : VirtualFileReader, VirtualFileWriter {
return fread(buffer, 1, bytes, file_);
}
size_t Write(const void* buffer, size_t bytes) const {
size_t Write(const void* buffer, size_t bytes) {
return fwrite(buffer, bytes, 1, file_) == 1 ? bytes : 0;
}
......
......@@ -675,7 +675,7 @@ void Metadata::LoadFromMemory(const void* memory) {
CalculateQueryWeights();
}
void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const {
void Metadata::SaveBinaryToFile(BinaryWriter* writer) const {
writer->AlignedWrite(&num_data_, sizeof(num_data_));
writer->AlignedWrite(&num_weights_, sizeof(num_weights_));
writer->AlignedWrite(&num_queries_, sizeof(num_queries_));
......
......@@ -508,7 +508,7 @@ class SparseBin : public Bin {
fast_index_.shrink_to_fit();
}
void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
void SaveBinaryToFile(BinaryWriter* writer) const override {
writer->AlignedWrite(&num_vals_, sizeof(num_vals_));
writer->AlignedWrite(deltas_.data(), sizeof(uint8_t) * (num_vals_ + 1));
writer->AlignedWrite(vals_.data(), sizeof(VAL_T) * num_vals_);
......
/*!
* Copyright (c) 2022 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#include <gtest/gtest.h>
#include <LightGBM/utils/byte_buffer.h>
#include <random>
using LightGBM::ByteBuffer;
TEST(ByteBuffer, JustWorks) {
std::unique_ptr<ByteBuffer> buffer;
buffer.reset(new ByteBuffer());
int cumulativeSize = 0;
EXPECT_EQ(cumulativeSize, buffer->GetSize());
int8_t int8Val = 34;
cumulativeSize += sizeof(int8_t);
buffer->Write(&int8Val, sizeof(int8_t));
EXPECT_EQ(cumulativeSize, buffer->GetSize());
EXPECT_EQ(int8Val, buffer->GetAt(cumulativeSize - 1));
int16_t int16Val = 33;
cumulativeSize += sizeof(int16_t);
buffer->Write(&int16Val, sizeof(int16_t));
EXPECT_EQ(cumulativeSize, buffer->GetSize());
int16_t serializedInt16 = 0;
char* int16Ptr = reinterpret_cast<char*>(&serializedInt16);
for (int i = 0; i < sizeof(int16_t); i++) {
int16Ptr[i] = buffer->GetAt(cumulativeSize - (sizeof(int16_t) - i));
}
EXPECT_EQ(int16Val, serializedInt16);
int64_t int64Val = 35;
cumulativeSize += sizeof(int64_t);
buffer->Write(&int64Val, sizeof(int64_t));
EXPECT_EQ(cumulativeSize, buffer->GetSize());
int64_t serializedInt64 = 0;
char* int64Ptr = reinterpret_cast<char*>(&serializedInt64);
for (int i = 0; i < sizeof(int64_t); i++) {
int64Ptr[i] = buffer->GetAt(cumulativeSize - (sizeof(int64_t) - i));
}
EXPECT_EQ(int64Val, serializedInt64);
double doubleVal = 36.6;
cumulativeSize += sizeof(double);
buffer->Write(&doubleVal, sizeof(doubleVal));
EXPECT_EQ(cumulativeSize, buffer->GetSize());
double serializedDouble = 0;
char* doublePtr = reinterpret_cast<char*>(&serializedDouble);
for (int i = 0; i < sizeof(double); i++) {
doublePtr[i] = buffer->GetAt(cumulativeSize - (sizeof(double) - i));
}
EXPECT_EQ(doubleVal, serializedDouble);
const int charSize = 3;
char charArrayVal[charSize] = { 'a', 'b', 'c' };
cumulativeSize += charSize;
buffer->Write(charArrayVal, charSize);
EXPECT_EQ(cumulativeSize, buffer->GetSize());
for (int i = 0; i < charSize; i++) {
EXPECT_EQ(charArrayVal[i], buffer->GetAt(cumulativeSize - (charSize - i)));
}
// Test that Data() points to first value written
EXPECT_EQ(int8Val, *buffer->Data());
}
/*!
* Copyright (c) 2022 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#include <gtest/gtest.h>
#include <testutils.h>
#include <LightGBM/utils/byte_buffer.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/c_api.h>
#include <LightGBM/dataset.h>
#include <iostream>
using LightGBM::ByteBuffer;
using LightGBM::Dataset;
using LightGBM::Log;
using LightGBM::TestUtils;
TEST(Serialization, JustWorks) {
// Load some test data
DatasetHandle dataset_handle;
const char* params = "max_bin=15";
int result = TestUtils::LoadDatasetFromExamples("binary_classification/binary.test", params, &dataset_handle);
EXPECT_EQ(0, result) << "LoadDatasetFromExamples result code: " << result;
Dataset* dataset;
bool succeeded = true;
std::string exceptionText("");
try {
dataset = static_cast<Dataset*>(dataset_handle);
// Serialize the reference
ByteBufferHandle buffer_handle;
int32_t buffer_len;
result = LGBM_DatasetSerializeReferenceToBinary(dataset_handle, &buffer_handle, &buffer_len);
EXPECT_EQ(0, result) << "LGBM_DatasetSerializeReferenceToBinary result code: " << result;
ByteBuffer* buffer = nullptr;
Dataset* deserialized_dataset = nullptr;
try {
buffer = static_cast<ByteBuffer*>(buffer_handle);
// Deserialize the reference
DatasetHandle deserialized_dataset_handle;
result = LGBM_DatasetCreateFromSerializedReference(buffer->Data(),
static_cast<int32_t>(buffer->GetSize()),
dataset->num_data(),
0, // num_classes
params,
&deserialized_dataset_handle);
EXPECT_EQ(0, result) << "LGBM_DatasetCreateFromSerializedReference result code: " << result;
// Confirm 1 successful API call
deserialized_dataset = static_cast<Dataset*>(deserialized_dataset_handle);
EXPECT_EQ(dataset->num_data(), deserialized_dataset->num_data());
} catch (std::exception& ex) {
succeeded = false;
exceptionText = std::string(ex.what());
}
// Free memory
if (buffer) {
result = LGBM_ByteBufferFree(buffer);
EXPECT_EQ(0, result) << "LGBM_ByteBufferFree result code: " << result;
}
if (deserialized_dataset) {
result = LGBM_DatasetFree(deserialized_dataset);
EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
}
} catch (std::exception& ex) {
succeeded = false;
exceptionText = std::string(ex.what());
}
if (dataset) {
result = LGBM_DatasetFree(dataset);
EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
}
if (!succeeded) {
FAIL() << "Test Serialization failed with exception: " << exceptionText;
}
}
......@@ -258,6 +258,8 @@
<ClInclude Include="..\include\LightGBM\tree_learner.h" />
<ClInclude Include="..\include\LightGBM\utils\yamc\alternate_shared_mutex.hpp" />
<ClInclude Include="..\include\LightGBM\utils\array_args.h" />
<ClInclude Include="..\include\LightGBM\utils\binary_writer.h" />
<ClInclude Include="..\include\LightGBM\utils\byte_buffer.h" />
<ClInclude Include="..\include\LightGBM\utils\common.h" />
<ClInclude Include="..\include\LightGBM\utils\file_io.h" />
<ClInclude Include="..\include\LightGBM\utils\json11.h" />
......
......@@ -231,6 +231,12 @@
<ClInclude Include="..\src\treelearner\linear_tree_learner.h">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\byte_buffer.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\binary_writer.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\src\application\application.cpp">
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment