Commit 4f77bd28 authored by Guolin Ke's avatar Guolin Ke
Browse files

update to v2.

parent 13d4581b
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <LightGBM/feature_group.h>
#include <LightGBM/feature.h>
#include <LightGBM/utils/openmp_wrapper.h> #include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/threading.h>
#include <LightGBM/utils/array_args.h>
#include <cstdio> #include <cstdio>
#include <unordered_map> #include <unordered_map>
...@@ -16,6 +16,8 @@ namespace LightGBM { ...@@ -16,6 +16,8 @@ namespace LightGBM {
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n"; const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
Dataset::Dataset() { Dataset::Dataset() {
data_filename_ = "noname"; data_filename_ = "noname";
num_data_ = 0; num_data_ = 0;
...@@ -24,50 +26,189 @@ Dataset::Dataset() { ...@@ -24,50 +26,189 @@ Dataset::Dataset() {
Dataset::Dataset(data_size_t num_data) { Dataset::Dataset(data_size_t num_data) {
data_filename_ = "noname"; data_filename_ = "noname";
num_data_ = num_data; num_data_ = num_data;
metadata_.Init(num_data_, -1, -1); metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC);
} }
Dataset::~Dataset() { Dataset::~Dataset() {
}
std::vector<std::vector<int>> NoGroup(
const std::vector<int>& used_features) {
std::vector<std::vector<int>> features_in_group;
features_in_group.resize(used_features.size());
for (size_t i = 0; i < used_features.size(); ++i) {
features_in_group[i].emplace_back(used_features[i]);
}
return features_in_group;
}
void Dataset::Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const std::vector<std::vector<int>>& sample_indices,
size_t total_sample_cnt,
const IOConfig& io_config) {
num_total_features_ = static_cast<int>(bin_mappers.size());
// get num_features
std::vector<int> used_features;
for (int i = 0; i < static_cast<int>(bin_mappers.size()); ++i) {
if (bin_mappers[i] != nullptr && !bin_mappers[i]->is_trival()) {
used_features.emplace_back(i);
}
}
auto features_in_group = NoGroup(used_features);
num_features_ = 0;
for (const auto& fs : features_in_group) {
num_features_ += static_cast<int>(fs.size());
}
int cur_fidx = 0;
used_feature_map_ = std::vector<int>(num_total_features_, -1);
num_groups_ = static_cast<int>(features_in_group.size());
real_feature_idx_.resize(num_features_);
feature2group_.resize(num_features_);
feature2subfeature_.resize(num_features_);
for (int i = 0; i < num_groups_; ++i) {
auto cur_features = features_in_group[i];
int cur_cnt_features = static_cast<int>(cur_features.size());
// get bin_mappers
std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
for (int j = 0; j < cur_cnt_features; ++j) {
int real_fidx = cur_features[j];
used_feature_map_[real_fidx] = cur_fidx;
real_feature_idx_[cur_fidx] = real_fidx;
feature2group_[cur_fidx] = i;
feature2subfeature_[cur_fidx] = j;
cur_bin_mappers.emplace_back(bin_mappers[real_fidx].release());
++cur_fidx;
}
feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(cur_cnt_features, cur_bin_mappers, num_data_, io_config.is_enable_sparse)));
}
feature_groups_.shrink_to_fit();
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
for (int i = 0; i < num_groups_; ++i) {
num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
}
int last_group = 0;
group_feature_start_.reserve(num_groups_);
group_feature_cnt_.reserve(num_groups_);
group_feature_start_.push_back(0);
group_feature_cnt_.push_back(1);
for (int i = 1; i < num_features_; ++i) {
const int group = feature2group_[i];
if (group == last_group) {
group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
} else {
group_feature_start_.push_back(i);
group_feature_cnt_.push_back(1);
last_group = group;
}
}
} }
void Dataset::FinishLoad() { void Dataset::FinishLoad() {
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < num_groups_; ++i) {
features_[i]->FinishLoad(); feature_groups_[i]->bin_data_->FinishLoad();
} }
} }
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
features_.clear(); feature_groups_.clear();
num_features_ = dataset->num_features_; num_features_ = dataset->num_features_;
num_groups_ = dataset->num_groups_;
bool is_enable_sparse = false; bool is_enable_sparse = false;
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < num_groups_; ++i) {
if (dataset->features_[i]->is_sparse()) { if (dataset->feature_groups_[i]->is_sparse_) {
is_enable_sparse = true; is_enable_sparse = true;
break; break;
} }
} }
// copy feature bin mapper data // copy feature bin mapper data
for(int i = 0;i < num_features_;++i){ for (int i = 0; i < num_groups_; ++i) {
features_.emplace_back(new Feature(dataset->features_[i]->feature_index(), std::vector<std::unique_ptr<BinMapper>> bin_mappers;
new BinMapper(*(dataset->features_[i]->bin_mapper())), for (int j = 0; j < dataset->feature_groups_[i]->num_feature_; ++j) {
bin_mappers.emplace_back(new BinMapper(*(dataset->feature_groups_[i]->bin_mappers_[j])));
}
feature_groups_.emplace_back(new FeatureGroup(
dataset->feature_groups_[i]->num_feature_,
bin_mappers,
num_data_, num_data_,
is_enable_sparse)); is_enable_sparse));
} }
features_.shrink_to_fit(); feature_groups_.shrink_to_fit();
used_feature_map_ = dataset->used_feature_map_; used_feature_map_ = dataset->used_feature_map_;
num_total_features_ = dataset->num_total_features_; num_total_features_ = dataset->num_total_features_;
feature_names_ = dataset->feature_names_; feature_names_ = dataset->feature_names_;
label_idx_ = dataset->label_idx_; label_idx_ = dataset->label_idx_;
real_feature_idx_ = dataset->real_feature_idx_;
feature2group_ = dataset->feature2group_;
feature2subfeature_ = dataset->feature2subfeature_;
group_bin_boundaries_ = dataset->group_bin_boundaries_;
group_feature_start_ = dataset->group_feature_start_;
group_feature_cnt_ = dataset->group_feature_cnt_;
}
void Dataset::CreateValid(const Dataset* dataset) {
feature_groups_.clear();
num_features_ = dataset->num_features_;
num_groups_ = num_features_;
bool is_enable_sparse = true;
feature2group_.clear();
feature2subfeature_.clear();
// copy feature bin mapper data
for (int i = 0; i < num_features_; ++i) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers;
bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i))));
feature_groups_.emplace_back(new FeatureGroup(
1,
bin_mappers,
num_data_,
is_enable_sparse));
feature2group_.push_back(i);
feature2subfeature_.push_back(0);
}
feature_groups_.shrink_to_fit();
used_feature_map_ = dataset->used_feature_map_;
num_total_features_ = dataset->num_total_features_;
feature_names_ = dataset->feature_names_;
label_idx_ = dataset->label_idx_;
real_feature_idx_ = dataset->real_feature_idx_;
group_bin_boundaries_.clear();
uint64_t num_total_bin = 0;
group_bin_boundaries_.push_back(num_total_bin);
for (int i = 0; i < num_groups_; ++i) {
num_total_bin += feature_groups_[i]->num_total_bin_;
group_bin_boundaries_.push_back(num_total_bin);
}
int last_group = 0;
group_feature_start_.reserve(num_groups_);
group_feature_cnt_.reserve(num_groups_);
group_feature_start_.push_back(0);
group_feature_cnt_.push_back(1);
for (int i = 1; i < num_features_; ++i) {
const int group = feature2group_[i];
if (group == last_group) {
group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
} else {
group_feature_start_.push_back(i);
group_feature_cnt_.push_back(1);
last_group = group;
}
}
} }
void Dataset::ReSize(data_size_t num_data) { void Dataset::ReSize(data_size_t num_data) {
if (num_data_ != num_data) { if (num_data_ != num_data) {
num_data_ = num_data; num_data_ = num_data;
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int fidx = 0; fidx < num_features_; ++fidx) { for (int group = 0; group < num_groups_; ++group) {
features_[fidx]->ReSize(num_data_); feature_groups_[group]->bin_data_->ReSize(num_data_);
} }
} }
} }
...@@ -75,8 +216,8 @@ void Dataset::ReSize(data_size_t num_data) { ...@@ -75,8 +216,8 @@ void Dataset::ReSize(data_size_t num_data) {
void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) { void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) {
CHECK(num_used_indices == num_data_); CHECK(num_used_indices == num_data_);
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int fidx = 0; fidx < num_features_; ++fidx) { for (int group = 0; group < num_groups_; ++group) {
features_[fidx]->CopySubset(fullset->features_[fidx].get(), used_indices, num_used_indices); feature_groups_[group]->CopySubset(fullset->feature_groups_[group].get(), used_indices, num_used_indices);
} }
if (need_meta_data) { if (need_meta_data) {
metadata_.Init(fullset->metadata_, used_indices, num_used_indices); metadata_.Init(fullset->metadata_, used_indices, num_used_indices);
...@@ -158,8 +299,8 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in ...@@ -158,8 +299,8 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in
} }
void Dataset::SaveBinaryFile(const char* bin_filename) { void Dataset::SaveBinaryFile(const char* bin_filename) {
if (bin_filename != nullptr if (bin_filename != nullptr
&& std::string(bin_filename) == std::string(data_filename_)) { && std::string(bin_filename) == std::string(data_filename_)) {
Log::Warning("Bianry file %s already existed", bin_filename); Log::Warning("Bianry file %s already existed", bin_filename);
return; return;
} }
...@@ -196,8 +337,9 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -196,8 +337,9 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
size_t size_of_token = std::strlen(binary_file_token); size_t size_of_token = std::strlen(binary_file_token);
fwrite(binary_file_token, sizeof(char), size_of_token, file); fwrite(binary_file_token, sizeof(char), size_of_token, file);
// get size of header // get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_) size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(size_t) + sizeof(int) * used_feature_map_.size(); + sizeof(int) * num_total_features_ + sizeof(num_groups_)
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_;
// size of feature names // size of feature names
for (int i = 0; i < num_total_features_; ++i) { for (int i = 0; i < num_total_features_; ++i) {
size_of_header += feature_names_[i].size() + sizeof(int); size_of_header += feature_names_[i].size() + sizeof(int);
...@@ -206,10 +348,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -206,10 +348,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
// write header // write header
fwrite(&num_data_, sizeof(num_data_), 1, file); fwrite(&num_data_, sizeof(num_data_), 1, file);
fwrite(&num_features_, sizeof(num_features_), 1, file); fwrite(&num_features_, sizeof(num_features_), 1, file);
fwrite(&num_total_features_, sizeof(num_features_), 1, file); fwrite(&num_total_features_, sizeof(num_total_features_), 1, file);
size_t num_used_feature_map = used_feature_map_.size(); fwrite(used_feature_map_.data(), sizeof(int), num_total_features_, file);
fwrite(&num_used_feature_map, sizeof(num_used_feature_map), 1, file); fwrite(&num_groups_, sizeof(num_groups_), 1, file);
fwrite(used_feature_map_.data(), sizeof(int), num_used_feature_map, file); fwrite(real_feature_idx_.data(), sizeof(int), num_features_, file);
fwrite(feature2group_.data(), sizeof(int), num_features_, file);
fwrite(feature2subfeature_.data(), sizeof(int), num_features_, file);
fwrite(group_bin_boundaries_.data(), sizeof(uint64_t), num_groups_ + 1, file);
fwrite(group_feature_start_.data(), sizeof(int), num_groups_, file);
fwrite(group_feature_cnt_.data(), sizeof(int), num_groups_, file);
// write feature names // write feature names
for (int i = 0; i < num_total_features_; ++i) { for (int i = 0; i < num_total_features_; ++i) {
...@@ -226,15 +373,94 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -226,15 +373,94 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
metadata_.SaveBinaryToFile(file); metadata_.SaveBinaryToFile(file);
// write feature data // write feature data
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < num_groups_; ++i) {
// get size of feature // get size of feature
size_t size_of_feature = features_[i]->SizesInByte(); size_t size_of_feature = feature_groups_[i]->SizesInByte();
fwrite(&size_of_feature, sizeof(size_of_feature), 1, file); fwrite(&size_of_feature, sizeof(size_of_feature), 1, file);
// write feature // write feature
features_[i]->SaveBinaryToFile(file); feature_groups_[i]->SaveBinaryToFile(file);
} }
fclose(file); fclose(file);
} }
} }
void Dataset::ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
HistogramBinEntry* hist_data) const {
if (leaf_idx < 0 || num_data <= 0 || hist_data == nullptr) {
return;
}
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
}
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
}
#pragma omp parallel for schedule(guided)
for (int group = 0; group < num_groups_; ++group) {
bool is_groud_used = false;
const int f_cnt = group_feature_cnt_[group];
for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j;
if (is_feature_used[fidx]) {
is_groud_used = true;
break;
}
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
}
}
}
void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
HistogramBinEntry* data) const {
const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();
const int default_bin = bin_mapper->GetDefaultBin();
if (default_bin > 0) {
const int num_bin = bin_mapper->num_bin();
data[default_bin].sum_gradients = sum_gradient;
data[default_bin].sum_hessians = sum_hessian;
data[default_bin].cnt = num_data;
for (int i = 0; i < num_bin; ++i) {
if (i != default_bin) {
data[default_bin].sum_gradients -= data[i].sum_gradients;
data[default_bin].sum_hessians -= data[i].sum_hessians;
data[default_bin].cnt -= data[i].cnt;
}
}
}
}
} // namespace LightGBM } // namespace LightGBM
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#include <LightGBM/utils/log.h> #include <LightGBM/utils/log.h>
#include <LightGBM/dataset_loader.h> #include <LightGBM/dataset_loader.h>
#include <LightGBM/feature.h>
#include <LightGBM/network.h> #include <LightGBM/network.h>
...@@ -132,31 +131,6 @@ void DatasetLoader::SetHeader(const char* filename) { ...@@ -132,31 +131,6 @@ void DatasetLoader::SetHeader(const char* filename) {
ignore_features_.emplace(group_idx_); ignore_features_.emplace(group_idx_);
} }
} }
// load categorical features
if (io_config_.categorical_column.size() > 0) {
if (Common::StartsWith(io_config_.categorical_column, name_prefix)) {
std::string names = io_config_.categorical_column.substr(name_prefix.size());
for (auto name : Common::Split(names.c_str(), ',')) {
if (name2idx.count(name) > 0) {
int tmp = name2idx[name];
categorical_features_.emplace(tmp);
} else {
Log::Fatal("Could not find categorical_column %s in data file", name.c_str());
}
}
} else {
for (auto token : Common::Split(io_config_.categorical_column.c_str(), ',')) {
int tmp = 0;
if (!Common::AtoiAndCheck(token.c_str(), &tmp)) {
Log::Fatal("categorical_column is not a number, \
if you want to use a column name, \
please add the prefix \"name:\" to the column name");
}
categorical_features_.emplace(tmp);
}
}
}
} }
Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) { Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) {
...@@ -238,7 +212,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, ...@@ -238,7 +212,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset->num_data_ = static_cast<data_size_t>(text_data.size()); dataset->num_data_ = static_cast<data_size_t>(text_data.size());
// initialize label // initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
dataset->CopyFeatureMapperFrom(train_data); dataset->CreateValid(train_data);
// extract features // extract features
ExtractFeaturesFromMemory(text_data, parser.get(), dataset.get()); ExtractFeaturesFromMemory(text_data, parser.get(), dataset.get());
text_data.clear(); text_data.clear();
...@@ -249,7 +223,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, ...@@ -249,7 +223,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
num_global_data = dataset->num_data_; num_global_data = dataset->num_data_;
// initialize label // initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
dataset->CopyFeatureMapperFrom(train_data); dataset->CreateValid(train_data);
// extract features // extract features
ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get()); ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
} }
...@@ -318,14 +292,60 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -318,14 +292,60 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
mem_ptr += sizeof(dataset->num_features_); mem_ptr += sizeof(dataset->num_features_);
dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_total_features_); mem_ptr += sizeof(dataset->num_total_features_);
size_t num_used_feature_map = *(reinterpret_cast<const size_t*>(mem_ptr));
mem_ptr += sizeof(num_used_feature_map);
const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr); const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
dataset->used_feature_map_.clear(); dataset->used_feature_map_.clear();
for (size_t i = 0; i < num_used_feature_map; ++i) { for (int i = 0; i < dataset->num_total_features_; ++i) {
dataset->used_feature_map_.push_back(tmp_feature_map[i]); dataset->used_feature_map_.push_back(tmp_feature_map[i]);
} }
mem_ptr += sizeof(int) * num_used_feature_map; mem_ptr += sizeof(int) * dataset->num_total_features_;
// num_groups
dataset->num_groups_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_groups_);
// real_feature_idx_
const int* tmp_ptr_real_feature_idx_ = reinterpret_cast<const int*>(mem_ptr);
dataset->real_feature_idx_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->real_feature_idx_.push_back(tmp_ptr_real_feature_idx_[i]);
}
mem_ptr += sizeof(int) * dataset->num_features_;
// feature2group
const int* tmp_ptr_feature2group = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2group_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2group_.push_back(tmp_ptr_feature2group[i]);
}
mem_ptr += sizeof(int) * dataset->num_features_;
// feature2subfeature
const int* tmp_ptr_feature2subfeature = reinterpret_cast<const int*>(mem_ptr);
dataset->feature2subfeature_.clear();
for (int i = 0; i < dataset->num_features_; ++i) {
dataset->feature2subfeature_.push_back(tmp_ptr_feature2subfeature[i]);
}
mem_ptr += sizeof(int) * dataset->num_features_;
// group_bin_boundaries
const uint64_t* tmp_ptr_group_bin_boundaries = reinterpret_cast<const uint64_t*>(mem_ptr);
dataset->group_bin_boundaries_.clear();
for (int i = 0; i < dataset->num_groups_ + 1; ++i) {
dataset->group_bin_boundaries_.push_back(tmp_ptr_group_bin_boundaries[i]);
}
mem_ptr += sizeof(uint64_t) * (dataset->num_groups_ + 1);
// group_feature_start_
const int* tmp_ptr_group_feature_start = reinterpret_cast<const int*>(mem_ptr);
dataset->group_feature_start_.clear();
for (int i = 0; i < dataset->num_groups_ ; ++i) {
dataset->group_feature_start_.push_back(tmp_ptr_group_feature_start[i]);
}
mem_ptr += sizeof(int) * (dataset->num_groups_);
// group_feature_cnt_
const int* tmp_ptr_group_feature_cnt = reinterpret_cast<const int*>(mem_ptr);
dataset->group_feature_cnt_.clear();
for (int i = 0; i < dataset->num_groups_; ++i) {
dataset->group_feature_cnt_.push_back(tmp_ptr_group_feature_cnt[i]);
}
mem_ptr += sizeof(int) * (dataset->num_groups_);
// get feature names // get feature names
dataset->feature_names_.clear(); dataset->feature_names_.clear();
// write feature names // write feature names
...@@ -372,7 +392,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -372,7 +392,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
if (query_boundaries == nullptr) { if (query_boundaries == nullptr) {
// if not contain query file, minimal sample unit is one record // if not contain query file, minimal sample unit is one record
for (data_size_t i = 0; i < dataset->num_data_; ++i) { for (data_size_t i = 0; i < dataset->num_data_; ++i) {
if (random_.NextInt(0, num_machines) == rank) { if (random_.NextShort(0, num_machines) == rank) {
used_data_indices->push_back(i); used_data_indices->push_back(i);
} }
} }
...@@ -388,7 +408,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -388,7 +408,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
if (i >= query_boundaries[qid + 1]) { if (i >= query_boundaries[qid + 1]) {
// if is new query // if is new query
is_query_used = false; is_query_used = false;
if (random_.NextInt(0, num_machines) == rank) { if (random_.NextShort(0, num_machines) == rank) {
is_query_used = true; is_query_used = true;
} }
++qid; ++qid;
...@@ -420,18 +440,20 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -420,18 +440,20 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
if (read_cnt != size_of_feature) { if (read_cnt != size_of_feature) {
Log::Fatal("Binary file error: feature %d is incorrect, read count: %d", i, read_cnt); Log::Fatal("Binary file error: feature %d is incorrect, read count: %d", i, read_cnt);
} }
dataset->features_.emplace_back(std::unique_ptr<Feature>( dataset->feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new Feature(buffer.data(), new FeatureGroup(buffer.data(),
*num_global_data, *num_global_data,
*used_data_indices) *used_data_indices)
)); ));
} }
dataset->features_.shrink_to_fit(); dataset->feature_groups_.shrink_to_fit();
fclose(file); fclose(file);
return dataset.release(); return dataset.release();
} }
Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data) { Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& sample_values,
std::vector<std::vector<int>>& sample_indices,
size_t total_sample_size, data_size_t num_data) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers(sample_values.size()); std::vector<std::unique_ptr<BinMapper>> bin_mappers(sample_values.size());
// fill feature_names_ if not header // fill feature_names_ if not header
if (feature_names_.empty()) { if (feature_names_.empty()) {
...@@ -441,43 +463,21 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& ...@@ -441,43 +463,21 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
feature_names_.push_back(str_buf.str()); feature_names_.push_back(str_buf.str());
} }
} }
const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * io_config_.min_data_in_leaf) / num_data * sample_values.size());
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) { for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
bin_mappers[i].reset(new BinMapper()); if (ignore_features_.count(i) > 0) {
BinType bin_type = BinType::NumericalBin; bin_mappers[i] = nullptr;
if (categorical_features_.count(i)) { continue;
bin_type = BinType::CategoricalBin;
}
bin_mappers[i]->FindBin(&sample_values[i], total_sample_size, io_config_.max_bin, bin_type);
}
auto dataset = std::unique_ptr<Dataset>(new Dataset());
dataset->features_.clear();
dataset->num_data_ = num_data;
// -1 means doesn't use this feature
dataset->used_feature_map_ = std::vector<int>(bin_mappers.size(), -1);
dataset->num_total_features_ = static_cast<int>(bin_mappers.size());
for (size_t i = 0; i < bin_mappers.size(); ++i) {
if (!bin_mappers[i]->is_trival()) {
// map real feature index to used feature index
dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
// push new feature
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(static_cast<int>(i),
bin_mappers[i].release(),
dataset->num_data_,
io_config_.is_enable_sparse)
));
} else {
// if feature is trival(only 1 bin), free spaces
Log::Warning("Ignoring Column_%d , only has one value", i);
} }
bin_mappers[i].reset(new BinMapper());
bin_mappers[i]->FindBin(sample_values[i], total_sample_size,
io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt);
} }
dataset->features_.shrink_to_fit(); auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
dataset->feature_names_ = feature_names_; dataset->feature_names_ = feature_names_;
dataset->num_features_ = static_cast<int>(dataset->features_.size()); dataset->Construct(bin_mappers, sample_indices, total_sample_size, io_config_);
dataset->metadata_.Init(dataset->num_data_, NO_SPECIFIC, NO_SPECIFIC);
return dataset.release(); return dataset.release();
} }
...@@ -488,13 +488,34 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) { ...@@ -488,13 +488,34 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
if (dataset->num_data_ <= 0) { if (dataset->num_data_ <= 0) {
Log::Fatal("Data file %s is empty", dataset->data_filename_); Log::Fatal("Data file %s is empty", dataset->data_filename_);
} }
if (dataset->features_.empty()) { if (dataset->feature_groups_.empty()) {
Log::Fatal("No usable features in data file %s", dataset->data_filename_); Log::Fatal("No usable features in data file %s", dataset->data_filename_);
} }
if (dataset->feature_names_.size() != static_cast<size_t>(dataset->num_total_features_)) { if (dataset->feature_names_.size() != static_cast<size_t>(dataset->num_total_features_)) {
Log::Fatal("Size of feature name error, should be %d, got %d", dataset->num_total_features_, Log::Fatal("Size of feature name error, should be %d, got %d", dataset->num_total_features_,
static_cast<int>(dataset->feature_names_.size())); static_cast<int>(dataset->feature_names_.size()));
} }
bool is_feature_order_by_group = true;
int last_group = -1;
int last_sub_feature = -1;
// if features are ordered, not need to use hist_buf
for (int i = 0; i < dataset->num_features_; ++i) {
int group = dataset->feature2group_[i];
int sub_feature = dataset->feature2subfeature_[i];
if (group < last_group) {
is_feature_order_by_group = false;
} else if (group == last_group) {
if (sub_feature <= last_sub_feature) {
is_feature_order_by_group = false;
break;
}
}
last_group = group;
last_sub_feature = sub_feature;
}
if (!is_feature_order_by_group) {
Log::Fatal("feature in dataset should order by group");
}
} }
std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata, std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata,
...@@ -512,7 +533,7 @@ std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filenam ...@@ -512,7 +533,7 @@ std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filenam
if (query_boundaries == nullptr) { if (query_boundaries == nullptr) {
// if not contain query data, minimal sample unit is one record // if not contain query data, minimal sample unit is one record
*num_global_data = text_reader.ReadAndFilterLines([this, rank, num_machines](data_size_t) { *num_global_data = text_reader.ReadAndFilterLines([this, rank, num_machines](data_size_t) {
if (random_.NextInt(0, num_machines) == rank) { if (random_.NextShort(0, num_machines) == rank) {
return true; return true;
} else { } else {
return false; return false;
...@@ -532,7 +553,7 @@ std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filenam ...@@ -532,7 +553,7 @@ std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filenam
if (line_idx >= query_boundaries[qid + 1]) { if (line_idx >= query_boundaries[qid + 1]) {
// if is new query // if is new query
is_query_used = false; is_query_used = false;
if (random_.NextInt(0, num_machines) == rank) { if (random_.NextShort(0, num_machines) == rank) {
is_query_used = true; is_query_used = true;
} }
++qid; ++qid;
...@@ -571,7 +592,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen ...@@ -571,7 +592,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
// if not contain query file, minimal sample unit is one record // if not contain query file, minimal sample unit is one record
*num_global_data = text_reader.SampleAndFilterFromFile([this, rank, num_machines] *num_global_data = text_reader.SampleAndFilterFromFile([this, rank, num_machines]
(data_size_t) { (data_size_t) {
if (random_.NextInt(0, num_machines) == rank) { if (random_.NextShort(0, num_machines) == rank) {
return true; return true;
} else { } else {
return false; return false;
...@@ -592,7 +613,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen ...@@ -592,7 +613,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
if (line_idx >= query_boundaries[qid + 1]) { if (line_idx >= query_boundaries[qid + 1]) {
// if is new query // if is new query
is_query_used = false; is_query_used = false;
if (random_.NextInt(0, num_machines) == rank) { if (random_.NextShort(0, num_machines) == rank) {
is_query_used = true; is_query_used = true;
} }
++qid; ++qid;
...@@ -605,30 +626,28 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen ...@@ -605,30 +626,28 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
} }
void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset) { void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset) {
// sample_values[i][j], means the value of j-th sample on i-th feature
std::vector<std::vector<double>> sample_values; std::vector<std::vector<double>> sample_values;
// temp buffer for one line features and label std::vector<std::vector<int>> sample_indices;
std::vector<std::pair<int, double>> oneline_features; std::vector<std::pair<int, double>> oneline_features;
double label; double label;
for (size_t i = 0; i < sample_data.size(); ++i) { for (int i = 0; i < static_cast<int>(sample_data.size()); ++i) {
oneline_features.clear(); oneline_features.clear();
// parse features // parse features
parser->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label); parser->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label);
for (std::pair<int, double>& inner_data : oneline_features) { for (std::pair<int, double>& inner_data : oneline_features) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) { if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set sample_values.resize(inner_data.first + 1);
size_t need_size = inner_data.first - sample_values.size() + 1; sample_indices.resize(inner_data.first + 1);
for (size_t j = 0; j < need_size; ++j) {
sample_values.emplace_back();
}
} }
if (std::fabs(inner_data.second) > 1e-15) { if (std::fabs(inner_data.second) > kEpsilon) {
sample_values[inner_data.first].push_back(inner_data.second); sample_values[inner_data.first].emplace_back(inner_data.second);
sample_indices[inner_data.first].emplace_back(i);
} }
} }
} }
dataset->features_.clear(); dataset->feature_groups_.clear();
if (feature_names_.empty()) { if (feature_names_.empty()) {
// -1 means doesn't use this feature // -1 means doesn't use this feature
...@@ -653,41 +672,21 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -653,41 +672,21 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
} }
} }
dataset->feature_names_ = feature_names_; dataset->feature_names_ = feature_names_;
std::vector<std::unique_ptr<BinMapper>> bin_mappers(sample_values.size());
const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * io_config_.min_data_in_leaf) / dataset->num_data_ * sample_values.size());
// start find bins // start find bins
if (num_machines == 1) { if (num_machines == 1) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers(sample_values.size());
// if only one machine, find bin locally // if only one machine, find bin locally
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) { for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
if (ignore_features_.count(i) > 0) { if (ignore_features_.count(i) > 0) {
bin_mappers[i].reset(nullptr); bin_mappers[i] = nullptr;
continue; continue;
} }
bin_mappers[i].reset(new BinMapper()); bin_mappers[i].reset(new BinMapper());
BinType bin_type = BinType::NumericalBin; bin_mappers[i]->FindBin(sample_values[i], sample_data.size(),
if (categorical_features_.count(i)) { io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt);
bin_type = BinType::CategoricalBin;
}
bin_mappers[i]->FindBin(&sample_values[i], sample_data.size(), io_config_.max_bin, bin_type);
}
for (size_t i = 0; i < sample_values.size(); ++i) {
if (bin_mappers[i] == nullptr) {
Log::Warning("Ignoring feature %s", feature_names_[i].c_str());
} else if (!bin_mappers[i]->is_trival()) {
// map real feature index to used feature index
dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
// push new feature
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(static_cast<int>(i),
bin_mappers[i].release(),
dataset->num_data_,
io_config_.is_enable_sparse)
));
} else {
// if feature is trival(only 1 bin), free spaces
Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
}
} }
} else { } else {
// if have multi-machines, need find bin distributed // if have multi-machines, need find bin distributed
...@@ -718,11 +717,8 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -718,11 +717,8 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) { for (int i = 0; i < len[rank]; ++i) {
BinMapper bin_mapper; BinMapper bin_mapper;
BinType bin_type = BinType::NumericalBin; bin_mapper.FindBin(sample_values[start[rank] + i], sample_data.size(),
if (categorical_features_.count(start[rank] + i)) { io_config_.max_bin, io_config_.min_data_in_bin, filter_cnt);
bin_type = BinType::CategoricalBin;
}
bin_mapper.FindBin(&sample_values[start[rank] + i], sample_data.size(), io_config_.max_bin, bin_type);
bin_mapper.CopyTo(input_buffer.data() + i * type_size); bin_mapper.CopyTo(input_buffer.data() + i * type_size);
} }
// convert to binary size // convert to binary size
...@@ -735,26 +731,15 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -735,26 +731,15 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// restore features bins from buffer // restore features bins from buffer
for (int i = 0; i < total_num_feature; ++i) { for (int i = 0; i < total_num_feature; ++i) {
if (ignore_features_.count(i) > 0) { if (ignore_features_.count(i) > 0) {
Log::Warning("Ignoring feature %s", feature_names_[i].c_str()); bin_mappers[i] = nullptr;
continue; continue;
} }
auto bin_mapper = std::unique_ptr<BinMapper>(new BinMapper()); bin_mappers[i].reset(new BinMapper());
bin_mapper->CopyFrom(output_buffer.data() + i * type_size); bin_mappers[i]->CopyFrom(output_buffer.data() + i * type_size);
if (!bin_mapper->is_trival()) {
dataset->used_feature_map_[i] = static_cast<int>(dataset->features_.size());
dataset->features_.emplace_back(std::unique_ptr<Feature>(
new Feature(static_cast<int>(i),
bin_mapper.release(),
dataset->num_data_,
io_config_.is_enable_sparse)
));
} else {
Log::Warning("Ignoring feature %s, only has one value", feature_names_[i].c_str());
}
} }
} }
dataset->features_.shrink_to_fit(); sample_values.clear();
dataset->num_features_ = static_cast<int>(dataset->features_.size()); dataset->Construct(bin_mappers, sample_indices, sample_data.size(), io_config_);
} }
/*! \brief Extract local features from memory */ /*! \brief Extract local features from memory */
...@@ -781,7 +766,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat ...@@ -781,7 +766,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
int feature_idx = dataset->used_feature_map_[inner_data.first]; int feature_idx = dataset->used_feature_map_[inner_data.first];
if (feature_idx >= 0) { if (feature_idx >= 0) {
// if is used feature // if is used feature
dataset->features_[feature_idx]->PushData(tid, i, inner_data.second); int group = dataset->feature2group_[feature_idx];
int sub_feature = dataset->feature2subfeature_[feature_idx];
dataset->feature_groups_[group]->PushData(tid, sub_feature, i, inner_data.second);
} else { } else {
if (inner_data.first == weight_idx_) { if (inner_data.first == weight_idx_) {
dataset->metadata_.SetWeightAt(i, static_cast<float>(inner_data.second)); dataset->metadata_.SetWeightAt(i, static_cast<float>(inner_data.second));
...@@ -817,7 +804,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat ...@@ -817,7 +804,9 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
int feature_idx = dataset->used_feature_map_[inner_data.first]; int feature_idx = dataset->used_feature_map_[inner_data.first];
if (feature_idx >= 0) { if (feature_idx >= 0) {
// if is used feature // if is used feature
dataset->features_[feature_idx]->PushData(tid, i, inner_data.second); int group = dataset->feature2group_[feature_idx];
int sub_feature = dataset->feature2subfeature_[feature_idx];
dataset->feature_groups_[group]->PushData(tid, sub_feature, i, inner_data.second);
} else { } else {
if (inner_data.first == weight_idx_) { if (inner_data.first == weight_idx_) {
dataset->metadata_.SetWeightAt(i, static_cast<float>(inner_data.second)); dataset->metadata_.SetWeightAt(i, static_cast<float>(inner_data.second));
...@@ -867,7 +856,9 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* ...@@ -867,7 +856,9 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
int feature_idx = dataset->used_feature_map_[inner_data.first]; int feature_idx = dataset->used_feature_map_[inner_data.first];
if (feature_idx >= 0) { if (feature_idx >= 0) {
// if is used feature // if is used feature
dataset->features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second); int group = dataset->feature2group_[feature_idx];
int sub_feature = dataset->feature2subfeature_[feature_idx];
dataset->feature_groups_[group]->PushData(tid, sub_feature, start_idx + i, inner_data.second);
} else { } else {
if (inner_data.first == weight_idx_) { if (inner_data.first == weight_idx_) {
dataset->metadata_.SetWeightAt(start_idx + i, static_cast<float>(inner_data.second)); dataset->metadata_.SetWeightAt(start_idx + i, static_cast<float>(inner_data.second));
......
...@@ -9,15 +9,41 @@ ...@@ -9,15 +9,41 @@
namespace LightGBM { namespace LightGBM {
template <typename VAL_T>
class DenseBin;
template <typename VAL_T>
class DenseBinIterator : public BinIterator {
public:
explicit DenseBinIterator(const DenseBin<VAL_T>* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
: bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
max_bin_(static_cast<VAL_T>(max_bin)),
default_bin_(static_cast<uint8_t>(default_bin)) {
if (default_bin_ == 0) {
bias_ = 1;
} else {
bias_ = 0;
}
}
inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override { }
private:
const DenseBin<VAL_T>* bin_data_;
VAL_T min_bin_;
VAL_T max_bin_;
VAL_T default_bin_;
uint8_t bias_;
};
/*! /*!
* \brief Used to store bins for dense feature * \brief Used to store bins for dense feature
* Use template to reduce memory cost * Use template to reduce memory cost
*/ */
template <typename VAL_T> template <typename VAL_T>
class DenseBin: public Bin { class DenseBin : public Bin {
public: public:
DenseBin(data_size_t num_data, uint32_t default_bin) friend DenseBinIterator<VAL_T>;
: num_data_(num_data), data_(num_data_, static_cast<VAL_T>(default_bin)) { DenseBin(data_size_t num_data)
: num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) {
} }
~DenseBin() { ~DenseBin() {
...@@ -34,24 +60,20 @@ public: ...@@ -34,24 +60,20 @@ public:
} }
} }
inline uint32_t Get(data_size_t idx) const { BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
return static_cast<uint32_t>(data_[idx]);
}
BinIterator* GetIterator(data_size_t start_idx) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster // use 4-way unrolling, will be faster
if (data_indices != nullptr) { // if use part of data if (data_indices != nullptr) { // if use part of data
data_size_t rest = num_data % 4; const data_size_t rest = num_data % 4;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
VAL_T bin0 = data_[data_indices[i]]; const VAL_T bin0 = data_[data_indices[i]];
VAL_T bin1 = data_[data_indices[i + 1]]; const VAL_T bin1 = data_[data_indices[i + 1]];
VAL_T bin2 = data_[data_indices[i + 2]]; const VAL_T bin2 = data_[data_indices[i + 2]];
VAL_T bin3 = data_[data_indices[i + 3]]; const VAL_T bin3 = data_[data_indices[i + 3]];
out[bin0].sum_gradients += ordered_gradients[i]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1]; out[bin1].sum_gradients += ordered_gradients[i + 1];
...@@ -69,19 +91,19 @@ public: ...@@ -69,19 +91,19 @@ public:
++out[bin3].cnt; ++out[bin3].cnt;
} }
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
VAL_T bin = data_[data_indices[i]]; const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt; ++out[bin].cnt;
} }
} else { // use full data } else { // use full data
data_size_t rest = num_data % 4; const data_size_t rest = num_data % 4;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
VAL_T bin0 = data_[i]; const VAL_T bin0 = data_[i];
VAL_T bin1 = data_[i + 1]; const VAL_T bin1 = data_[i + 1];
VAL_T bin2 = data_[i + 2]; const VAL_T bin2 = data_[i + 2];
VAL_T bin3 = data_[i + 3]; const VAL_T bin3 = data_[i + 3];
out[bin0].sum_gradients += ordered_gradients[i]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1]; out[bin1].sum_gradients += ordered_gradients[i + 1];
...@@ -99,7 +121,7 @@ public: ...@@ -99,7 +121,7 @@ public:
++out[bin3].cnt; ++out[bin3].cnt;
} }
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
VAL_T bin = data_[i]; const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt; ++out[bin].cnt;
...@@ -107,13 +129,31 @@ public: ...@@ -107,13 +129,31 @@ public:
} }
} }
virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data, virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override { data_size_t* lte_indices, data_size_t* gt_indices) const override {
if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin);
VAL_T maxb = static_cast<VAL_T>(max_bin);
if (default_bin == 0) {
th -= 1;
}
data_size_t lte_count = 0; data_size_t lte_count = 0;
data_size_t gt_count = 0; data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
if (data_[idx] > threshold) { VAL_T bin = data_[idx];
if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx; gt_indices[gt_count++] = idx;
} else { } else {
lte_indices[lte_count++] = idx; lte_indices[lte_count++] = idx;
...@@ -162,45 +202,19 @@ protected: ...@@ -162,45 +202,19 @@ protected:
}; };
template <typename VAL_T> template <typename VAL_T>
class DenseBinIterator: public BinIterator { uint32_t DenseBinIterator<VAL_T>::Get(data_size_t idx) {
public: auto ret = bin_data_->data_[idx];
explicit DenseBinIterator(const DenseBin<VAL_T>* bin_data) if (ret >= min_bin_ && ret <= max_bin_) {
: bin_data_(bin_data) { return ret - min_bin_ + bias_;
} else {
return default_bin_;
} }
uint32_t Get(data_size_t idx) override {
return bin_data_->Get(idx);
}
private:
const DenseBin<VAL_T>* bin_data_;
};
template <typename VAL_T>
BinIterator* DenseBin<VAL_T>::GetIterator(data_size_t) const {
return new DenseBinIterator<VAL_T>(this);
} }
template <typename VAL_T> template <typename VAL_T>
class DenseCategoricalBin: public DenseBin<VAL_T> { BinIterator* DenseBin<VAL_T>::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
public: return new DenseBinIterator<VAL_T>(this, min_bin, max_bin, default_bin);
DenseCategoricalBin(data_size_t num_data, int default_bin) }
: DenseBin<VAL_T>(num_data, default_bin) {
}
virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t lte_count = 0;
data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) {
data_size_t idx = data_indices[i];
if (DenseBin<VAL_T>::data_[idx] != threshold) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
return lte_count;
}
};
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_IO_DENSE_BIN_HPP_ #endif // LightGBM_IO_DENSE_BIN_HPP_
...@@ -41,6 +41,7 @@ public: ...@@ -41,6 +41,7 @@ public:
++non_zero_cnt; ++non_zero_cnt;
} }
ordered_pair_.resize(non_zero_cnt); ordered_pair_.resize(non_zero_cnt);
leaf_cnt_.push_back(non_zero_cnt);
} }
~OrderedSparseBin() { ~OrderedSparseBin() {
...@@ -92,7 +93,7 @@ public: ...@@ -92,7 +93,7 @@ public:
} }
} }
void Split(int leaf, int right_leaf, const char* left_indices) override { void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) override {
// get current leaf boundary // get current leaf boundary
const data_size_t l_start = leaf_start_[leaf]; const data_size_t l_start = leaf_start_[leaf];
const data_size_t l_end = l_start + leaf_cnt_[leaf]; const data_size_t l_end = l_start + leaf_cnt_[leaf];
...@@ -100,7 +101,7 @@ public: ...@@ -100,7 +101,7 @@ public:
data_size_t new_left_end = l_start; data_size_t new_left_end = l_start;
for (data_size_t i = l_start; i < l_end; ++i) { for (data_size_t i = l_start; i < l_end; ++i) {
if (left_indices[ordered_pair_[i].ridx]) { if (is_in_leaf[ordered_pair_[i].ridx] == mark) {
std::swap(ordered_pair_[new_left_end], ordered_pair_[i]); std::swap(ordered_pair_[new_left_end], ordered_pair_[i]);
++new_left_end; ++new_left_end;
} }
...@@ -110,7 +111,9 @@ public: ...@@ -110,7 +111,9 @@ public:
leaf_cnt_[leaf] = new_left_end - l_start; leaf_cnt_[leaf] = new_left_end - l_start;
leaf_cnt_[right_leaf] = l_end - new_left_end; leaf_cnt_[right_leaf] = l_end - new_left_end;
} }
data_size_t NonZeroCount(int leaf) const override {
return static_cast<data_size_t>(leaf_cnt_[leaf]);
}
/*! \brief Disable copy */ /*! \brief Disable copy */
OrderedSparseBin<VAL_T>& operator=(const OrderedSparseBin<VAL_T>&) = delete; OrderedSparseBin<VAL_T>& operator=(const OrderedSparseBin<VAL_T>&) = delete;
/*! \brief Disable copy */ /*! \brief Disable copy */
......
...@@ -23,22 +23,43 @@ const uint8_t kMaxDelta = 255; ...@@ -23,22 +23,43 @@ const uint8_t kMaxDelta = 255;
template <typename VAL_T> template <typename VAL_T>
class SparseBinIterator: public BinIterator { class SparseBinIterator: public BinIterator {
public: public:
SparseBinIterator(const SparseBin<VAL_T>* bin_data,
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
: bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
max_bin_(static_cast<VAL_T>(max_bin)),
default_bin_(static_cast<uint8_t>(default_bin)) {
if (default_bin_ == 0) {
bias_ = 1;
} else {
bias_ = 0;
}
Reset(0);
}
SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx) SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx)
: bin_data_(bin_data) { : bin_data_(bin_data) {
Reset(start_idx); Reset(start_idx);
} }
inline VAL_T InnerGet(data_size_t idx); inline VAL_T RawGet(data_size_t idx);
inline uint32_t Get(data_size_t idx) override { inline uint32_t Get( data_size_t idx) override {
return InnerGet(idx); VAL_T ret = RawGet(idx);
if (ret >= min_bin_ && ret <= max_bin_) {
return ret - min_bin_ + bias_;
} else {
return default_bin_;
}
} }
inline void Reset(data_size_t idx); inline void Reset(data_size_t idx) override;
private: private:
const SparseBin<VAL_T>* bin_data_; const SparseBin<VAL_T>* bin_data_;
data_size_t cur_pos_; data_size_t cur_pos_;
data_size_t i_delta_; data_size_t i_delta_;
VAL_T min_bin_;
VAL_T max_bin_;
VAL_T default_bin_;
uint8_t bias_;
}; };
template <typename VAL_T> template <typename VAL_T>
...@@ -50,17 +71,15 @@ public: ...@@ -50,17 +71,15 @@ public:
friend class SparseBinIterator<VAL_T>; friend class SparseBinIterator<VAL_T>;
friend class OrderedSparseBin<VAL_T>; friend class OrderedSparseBin<VAL_T>;
SparseBin(data_size_t num_data, uint32_t default_bin) SparseBin(data_size_t num_data)
: num_data_(num_data) { : num_data_(num_data) {
default_bin_ = static_cast<VAL_T>(default_bin); int num_threads = 1;
#pragma omp parallel #pragma omp parallel
#pragma omp master #pragma omp master
{ {
num_threads_ = omp_get_num_threads(); num_threads = omp_get_num_threads();
}
for (int i = 0; i < num_threads_; ++i) {
push_buffers_.emplace_back();
} }
push_buffers_.resize(num_threads);
} }
~SparseBin() { ~SparseBin() {
...@@ -73,12 +92,12 @@ public: ...@@ -73,12 +92,12 @@ public:
void Push(int tid, data_size_t idx, uint32_t value) override { void Push(int tid, data_size_t idx, uint32_t value) override {
auto cur_bin = static_cast<VAL_T>(value); auto cur_bin = static_cast<VAL_T>(value);
if (cur_bin != default_bin_) { if (cur_bin != 0) {
push_buffers_[tid].emplace_back(idx, cur_bin); push_buffers_[tid].emplace_back(idx, cur_bin);
} }
} }
BinIterator* GetIterator(data_size_t start_idx) const override; BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
void ConstructHistogram(const data_size_t*, data_size_t, const score_t*, void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
const score_t*, HistogramBinEntry*) const override { const score_t*, HistogramBinEntry*) const override {
...@@ -88,11 +107,10 @@ public: ...@@ -88,11 +107,10 @@ public:
inline bool NextNonzero(data_size_t* i_delta, inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const { data_size_t* cur_pos) const {
const VAL_T non_data_flag = std::numeric_limits<VAL_T>::max();
++(*i_delta); ++(*i_delta);
*cur_pos += deltas_[*i_delta]; *cur_pos += deltas_[*i_delta];
data_size_t factor = 1; data_size_t factor = 1;
while (*i_delta < num_vals_ && vals_[*i_delta] == non_data_flag) { while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
++(*i_delta); ++(*i_delta);
factor *= kMaxDelta; factor *= kMaxDelta;
*cur_pos += deltas_[*i_delta] * factor; *cur_pos += deltas_[*i_delta] * factor;
...@@ -104,17 +122,33 @@ public: ...@@ -104,17 +122,33 @@ public:
} }
} }
virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data, virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override { data_size_t* lte_indices, data_size_t* gt_indices) const override {
// not need to split // not need to split
if (num_data <= 0) { return 0; } if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin);
VAL_T maxb = static_cast<VAL_T>(max_bin);
if (default_bin == 0) {
th -= 1;
}
SparseBinIterator<VAL_T> iterator(this, data_indices[0]); SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
data_size_t lte_count = 0; data_size_t lte_count = 0;
data_size_t gt_count = 0; data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (default_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerGet(idx); VAL_T bin = iterator.RawGet(idx);
if (bin > threshold) { if (bin > maxb || bin < minb) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx; gt_indices[gt_count++] = idx;
} else { } else {
lte_indices[lte_count++] = idx; lte_indices[lte_count++] = idx;
...@@ -133,16 +167,14 @@ public: ...@@ -133,16 +167,14 @@ public:
for (size_t i = 0; i < push_buffers_.size(); ++i) { for (size_t i = 0; i < push_buffers_.size(); ++i) {
pair_cnt += push_buffers_[i].size(); pair_cnt += push_buffers_[i].size();
} }
std::vector<std::pair<data_size_t, VAL_T>> idx_val_pairs; std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs = push_buffers_[0];
// merge
idx_val_pairs.reserve(pair_cnt); idx_val_pairs.reserve(pair_cnt);
for (size_t i = 0; i < push_buffers_.size(); ++i) {
for (size_t i = 1; i < push_buffers_.size(); ++i) {
idx_val_pairs.insert(idx_val_pairs.end(), push_buffers_[i].begin(), push_buffers_[i].end()); idx_val_pairs.insert(idx_val_pairs.end(), push_buffers_[i].begin(), push_buffers_[i].end());
push_buffers_[i].clear(); push_buffers_[i].clear();
push_buffers_[i].shrink_to_fit(); push_buffers_[i].shrink_to_fit();
} }
push_buffers_.clear();
push_buffers_.shrink_to_fit();
// sort by data index // sort by data index
std::sort(idx_val_pairs.begin(), idx_val_pairs.end(), std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
[](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) { [](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
...@@ -155,7 +187,6 @@ public: ...@@ -155,7 +187,6 @@ public:
void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs) { void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs) {
deltas_.clear(); deltas_.clear();
vals_.clear(); vals_.clear();
const VAL_T non_data_flag = std::numeric_limits<VAL_T>::max();
// transform to delta array // transform to delta array
data_size_t last_idx = 0; data_size_t last_idx = 0;
for (size_t i = 0; i < idx_val_pairs.size(); ++i) { for (size_t i = 0; i < idx_val_pairs.size(); ++i) {
...@@ -164,7 +195,7 @@ public: ...@@ -164,7 +195,7 @@ public:
data_size_t cur_delta = cur_idx - last_idx; data_size_t cur_delta = cur_idx - last_idx;
while (cur_delta > kMaxDelta) { while (cur_delta > kMaxDelta) {
deltas_.push_back(cur_delta % kMaxDelta); deltas_.push_back(cur_delta % kMaxDelta);
vals_.push_back(non_data_flag); vals_.push_back(0);
cur_delta /= kMaxDelta; cur_delta /= kMaxDelta;
} }
deltas_.push_back(static_cast<uint8_t>(cur_delta)); deltas_.push_back(static_cast<uint8_t>(cur_delta));
...@@ -269,8 +300,8 @@ public: ...@@ -269,8 +300,8 @@ public:
SparseBinIterator<VAL_T> iterator(other_bin, used_indices[0]); SparseBinIterator<VAL_T> iterator(other_bin, used_indices[0]);
std::vector<std::pair<data_size_t, VAL_T>> tmp_pair; std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
for (data_size_t i = 0; i < num_used_indices; ++i) { for (data_size_t i = 0; i < num_used_indices; ++i) {
VAL_T bin = iterator.InnerGet(used_indices[i]); VAL_T bin = iterator.RawGet(used_indices[i]);
if (bin != default_bin_) { if (bin > 0) {
tmp_pair.emplace_back(i, bin); tmp_pair.emplace_back(i, bin);
} }
} }
...@@ -282,22 +313,20 @@ protected: ...@@ -282,22 +313,20 @@ protected:
std::vector<uint8_t> deltas_; std::vector<uint8_t> deltas_;
std::vector<VAL_T> vals_; std::vector<VAL_T> vals_;
data_size_t num_vals_; data_size_t num_vals_;
int num_threads_;
std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_; std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
std::vector<std::pair<data_size_t, data_size_t>> fast_index_; std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
data_size_t fast_index_shift_; data_size_t fast_index_shift_;
VAL_T default_bin_;
}; };
template <typename VAL_T> template <typename VAL_T>
inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) { inline VAL_T SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) { while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) {
bin_data_->NextNonzero(&i_delta_, &cur_pos_); bin_data_->NextNonzero(&i_delta_, &cur_pos_);
} }
if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_ && i_delta_ >= 0) { if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_ && i_delta_ >= 0) {
return bin_data_->vals_[i_delta_]; return bin_data_->vals_[i_delta_];
} else { } else {
return bin_data_->default_bin_; return 0;
} }
} }
...@@ -309,38 +338,9 @@ inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) { ...@@ -309,38 +338,9 @@ inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
} }
template <typename VAL_T> template <typename VAL_T>
BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const { BinIterator* SparseBin<VAL_T>::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
return new SparseBinIterator<VAL_T>(this, start_idx); return new SparseBinIterator<VAL_T>(this, min_bin, max_bin, default_bin);
} }
template <typename VAL_T>
class SparseCategoricalBin: public SparseBin<VAL_T> {
public:
SparseCategoricalBin(data_size_t num_data, uint32_t default_bin)
: SparseBin<VAL_T>(num_data, default_bin) {
}
virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
// not need to split
if (num_data <= 0) { return 0; }
SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
data_size_t lte_count = 0;
data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerGet(idx);
if (bin != threshold) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
return lte_count;
}
};
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_IO_SPARSE_BIN_HPP_ #endif // LightGBM_IO_SPARSE_BIN_HPP_
\ No newline at end of file
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include <LightGBM/utils/common.h> #include <LightGBM/utils/common.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <LightGBM/feature.h>
#include <sstream> #include <sstream>
#include <unordered_map> #include <unordered_map>
...@@ -16,22 +15,16 @@ ...@@ -16,22 +15,16 @@
namespace LightGBM { namespace LightGBM {
std::vector<bool(*)(unsigned int, unsigned int)> Tree::inner_decision_funs =
{Tree::NumericalDecision<unsigned int>, Tree::CategoricalDecision<unsigned int> };
std::vector<bool(*)(double, double)> Tree::decision_funs =
{ Tree::NumericalDecision<double>, Tree::CategoricalDecision<double> };
Tree::Tree(int max_leaves) Tree::Tree(int max_leaves)
:max_leaves_(max_leaves) { :max_leaves_(max_leaves) {
num_leaves_ = 0; num_leaves_ = 0;
left_child_ = std::vector<int>(max_leaves_ - 1); left_child_ = std::vector<int>(max_leaves_ - 1);
right_child_ = std::vector<int>(max_leaves_ - 1); right_child_ = std::vector<int>(max_leaves_ - 1);
split_feature_inner = std::vector<int>(max_leaves_ - 1);
split_feature_ = std::vector<int>(max_leaves_ - 1); split_feature_ = std::vector<int>(max_leaves_ - 1);
split_feature_real_ = std::vector<int>(max_leaves_ - 1); threshold_in_bin_ = std::vector<uint32_t>(max_leaves_ - 1);
threshold_in_bin_ = std::vector<unsigned int>(max_leaves_ - 1);
threshold_ = std::vector<double>(max_leaves_ - 1); threshold_ = std::vector<double>(max_leaves_ - 1);
decision_type_ = std::vector<int8_t>(max_leaves_ - 1);
split_gain_ = std::vector<double>(max_leaves_ - 1); split_gain_ = std::vector<double>(max_leaves_ - 1);
leaf_parent_ = std::vector<int>(max_leaves_); leaf_parent_ = std::vector<int>(max_leaves_);
leaf_value_ = std::vector<double>(max_leaves_); leaf_value_ = std::vector<double>(max_leaves_);
...@@ -48,7 +41,7 @@ Tree::~Tree() { ...@@ -48,7 +41,7 @@ Tree::~Tree() {
} }
int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_bin, int real_feature, int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
double threshold_double, double left_value, double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) { double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
int new_node_idx = num_leaves_ - 1; int new_node_idx = num_leaves_ - 1;
...@@ -63,15 +56,10 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_ ...@@ -63,15 +56,10 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
} }
} }
// add new node // add new node
split_feature_[new_node_idx] = feature; split_feature_inner[new_node_idx] = feature;
split_feature_real_[new_node_idx] = real_feature; split_feature_[new_node_idx] = real_feature;
threshold_in_bin_[new_node_idx] = threshold_bin; threshold_in_bin_[new_node_idx] = threshold_bin;
threshold_[new_node_idx] = threshold_double; threshold_[new_node_idx] = threshold_double;
if (bin_type == BinType::NumericalBin) {
decision_type_[new_node_idx] = 0;
} else {
decision_type_[new_node_idx] = 1;
}
split_gain_[new_node_idx] = gain; split_gain_[new_node_idx] = gain;
// add two new leaves // add two new leaves
left_child_[new_node_idx] = ~leaf; left_child_[new_node_idx] = ~leaf;
...@@ -95,42 +83,74 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_ ...@@ -95,42 +83,74 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
} }
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const { void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
Threading::For<data_size_t>(0, num_data, [this, data, score](int, data_size_t start, data_size_t end) { if (data->num_features() > num_leaves_ - 1) {
std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features()); Threading::For<data_size_t>(0, num_data,
for (int i = 0; i < data->num_features(); ++i) { [this, &data, score](int, data_size_t start, data_size_t end) {
iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(start)); std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
} for (int i = 0; i < num_leaves_ - 1; ++i) {
for (data_size_t i = start; i < end; ++i) { const int fidx = split_feature_inner[i];
score[i] += static_cast<double>(leaf_value_[GetLeaf(iterators, i)]); iter[i].reset(data->FeatureIterator(fidx));
} iter[i]->Reset(start);
}); }
for (data_size_t i = start; i < end; ++i) {
score[i] += static_cast<double>(leaf_value_[GetLeaf(iter, i)]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
score[i] += static_cast<double>(leaf_value_[GetLeafRaw(iter, i)]);
}
});
}
} }
void Tree::AddPredictionToScore(const Dataset* data, const data_size_t* used_data_indices, void Tree::AddPredictionToScore(const Dataset* data,
data_size_t num_data, double* score) const { const data_size_t* used_data_indices,
Threading::For<data_size_t>(0, num_data, data_size_t num_data, double* score) const {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) { [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features()); std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < data->num_features(); ++i) { for (int i = 0; i < num_leaves_ - 1; ++i) {
iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(used_data_indices[start])); const int fidx = split_feature_inner[i];
} iter[i].reset(data->FeatureIterator(fidx));
for (data_size_t i = start; i < end; ++i) { iter[i]->Reset(used_data_indices[start]);
score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeaf(iterators, used_data_indices[i])]); }
} for (data_size_t i = start; i < end; ++i) {
}); score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeaf(iter, used_data_indices[i])]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeafRaw(iter, used_data_indices[i])]);
}
});
}
} }
std::string Tree::ToString() { std::string Tree::ToString() {
std::stringstream str_buf; std::stringstream str_buf;
str_buf << "num_leaves=" << num_leaves_ << std::endl; str_buf << "num_leaves=" << num_leaves_ << std::endl;
str_buf << "split_feature=" str_buf << "split_feature="
<< Common::ArrayToString<int>(split_feature_real_, num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<int>(split_feature_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "split_gain=" str_buf << "split_gain="
<< Common::ArrayToString<double>(split_gain_, num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<double>(split_gain_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "threshold=" str_buf << "threshold="
<< Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "decision_type="
<< Common::ArrayToString<int>(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1, ' ') << std::endl;
str_buf << "left_child=" str_buf << "left_child="
<< Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "right_child=" str_buf << "right_child="
...@@ -166,10 +186,9 @@ std::string Tree::NodeToJSON(int index) { ...@@ -166,10 +186,9 @@ std::string Tree::NodeToJSON(int index) {
// non-leaf // non-leaf
str_buf << "{" << std::endl; str_buf << "{" << std::endl;
str_buf << "\"split_index\":" << index << "," << std::endl; str_buf << "\"split_index\":" << index << "," << std::endl;
str_buf << "\"split_feature\":" << split_feature_real_[index] << "," << std::endl; str_buf << "\"split_feature\":" << split_feature_[index] << "," << std::endl;
str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl; str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl; str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl;
str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl; str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl;
str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl; str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl;
str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl; str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl;
...@@ -207,7 +226,7 @@ Tree::Tree(const std::string& str) { ...@@ -207,7 +226,7 @@ Tree::Tree(const std::string& str) {
|| key_vals.count("left_child") <= 0 || key_vals.count("right_child") <= 0 || key_vals.count("left_child") <= 0 || key_vals.count("right_child") <= 0
|| key_vals.count("leaf_parent") <= 0 || key_vals.count("leaf_value") <= 0 || key_vals.count("leaf_parent") <= 0 || key_vals.count("leaf_value") <= 0
|| key_vals.count("internal_value") <= 0 || key_vals.count("internal_count") <= 0 || key_vals.count("internal_value") <= 0 || key_vals.count("internal_count") <= 0
|| key_vals.count("leaf_count") <= 0 || key_vals.count("decision_type") <= 0 || key_vals.count("leaf_count") <= 0
) { ) {
Log::Fatal("Tree model string format error"); Log::Fatal("Tree model string format error");
} }
...@@ -216,12 +235,11 @@ Tree::Tree(const std::string& str) { ...@@ -216,12 +235,11 @@ Tree::Tree(const std::string& str) {
left_child_ = Common::StringToArray<int>(key_vals["left_child"], ' ', num_leaves_ - 1); left_child_ = Common::StringToArray<int>(key_vals["left_child"], ' ', num_leaves_ - 1);
right_child_ = Common::StringToArray<int>(key_vals["right_child"], ' ', num_leaves_ - 1); right_child_ = Common::StringToArray<int>(key_vals["right_child"], ' ', num_leaves_ - 1);
split_feature_real_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1); split_feature_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1);
threshold_ = Common::StringToArray<double>(key_vals["threshold"], ' ', num_leaves_ - 1); threshold_ = Common::StringToArray<double>(key_vals["threshold"], ' ', num_leaves_ - 1);
split_gain_ = Common::StringToArray<double>(key_vals["split_gain"], ' ', num_leaves_ - 1); split_gain_ = Common::StringToArray<double>(key_vals["split_gain"], ' ', num_leaves_ - 1);
internal_count_ = Common::StringToArray<data_size_t>(key_vals["internal_count"], ' ', num_leaves_ - 1); internal_count_ = Common::StringToArray<data_size_t>(key_vals["internal_count"], ' ', num_leaves_ - 1);
internal_value_ = Common::StringToArray<double>(key_vals["internal_value"], ' ', num_leaves_ - 1); internal_value_ = Common::StringToArray<double>(key_vals["internal_value"], ' ', num_leaves_ - 1);
decision_type_ = Common::StringToArray<int8_t>(key_vals["decision_type"], ' ', num_leaves_ - 1);
leaf_count_ = Common::StringToArray<data_size_t>(key_vals["leaf_count"], ' ', num_leaves_); leaf_count_ = Common::StringToArray<data_size_t>(key_vals["leaf_count"], ' ', num_leaves_);
leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_); leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_);
......
...@@ -103,7 +103,7 @@ public: ...@@ -103,7 +103,7 @@ public:
explicit BinaryLoglossMetric(const MetricConfig& config) :BinaryMetric<BinaryLoglossMetric>(config) {} explicit BinaryLoglossMetric(const MetricConfig& config) :BinaryMetric<BinaryLoglossMetric>(config) {}
inline static double LossOnPoint(float label, double prob) { inline static double LossOnPoint(float label, double prob) {
if (label == 0) { if (label <= 0) {
if (1.0f - prob > kEpsilon) { if (1.0f - prob > kEpsilon) {
return -std::log(1.0f - prob); return -std::log(1.0f - prob);
} }
...@@ -128,9 +128,9 @@ public: ...@@ -128,9 +128,9 @@ public:
inline static double LossOnPoint(float label, double prob) { inline static double LossOnPoint(float label, double prob) {
if (prob <= 0.5f) { if (prob <= 0.5f) {
return label; return label > 0;
} else { } else {
return 1.0f - label; return label <= 0;
} }
} }
...@@ -207,8 +207,8 @@ public: ...@@ -207,8 +207,8 @@ public:
// reset // reset
cur_neg = cur_pos = 0.0f; cur_neg = cur_pos = 0.0f;
} }
cur_neg += 1.0f - cur_label; cur_neg += (cur_label <= 0);
cur_pos += cur_label; cur_pos += (cur_label > 0);
} }
} else { // has weights } else { // has weights
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
...@@ -224,8 +224,8 @@ public: ...@@ -224,8 +224,8 @@ public:
// reset // reset
cur_neg = cur_pos = 0.0f; cur_neg = cur_pos = 0.0f;
} }
cur_neg += (1.0f - cur_label)*cur_weight; cur_neg += (cur_label <= 0)*cur_weight;
cur_pos += cur_label*cur_weight; cur_pos += (cur_label > 0)*cur_weight;
} }
} }
accum += cur_neg*(cur_pos * 0.5f + sum_pos); accum += cur_neg*(cur_pos * 0.5f + sum_pos);
......
...@@ -28,8 +28,9 @@ public: ...@@ -28,8 +28,9 @@ public:
data_size_t cnt_positive = 0; data_size_t cnt_positive = 0;
data_size_t cnt_negative = 0; data_size_t cnt_negative = 0;
// count for positive and negative samples // count for positive and negative samples
#pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
if (label_[i] == 1) { if (label_[i] > 0) {
++cnt_positive; ++cnt_positive;
} else { } else {
++cnt_negative; ++cnt_negative;
...@@ -64,8 +65,9 @@ public: ...@@ -64,8 +65,9 @@ public:
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// get label and label weights // get label and label weights
const int label = label_val_[static_cast<int>(label_[i])]; const int is_pos = label_[i] > 0;
const double label_weight = label_weights_[static_cast<int>(label_[i])]; const int label = label_val_[is_pos];
const double label_weight = label_weights_[is_pos];
// calculate gradients and hessians // calculate gradients and hessians
const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i])); const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
const double abs_response = fabs(response); const double abs_response = fabs(response);
...@@ -76,8 +78,9 @@ public: ...@@ -76,8 +78,9 @@ public:
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
// get label and label weights // get label and label weights
const int label = label_val_[static_cast<int>(label_[i])]; const int is_pos = label_[i] > 0;
const double label_weight = label_weights_[static_cast<int>(label_[i])]; const int label = label_val_[is_pos];
const double label_weight = label_weights_[is_pos];
// calculate gradients and hessians // calculate gradients and hessians
const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i])); const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
const double abs_response = fabs(response); const double abs_response = fabs(response);
......
...@@ -52,6 +52,7 @@ public: ...@@ -52,6 +52,7 @@ public:
num_queries_ = metadata.num_queries(); num_queries_ = metadata.num_queries();
// cache inverse max DCG, avoid computation many times // cache inverse max DCG, avoid computation many times
inverse_max_dcgs_.resize(num_queries_); inverse_max_dcgs_.resize(num_queries_);
#pragma omp parallel for schedule(guided)
for (data_size_t i = 0; i < num_queries_; ++i) { for (data_size_t i = 0; i < num_queries_; ++i) {
inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(optimize_pos_at_, inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(optimize_pos_at_,
label_ + query_boundaries_[i], label_ + query_boundaries_[i],
......
...@@ -259,14 +259,14 @@ public: ...@@ -259,14 +259,14 @@ public:
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = score[i] - label_[i]; gradients[i] = static_cast<score_t>(score[i] - label_[i]);
hessians[i] = score[i] + max_delta_step_; hessians[i] = static_cast<score_t>(score[i] + max_delta_step_);
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = (score[i] - label_[i]) * weights_[i]; gradients[i] = static_cast<score_t>((score[i] - label_[i]) * weights_[i]);
hessians[i] = (score[i] + max_delta_step_) * weights_[i]; hessians[i] = static_cast<score_t>((score[i] + max_delta_step_) * weights_[i]);
} }
} }
} }
......
...@@ -24,7 +24,7 @@ void DataParallelTreeLearner::Init(const Dataset* train_data) { ...@@ -24,7 +24,7 @@ void DataParallelTreeLearner::Init(const Dataset* train_data) {
// allocate buffer for communication // allocate buffer for communication
size_t buffer_size = 0; size_t buffer_size = 0;
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < num_features_; ++i) {
buffer_size += train_data_->FeatureAt(i)->num_bin() * sizeof(HistogramBinEntry); buffer_size += train_data_->FeatureNumBin(i) * sizeof(HistogramBinEntry);
} }
input_buffer_.resize(buffer_size); input_buffer_.resize(buffer_size);
...@@ -54,7 +54,7 @@ void DataParallelTreeLearner::BeforeTrain() { ...@@ -54,7 +54,7 @@ void DataParallelTreeLearner::BeforeTrain() {
if (is_feature_used_[i]) { if (is_feature_used_[i]) {
int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed)); int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
feature_distribution[cur_min_machine].push_back(i); feature_distribution[cur_min_machine].push_back(i);
num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin(); num_bins_distributed[cur_min_machine] += train_data_->FeatureNumBin(i);
} }
is_feature_aggregated_[i] = false; is_feature_aggregated_[i] = false;
} }
...@@ -68,7 +68,7 @@ void DataParallelTreeLearner::BeforeTrain() { ...@@ -68,7 +68,7 @@ void DataParallelTreeLearner::BeforeTrain() {
for (int i = 0; i < num_machines_; ++i) { for (int i = 0; i < num_machines_; ++i) {
block_len_[i] = 0; block_len_[i] = 0;
for (auto fid : feature_distribution[i]) { for (auto fid : feature_distribution[i]) {
block_len_[i] += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry); block_len_[i] += train_data_->FeatureNumBin(fid) * sizeof(HistogramBinEntry);
} }
reduce_scatter_size_ += block_len_[i]; reduce_scatter_size_ += block_len_[i];
} }
...@@ -83,7 +83,7 @@ void DataParallelTreeLearner::BeforeTrain() { ...@@ -83,7 +83,7 @@ void DataParallelTreeLearner::BeforeTrain() {
for (int i = 0; i < num_machines_; ++i) { for (int i = 0; i < num_machines_; ++i) {
for (auto fid : feature_distribution[i]) { for (auto fid : feature_distribution[i]) {
buffer_write_start_pos_[fid] = bin_size; buffer_write_start_pos_[fid] = bin_size;
bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry); bin_size += train_data_->FeatureNumBin(fid) * sizeof(HistogramBinEntry);
} }
} }
...@@ -91,7 +91,7 @@ void DataParallelTreeLearner::BeforeTrain() { ...@@ -91,7 +91,7 @@ void DataParallelTreeLearner::BeforeTrain() {
bin_size = 0; bin_size = 0;
for (auto fid : feature_distribution[rank_]) { for (auto fid : feature_distribution[rank_]) {
buffer_read_start_pos_[fid] = bin_size; buffer_read_start_pos_[fid] = bin_size;
bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry); bin_size += train_data_->FeatureNumBin(fid) * sizeof(HistogramBinEntry);
} }
// sync global data sumup info // sync global data sumup info
...@@ -125,49 +125,51 @@ void DataParallelTreeLearner::BeforeTrain() { ...@@ -125,49 +125,51 @@ void DataParallelTreeLearner::BeforeTrain() {
} }
void DataParallelTreeLearner::FindBestThresholds() { void DataParallelTreeLearner::FindBestThresholds() {
train_data_->ConstructHistograms(is_feature_used_,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
smaller_leaf_histogram_array_[0].RawData() - 1);
// construct local histograms // construct local histograms
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) { for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if ((!is_feature_used_.empty() && is_feature_used_[feature_index] == false)) continue; if ((!is_feature_used_.empty() && is_feature_used_[feature_index] == false)) continue;
// construct histograms for smaller leaf
if (ordered_bins_[feature_index] == nullptr) {
// if not use ordered bin
train_data_->FeatureAt(feature_index)->bin_data()->ConstructHistogram(
smaller_leaf_splits_->data_indices(),
smaller_leaf_splits_->num_data_in_leaf(),
ptr_to_ordered_gradients_smaller_leaf_,
ptr_to_ordered_hessians_smaller_leaf_,
smaller_leaf_histogram_array_[feature_index].GetData());
} else {
// used ordered bin
ordered_bins_[feature_index]->ConstructHistogram(smaller_leaf_splits_->LeafIndex(),
gradients_,
hessians_,
smaller_leaf_histogram_array_[feature_index].GetData());
}
// copy to buffer // copy to buffer
std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index], std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index],
smaller_leaf_histogram_array_[feature_index].HistogramData(), smaller_leaf_histogram_array_[feature_index].RawData(),
smaller_leaf_histogram_array_[feature_index].SizeOfHistgram()); smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
} }
// Reduce scatter for histogram // Reduce scatter for histogram
Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(), Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(),
block_len_.data(), output_buffer_.data(), &HistogramBinEntry::SumReducer); block_len_.data(), output_buffer_.data(), &HistogramBinEntry::SumReducer);
std::vector<SplitInfo> smaller_best(num_threads_, SplitInfo());
std::vector<SplitInfo> larger_best(num_threads_, SplitInfo());
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) { for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_aggregated_[feature_index]) continue; if (!is_feature_aggregated_[feature_index]) continue;
const int tid = omp_get_thread_num();
// restore global histograms from buffer // restore global histograms from buffer
smaller_leaf_histogram_array_[feature_index].FromMemory( smaller_leaf_histogram_array_[feature_index].FromMemory(
output_buffer_.data() + buffer_read_start_pos_[feature_index]); output_buffer_.data() + buffer_read_start_pos_[feature_index]);
train_data_->FixHistogram(feature_index,
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_histogram_array_[feature_index].RawData());
SplitInfo smaller_split;
// find best threshold for smaller child // find best threshold for smaller child
smaller_leaf_histogram_array_[feature_index].FindBestThreshold( smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(), smaller_leaf_splits_->sum_hessians(),
GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()), GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
&smaller_leaf_splits_->BestSplitPerFeature()[feature_index]); &smaller_split);
if (smaller_split.gain > smaller_best[tid].gain) {
smaller_best[tid] = smaller_split;
}
// only root leaf // only root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue; if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
...@@ -175,35 +177,36 @@ void DataParallelTreeLearner::FindBestThresholds() { ...@@ -175,35 +177,36 @@ void DataParallelTreeLearner::FindBestThresholds() {
// construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms // construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_[feature_index].Subtract( larger_leaf_histogram_array_[feature_index].Subtract(
smaller_leaf_histogram_array_[feature_index]); smaller_leaf_histogram_array_[feature_index]);
SplitInfo larger_split;
// find best threshold for larger child // find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold( larger_leaf_histogram_array_[feature_index].FindBestThreshold(
larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(), larger_leaf_splits_->sum_hessians(),
GetGlobalDataCountInLeaf(larger_leaf_splits_->LeafIndex()), GetGlobalDataCountInLeaf(larger_leaf_splits_->LeafIndex()),
&larger_leaf_splits_->BestSplitPerFeature()[feature_index]); &larger_split);
if (larger_split.gain > larger_best[tid].gain) {
larger_best[tid] = larger_split;
}
} }
auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
int leaf = smaller_leaf_splits_->LeafIndex();
best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { return; }
leaf = larger_leaf_splits_->LeafIndex();
auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
} }
void DataParallelTreeLearner::FindBestSplitsForLeaves() { void DataParallelTreeLearner::FindBestSplitsForLeaves() {
int smaller_best_feature = -1, larger_best_feature = -1;
SplitInfo smaller_best, larger_best; SplitInfo smaller_best, larger_best;
std::vector<double> gains; smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
// find local best split for smaller leaf
for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
}
smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
// find local best split for larger leaf // find local best split for larger leaf
if (larger_leaf_splits_->LeafIndex() >= 0) { if (larger_leaf_splits_->LeafIndex() >= 0) {
gains.clear(); larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
}
larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
} }
// sync global best info // sync global best info
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_ #define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include <LightGBM/feature.h> #include <LightGBM/dataset.h>
#include <LightGBM/utils/openmp_wrapper.h> #include <LightGBM/utils/openmp_wrapper.h>
...@@ -93,7 +93,7 @@ public: ...@@ -93,7 +93,7 @@ public:
* \param threshold threshold that want to split * \param threshold threshold that want to split
* \param right_leaf index of right leaf * \param right_leaf index of right leaf
*/ */
void Split(int leaf, const Bin* feature_bins, unsigned int threshold, int right_leaf) { void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, int right_leaf) {
const data_size_t min_inner_size = 1000; const data_size_t min_inner_size = 1000;
// get leaf boundary // get leaf boundary
const data_size_t begin = leaf_begin_[leaf]; const data_size_t begin = leaf_begin_[leaf];
...@@ -111,7 +111,7 @@ public: ...@@ -111,7 +111,7 @@ public:
data_size_t cur_cnt = inner_size; data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; } if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
// split data inner, reduce the times of function called // split data inner, reduce the times of function called
data_size_t cur_left_count = feature_bins->Split(threshold, indices_.data() + begin + cur_start, cur_cnt, data_size_t cur_left_count = dataset->Split(feature, threshold, indices_.data() + begin + cur_start, cur_cnt,
temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start); temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
offsets_buf_[i] = cur_start; offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count; left_cnts_buf_[i] = cur_left_count;
......
...@@ -2,19 +2,32 @@ ...@@ -2,19 +2,32 @@
#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#include "split_info.hpp" #include "split_info.hpp"
#include <LightGBM/feature.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/dataset.h>
#include <cstring> #include <cstring>
namespace LightGBM { namespace LightGBM
{
class FeatureMetainfo {
public:
int feature_idx;
int num_bin;
int bias = 0;
/*! \brief pointer of tree config */
const TreeConfig* tree_config;
};
/*! /*!
* \brief FeatureHistogram is used to construct and store a histogram for a feature. * \brief FeatureHistogram is used to construct and store a histogram for a feature.
*/ */
class FeatureHistogram { class FeatureHistogram {
public: public:
FeatureHistogram() { FeatureHistogram() {
data_ = nullptr;
} }
~FeatureHistogram() { ~FeatureHistogram() {
} }
...@@ -28,123 +41,80 @@ public: ...@@ -28,123 +41,80 @@ public:
* \param feature the feature data for this histogram * \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf * \param min_num_data_one_leaf minimal number of data in one leaf
*/ */
void Init(const Feature* feature, int feature_idx, const TreeConfig* tree_config) { void Init(HistogramBinEntry* data, const FeatureMetainfo* meta) {
feature_idx_ = feature_idx; meta_ = meta;
tree_config_ = tree_config; data_ = data;
feature_ = feature;
data_.resize(feature_->num_bin());
if (feature->bin_type() == BinType::NumericalBin) {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdForNumerical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
} else {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdForCategorical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
}
} }
HistogramBinEntry* GetData() { HistogramBinEntry* RawData() {
std::memset(data_.data(), 0, feature_->num_bin() * sizeof(HistogramBinEntry)); return data_;
return data_.data();
} }
/*! /*!
* \brief Subtract current histograms with other * \brief Subtract current histograms with other
* \param other The histogram that want to subtract * \param other The histogram that want to subtract
*/ */
void Subtract(const FeatureHistogram& other) { void Subtract(const FeatureHistogram& other) {
for (int i = 0; i < feature_->num_bin(); ++i) { for (int i = 0; i < meta_->num_bin - meta_->bias; ++i) {
data_[i].cnt -= other.data_[i].cnt; data_[i].cnt -= other.data_[i].cnt;
data_[i].sum_gradients -= other.data_[i].sum_gradients; data_[i].sum_gradients -= other.data_[i].sum_gradients;
data_[i].sum_hessians -= other.data_[i].sum_hessians; data_[i].sum_hessians -= other.data_[i].sum_hessians;
} }
} }
void FixIgnoreBin(double sum_gradient, double sum_hessian, data_size_t num_data) {
if (feature_->is_sparse()) {
// not need to Fix if max heavy bin is 0
if (feature_->bin_type() == BinType::NumericalBin
&& feature_->bin_mapper()->GetDefaultBin() == 0) {
return;
}
int default_bin = static_cast<int>(feature_->bin_mapper()->GetDefaultBin());
data_[default_bin].sum_gradients = sum_gradient;
data_[default_bin].sum_hessians = sum_hessian;
data_[default_bin].cnt = num_data;
for (int t = feature_->num_bin() - 1; t >= 0; --t) {
if (t != default_bin) {
data_[default_bin].sum_gradients -= data_[t].sum_gradients;
data_[default_bin].sum_hessians -= data_[t].sum_hessians;
data_[default_bin].cnt -= data_[t].cnt;
}
}
}
}
/*!
* \brief Find best threshold for this histogram
* \param output The best split result
*/
void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) { SplitInfo* output) {
FixIgnoreBin(sum_gradient, sum_hessian, num_data); sum_hessian += 2 * kEpsilon;
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
if (output->gain > kMinScore) {
is_splittable_ = true;
} else {
is_splittable_ = false;
}
}
void FindBestThresholdForNumerical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
double best_sum_left_gradient = NAN; double best_sum_left_gradient = NAN;
double best_sum_left_hessian = NAN; double best_sum_left_hessian = NAN;
double best_gain = kMinScore; double best_gain = kMinScore;
data_size_t best_left_count = 0; data_size_t best_left_count = 0;
unsigned int best_threshold = static_cast<unsigned int>(feature_->num_bin()); uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
double sum_right_gradient = 0.0f; double sum_right_gradient = 0.0f;
double sum_right_hessian = kEpsilon; double sum_right_hessian = kEpsilon;
data_size_t right_count = 0; data_size_t right_count = 0;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian); double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
double min_gain_shift = gain_shift + tree_config_->min_gain_to_split; double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
bool is_splittable = false; is_splittable_ = false;
const int bias = meta_->bias;
int t = meta_->num_bin - 1 - bias;
const int t_end = 1 - bias;
// from right to left, and we don't need data in bin0 // from right to left, and we don't need data in bin0
for (int t = feature_->num_bin() - 1; t > 0; --t) { for (; t >= t_end; --t) {
sum_right_gradient += data_[t].sum_gradients; sum_right_gradient += data_[t].sum_gradients;
sum_right_hessian += data_[t].sum_hessians; sum_right_hessian += data_[t].sum_hessians;
right_count += data_[t].cnt; right_count += data_[t].cnt;
// if data not enough, or sum hessian too small // if data not enough, or sum hessian too small
if (right_count < tree_config_->min_data_in_leaf if (right_count < meta_->tree_config->min_data_in_leaf
|| sum_right_hessian < tree_config_->min_sum_hessian_in_leaf) continue; || sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t left_count = num_data - right_count; data_size_t left_count = num_data - right_count;
// if data not enough // if data not enough
if (left_count < tree_config_->min_data_in_leaf) break; if (left_count < meta_->tree_config->min_data_in_leaf) break;
double sum_left_hessian = sum_hessian - sum_right_hessian; double sum_left_hessian = sum_hessian - sum_right_hessian;
// if sum hessian too small // if sum hessian too small
if (sum_left_hessian < tree_config_->min_sum_hessian_in_leaf) break; if (sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
double sum_left_gradient = sum_gradient - sum_right_gradient; double sum_left_gradient = sum_gradient - sum_right_gradient;
// current split gain // current split gain
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian) double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian); + GetLeafSplitGain(sum_right_gradient, sum_right_hessian);
// gain with split is worse than without split // gain with split is worse than without split
if (current_gain < min_gain_shift) continue; if (current_gain <= min_gain_shift) continue;
// mark to is splittable // mark to is splittable
is_splittable = true; is_splittable_ = true;
// better split point // better split point
if (current_gain > best_gain) { if (current_gain > best_gain) {
best_left_count = left_count; best_left_count = left_count;
best_sum_left_gradient = sum_left_gradient; best_sum_left_gradient = sum_left_gradient;
best_sum_left_hessian = sum_left_hessian; best_sum_left_hessian = sum_left_hessian;
// left is <= threshold, right is > threshold. so this is t-1 // left is <= threshold, right is > threshold. so this is t-1
best_threshold = static_cast<unsigned int>(t - 1); best_threshold = static_cast<uint32_t>(t - 1 + bias);
best_gain = current_gain; best_gain = current_gain;
} }
} }
if (is_splittable) { if (is_splittable_) {
// update split information // update split information
output->feature = feature_idx_; output->feature = meta_->feature_idx;
output->threshold = best_threshold; output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian); output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
output->left_count = best_left_count; output->left_count = best_left_count;
...@@ -157,72 +127,7 @@ public: ...@@ -157,72 +127,7 @@ public:
output->right_sum_hessian = sum_hessian - best_sum_left_hessian; output->right_sum_hessian = sum_hessian - best_sum_left_hessian;
output->gain = best_gain - gain_shift; output->gain = best_gain - gain_shift;
} else { } else {
output->feature = feature_idx_; output->feature = meta_->feature_idx;
output->gain = kMinScore;
}
}
/*!
* \brief Find best threshold for this histogram
* \param output The best split result
*/
void FindBestThresholdForCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
double best_gain = kMinScore;
unsigned int best_threshold = static_cast<unsigned int>(feature_->num_bin());
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
double min_gain_shift = gain_shift + tree_config_->min_gain_to_split;
bool is_splittable = false;
for (int t = feature_->num_bin() - 1; t >= 0; --t) {
double sum_current_gradient = data_[t].sum_gradients;
double sum_current_hessian = data_[t].sum_hessians;
data_size_t current_count = data_[t].cnt;
// if data not enough, or sum hessian too small
if (current_count < tree_config_->min_data_in_leaf
|| sum_current_hessian < tree_config_->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - current_count;
// if data not enough
if (other_count < tree_config_->min_data_in_leaf) continue;
double sum_other_hessian = sum_hessian - sum_current_hessian;
// if sum hessian too small
if (sum_other_hessian < tree_config_->min_sum_hessian_in_leaf) continue;
double sum_other_gradient = sum_gradient - sum_current_gradient;
// current split gain
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(sum_current_gradient, sum_current_hessian);
// gain with split is worse than without split
if (current_gain < min_gain_shift) continue;
// mark to is splittable
is_splittable = true;
// better split point
if (current_gain > best_gain) {
best_threshold = static_cast<unsigned int>(t);
best_gain = current_gain;
}
}
// update split information
if (is_splittable) {
output->feature = feature_idx_;
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(data_[best_threshold].sum_gradients,
data_[best_threshold].sum_hessians);
output->left_count = data_[best_threshold].cnt;
output->left_sum_gradient = data_[best_threshold].sum_gradients;
output->left_sum_hessian = data_[best_threshold].sum_hessians;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - data_[best_threshold].sum_gradients,
sum_hessian - data_[best_threshold].sum_hessians);
output->right_count = num_data - data_[best_threshold].cnt;
output->right_sum_gradient = sum_gradient - data_[best_threshold].sum_gradients;
output->right_sum_hessian = sum_hessian - data_[best_threshold].sum_hessians;
output->gain = best_gain - gain_shift;
} else {
output->feature = feature_idx_;
output->gain = kMinScore; output->gain = kMinScore;
} }
} }
...@@ -230,21 +135,14 @@ public: ...@@ -230,21 +135,14 @@ public:
* \brief Binary size of this histogram * \brief Binary size of this histogram
*/ */
int SizeOfHistgram() const { int SizeOfHistgram() const {
return feature_->num_bin() * sizeof(HistogramBinEntry); return (meta_->num_bin - meta_->bias) * sizeof(HistogramBinEntry);
}
/*!
* \brief Memory pointer to histogram data
*/
const HistogramBinEntry* HistogramData() const {
return data_.data();
} }
/*! /*!
* \brief Restore histogram from memory * \brief Restore histogram from memory
*/ */
void FromMemory(char* memory_data) { void FromMemory(char* memory_data) {
std::memcpy(data_.data(), memory_data, feature_->num_bin() * sizeof(HistogramBinEntry)); std::memcpy(data_, memory_data, (meta_->num_bin - meta_->bias) * sizeof(HistogramBinEntry));
} }
/*! /*!
...@@ -257,10 +155,6 @@ public: ...@@ -257,10 +155,6 @@ public:
*/ */
void set_is_splittable(bool val) { is_splittable_ = val; } void set_is_splittable(bool val) { is_splittable_ = val; }
void ResetConfig(const TreeConfig* tree_config) {
tree_config_ = tree_config;
}
private: private:
/*! /*!
* \brief Calculate the split gain based on regularized sum_gradients and sum_hessians * \brief Calculate the split gain based on regularized sum_gradients and sum_hessians
...@@ -270,12 +164,10 @@ private: ...@@ -270,12 +164,10 @@ private:
*/ */
double GetLeafSplitGain(double sum_gradients, double sum_hessians) const { double GetLeafSplitGain(double sum_gradients, double sum_hessians) const {
double abs_sum_gradients = std::fabs(sum_gradients); double abs_sum_gradients = std::fabs(sum_gradients);
if (abs_sum_gradients > tree_config_->lambda_l1) { double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
double reg_abs_sum_gradients = abs_sum_gradients - tree_config_->lambda_l1; return (reg_abs_sum_gradients * reg_abs_sum_gradients)
return (reg_abs_sum_gradients * reg_abs_sum_gradients) / (sum_hessians + meta_->tree_config->lambda_l2);
/ (sum_hessians + tree_config_->lambda_l2);
}
return 0.0f;
} }
/*! /*!
...@@ -286,26 +178,17 @@ private: ...@@ -286,26 +178,17 @@ private:
*/ */
double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians) const { double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians) const {
double abs_sum_gradients = std::fabs(sum_gradients); double abs_sum_gradients = std::fabs(sum_gradients);
if (abs_sum_gradients > tree_config_->lambda_l1) { double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
return -std::copysign(abs_sum_gradients - tree_config_->lambda_l1, sum_gradients) return -std::copysign(reg_abs_sum_gradients, sum_gradients)
/ (sum_hessians + tree_config_->lambda_l2); / (sum_hessians + meta_->tree_config->lambda_l2);
}
return 0.0f;
} }
const FeatureMetainfo* meta_;
int feature_idx_;
const Feature* feature_;
/*! \brief pointer of tree config */
const TreeConfig* tree_config_;
/*! \brief sum of gradient of each bin */ /*! \brief sum of gradient of each bin */
std::vector<HistogramBinEntry> data_; HistogramBinEntry* data_;
//std::vector<HistogramBinEntry> data_;
/*! \brief False if this histogram cannot split */ /*! \brief False if this histogram cannot split */
bool is_splittable_ = true; bool is_splittable_ = true;
/*! \brief function that used to find best threshold */
std::function<void(double, double, data_size_t, SplitInfo*)> find_best_threshold_fun_;
}; };
class HistogramPool { class HistogramPool {
public: public:
/*! /*!
...@@ -315,7 +198,6 @@ public: ...@@ -315,7 +198,6 @@ public:
cache_size_ = 0; cache_size_ = 0;
total_size_ = 0; total_size_ = 0;
} }
/*! /*!
* \brief Destructor * \brief Destructor
*/ */
...@@ -342,7 +224,6 @@ public: ...@@ -342,7 +224,6 @@ public:
ResetMap(); ResetMap();
} }
} }
/*! /*!
* \brief Reset mapper * \brief Reset mapper
*/ */
...@@ -355,34 +236,49 @@ public: ...@@ -355,34 +236,49 @@ public:
} }
} }
/*! void DynamicChangeSize(const Dataset* train_data, const TreeConfig* tree_config, int cache_size, int total_size) {
* \brief Fill the pool if (feature_metas_.empty()) {
* \param obj_create_fun that used to generate object feature_metas_.resize(train_data->num_features());
*/ #pragma omp parallel for schedule(static)
void Fill(std::function<FeatureHistogram*()> obj_create_fun) { for (int i = 0; i < train_data->num_features(); ++i) {
fill_func_ = obj_create_fun; feature_metas_[i].feature_idx = i;
pool_.clear(); feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
pool_.resize(cache_size_); if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
for (int i = 0; i < cache_size_; ++i) { feature_metas_[i].bias = 1;
pool_[i].reset(obj_create_fun()); } else {
feature_metas_[i].bias = 0;
}
feature_metas_[i].tree_config = tree_config;
}
} }
} uint64_t num_total_bin = train_data->NumTotalBin();
Log::Info("Total Bins %d", num_total_bin);
void DynamicChangeSize(int cache_size, int total_size) {
int old_cache_size = cache_size_; int old_cache_size = cache_size_;
Reset(cache_size, total_size); Reset(cache_size, total_size);
pool_.resize(cache_size_); pool_.resize(cache_size);
data_.resize(cache_size);
#pragma omp parallel for schedule(static)
for (int i = old_cache_size; i < cache_size_; ++i) { for (int i = old_cache_size; i < cache_size_; ++i) {
pool_[i].reset(fill_func_()); pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
data_[i].resize(num_total_bin);
uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j]);
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
num_bin -= 1;
}
offset += static_cast<uint64_t>(num_bin);
}
CHECK(offset == num_total_bin);
} }
} }
void ResetConfig(const TreeConfig* tree_config, int array_size) { void ResetConfig(const TreeConfig* tree_config) {
for (int i = 0; i < cache_size_; ++i) { #pragma omp parallel for schedule(static)
auto data_ptr = pool_[i].get(); for (int i = 0; i < static_cast<int>(feature_metas_.size()); ++i) {
for (int j = 0; j < array_size; ++j) { feature_metas_[i].tree_config = tree_config;
data_ptr[j].ResetConfig(tree_config);
}
} }
} }
/*! /*!
...@@ -440,9 +336,9 @@ public: ...@@ -440,9 +336,9 @@ public:
inverse_mapper_[slot] = dst_idx; inverse_mapper_[slot] = dst_idx;
} }
private: private:
std::vector<std::unique_ptr<FeatureHistogram[]>> pool_; std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
std::function<FeatureHistogram*()> fill_func_; std::vector<std::vector<HistogramBinEntry>> data_;
std::vector<FeatureMetainfo> feature_metas_;
int cache_size_; int cache_size_;
int total_size_; int total_size_;
bool is_enough_ = false; bool is_enough_ = false;
...@@ -452,7 +348,5 @@ private: ...@@ -452,7 +348,5 @@ private:
int cur_time_ = 0; int cur_time_ = 0;
}; };
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ #endif // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
...@@ -32,7 +32,7 @@ void FeatureParallelTreeLearner::BeforeTrain() { ...@@ -32,7 +32,7 @@ void FeatureParallelTreeLearner::BeforeTrain() {
if (is_feature_used_[i]) { if (is_feature_used_[i]) {
int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed)); int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
feature_distribution[cur_min_machine].push_back(i); feature_distribution[cur_min_machine].push_back(i);
num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin(); num_bins_distributed[cur_min_machine] += train_data_->FeatureNumBin(i);
is_feature_used_[i] = false; is_feature_used_[i] = false;
} }
} }
...@@ -43,23 +43,12 @@ void FeatureParallelTreeLearner::BeforeTrain() { ...@@ -43,23 +43,12 @@ void FeatureParallelTreeLearner::BeforeTrain() {
} }
void FeatureParallelTreeLearner::FindBestSplitsForLeaves() { void FeatureParallelTreeLearner::FindBestSplitsForLeaves() {
int smaller_best_feature = -1, larger_best_feature = -1;
SplitInfo smaller_best, larger_best; SplitInfo smaller_best, larger_best;
// get best split at smaller leaf // get best split at smaller leaf
std::vector<double> gains; smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) { // find local best split for larger leaf
gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
}
smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
// get best split at larger leaf
if (larger_leaf_splits_->LeafIndex() >= 0) { if (larger_leaf_splits_->LeafIndex() >= 0) {
gains.clear(); larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
}
larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
} }
// sync global best info // sync global best info
std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo)); std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo));
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
#define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_ #define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include "data_partition.hpp"
#include "split_info.hpp" #include "split_info.hpp"
#include "data_partition.hpp"
#include <vector> #include <vector>
...@@ -17,10 +17,6 @@ public: ...@@ -17,10 +17,6 @@ public:
LeafSplits(int num_feature, data_size_t num_data) LeafSplits(int num_feature, data_size_t num_data)
:num_data_in_leaf_(num_data), num_data_(num_data), num_features_(num_feature), :num_data_in_leaf_(num_data), num_data_(num_data), num_features_(num_feature),
data_indices_(nullptr) { data_indices_(nullptr) {
best_split_per_feature_.resize(num_features_);
for (int i = 0; i < num_features_; ++i) {
best_split_per_feature_[i].feature = i;
}
} }
void ResetNumData(data_size_t num_data) { void ResetNumData(data_size_t num_data) {
num_data_ = num_data; num_data_ = num_data;
...@@ -42,9 +38,6 @@ public: ...@@ -42,9 +38,6 @@ public:
data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
sum_gradients_ = sum_gradients; sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians; sum_hessians_ = sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
} }
/*! /*!
...@@ -65,9 +58,6 @@ public: ...@@ -65,9 +58,6 @@ public:
} }
sum_gradients_ = tmp_sum_gradients; sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians; sum_hessians_ = tmp_sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
} }
/*! /*!
...@@ -90,9 +80,6 @@ public: ...@@ -90,9 +80,6 @@ public:
} }
sum_gradients_ = tmp_sum_gradients; sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians; sum_hessians_ = tmp_sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
} }
...@@ -105,9 +92,6 @@ public: ...@@ -105,9 +92,6 @@ public:
leaf_index_ = 0; leaf_index_ = 0;
sum_gradients_ = sum_gradients; sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians; sum_hessians_ = sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
} }
/*! /*!
...@@ -115,13 +99,10 @@ public: ...@@ -115,13 +99,10 @@ public:
*/ */
void Init() { void Init() {
leaf_index_ = -1; leaf_index_ = -1;
for (SplitInfo& split_info : best_split_per_feature_) { data_indices_ = nullptr;
split_info.Reset(); num_data_in_leaf_ = 0;
}
} }
/*! \brief Get best splits on all features */
std::vector<SplitInfo>& BestSplitPerFeature() { return best_split_per_feature_;}
/*! \brief Get current leaf index */ /*! \brief Get current leaf index */
int LeafIndex() const { return leaf_index_; } int LeafIndex() const { return leaf_index_; }
...@@ -140,8 +121,6 @@ public: ...@@ -140,8 +121,6 @@ public:
private: private:
/*! \brief store best splits of all feature on current leaf */
std::vector<SplitInfo> best_split_per_feature_;
/*! \brief current leaf index */ /*! \brief current leaf index */
int leaf_index_; int leaf_index_;
/*! \brief number of data on current leaf */ /*! \brief number of data on current leaf */
......
...@@ -170,6 +170,10 @@ private: ...@@ -170,6 +170,10 @@ private:
std::unique_ptr<FeatureHistogram[]> smaller_leaf_histogram_array_global_; std::unique_ptr<FeatureHistogram[]> smaller_leaf_histogram_array_global_;
/*! \brief Store global histogram for larger leaf */ /*! \brief Store global histogram for larger leaf */
std::unique_ptr<FeatureHistogram[]> larger_leaf_histogram_array_global_; std::unique_ptr<FeatureHistogram[]> larger_leaf_histogram_array_global_;
std::vector<HistogramBinEntry> smaller_leaf_histogram_data_;
std::vector<HistogramBinEntry> larger_leaf_histogram_data_;
std::vector<FeatureMetainfo> feature_metas_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -10,10 +10,14 @@ namespace LightGBM { ...@@ -10,10 +10,14 @@ namespace LightGBM {
SerialTreeLearner::SerialTreeLearner(const TreeConfig* tree_config) SerialTreeLearner::SerialTreeLearner(const TreeConfig* tree_config)
:tree_config_(tree_config){ :tree_config_(tree_config){
random_ = Random(tree_config_->feature_fraction_seed); random_ = Random(tree_config_->feature_fraction_seed);
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
} }
SerialTreeLearner::~SerialTreeLearner() { SerialTreeLearner::~SerialTreeLearner() {
} }
void SerialTreeLearner::Init(const Dataset* train_data) { void SerialTreeLearner::Init(const Dataset* train_data) {
...@@ -27,38 +31,23 @@ void SerialTreeLearner::Init(const Dataset* train_data) { ...@@ -27,38 +31,23 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
} else { } else {
size_t total_histogram_size = 0; size_t total_histogram_size = 0;
for (int i = 0; i < train_data_->num_features(); ++i) { for (int i = 0; i < train_data_->num_features(); ++i) {
total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureAt(i)->num_bin(); total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
} }
max_cache_size = static_cast<int>(tree_config_->histogram_pool_size * 1024 * 1024 / total_histogram_size); max_cache_size = static_cast<int>(tree_config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
} }
// at least need 2 leaves // at least need 2 leaves
max_cache_size = std::max(2, max_cache_size); max_cache_size = std::max(2, max_cache_size);
max_cache_size = std::min(max_cache_size, tree_config_->num_leaves); max_cache_size = std::min(max_cache_size, tree_config_->num_leaves);
histogram_pool_.Reset(max_cache_size, tree_config_->num_leaves);
auto histogram_create_function = [this]() {
auto tmp_histogram_array = std::unique_ptr<FeatureHistogram[]>(new FeatureHistogram[train_data_->num_features()]);
for (int j = 0; j < train_data_->num_features(); ++j) {
tmp_histogram_array[j].Init(train_data_->FeatureAt(j),
j, tree_config_);
}
return tmp_histogram_array.release();
};
histogram_pool_.Fill(histogram_create_function);
histogram_pool_.DynamicChangeSize(train_data_, tree_config_, max_cache_size, tree_config_->num_leaves);
// push split information for all leaves // push split information for all leaves
best_split_per_leaf_.resize(tree_config_->num_leaves); best_split_per_leaf_.resize(tree_config_->num_leaves);
// initialize ordered_bins_ with nullptr
ordered_bins_.resize(num_features_);
// get ordered bin // get ordered bin
#pragma omp parallel for schedule(guided) train_data_->CreateOrderedBins(&ordered_bins_);
for (int i = 0; i < num_features_; ++i) {
ordered_bins_[i].reset(train_data_->FeatureAt(i)->bin_data()->CreateOrderedBin());
}
// check existing for ordered bin // check existing for ordered bin
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
if (ordered_bins_[i] != nullptr) { if (ordered_bins_[i] != nullptr) {
has_ordered_bin_ = true; has_ordered_bin_ = true;
break; break;
...@@ -70,17 +59,16 @@ void SerialTreeLearner::Init(const Dataset* train_data) { ...@@ -70,17 +59,16 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
// initialize data partition // initialize data partition
data_partition_.reset(new DataPartition(num_data_, tree_config_->num_leaves)); data_partition_.reset(new DataPartition(num_data_, tree_config_->num_leaves));
is_feature_used_.resize(num_features_); is_feature_used_.resize(num_features_);
// initialize ordered gradients and hessians // initialize ordered gradients and hessians
ordered_gradients_.resize(num_data_); ordered_gradients_.resize(num_data_);
ordered_hessians_.resize(num_data_); ordered_hessians_.resize(num_data_);
// if has ordered bin, need to allocate a buffer to fast split // if has ordered bin, need to allocate a buffer to fast split
if (has_ordered_bin_) { if (has_ordered_bin_) {
is_data_in_leaf_.resize(num_data_); is_data_in_leaf_.resize(num_data_);
std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
} }
Log::Info("Number of data: %d, number of features: %d", num_data_, num_features_); Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
} }
void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
...@@ -88,17 +76,12 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { ...@@ -88,17 +76,12 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
num_data_ = train_data_->num_data(); num_data_ = train_data_->num_data();
num_features_ = train_data_->num_features(); num_features_ = train_data_->num_features();
// initialize ordered_bins_ with nullptr
ordered_bins_.resize(num_features_);
// get ordered bin // get ordered bin
#pragma omp parallel for schedule(guided) train_data_->CreateOrderedBins(&ordered_bins_);
for (int i = 0; i < num_features_; ++i) {
ordered_bins_[i].reset(train_data_->FeatureAt(i)->bin_data()->CreateOrderedBin());
}
has_ordered_bin_ = false; has_ordered_bin_ = false;
// check existing for ordered bin // check existing for ordered bin
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
if (ordered_bins_[i] != nullptr) { if (ordered_bins_[i] != nullptr) {
has_ordered_bin_ = true; has_ordered_bin_ = true;
break; break;
...@@ -119,6 +102,7 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) { ...@@ -119,6 +102,7 @@ void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
// if has ordered bin, need to allocate a buffer to fast split // if has ordered bin, need to allocate a buffer to fast split
if (has_ordered_bin_) { if (has_ordered_bin_) {
is_data_in_leaf_.resize(num_data_); is_data_in_leaf_.resize(num_data_);
std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
} }
} }
...@@ -133,14 +117,14 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) { ...@@ -133,14 +117,14 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
} else { } else {
size_t total_histogram_size = 0; size_t total_histogram_size = 0;
for (int i = 0; i < train_data_->num_features(); ++i) { for (int i = 0; i < train_data_->num_features(); ++i) {
total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureAt(i)->num_bin(); total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
} }
max_cache_size = static_cast<int>(tree_config_->histogram_pool_size * 1024 * 1024 / total_histogram_size); max_cache_size = static_cast<int>(tree_config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
} }
// at least need 2 leaves // at least need 2 leaves
max_cache_size = std::max(2, max_cache_size); max_cache_size = std::max(2, max_cache_size);
max_cache_size = std::min(max_cache_size, tree_config_->num_leaves); max_cache_size = std::min(max_cache_size, tree_config_->num_leaves);
histogram_pool_.DynamicChangeSize(max_cache_size, tree_config_->num_leaves); histogram_pool_.DynamicChangeSize(train_data_, tree_config_, max_cache_size, tree_config_->num_leaves);
// push split information for all leaves // push split information for all leaves
best_split_per_leaf_.resize(tree_config_->num_leaves); best_split_per_leaf_.resize(tree_config_->num_leaves);
...@@ -149,7 +133,7 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) { ...@@ -149,7 +133,7 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
tree_config_ = tree_config; tree_config_ = tree_config;
} }
histogram_pool_.ResetConfig(tree_config_, train_data_->num_features()); histogram_pool_.ResetConfig(tree_config_);
} }
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) { Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
...@@ -164,7 +148,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians ...@@ -164,7 +148,7 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
int left_leaf = 0; int left_leaf = 0;
// only root leaf can be splitted on first time // only root leaf can be splitted on first time
int right_leaf = -1; int right_leaf = -1;
for (int split = 0; split < tree_config_->num_leaves - 1; split++) { for (int split = 0; split < tree_config_->num_leaves - 1; ++split) {
// some initial works before finding best split // some initial works before finding best split
if (BeforeFindBestSplit(left_leaf, right_leaf)) { if (BeforeFindBestSplit(left_leaf, right_leaf)) {
// find best threshold for every feature // find best threshold for every feature
...@@ -192,15 +176,22 @@ void SerialTreeLearner::BeforeTrain() { ...@@ -192,15 +176,22 @@ void SerialTreeLearner::BeforeTrain() {
// reset histogram pool // reset histogram pool
histogram_pool_.ResetMap(); histogram_pool_.ResetMap();
// initialize used features
for (int i = 0; i < num_features_; ++i) {
is_feature_used_[i] = false;
}
// Get used feature at current tree
int used_feature_cnt = static_cast<int>(num_features_*tree_config_->feature_fraction); int used_feature_cnt = static_cast<int>(num_features_*tree_config_->feature_fraction);
auto used_feature_indices = random_.Sample(num_features_, used_feature_cnt);
for (auto idx : used_feature_indices) { if (used_feature_cnt < num_features_) {
is_feature_used_[idx] = true; // initialize used features
std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * num_features_);
// Get used feature at current tree
auto used_feature_indices = random_.Sample(num_features_, used_feature_cnt);
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(used_feature_indices.size()); ++i) {
is_feature_used_[used_feature_indices[i]] = 1;
}
} else {
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_features_; ++i) {
is_feature_used_[i] = 1;
}
} }
// initialize data partition // initialize data partition
...@@ -215,28 +206,12 @@ void SerialTreeLearner::BeforeTrain() { ...@@ -215,28 +206,12 @@ void SerialTreeLearner::BeforeTrain() {
if (data_partition_->leaf_count(0) == num_data_) { if (data_partition_->leaf_count(0) == num_data_) {
// use all data // use all data
smaller_leaf_splits_->Init(gradients_, hessians_); smaller_leaf_splits_->Init(gradients_, hessians_);
// point to gradients, avoid copy
ptr_to_ordered_gradients_smaller_leaf_ = gradients_;
ptr_to_ordered_hessians_smaller_leaf_ = hessians_;
} else { } else {
// use bagging, only use part of data // use bagging, only use part of data
smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_); smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
// copy used gradients and hessians to ordered buffer
const data_size_t* indices = data_partition_->indices();
data_size_t cnt = data_partition_->leaf_count(0);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < cnt; ++i) {
ordered_gradients_[i] = gradients_[indices[i]];
ordered_hessians_[i] = hessians_[indices[i]];
}
// point to ordered_gradients_ and ordered_hessians_
ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_.data();
ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_.data();
} }
ptr_to_ordered_gradients_larger_leaf_ = nullptr;
ptr_to_ordered_hessians_larger_leaf_ = nullptr;
larger_leaf_splits_->Init(); larger_leaf_splits_->Init();
// if has ordered bin, need to initialize the ordered bin // if has ordered bin, need to initialize the ordered bin
...@@ -244,16 +219,16 @@ void SerialTreeLearner::BeforeTrain() { ...@@ -244,16 +219,16 @@ void SerialTreeLearner::BeforeTrain() {
if (data_partition_->leaf_count(0) == num_data_) { if (data_partition_->leaf_count(0) == num_data_) {
// use all data, pass nullptr // use all data, pass nullptr
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
if (ordered_bins_[i] != nullptr) { auto ptr = ordered_bins_[i].get();
ordered_bins_[i]->Init(nullptr, tree_config_->num_leaves); if (ptr != nullptr) {
ptr->Init(nullptr, tree_config_->num_leaves);
} }
} }
} else { } else {
// bagging, only use part of data // bagging, only use part of data
// mark used data // mark used data
std::memset(is_data_in_leaf_.data(), 0, sizeof(char)*num_data_);
const data_size_t* indices = data_partition_->indices(); const data_size_t* indices = data_partition_->indices();
data_size_t begin = data_partition_->leaf_begin(0); data_size_t begin = data_partition_->leaf_begin(0);
data_size_t end = begin + data_partition_->leaf_count(0); data_size_t end = begin + data_partition_->leaf_count(0);
...@@ -263,11 +238,16 @@ void SerialTreeLearner::BeforeTrain() { ...@@ -263,11 +238,16 @@ void SerialTreeLearner::BeforeTrain() {
} }
// initialize ordered bin // initialize ordered bin
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
if (ordered_bins_[i] != nullptr) { auto ptr = ordered_bins_[i].get();
ordered_bins_[i]->Init(is_data_in_leaf_.data(), tree_config_->num_leaves); if (ptr != nullptr) {
ptr->Init(is_data_in_leaf_.data(), tree_config_->num_leaves);
} }
} }
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 0;
}
} }
} }
} }
...@@ -296,174 +276,164 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) { ...@@ -296,174 +276,164 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
return false; return false;
} }
parent_leaf_histogram_array_ = nullptr; parent_leaf_histogram_array_ = nullptr;
// -1 if only has one leaf. else equal the index of smaller leaf
int smaller_leaf = -1;
int larger_leaf = -1;
// only have root // only have root
if (right_leaf < 0) { if (right_leaf < 0) {
histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_); histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
larger_leaf_histogram_array_ = nullptr; larger_leaf_histogram_array_ = nullptr;
} else if (num_data_in_left_child < num_data_in_right_child) { } else if (num_data_in_left_child < num_data_in_right_child) {
smaller_leaf = left_leaf;
larger_leaf = right_leaf;
// put parent(left) leaf's histograms into larger leaf's histograms // put parent(left) leaf's histograms into larger leaf's histograms
if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; } if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
histogram_pool_.Move(left_leaf, right_leaf); histogram_pool_.Move(left_leaf, right_leaf);
histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_); histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
} else { } else {
smaller_leaf = right_leaf;
larger_leaf = left_leaf;
// put parent(left) leaf's histograms to larger leaf's histograms // put parent(left) leaf's histograms to larger leaf's histograms
if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; } if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_); histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
} }
// init for the ordered gradients, only initialize when have 2 leaves
if (smaller_leaf >= 0) {
// only need to initialize for smaller leaf
// Get leaf boundary
const data_size_t* indices = data_partition_->indices();
data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
// copy
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
ordered_gradients_[i - begin] = gradients_[indices[i]];
ordered_hessians_[i - begin] = hessians_[indices[i]];
}
// assign pointer
ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_.data();
ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_.data();
if (parent_leaf_histogram_array_ == nullptr) {
// need order gradient for larger leaf
data_size_t smaller_size = end - begin;
data_size_t larger_begin = data_partition_->leaf_begin(larger_leaf);
data_size_t larger_end = larger_begin + data_partition_->leaf_count(larger_leaf);
// copy
#pragma omp parallel for schedule(static)
for (data_size_t i = larger_begin; i < larger_end; ++i) {
ordered_gradients_[smaller_size + i - larger_begin] = gradients_[indices[i]];
ordered_hessians_[smaller_size + i - larger_begin] = hessians_[indices[i]];
}
ptr_to_ordered_gradients_larger_leaf_ = ordered_gradients_.data() + smaller_size;
ptr_to_ordered_hessians_larger_leaf_ = ordered_hessians_.data() + smaller_size;
}
}
// split for the ordered bin // split for the ordered bin
if (has_ordered_bin_ && right_leaf >= 0) { if (has_ordered_bin_ && right_leaf >= 0) {
// mark data that at left-leaf // mark data that at left-leaf
std::memset(is_data_in_leaf_.data(), 0, sizeof(char)*num_data_);
const data_size_t* indices = data_partition_->indices(); const data_size_t* indices = data_partition_->indices();
const auto left_cnt = data_partition_->leaf_count(left_leaf);
const auto right_cnt = data_partition_->leaf_count(right_leaf);
char mark = 1;
data_size_t begin = data_partition_->leaf_begin(left_leaf); data_size_t begin = data_partition_->leaf_begin(left_leaf);
data_size_t end = begin + data_partition_->leaf_count(left_leaf); data_size_t end = begin + left_cnt;
if (left_cnt > right_cnt) {
begin = data_partition_->leaf_begin(right_leaf);
end = begin + right_cnt;
mark = 0;
}
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) { for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 1; is_data_in_leaf_[indices[i]] = 1;
} }
// split the ordered bin // split the ordered bin
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
if (ordered_bins_[i] != nullptr) { auto ptr = ordered_bins_[i].get();
ordered_bins_[i]->Split(left_leaf, right_leaf, is_data_in_leaf_.data()); if (ptr != nullptr) {
ptr->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark);
} }
} }
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 0;
}
} }
return true; return true;
} }
void SerialTreeLearner::FindBestThresholds() { void SerialTreeLearner::FindBestThresholds() {
#pragma omp parallel for schedule(guided) std::vector<int8_t> is_feature_used(num_features_, 0);
for (int feature_index = 0; feature_index < num_features_; feature_index++) { #pragma omp parallel for schedule(guided)
// feature is not used for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if ((!is_feature_used_.empty() && is_feature_used_[feature_index] == false)) continue; if (!is_feature_used_[feature_index]) continue;
// if parent(larger) leaf cannot split at current feature if (parent_leaf_histogram_array_ != nullptr
if (parent_leaf_histogram_array_ != nullptr && !parent_leaf_histogram_array_[feature_index].is_splittable()) { && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
smaller_leaf_histogram_array_[feature_index].set_is_splittable(false); smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
continue; continue;
} }
is_feature_used[feature_index] = 1;
}
bool use_subtract = true;
if (parent_leaf_histogram_array_ == nullptr) {
use_subtract = false;
}
// construct smaller leaf
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
train_data_->ConstructHistograms(is_feature_used,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
ptr_smaller_leaf_hist_data);
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf
HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
train_data_->ConstructHistograms(is_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
ptr_larger_leaf_hist_data);
}
std::vector<SplitInfo> smaller_best(num_threads_);
std::vector<SplitInfo> larger_best(num_threads_);
// find splits
#pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used[feature_index]) { continue; }
const int tid = omp_get_thread_num();
SplitInfo smaller_split;
train_data_->FixHistogram(feature_index,
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_histogram_array_[feature_index].RawData());
// construct histograms for smaller leaf
if (ordered_bins_[feature_index] == nullptr) {
// if not use ordered bin
train_data_->FeatureAt(feature_index)->bin_data()->ConstructHistogram(
smaller_leaf_splits_->data_indices(),
smaller_leaf_splits_->num_data_in_leaf(),
ptr_to_ordered_gradients_smaller_leaf_,
ptr_to_ordered_hessians_smaller_leaf_,
smaller_leaf_histogram_array_[feature_index].GetData());
} else {
// used ordered bin
ordered_bins_[feature_index]->ConstructHistogram(smaller_leaf_splits_->LeafIndex(),
gradients_,
hessians_,
smaller_leaf_histogram_array_[feature_index].GetData());
}
// find best threshold for smaller child
smaller_leaf_histogram_array_[feature_index].FindBestThreshold( smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(), smaller_leaf_splits_->sum_hessians(),
smaller_leaf_splits_->num_data_in_leaf(), smaller_leaf_splits_->num_data_in_leaf(),
&smaller_leaf_splits_->BestSplitPerFeature()[feature_index]); &smaller_split);
if (smaller_split.gain > smaller_best[tid].gain) {
smaller_best[tid] = smaller_split;
}
// only has root leaf // only has root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue; if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
if (parent_leaf_histogram_array_ != nullptr) { if (use_subtract) {
// construct histgroms for large leaf, we initialize larger leaf as the parent,
// so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]); larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
} else { } else {
if (ordered_bins_[feature_index] == nullptr) { train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(),
// if not use ordered bin larger_leaf_splits_->num_data_in_leaf(),
train_data_->FeatureAt(feature_index)->bin_data()->ConstructHistogram( larger_leaf_histogram_array_[feature_index].RawData());
larger_leaf_splits_->data_indices(),
larger_leaf_splits_->num_data_in_leaf(),
ptr_to_ordered_gradients_larger_leaf_,
ptr_to_ordered_hessians_larger_leaf_,
larger_leaf_histogram_array_[feature_index].GetData());
} else {
// used ordered bin
ordered_bins_[feature_index]->ConstructHistogram(larger_leaf_splits_->LeafIndex(),
gradients_,
hessians_,
larger_leaf_histogram_array_[feature_index].GetData());
}
} }
SplitInfo larger_split;
// find best threshold for larger child // find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold( larger_leaf_histogram_array_[feature_index].FindBestThreshold(
larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(), larger_leaf_splits_->sum_hessians(),
larger_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf(),
&larger_leaf_splits_->BestSplitPerFeature()[feature_index]); &larger_split);
if (larger_split.gain > larger_best[tid].gain) {
larger_best[tid] = larger_split;
}
} }
auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
int leaf = smaller_leaf_splits_->LeafIndex();
best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
if (larger_leaf_splits_ != nullptr && larger_leaf_splits_->LeafIndex() >= 0) {
leaf = larger_leaf_splits_->LeafIndex();
auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
}
}
void SerialTreeLearner::FindBestSplitsForLeaves() {
} }
void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) { void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf]; const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
// left = parent // left = parent
*left_leaf = best_Leaf; *left_leaf = best_Leaf;
// split tree, will return right leaf // split tree, will return right leaf
*right_leaf = tree->Split(best_Leaf, best_split_info.feature, *right_leaf = tree->Split(best_Leaf, best_split_info.feature,
train_data_->FeatureAt(best_split_info.feature)->bin_type(),
best_split_info.threshold, best_split_info.threshold,
train_data_->FeatureAt(best_split_info.feature)->feature_index(), train_data_->RealFeatureIndex(best_split_info.feature),
train_data_->FeatureAt(best_split_info.feature)->BinToValue(best_split_info.threshold), train_data_->RealThreshold(best_split_info.feature, best_split_info.threshold),
static_cast<double>(best_split_info.left_output), static_cast<double>(best_split_info.left_output),
static_cast<double>(best_split_info.right_output), static_cast<double>(best_split_info.right_output),
static_cast<data_size_t>(best_split_info.left_count), static_cast<data_size_t>(best_split_info.left_count),
static_cast<data_size_t>(best_split_info.right_count), static_cast<data_size_t>(best_split_info.right_count),
static_cast<double>(best_split_info.gain)); static_cast<double>(best_split_info.gain));
// split data partition // split data partition
data_partition_->Split(best_Leaf, train_data_->FeatureAt(best_split_info.feature)->bin_data(), data_partition_->Split(best_Leaf, train_data_, best_split_info.feature,
best_split_info.threshold, *right_leaf); best_split_info.threshold, *right_leaf);
// init the leaves that used on next iteration // init the leaves that used on next iteration
...@@ -480,4 +450,5 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri ...@@ -480,4 +450,5 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
} }
} }
} // namespace LightGBM } // namespace LightGBM
...@@ -7,10 +7,10 @@ ...@@ -7,10 +7,10 @@
#include <LightGBM/tree_learner.h> #include <LightGBM/tree_learner.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <LightGBM/tree.h> #include <LightGBM/tree.h>
#include <LightGBM/feature.h>
#include "feature_histogram.hpp" #include "feature_histogram.hpp"
#include "data_partition.hpp"
#include "split_info.hpp" #include "split_info.hpp"
#include "data_partition.hpp"
#include "leaf_splits.hpp" #include "leaf_splits.hpp"
#include <cstdio> #include <cstdio>
...@@ -77,7 +77,7 @@ protected: ...@@ -77,7 +77,7 @@ protected:
* \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_. * \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_.
* This function will be called after FindBestThresholds. * This function will be called after FindBestThresholds.
*/ */
inline virtual void FindBestSplitsForLeaves(); virtual void FindBestSplitsForLeaves();
/*! /*!
* \brief Partition tree and data according best split. * \brief Partition tree and data according best split.
...@@ -95,12 +95,6 @@ protected: ...@@ -95,12 +95,6 @@ protected:
*/ */
inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const; inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;
/*!
* \brief Find best features for leaf from leaf_splits
* \param leaf_splits
*/
inline void FindBestSplitForLeaf(LeafSplits* leaf_splits);
/*! \brief Last trained decision tree */ /*! \brief Last trained decision tree */
const Tree* last_trained_tree_; const Tree* last_trained_tree_;
/*! \brief number of data */ /*! \brief number of data */
...@@ -118,7 +112,7 @@ protected: ...@@ -118,7 +112,7 @@ protected:
/*! \brief used for generate used features */ /*! \brief used for generate used features */
Random random_; Random random_;
/*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */ /*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
std::vector<bool> is_feature_used_; std::vector<int8_t> is_feature_used_;
/*! \brief pointer to histograms array of parent of current leaves */ /*! \brief pointer to histograms array of parent of current leaves */
FeatureHistogram* parent_leaf_histogram_array_; FeatureHistogram* parent_leaf_histogram_array_;
/*! \brief pointer to histograms array of smaller leaf */ /*! \brief pointer to histograms array of smaller leaf */
...@@ -139,15 +133,6 @@ protected: ...@@ -139,15 +133,6 @@ protected:
/*! \brief hessians of current iteration, ordered for cache optimized */ /*! \brief hessians of current iteration, ordered for cache optimized */
std::vector<score_t> ordered_hessians_; std::vector<score_t> ordered_hessians_;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const score_t* ptr_to_ordered_gradients_smaller_leaf_;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const score_t* ptr_to_ordered_hessians_smaller_leaf_;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const score_t* ptr_to_ordered_gradients_larger_leaf_;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const score_t* ptr_to_ordered_hessians_larger_leaf_;
/*! \brief Store ordered bin */ /*! \brief Store ordered bin */
std::vector<std::unique_ptr<OrderedBin>> ordered_bins_; std::vector<std::unique_ptr<OrderedBin>> ordered_bins_;
/*! \brief True if has ordered bin */ /*! \brief True if has ordered bin */
...@@ -158,15 +143,9 @@ protected: ...@@ -158,15 +143,9 @@ protected:
HistogramPool histogram_pool_; HistogramPool histogram_pool_;
/*! \brief config of tree learner*/ /*! \brief config of tree learner*/
const TreeConfig* tree_config_; const TreeConfig* tree_config_;
int num_threads_;
}; };
inline void SerialTreeLearner::FindBestSplitsForLeaves() {
FindBestSplitForLeaf(smaller_leaf_splits_.get());
FindBestSplitForLeaf(larger_leaf_splits_.get());
}
inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) const { inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) const {
if (leafIdx >= 0) { if (leafIdx >= 0) {
return data_partition_->leaf_count(leafIdx); return data_partition_->leaf_count(leafIdx);
...@@ -175,19 +154,5 @@ inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) cons ...@@ -175,19 +154,5 @@ inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) cons
} }
} }
inline void SerialTreeLearner::FindBestSplitForLeaf(LeafSplits* leaf_splits) {
if (leaf_splits == nullptr || leaf_splits->LeafIndex() < 0) {
return;
}
std::vector<double> gains;
for (size_t i = 0; i < leaf_splits->BestSplitPerFeature().size(); ++i) {
gains.push_back(leaf_splits->BestSplitPerFeature()[i].gain);
}
int best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
int leaf = leaf_splits->LeafIndex();
best_split_per_leaf_[leaf] = leaf_splits->BestSplitPerFeature()[best_feature];
best_split_per_leaf_[leaf].feature = best_feature;
}
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_ #endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
...@@ -53,6 +53,8 @@ public: ...@@ -53,6 +53,8 @@ public:
inline bool operator > (const SplitInfo &si) const; inline bool operator > (const SplitInfo &si) const;
inline bool operator == (const SplitInfo &si) const;
inline static void MaxReducer(const char* src, char* dst, int len) { inline static void MaxReducer(const char* src, char* dst, int len) {
const int type_size = sizeof(SplitInfo); const int type_size = sizeof(SplitInfo);
int used_size = 0; int used_size = 0;
...@@ -103,5 +105,34 @@ inline bool SplitInfo::operator > (const SplitInfo& si) const { ...@@ -103,5 +105,34 @@ inline bool SplitInfo::operator > (const SplitInfo& si) const {
} }
} }
inline bool SplitInfo::operator == (const SplitInfo& si) const {
double local_gain = this->gain;
double other_gain = si.gain;
// replace nan with -inf
if (local_gain == NAN) {
local_gain = kMinScore;
}
// replace nan with -inf
if (other_gain == NAN) {
other_gain = kMinScore;
}
int local_feature = this->feature;
int other_feature = si.feature;
// replace -1 with max int
if (local_feature == -1) {
local_feature = INT32_MAX;
}
// replace -1 with max int
if (other_feature == -1) {
other_feature = INT32_MAX;
}
if (local_gain != other_gain) {
return local_gain == other_gain;
} else {
// if same gain, use smaller feature
return local_feature == other_feature;
}
}
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_ #endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
...@@ -26,8 +26,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) { ...@@ -26,8 +26,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
// get max bin // get max bin
int max_bin = 0; int max_bin = 0;
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < num_features_; ++i) {
if (max_bin < train_data_->FeatureAt(i)->num_bin()) { if (max_bin < train_data_->FeatureNumBin(i)) {
max_bin = train_data_->FeatureAt(i)->num_bin(); max_bin = train_data_->FeatureNumBin(i);
} }
} }
// calculate buffer size // calculate buffer size
...@@ -53,14 +53,39 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) { ...@@ -53,14 +53,39 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
local_tree_config_.min_data_in_leaf /= num_machines_; local_tree_config_.min_data_in_leaf /= num_machines_;
local_tree_config_.min_sum_hessian_in_leaf /= num_machines_; local_tree_config_.min_sum_hessian_in_leaf /= num_machines_;
histogram_pool_.ResetConfig(&local_tree_config_, train_data_->num_features()); histogram_pool_.ResetConfig(&local_tree_config_);
// initialize histograms for global // initialize histograms for global
smaller_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]); smaller_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]);
larger_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]); larger_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]);
for (int j = 0; j < num_features_; ++j) { int num_total_bin = 0;
smaller_leaf_histogram_array_global_[j].Init(train_data_->FeatureAt(j), j, tree_config_); for (int i = 0; i < num_features_; ++i) {
larger_leaf_histogram_array_global_[j].Init(train_data_->FeatureAt(j), j, tree_config_); num_total_bin += train_data_->FeatureNumBin(i);
}
smaller_leaf_histogram_data_.resize(num_total_bin);
larger_leaf_histogram_data_.resize(num_total_bin);
feature_metas_.resize(train_data->num_features());
#pragma omp parallel for schedule(static)
for (int i = 0; i < train_data->num_features(); ++i) {
feature_metas_[i].feature_idx = i;
feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1;
} else {
feature_metas_[i].bias = 0;
}
feature_metas_[i].tree_config = tree_config_;
}
uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
smaller_leaf_histogram_array_global_[j].Init(smaller_leaf_histogram_data_.data() + offset, &feature_metas_[j]);
larger_leaf_histogram_array_global_[j].Init(larger_leaf_histogram_data_.data() + offset, &feature_metas_[j]);
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
num_bin -= 1;
}
offset += static_cast<uint64_t>(num_bin);
} }
} }
...@@ -71,12 +96,11 @@ void VotingParallelTreeLearner::ResetConfig(const TreeConfig* tree_config) { ...@@ -71,12 +96,11 @@ void VotingParallelTreeLearner::ResetConfig(const TreeConfig* tree_config) {
local_tree_config_.min_data_in_leaf /= num_machines_; local_tree_config_.min_data_in_leaf /= num_machines_;
local_tree_config_.min_sum_hessian_in_leaf /= num_machines_; local_tree_config_.min_sum_hessian_in_leaf /= num_machines_;
histogram_pool_.ResetConfig(&local_tree_config_, train_data_->num_features()); histogram_pool_.ResetConfig(&local_tree_config_);
global_data_count_in_leaf_.resize(tree_config_->num_leaves); global_data_count_in_leaf_.resize(tree_config_->num_leaves);
for (int j = 0; j < num_features_; ++j) { for (size_t i = 0; i < feature_metas_.size(); ++i) {
smaller_leaf_histogram_array_global_[j].ResetConfig(tree_config_); feature_metas_[i].tree_config = tree_config_;
larger_leaf_histogram_array_global_[j].ResetConfig(tree_config_);
} }
} }
...@@ -191,7 +215,7 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small ...@@ -191,7 +215,7 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
smaller_buffer_read_start_pos_[fid] = static_cast<int>(cur_size); smaller_buffer_read_start_pos_[fid] = static_cast<int>(cur_size);
} }
// copy // copy
std::memcpy(input_buffer_.data() + reduce_scatter_size_, smaller_leaf_histogram_array_[fid].HistogramData(), smaller_leaf_histogram_array_[fid].SizeOfHistgram()); std::memcpy(input_buffer_.data() + reduce_scatter_size_, smaller_leaf_histogram_array_[fid].RawData(), smaller_leaf_histogram_array_[fid].SizeOfHistgram());
cur_size += smaller_leaf_histogram_array_[fid].SizeOfHistgram(); cur_size += smaller_leaf_histogram_array_[fid].SizeOfHistgram();
reduce_scatter_size_ += smaller_leaf_histogram_array_[fid].SizeOfHistgram(); reduce_scatter_size_ += smaller_leaf_histogram_array_[fid].SizeOfHistgram();
++smaller_idx; ++smaller_idx;
...@@ -209,7 +233,7 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small ...@@ -209,7 +233,7 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
larger_buffer_read_start_pos_[fid] = static_cast<int>(cur_size); larger_buffer_read_start_pos_[fid] = static_cast<int>(cur_size);
} }
// copy // copy
std::memcpy(input_buffer_.data() + reduce_scatter_size_, larger_leaf_histogram_array_[fid].HistogramData(), larger_leaf_histogram_array_[fid].SizeOfHistgram()); std::memcpy(input_buffer_.data() + reduce_scatter_size_, larger_leaf_histogram_array_[fid].RawData(), larger_leaf_histogram_array_[fid].SizeOfHistgram());
cur_size += larger_leaf_histogram_array_[fid].SizeOfHistgram(); cur_size += larger_leaf_histogram_array_[fid].SizeOfHistgram();
reduce_scatter_size_ += larger_leaf_histogram_array_[fid].SizeOfHistgram(); reduce_scatter_size_ += larger_leaf_histogram_array_[fid].SizeOfHistgram();
++larger_idx; ++larger_idx;
...@@ -225,11 +249,80 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small ...@@ -225,11 +249,80 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
void VotingParallelTreeLearner::FindBestThresholds() { void VotingParallelTreeLearner::FindBestThresholds() {
// use local data to find local best splits // use local data to find local best splits
SerialTreeLearner::FindBestThresholds(); std::vector<int8_t> is_feature_used(num_features_, 0);
#pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used_[feature_index]) continue;
if (parent_leaf_histogram_array_ != nullptr
&& !parent_leaf_histogram_array_[feature_index].is_splittable()) {
smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
continue;
}
is_feature_used[feature_index] = 1;
}
bool use_subtract = true;
if (parent_leaf_histogram_array_ == nullptr) {
use_subtract = false;
}
// construct smaller leaf
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
train_data_->ConstructHistograms(is_feature_used,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
ptr_smaller_leaf_hist_data);
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf
HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
train_data_->ConstructHistograms(is_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
ptr_larger_leaf_hist_data);
}
std::vector<SplitInfo> smaller_bestsplit_per_features(num_features_);
std::vector<SplitInfo> larger_bestsplit_per_features(num_features_);
// find splits
#pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used[feature_index]) { continue; }
train_data_->FixHistogram(feature_index,
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_histogram_array_[feature_index].RawData());
smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
smaller_leaf_splits_->num_data_in_leaf(),
&smaller_bestsplit_per_features[feature_index]);
// only has root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
if (use_subtract) {
larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
} else {
train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(),
larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_histogram_array_[feature_index].RawData());
}
// find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold(
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
larger_leaf_splits_->num_data_in_leaf(),
&larger_bestsplit_per_features[feature_index]);
}
std::vector<SplitInfo> smaller_top_k_splits, larger_top_k_splits; std::vector<SplitInfo> smaller_top_k_splits, larger_top_k_splits;
// local voting // local voting
ArrayArgs<SplitInfo>::MaxK(smaller_leaf_splits_->BestSplitPerFeature(), top_k_, &smaller_top_k_splits); ArrayArgs<SplitInfo>::MaxK(smaller_bestsplit_per_features, top_k_, &smaller_top_k_splits);
ArrayArgs<SplitInfo>::MaxK(larger_leaf_splits_->BestSplitPerFeature(), top_k_, &larger_top_k_splits); ArrayArgs<SplitInfo>::MaxK(larger_bestsplit_per_features, top_k_, &larger_top_k_splits);
// gather // gather
int offset = 0; int offset = 0;
for (int i = 0; i < top_k_; ++i) { for (int i = 0; i < top_k_; ++i) {
...@@ -263,11 +356,15 @@ void VotingParallelTreeLearner::FindBestThresholds() { ...@@ -263,11 +356,15 @@ void VotingParallelTreeLearner::FindBestThresholds() {
// Reduce scatter for histogram // Reduce scatter for histogram
Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(), block_len_.data(), Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(), block_len_.data(),
output_buffer_.data(), &HistogramBinEntry::SumReducer); output_buffer_.data(), &HistogramBinEntry::SumReducer);
std::vector<SplitInfo> smaller_best(num_threads_);
std::vector<SplitInfo> larger_best(num_threads_);
// find best split from local aggregated histograms // find best split from local aggregated histograms
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) { for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
const int tid = omp_get_thread_num();
if (smaller_is_feature_aggregated_[feature_index]) { if (smaller_is_feature_aggregated_[feature_index]) {
SplitInfo smaller_split;
// restore from buffer // restore from buffer
smaller_leaf_histogram_array_global_[feature_index].FromMemory( smaller_leaf_histogram_array_global_[feature_index].FromMemory(
output_buffer_.data() + smaller_buffer_read_start_pos_[feature_index]); output_buffer_.data() + smaller_buffer_read_start_pos_[feature_index]);
...@@ -276,10 +373,14 @@ void VotingParallelTreeLearner::FindBestThresholds() { ...@@ -276,10 +373,14 @@ void VotingParallelTreeLearner::FindBestThresholds() {
smaller_leaf_splits_global_->sum_gradients(), smaller_leaf_splits_global_->sum_gradients(),
smaller_leaf_splits_global_->sum_hessians(), smaller_leaf_splits_global_->sum_hessians(),
GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()), GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()),
&smaller_leaf_splits_global_->BestSplitPerFeature()[feature_index]); &smaller_split);
if (smaller_split.gain > smaller_best[tid].gain) {
smaller_best[tid] = smaller_split;
}
} }
if (larger_is_feature_aggregated_[feature_index]) { if (larger_is_feature_aggregated_[feature_index]) {
SplitInfo larger_split;
// restore from buffer // restore from buffer
larger_leaf_histogram_array_global_[feature_index].FromMemory(output_buffer_.data() + larger_buffer_read_start_pos_[feature_index]); larger_leaf_histogram_array_global_[feature_index].FromMemory(output_buffer_.data() + larger_buffer_read_start_pos_[feature_index]);
// find best threshold // find best threshold
...@@ -287,30 +388,31 @@ void VotingParallelTreeLearner::FindBestThresholds() { ...@@ -287,30 +388,31 @@ void VotingParallelTreeLearner::FindBestThresholds() {
larger_leaf_splits_global_->sum_gradients(), larger_leaf_splits_global_->sum_gradients(),
larger_leaf_splits_global_->sum_hessians(), larger_leaf_splits_global_->sum_hessians(),
GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()), GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()),
&larger_leaf_splits_global_->BestSplitPerFeature()[feature_index]); &larger_split);
if (larger_split.gain > larger_best[tid].gain) {
larger_best[tid] = larger_split;
}
} }
} }
auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
int leaf = smaller_leaf_splits_->LeafIndex();
best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
if (larger_leaf_splits_ != nullptr && larger_leaf_splits_->LeafIndex() >= 0) {
leaf = larger_leaf_splits_->LeafIndex();
auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
}
} }
void VotingParallelTreeLearner::FindBestSplitsForLeaves() { void VotingParallelTreeLearner::FindBestSplitsForLeaves() {
int smaller_best_feature = -1, larger_best_feature = -1;
// find local best // find local best
SplitInfo smaller_best, larger_best; SplitInfo smaller_best, larger_best;
std::vector<double> gains; smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
for (size_t i = 0; i < smaller_leaf_splits_global_->BestSplitPerFeature().size(); ++i) { // find local best split for larger leaf
gains.push_back(smaller_leaf_splits_global_->BestSplitPerFeature()[i].gain); if (larger_leaf_splits_->LeafIndex() >= 0) {
} larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
smaller_best = smaller_leaf_splits_global_->BestSplitPerFeature()[smaller_best_feature];
if (larger_leaf_splits_global_->LeafIndex() >= 0) {
gains.clear();
for (size_t i = 0; i < larger_leaf_splits_global_->BestSplitPerFeature().size(); ++i) {
gains.push_back(larger_leaf_splits_global_->BestSplitPerFeature()[i].gain);
}
larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
larger_best = larger_leaf_splits_global_->BestSplitPerFeature()[larger_best_feature];
} }
// sync global best info // sync global best info
std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo)); std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment