Unverified Commit 509c2e50 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Support both row-wise and col-wise multi-threading (#2699)



* commit

* fix a bug

* fix bug

* reset to track changes

* refine the auto choose logic

* sort the time stats output

* fix include

* change  multi_val_bin_sparse_threshold

* add cmake

* add _mm_malloc and _mm_free for cross platform

* fix cmake bug

* timer for split

* try to fix cmake

* fix tests

* refactor DataPartition::Split

* fix test

* typo

* formating

* Revert "formating"

This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222.

* add document

* [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719)

* naming

* fix gpu code

* Update include/LightGBM/bin.h
Co-Authored-By: default avatarJames Lamb <jaylamb20@gmail.com>

* Update src/treelearner/ocl/histogram16.cl

* test: swap compilers for CI

* fix omp

* not avx2

* no aligned for feature histogram

* Revert "refactor DataPartition::Split"

This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8.

* slightly refactor data partition

* reduce the memory cost
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent bc7bc4a1
...@@ -1065,7 +1065,7 @@ int LGBM_DatasetAddFeaturesFrom(DatasetHandle target, ...@@ -1065,7 +1065,7 @@ int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
API_BEGIN(); API_BEGIN();
auto target_d = reinterpret_cast<Dataset*>(target); auto target_d = reinterpret_cast<Dataset*>(target);
auto source_d = reinterpret_cast<Dataset*>(source); auto source_d = reinterpret_cast<Dataset*>(source);
target_d->addFeaturesFrom(source_d); target_d->AddFeaturesFrom(source_d);
API_END(); API_END();
} }
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
#include "dense_bin.hpp" #include "dense_bin.hpp"
#include "dense_nbits_bin.hpp" #include "dense_nbits_bin.hpp"
#include "ordered_sparse_bin.hpp" #include "multi_val_dense_bin.hpp"
#include "multi_val_sparse_bin.hpp"
#include "sparse_bin.hpp" #include "sparse_bin.hpp"
namespace LightGBM { namespace LightGBM {
...@@ -636,21 +637,10 @@ namespace LightGBM { ...@@ -636,21 +637,10 @@ namespace LightGBM {
template class SparseBin<uint16_t>; template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>; template class SparseBin<uint32_t>;
template class OrderedSparseBin<uint8_t>; template class MultiValDenseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>; template class MultiValDenseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>; template class MultiValDenseBin<uint32_t>;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
// sparse threshold
if (sparse_rate >= sparse_threshold && is_enable_sparse) {
*is_sparse = true;
return CreateSparseBin(num_data, num_bin);
} else {
*is_sparse = false;
return CreateDenseBin(num_data, num_bin);
}
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) { Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) { if (num_bin <= 16) {
...@@ -674,4 +664,25 @@ namespace LightGBM { ...@@ -674,4 +664,25 @@ namespace LightGBM {
} }
} }
MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate) {
const double multi_val_bin_sparse_threshold = 0.25f;
if (sparse_rate >= multi_val_bin_sparse_threshold) {
if (num_bin <= 256) {
return new MultiValSparseBin<uint8_t>(num_data, num_bin);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<uint16_t>(num_data, num_bin);
} else {
return new MultiValSparseBin<uint32_t>(num_data, num_bin);
}
} else {
if (num_bin <= 256) {
return new MultiValDenseBin<uint8_t>(num_data, num_bin, num_feature);
} else if (num_bin <= 65536) {
return new MultiValDenseBin<uint16_t>(num_data, num_bin, num_feature);
} else {
return new MultiValDenseBin<uint32_t>(num_data, num_bin, num_feature);
}
}
}
} // namespace LightGBM } // namespace LightGBM
...@@ -312,6 +312,11 @@ void Config::CheckParamConflict() { ...@@ -312,6 +312,11 @@ void Config::CheckParamConflict() {
num_leaves = static_cast<int>(full_num_leaves); num_leaves = static_cast<int>(full_num_leaves);
} }
} }
// force col-wise for gpu
if (device_type == std::string("gpu")) {
force_col_wise = true;
force_row_wise = false;
}
} }
std::string Config::ToString() const { std::string Config::ToString() const {
......
...@@ -116,9 +116,6 @@ std::unordered_map<std::string, std::string> Config::alias_table({ ...@@ -116,9 +116,6 @@ std::unordered_map<std::string, std::string> Config::alias_table({
{"is_pre_partition", "pre_partition"}, {"is_pre_partition", "pre_partition"},
{"is_enable_bundle", "enable_bundle"}, {"is_enable_bundle", "enable_bundle"},
{"bundle", "enable_bundle"}, {"bundle", "enable_bundle"},
{"is_sparse", "is_enable_sparse"},
{"enable_sparse", "is_enable_sparse"},
{"sparse", "is_enable_sparse"},
{"two_round_loading", "two_round"}, {"two_round_loading", "two_round"},
{"use_two_round_loading", "two_round"}, {"use_two_round_loading", "two_round"},
{"is_save_binary", "save_binary"}, {"is_save_binary", "save_binary"},
...@@ -181,6 +178,8 @@ std::unordered_set<std::string> Config::parameter_set({ ...@@ -181,6 +178,8 @@ std::unordered_set<std::string> Config::parameter_set({
"num_threads", "num_threads",
"device_type", "device_type",
"seed", "seed",
"force_col_wise",
"force_row_wise",
"max_depth", "max_depth",
"min_data_in_leaf", "min_data_in_leaf",
"min_sum_hessian_in_leaf", "min_sum_hessian_in_leaf",
...@@ -236,9 +235,6 @@ std::unordered_set<std::string> Config::parameter_set({ ...@@ -236,9 +235,6 @@ std::unordered_set<std::string> Config::parameter_set({
"valid_data_initscores", "valid_data_initscores",
"pre_partition", "pre_partition",
"enable_bundle", "enable_bundle",
"max_conflict_rate",
"is_enable_sparse",
"sparse_threshold",
"use_missing", "use_missing",
"zero_as_missing", "zero_as_missing",
"two_round", "two_round",
...@@ -309,6 +305,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str ...@@ -309,6 +305,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
GetInt(params, "num_threads", &num_threads); GetInt(params, "num_threads", &num_threads);
GetBool(params, "force_col_wise", &force_col_wise);
GetBool(params, "force_row_wise", &force_row_wise);
GetInt(params, "max_depth", &max_depth); GetInt(params, "max_depth", &max_depth);
GetInt(params, "min_data_in_leaf", &min_data_in_leaf); GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
...@@ -467,16 +467,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str ...@@ -467,16 +467,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
GetBool(params, "enable_bundle", &enable_bundle); GetBool(params, "enable_bundle", &enable_bundle);
GetDouble(params, "max_conflict_rate", &max_conflict_rate);
CHECK(max_conflict_rate >=0.0);
CHECK(max_conflict_rate <1.0);
GetBool(params, "is_enable_sparse", &is_enable_sparse);
GetDouble(params, "sparse_threshold", &sparse_threshold);
CHECK(sparse_threshold >0.0);
CHECK(sparse_threshold <=1.0);
GetBool(params, "use_missing", &use_missing); GetBool(params, "use_missing", &use_missing);
GetBool(params, "zero_as_missing", &zero_as_missing); GetBool(params, "zero_as_missing", &zero_as_missing);
...@@ -600,6 +590,8 @@ std::string Config::SaveMembersToString() const { ...@@ -600,6 +590,8 @@ std::string Config::SaveMembersToString() const {
str_buf << "[learning_rate: " << learning_rate << "]\n"; str_buf << "[learning_rate: " << learning_rate << "]\n";
str_buf << "[num_leaves: " << num_leaves << "]\n"; str_buf << "[num_leaves: " << num_leaves << "]\n";
str_buf << "[num_threads: " << num_threads << "]\n"; str_buf << "[num_threads: " << num_threads << "]\n";
str_buf << "[force_col_wise: " << force_col_wise << "]\n";
str_buf << "[force_row_wise: " << force_row_wise << "]\n";
str_buf << "[max_depth: " << max_depth << "]\n"; str_buf << "[max_depth: " << max_depth << "]\n";
str_buf << "[min_data_in_leaf: " << min_data_in_leaf << "]\n"; str_buf << "[min_data_in_leaf: " << min_data_in_leaf << "]\n";
str_buf << "[min_sum_hessian_in_leaf: " << min_sum_hessian_in_leaf << "]\n"; str_buf << "[min_sum_hessian_in_leaf: " << min_sum_hessian_in_leaf << "]\n";
...@@ -655,9 +647,6 @@ std::string Config::SaveMembersToString() const { ...@@ -655,9 +647,6 @@ std::string Config::SaveMembersToString() const {
str_buf << "[valid_data_initscores: " << Common::Join(valid_data_initscores, ",") << "]\n"; str_buf << "[valid_data_initscores: " << Common::Join(valid_data_initscores, ",") << "]\n";
str_buf << "[pre_partition: " << pre_partition << "]\n"; str_buf << "[pre_partition: " << pre_partition << "]\n";
str_buf << "[enable_bundle: " << enable_bundle << "]\n"; str_buf << "[enable_bundle: " << enable_bundle << "]\n";
str_buf << "[max_conflict_rate: " << max_conflict_rate << "]\n";
str_buf << "[is_enable_sparse: " << is_enable_sparse << "]\n";
str_buf << "[sparse_threshold: " << sparse_threshold << "]\n";
str_buf << "[use_missing: " << use_missing << "]\n"; str_buf << "[use_missing: " << use_missing << "]\n";
str_buf << "[zero_as_missing: " << zero_as_missing << "]\n"; str_buf << "[zero_as_missing: " << zero_as_missing << "]\n";
str_buf << "[two_round: " << two_round << "]\n"; str_buf << "[two_round: " << two_round << "]\n";
......
...@@ -36,6 +36,7 @@ Dataset::Dataset(data_size_t num_data) { ...@@ -36,6 +36,7 @@ Dataset::Dataset(data_size_t num_data) {
} }
Dataset::~Dataset() { Dataset::~Dataset() {
} }
std::vector<std::vector<int>> NoGroup( std::vector<std::vector<int>> NoGroup(
...@@ -48,19 +49,20 @@ std::vector<std::vector<int>> NoGroup( ...@@ -48,19 +49,20 @@ std::vector<std::vector<int>> NoGroup(
return features_in_group; return features_in_group;
} }
int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, int max_cnt) { int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, data_size_t max_cnt) {
int ret = 0; int ret = 0;
for (int i = 0; i < num_indices; ++i) { for (int i = 0; i < num_indices; ++i) {
if (mark[indices[i]]) { if (mark[indices[i]]) {
++ret; ++ret;
if (ret > max_cnt) { }
return -1; if (ret > max_cnt) {
} return -1;
} }
} }
return ret; return ret;
} }
void MarkUsed(std::vector<bool>* mark, const int* indices, int num_indices) {
void MarkUsed(std::vector<bool>* mark, const int* indices, data_size_t num_indices) {
auto& ref_mark = *mark; auto& ref_mark = *mark;
for (int i = 0; i < num_indices; ++i) { for (int i = 0; i < num_indices; ++i) {
ref_mark[indices[i]] = true; ref_mark[indices[i]] = true;
...@@ -93,29 +95,31 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa ...@@ -93,29 +95,31 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
int** sample_indices, int** sample_indices,
const int* num_per_col, const int* num_per_col,
int num_sample_col, int num_sample_col,
size_t total_sample_cnt, data_size_t total_sample_cnt,
data_size_t max_error_cnt,
data_size_t filter_cnt,
data_size_t num_data, data_size_t num_data,
bool is_use_gpu) { bool is_use_gpu,
std::vector<int8_t>* multi_val_group) {
const int max_search_group = 100; const int max_search_group = 100;
const int gpu_max_bin_per_group = 256; const int max_bin_per_group = 256;
const data_size_t single_val_max_conflict_cnt = static_cast<data_size_t>(total_sample_cnt / 10000);
multi_val_group->clear();
Random rand(num_data); Random rand(num_data);
std::vector<std::vector<int>> features_in_group; std::vector<std::vector<int>> features_in_group;
std::vector<std::vector<bool>> conflict_marks; std::vector<std::vector<bool>> conflict_marks;
std::vector<int> group_conflict_cnt; std::vector<data_size_t> group_used_row_cnt;
std::vector<size_t> group_non_zero_cnt; std::vector<data_size_t> group_total_data_cnt;
std::vector<int> group_num_bin; std::vector<int> group_num_bin;
// first round: fill the single val group
for (auto fidx : find_order) { for (auto fidx : find_order) {
bool is_filtered_feature = fidx >= num_sample_col; bool is_filtered_feature = fidx >= num_sample_col;
const size_t cur_non_zero_cnt = is_filtered_feature ? 0: num_per_col[fidx]; const data_size_t cur_non_zero_cnt = is_filtered_feature ? 0 : num_per_col[fidx];
bool need_new_group = true;
std::vector<int> available_groups; std::vector<int> available_groups;
for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) { for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
if (group_non_zero_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + max_error_cnt) { auto cur_num_bin = group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
if (!is_use_gpu || group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0) if (group_total_data_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + single_val_max_conflict_cnt) {
<= gpu_max_bin_per_group) { if (!is_use_gpu || cur_num_bin <= max_bin_per_group) {
available_groups.push_back(gid); available_groups.push_back(gid);
} }
} }
...@@ -124,44 +128,82 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa ...@@ -124,44 +128,82 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
if (!available_groups.empty()) { if (!available_groups.empty()) {
int last = static_cast<int>(available_groups.size()) - 1; int last = static_cast<int>(available_groups.size()) - 1;
auto indices = rand.Sample(last, std::min(last, max_search_group - 1)); auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
// always push the last group
search_groups.push_back(available_groups.back()); search_groups.push_back(available_groups.back());
for (auto idx : indices) { for (auto idx : indices) {
search_groups.push_back(available_groups[idx]); search_groups.push_back(available_groups[idx]);
} }
} }
int best_gid = -1;
int best_conflict_cnt = -1;
for (auto gid : search_groups) { for (auto gid : search_groups) {
const int rest_max_cnt = max_error_cnt - group_conflict_cnt[gid]; const data_size_t rest_max_cnt = single_val_max_conflict_cnt - group_total_data_cnt[gid] + group_used_row_cnt[gid];
const int cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt); const data_size_t cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
if (cnt >= 0 && cnt <= rest_max_cnt) { if (cnt >= 0 && cnt <= rest_max_cnt && cnt <= cur_non_zero_cnt / 2) {
data_size_t rest_non_zero_data = static_cast<data_size_t>( best_gid = gid;
static_cast<double>(cur_non_zero_cnt - cnt) * num_data / total_sample_cnt); best_conflict_cnt = cnt;
if (rest_non_zero_data < filter_cnt) { continue; }
need_new_group = false;
features_in_group[gid].push_back(fidx);
group_conflict_cnt[gid] += cnt;
group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt;
if (!is_filtered_feature) {
MarkUsed(&conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
}
if (is_use_gpu) {
group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
}
break; break;
} }
} }
if (need_new_group) { if (best_gid >= 0) {
features_in_group[best_gid].push_back(fidx);
group_total_data_cnt[best_gid] += cur_non_zero_cnt;
group_used_row_cnt[best_gid] += cur_non_zero_cnt - best_conflict_cnt;
if (!is_filtered_feature) {
MarkUsed(&conflict_marks[best_gid], sample_indices[fidx], num_per_col[fidx]);
}
group_num_bin[best_gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
} else {
features_in_group.emplace_back(); features_in_group.emplace_back();
features_in_group.back().push_back(fidx); features_in_group.back().push_back(fidx);
group_conflict_cnt.push_back(0);
conflict_marks.emplace_back(total_sample_cnt, false); conflict_marks.emplace_back(total_sample_cnt, false);
if (!is_filtered_feature) { if (!is_filtered_feature) {
MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]); MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
} }
group_non_zero_cnt.emplace_back(cur_non_zero_cnt); group_total_data_cnt.emplace_back(cur_non_zero_cnt);
if (is_use_gpu) { group_used_row_cnt.emplace_back(cur_non_zero_cnt);
group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)); group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
}
}
std::vector<int> second_round_features;
std::vector<std::vector<int>> features_in_group2;
std::vector<std::vector<bool>> conflict_marks2;
const double dense_threshold = 0.4;
for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
const double dense_rate = static_cast<double>(group_used_row_cnt[gid]) / total_sample_cnt;
if (dense_rate >= dense_threshold) {
features_in_group2.push_back(std::move(features_in_group[gid]));
conflict_marks2.push_back(std::move(conflict_marks[gid]));
} else {
for (auto fidx : features_in_group[gid]) {
second_round_features.push_back(fidx);
}
}
}
features_in_group = features_in_group2;
conflict_marks = conflict_marks2;
multi_val_group->resize(features_in_group.size(), false);
if (!second_round_features.empty()) {
features_in_group.emplace_back();
conflict_marks.emplace_back(total_sample_cnt, false);
bool is_multi_val = is_use_gpu ? true : false;
int conflict_cnt = 0;
for (auto fidx : second_round_features) {
features_in_group.back().push_back(fidx);
if (!is_multi_val) {
const int rest_max_cnt = single_val_max_conflict_cnt - conflict_cnt;
const auto cnt = GetConfilctCount(conflict_marks.back(), sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
conflict_cnt += cnt;
if (cnt < 0 || conflict_cnt > single_val_max_conflict_cnt) {
is_multi_val = true;
continue;
}
MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
} }
} }
multi_val_group->push_back(is_multi_val);
} }
return features_in_group; return features_in_group;
} }
...@@ -171,17 +213,12 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_ ...@@ -171,17 +213,12 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
double** sample_values, double** sample_values,
const int* num_per_col, const int* num_per_col,
int num_sample_col, int num_sample_col,
size_t total_sample_cnt, data_size_t total_sample_cnt,
const std::vector<int>& used_features, const std::vector<int>& used_features,
double max_conflict_rate,
data_size_t num_data, data_size_t num_data,
data_size_t min_data, bool is_use_gpu,
double sparse_threshold, std::vector<int8_t>* multi_val_group) {
bool is_enable_sparse, Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
bool is_use_gpu) {
// filter is based on sampling data, so decrease its range
const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * min_data) / num_data * total_sample_cnt);
const data_size_t max_error_cnt = static_cast<data_size_t>(total_sample_cnt * max_conflict_rate);
std::vector<size_t> feature_non_zero_cnt; std::vector<size_t> feature_non_zero_cnt;
feature_non_zero_cnt.reserve(used_features.size()); feature_non_zero_cnt.reserve(used_features.size());
// put dense feature first // put dense feature first
...@@ -209,6 +246,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_ ...@@ -209,6 +246,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
for (auto sidx : sorted_idx) { for (auto sidx : sorted_idx) {
feature_order_by_cnt.push_back(used_features[sidx]); feature_order_by_cnt.push_back(used_features[sidx]);
} }
std::vector<std::vector<int>> tmp_indices; std::vector<std::vector<int>> tmp_indices;
std::vector<int> tmp_num_per_col(num_sample_col, 0); std::vector<int> tmp_num_per_col(num_sample_col, 0);
for (auto fidx : used_features) { for (auto fidx : used_features) {
...@@ -224,42 +262,25 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_ ...@@ -224,42 +262,25 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
tmp_num_per_col[fidx] = num_per_col[fidx]; tmp_num_per_col[fidx] = num_per_col[fidx];
} }
} }
auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu); std::vector<int8_t> group_is_multi_val, group_is_multi_val2;
auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu); auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val);
auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val2);
if (features_in_group.size() > group2.size()) { if (features_in_group.size() > group2.size()) {
features_in_group = group2; features_in_group = group2;
} group_is_multi_val = group_is_multi_val2;
std::vector<std::vector<int>> ret;
for (size_t i = 0; i < features_in_group.size(); ++i) {
if (features_in_group[i].size() <= 1 || features_in_group[i].size() >= 5) {
ret.push_back(features_in_group[i]);
} else {
int cnt_non_zero = 0;
for (size_t j = 0; j < features_in_group[i].size(); ++j) {
const int fidx = features_in_group[i][j];
cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers[fidx]->sparse_rate()));
}
double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
// take apart small sparse group, due it will not gain on speed
if (sparse_rate >= sparse_threshold && is_enable_sparse) {
for (size_t j = 0; j < features_in_group[i].size(); ++j) {
const int fidx = features_in_group[i][j];
ret.emplace_back();
ret.back().push_back(fidx);
}
} else {
ret.push_back(features_in_group[i]);
}
}
} }
// shuffle groups // shuffle groups
int num_group = static_cast<int>(ret.size()); int num_group = static_cast<int>(features_in_group.size());
Random tmp_rand(12); Random tmp_rand(num_data);
for (int i = 0; i < num_group - 1; ++i) { for (int i = 0; i < num_group - 1; ++i) {
int j = tmp_rand.NextShort(i + 1, num_group); int j = tmp_rand.NextShort(i + 1, num_group);
std::swap(ret[i], ret[j]); std::swap(features_in_group[i], features_in_group[j]);
// Use std::swap for vector<bool> will cause the wrong result..
std::swap(group_is_multi_val[i], group_is_multi_val[j]);
} }
return ret; *multi_val_group = group_is_multi_val;
return features_in_group;
} }
void Dataset::Construct( void Dataset::Construct(
...@@ -274,7 +295,6 @@ void Dataset::Construct( ...@@ -274,7 +295,6 @@ void Dataset::Construct(
const Config& io_config) { const Config& io_config) {
num_total_features_ = num_total_features; num_total_features_ = num_total_features;
CHECK(num_total_features_ == static_cast<int>(bin_mappers->size())); CHECK(num_total_features_ == static_cast<int>(bin_mappers->size()));
sparse_threshold_ = io_config.sparse_threshold;
// get num_features // get num_features
std::vector<int> used_features; std::vector<int> used_features;
auto& ref_bin_mappers = *bin_mappers; auto& ref_bin_mappers = *bin_mappers;
...@@ -287,13 +307,11 @@ void Dataset::Construct( ...@@ -287,13 +307,11 @@ void Dataset::Construct(
Log::Warning("There are no meaningful features, as all feature values are constant."); Log::Warning("There are no meaningful features, as all feature values are constant.");
} }
auto features_in_group = NoGroup(used_features); auto features_in_group = NoGroup(used_features);
std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
if (io_config.enable_bundle && !used_features.empty()) { if (io_config.enable_bundle && !used_features.empty()) {
features_in_group = FastFeatureBundling(*bin_mappers, features_in_group = FastFeatureBundling(*bin_mappers,
sample_non_zero_indices, sample_values, num_per_col, num_sample_col, total_sample_cnt, sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast<data_size_t>(total_sample_cnt),
used_features, io_config.max_conflict_rate, used_features, num_data_, io_config.device_type == std::string("gpu"), &group_is_multi_val);
num_data_, io_config.min_data_in_leaf,
sparse_threshold_, io_config.is_enable_sparse, io_config.device_type == std::string("gpu"));
} }
num_features_ = 0; num_features_ = 0;
...@@ -306,10 +324,14 @@ void Dataset::Construct( ...@@ -306,10 +324,14 @@ void Dataset::Construct(
real_feature_idx_.resize(num_features_); real_feature_idx_.resize(num_features_);
feature2group_.resize(num_features_); feature2group_.resize(num_features_);
feature2subfeature_.resize(num_features_); feature2subfeature_.resize(num_features_);
int num_multi_val_group = 0;
feature_need_push_zeros_.clear(); feature_need_push_zeros_.clear();
for (int i = 0; i < num_groups_; ++i) { for (int i = 0; i < num_groups_; ++i) {
auto cur_features = features_in_group[i]; auto cur_features = features_in_group[i];
int cur_cnt_features = static_cast<int>(cur_features.size()); int cur_cnt_features = static_cast<int>(cur_features.size());
if (group_is_multi_val[i]) {
++num_multi_val_group;
}
// get bin_mappers // get bin_mappers
std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers; std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
for (int j = 0; j < cur_cnt_features; ++j) { for (int j = 0; j < cur_cnt_features; ++j) {
...@@ -325,8 +347,7 @@ void Dataset::Construct( ...@@ -325,8 +347,7 @@ void Dataset::Construct(
++cur_fidx; ++cur_fidx;
} }
feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>( feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(cur_cnt_features, &cur_bin_mappers, num_data_, sparse_threshold_, new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_)));
io_config.is_enable_sparse)));
} }
feature_groups_.shrink_to_fit(); feature_groups_.shrink_to_fit();
group_bin_boundaries_.clear(); group_bin_boundaries_.clear();
...@@ -414,9 +435,6 @@ void Dataset::ResetConfig(const char* parameters) { ...@@ -414,9 +435,6 @@ void Dataset::ResetConfig(const char* parameters) {
if (param.count("zero_as_missing") && io_config.zero_as_missing != zero_as_missing_) { if (param.count("zero_as_missing") && io_config.zero_as_missing != zero_as_missing_) {
Log::Warning("Cannot change zero_as_missing after constructed Dataset handle."); Log::Warning("Cannot change zero_as_missing after constructed Dataset handle.");
} }
if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
}
if (param.count("forcedbins_filename")) { if (param.count("forcedbins_filename")) {
Log::Warning("Cannot change forced bins after constructed Dataset handle."); Log::Warning("Cannot change forced bins after constructed Dataset handle.");
} }
...@@ -452,23 +470,229 @@ void Dataset::ResetConfig(const char* parameters) { ...@@ -452,23 +470,229 @@ void Dataset::ResetConfig(const char* parameters) {
void Dataset::FinishLoad() { void Dataset::FinishLoad() {
if (is_finish_load_) { return; } if (is_finish_load_) { return; }
if (num_groups_ > 0) { if (num_groups_ > 0) {
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_groups_; ++i) { for (int i = 0; i < num_groups_; ++i) {
OMP_LOOP_EX_BEGIN(); feature_groups_[i]->FinishLoad();
feature_groups_[i]->bin_data_->FinishLoad();
OMP_LOOP_EX_END();
} }
OMP_THROW_EX();
} }
is_finish_load_ = true; is_finish_load_ = true;
} }
void PushDataToMultiValBin(int num_threads, data_size_t num_data, const std::vector<uint32_t> most_freq_bins,
const std::vector<uint32_t> offsets, std::vector<std::vector<std::unique_ptr<BinIterator>>>& iters, MultiValBin* ret) {
Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer);
const data_size_t min_block_size = 4096;
const int n_block = std::min(num_threads, (num_data + min_block_size - 1) / min_block_size);
const data_size_t block_size = (num_data + n_block - 1) / n_block;
if (ret->IsSparse()) {
#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_block; ++tid) {
std::vector<uint32_t> cur_data;
data_size_t start = tid * block_size;
data_size_t end = std::min(num_data, start + block_size);
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
iters[tid][j]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
cur_data.clear();
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
auto cur_bin = iters[tid][j]->Get(i);
if (cur_bin == most_freq_bins[j]) {
continue;
}
cur_bin += offsets[j];
if (most_freq_bins[j] == 0) {
cur_bin -= 1;
}
cur_data.push_back(cur_bin);
}
ret->PushOneRow(tid, i, cur_data);
}
}
} else {
#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_block; ++tid) {
std::vector<uint32_t> cur_data;
data_size_t start = tid * block_size;
data_size_t end = std::min(num_data, start + block_size);
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
iters[tid][j]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
cur_data.clear();
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
auto cur_bin = iters[tid][j]->Get(i);
if (cur_bin == most_freq_bins[j]) {
cur_bin = 0;
} else {
cur_bin += offsets[j];
if (most_freq_bins[j] == 0) {
cur_bin -= 1;
}
}
cur_data.push_back(cur_bin);
}
ret->PushOneRow(tid, i, cur_data);
}
}
}
}
MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures", global_timer);
int multi_group_id = -1;
for (int i = 0; i < num_groups_; ++i) {
if (feature_groups_[i]->is_multi_val_) {
if (multi_group_id < 0) {
multi_group_id = i;
} else {
Log::Fatal("Bug. There should be only one multi-val group.");
}
}
}
if (multi_group_id < 0) {
return nullptr;
}
const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
const int num_feature = feature_groups_[multi_group_id]->num_feature_;
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
std::vector<uint32_t> most_freq_bins;
double sum_sparse_rate = 0;
for (int i = 0; i < num_feature; ++i) {
for (int tid = 0; tid < num_threads; ++tid) {
iters[tid].emplace_back(feature_groups_[multi_group_id]->SubFeatureIterator(i));
}
most_freq_bins.push_back(feature_groups_[multi_group_id]->bin_mappers_[i]->GetMostFreqBin());
sum_sparse_rate += feature_groups_[multi_group_id]->bin_mappers_[i]->sparse_rate();
}
sum_sparse_rate /= num_feature;
Log::Debug("GetMultiBinFromSparseFeatures:: sparse rate %f", sum_sparse_rate);
std::unique_ptr<MultiValBin> ret;
ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(), num_feature, sum_sparse_rate));
PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
ret->FinishLoad();
return ret.release();
}
MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures", global_timer);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
double sum_dense_ratio = 0;
std::unique_ptr<MultiValBin> ret;
std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
std::vector<uint32_t> most_freq_bins;
std::vector<uint32_t> offsets;
int num_total_bin = 1;
offsets.push_back(num_total_bin);
for (int gid = 0; gid < num_groups_; ++gid) {
if (feature_groups_[gid]->is_multi_val_) {
for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
num_total_bin += bin_mapper->num_bin();
if (most_freq_bins.back() == 0) {
num_total_bin -= 1;
}
offsets.push_back(num_total_bin);
for (int tid = 0; tid < num_threads; ++tid) {
iters[tid].emplace_back(feature_groups_[gid]->SubFeatureIterator(fid));
}
}
} else {
most_freq_bins.push_back(0);
num_total_bin += feature_groups_[gid]->bin_offsets_.back() - 1;
for (int tid = 0; tid < num_threads; ++tid) {
iters[tid].emplace_back(feature_groups_[gid]->FeatureGroupIterator());
}
offsets.push_back(num_total_bin);
for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
}
}
}
sum_dense_ratio /= static_cast<double>(most_freq_bins.size());
Log::Debug("GetMultiBinFromAllFeatures:: sparse rate %f", 1.0 - sum_dense_ratio);
ret.reset(MultiValBin::CreateMultiValBin(num_data_, num_total_bin, static_cast<int>(most_freq_bins.size()), 1.0 - sum_dense_ratio));
PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
ret->FinishLoad();
return ret.release();
}
MultiValBin* Dataset::TestMultiThreadingMethod(score_t* gradients, score_t* hessians, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", global_timer);
if (force_colwise && force_rowwise) {
Log::Fatal("cannot set both `force_col_wise` and `force_row_wise` to `true`.");
}
if (num_groups_ <= 0) {
return nullptr;
}
if (force_colwise) {
*is_hist_col_wise = true;
return GetMultiBinFromSparseFeatures();
} else if (force_rowwise) {
*is_hist_col_wise = false;
auto ret = GetMultiBinFromAllFeatures();
const int num_bin_aligned =
(ret->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
hist_buf_.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
return ret;
} else {
std::unique_ptr<MultiValBin> sparse_bin;
std::unique_ptr<MultiValBin> all_bin;
sparse_bin.reset(GetMultiBinFromSparseFeatures());
all_bin.reset(GetMultiBinFromAllFeatures());
std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_data(NumTotalBin() * 2);
const int num_bin_aligned =
(all_bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
hist_buf_.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
auto start_time = std::chrono::steady_clock::now();
ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, hessians, gradients, hessians, is_constant_hessian, sparse_bin.get(), true, hist_data.data());
col_wise_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now();
ConstructHistogramsMultiVal(all_bin.get(), nullptr, num_data_, gradients, hessians, is_constant_hessian, hist_data.data());
row_wise_time = std::chrono::steady_clock::now() - start_time;
Log::Debug("colwise cost %f seconds, rowwise cost %f seconds", col_wise_time * 1e-3, row_wise_time * 1e-3);
if (col_wise_time < row_wise_time) {
*is_hist_col_wise = true;
hist_buf_.clear();
return sparse_bin.release();
} else {
*is_hist_col_wise = false;
Log::Info("Use row-wise multi-threading, may increase memory usage. If memory is not enough, you can set `force_col_wise=true`.");
if (all_bin->IsSparse()) {
Log::Debug("Use Sparse Multi-Val Bin");
} else {
Log::Debug("Use Dense Multi-Val Bin");
}
return all_bin.release();
}
}
}
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
feature_groups_.clear(); feature_groups_.clear();
num_features_ = dataset->num_features_; num_features_ = dataset->num_features_;
num_groups_ = dataset->num_groups_; num_groups_ = dataset->num_groups_;
sparse_threshold_ = dataset->sparse_threshold_;
// copy feature bin mapper data // copy feature bin mapper data
for (int i = 0; i < num_groups_; ++i) { for (int i = 0; i < num_groups_; ++i) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers; std::vector<std::unique_ptr<BinMapper>> bin_mappers;
...@@ -477,9 +701,9 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { ...@@ -477,9 +701,9 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
} }
feature_groups_.emplace_back(new FeatureGroup( feature_groups_.emplace_back(new FeatureGroup(
dataset->feature_groups_[i]->num_feature_, dataset->feature_groups_[i]->num_feature_,
dataset->feature_groups_[i]->is_multi_val_,
&bin_mappers, &bin_mappers,
num_data_, num_data_));
dataset->feature_groups_[i]->is_sparse_));
} }
feature_groups_.shrink_to_fit(); feature_groups_.shrink_to_fit();
used_feature_map_ = dataset->used_feature_map_; used_feature_map_ = dataset->used_feature_map_;
...@@ -502,8 +726,6 @@ void Dataset::CreateValid(const Dataset* dataset) { ...@@ -502,8 +726,6 @@ void Dataset::CreateValid(const Dataset* dataset) {
feature_groups_.clear(); feature_groups_.clear();
num_features_ = dataset->num_features_; num_features_ = dataset->num_features_;
num_groups_ = num_features_; num_groups_ = num_features_;
sparse_threshold_ = dataset->sparse_threshold_;
bool is_enable_sparse = true;
feature2group_.clear(); feature2group_.clear();
feature2subfeature_.clear(); feature2subfeature_.clear();
// copy feature bin mapper data // copy feature bin mapper data
...@@ -514,12 +736,8 @@ void Dataset::CreateValid(const Dataset* dataset) { ...@@ -514,12 +736,8 @@ void Dataset::CreateValid(const Dataset* dataset) {
if (bin_mappers.back()->GetDefaultBin() != bin_mappers.back()->GetMostFreqBin()) { if (bin_mappers.back()->GetDefaultBin() != bin_mappers.back()->GetMostFreqBin()) {
feature_need_push_zeros_.push_back(i); feature_need_push_zeros_.push_back(i);
} }
feature_groups_.emplace_back(new FeatureGroup( feature_groups_.emplace_back(new FeatureGroup(&bin_mappers,
1, num_data_));
&bin_mappers,
num_data_,
sparse_threshold_,
is_enable_sparse));
feature2group_.push_back(i); feature2group_.push_back(i);
feature2subfeature_.push_back(0); feature2subfeature_.push_back(0);
} }
...@@ -721,7 +939,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -721,7 +939,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
writer->Write(binary_file_token, size_of_token); writer->Write(binary_file_token, size_of_token);
// get size of header // get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_) size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) + sizeof(sparse_threshold_) + sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_)
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_ + 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_
+ sizeof(double) * num_features_ + sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2; + sizeof(double) * num_features_ + sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2;
// size of feature names // size of feature names
...@@ -743,7 +961,6 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -743,7 +961,6 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_)); writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_));
writer->Write(&use_missing_, sizeof(use_missing_)); writer->Write(&use_missing_, sizeof(use_missing_));
writer->Write(&zero_as_missing_, sizeof(zero_as_missing_)); writer->Write(&zero_as_missing_, sizeof(zero_as_missing_));
writer->Write(&sparse_threshold_, sizeof(sparse_threshold_));
writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_); writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_);
writer->Write(&num_groups_, sizeof(num_groups_)); writer->Write(&num_groups_, sizeof(num_groups_));
writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_); writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_);
...@@ -866,20 +1083,110 @@ void Dataset::DumpTextFile(const char* text_filename) { ...@@ -866,20 +1083,110 @@ void Dataset::DumpTextFile(const char* text_filename) {
fclose(file); fclose(file);
} }
void Dataset::ConstructHistogramsMultiVal(const MultiValBin* multi_val_bin, const data_size_t* data_indices, data_size_t num_data,
const score_t* gradients, const score_t* hessians,
bool is_constant_hessian,
hist_t* hist_data) const {
Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal", global_timer);
if (multi_val_bin == nullptr) { return; }
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
global_timer.Start("Dataset::sparse_bin_histogram");
const int num_bin = multi_val_bin->num_bin();
const int num_bin_aligned = (num_bin + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
const int min_data_block_size = 1024;
const int n_data_block = std::min(num_threads, (num_data + min_data_block_size - 1) / min_data_block_size);
const int data_block_size = (num_data + n_data_block - 1) / n_data_block;
const size_t buf_size = static_cast<size_t>(n_data_block - 1)* num_bin_aligned * 2;
if (hist_buf_.size() < buf_size) {
hist_buf_.resize(buf_size);
}
#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_data_block; ++tid) {
data_size_t start = tid * data_block_size;
data_size_t end = std::min(start + data_block_size, num_data);
auto data_ptr = hist_data;
if (tid > 0) {
data_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
}
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin* KHistEntrySize);
if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, hessians, data_ptr);
} else {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, data_ptr);
}
} else {
if (!is_constant_hessian) {
multi_val_bin->ConstructHistogram(start, end, gradients, hessians, data_ptr);
} else {
multi_val_bin->ConstructHistogram(start, end, gradients, data_ptr);
}
}
}
global_timer.Stop("Dataset::sparse_bin_histogram");
global_timer.Start("Dataset::sparse_bin_histogram_merge");
const int min_bin_block_size = 512;
const int n_bin_block = std::min(num_threads, (num_bin + min_bin_block_size - 1) / min_bin_block_size);
const int bin_block_size = (num_bin + n_bin_block - 1) / n_bin_block;
if (!is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
}
}
}
} else {
#pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
}
}
for (int i = start; i < end; i++) {
GET_HESS(hist_data, i) = GET_HESS(hist_data, i) * hessians[0];
}
}
}
global_timer.Stop("Dataset::sparse_bin_histogram_merge");
}
void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data, const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
const score_t* gradients, const score_t* hessians, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians, score_t* ordered_gradients, score_t* ordered_hessians,
bool is_constant_hessian, bool is_constant_hessian,
HistogramBinEntry* hist_data) const { const MultiValBin* multi_val_bin, bool is_colwise,
if (leaf_idx < 0 || num_data < 0 || hist_data == nullptr) { hist_t* hist_data) const {
Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer);
if (num_data < 0 || hist_data == nullptr) {
return; return;
} }
if (!is_colwise) {
std::vector<int> used_group; return ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian, hist_data);
used_group.reserve(num_groups_); }
global_timer.Start("Dataset::Get used group");
std::vector<int> used_dense_group;
int multi_val_groud_id = -1;
used_dense_group.reserve(num_groups_);
for (int group = 0; group < num_groups_; ++group) { for (int group = 0; group < num_groups_; ++group) {
const int f_cnt = group_feature_cnt_[group]; const int f_cnt = group_feature_cnt_[group];
bool is_group_used = false; bool is_group_used = false;
...@@ -891,172 +1198,137 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, ...@@ -891,172 +1198,137 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
} }
} }
if (is_group_used) { if (is_group_used) {
used_group.push_back(group); if (feature_groups_[group]->is_multi_val_) {
} multi_val_groud_id = group;
} } else {
int num_used_group = static_cast<int>(used_group.size()); used_dense_group.push_back(group);
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
auto& ref_ordered_bins = *ordered_bins;
if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
} }
} }
ptr_ordered_grad = ordered_gradients; }
ptr_ordered_hess = ordered_hessians; int num_used_dense_group = static_cast<int>(used_dense_group.size());
if (!is_constant_hessian) { global_timer.Stop("Dataset::Get used group");
OMP_INIT_EX(); global_timer.Start("Dataset::dense_bin_histogram");
#pragma omp parallel for schedule(static) if (num_used_dense_group > 0) {
for (int gi = 0; gi < num_used_group; ++gi) { auto ptr_ordered_grad = gradients;
OMP_LOOP_EX_BEGIN(); auto ptr_ordered_hess = hessians;
int group = used_group[gi]; if (data_indices != nullptr && num_data < num_data_) {
// feature is not used if (!is_constant_hessian) {
auto data_ptr = hist_data + group_bin_boundaries_[group]; #pragma omp parallel for schedule(static)
const int num_bin = feature_groups_[group]->num_total_bin_; for (data_size_t i = 0; i < num_data; ++i) {
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry)); ordered_gradients[i] = gradients[data_indices[i]];
// construct histograms for smaller leaf ordered_hessians[i] = hessians[data_indices[i]];
if (ref_ordered_bins[group] == nullptr) { }
// if not use ordered bin } else {
feature_groups_[group]->bin_data_->ConstructHistogram( #pragma omp parallel for schedule(static)
data_indices, for (data_size_t i = 0; i < num_data; ++i) {
0, ordered_gradients[i] = gradients[data_indices[i]];
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
} }
OMP_LOOP_EX_END();
} }
OMP_THROW_EX(); ptr_ordered_grad = ordered_gradients;
} else { ptr_ordered_hess = ordered_hessians;
OMP_INIT_EX(); if (!is_constant_hessian) {
#pragma omp parallel for schedule(static) OMP_INIT_EX();
for (int gi = 0; gi < num_used_group; ++gi) { #pragma omp parallel for schedule(static)
OMP_LOOP_EX_BEGIN(); for (int gi = 0; gi < num_used_dense_group; ++gi) {
int group = used_group[gi]; OMP_LOOP_EX_BEGIN();
// feature is not used int group = used_dense_group[gi];
auto data_ptr = hist_data + group_bin_boundaries_[group]; // feature is not used
const int num_bin = feature_groups_[group]->num_total_bin_; auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry)); const int num_bin = feature_groups_[group]->num_total_bin_;
// construct histograms for smaller leaf std::memset(reinterpret_cast<void*>(data_ptr), 0,
if (ref_ordered_bins[group] == nullptr) { num_bin * KHistEntrySize);
// if not use ordered bin // construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram( feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices, data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
0, data_ptr);
num_data, OMP_LOOP_EX_END();
ptr_ordered_grad,
data_ptr);
} else {
// used ordered bin
ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
}
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
} }
OMP_LOOP_EX_END(); OMP_THROW_EX();
}
OMP_THROW_EX(); } else {
} OMP_INIT_EX();
} else { #pragma omp parallel for schedule(static)
if (!is_constant_hessian) { for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_INIT_EX(); OMP_LOOP_EX_BEGIN();
#pragma omp parallel for schedule(static) int group = used_dense_group[gi];
for (int gi = 0; gi < num_used_group; ++gi) { // feature is not used
OMP_LOOP_EX_BEGIN(); auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
int group = used_group[gi]; const int num_bin = feature_groups_[group]->num_total_bin_;
// feature is not used std::memset(reinterpret_cast<void*>(data_ptr), 0,
auto data_ptr = hist_data + group_bin_boundaries_[group]; num_bin * KHistEntrySize);
const int num_bin = feature_groups_[group]->num_total_bin_; // construct histograms for smaller leaf
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ref_ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram( feature_groups_[group]->bin_data_->ConstructHistogram(
0, data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
num_data, // fixed hessian.
ptr_ordered_grad, for (int i = 0; i < num_bin; ++i) {
ptr_ordered_hess, GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
data_ptr); }
} else { OMP_LOOP_EX_END();
// used ordered bin
ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
} }
OMP_LOOP_EX_END(); OMP_THROW_EX();
} }
OMP_THROW_EX();
} else { } else {
OMP_INIT_EX(); if (!is_constant_hessian) {
#pragma omp parallel for schedule(static) OMP_INIT_EX();
for (int gi = 0; gi < num_used_group; ++gi) { #pragma omp parallel for schedule(static)
OMP_LOOP_EX_BEGIN(); for (int gi = 0; gi < num_used_dense_group; ++gi) {
int group = used_group[gi]; OMP_LOOP_EX_BEGIN();
// feature is not used int group = used_dense_group[gi];
auto data_ptr = hist_data + group_bin_boundaries_[group]; // feature is not used
const int num_bin = feature_groups_[group]->num_total_bin_; auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry)); const int num_bin = feature_groups_[group]->num_total_bin_;
// construct histograms for smaller leaf std::memset(reinterpret_cast<void*>(data_ptr), 0,
if (ref_ordered_bins[group] == nullptr) { num_bin * KHistEntrySize);
// if not use ordered bin // construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram( feature_groups_[group]->bin_data_->ConstructHistogram(
0, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
num_data, OMP_LOOP_EX_END();
ptr_ordered_grad,
data_ptr);
} else {
// used ordered bin
ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
} }
// fixed hessian. OMP_THROW_EX();
for (int i = 0; i < num_bin; ++i) { } else {
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0]; OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
0, num_data, ptr_ordered_grad, data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
}
OMP_LOOP_EX_END();
} }
OMP_LOOP_EX_END(); OMP_THROW_EX();
} }
OMP_THROW_EX();
} }
} }
global_timer.Stop("Dataset::dense_bin_histogram");
if (multi_val_groud_id >= 0) {
ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
}
} }
void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data, void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const {
HistogramBinEntry* data) const {
const int group = feature2group_[feature_idx]; const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx]; const int sub_feature = feature2subfeature_[feature_idx];
const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get(); const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();
const int most_freq_bin = bin_mapper->GetMostFreqBin(); const int most_freq_bin = bin_mapper->GetMostFreqBin();
if (most_freq_bin > 0) { if (most_freq_bin > 0) {
const int num_bin = bin_mapper->num_bin(); const int num_bin = bin_mapper->num_bin();
data[most_freq_bin].sum_gradients = sum_gradient; GET_GRAD(data, most_freq_bin) = sum_gradient;
data[most_freq_bin].sum_hessians = sum_hessian; GET_HESS(data, most_freq_bin) = sum_hessian;
data[most_freq_bin].cnt = num_data;
for (int i = 0; i < num_bin; ++i) { for (int i = 0; i < num_bin; ++i) {
if (i != most_freq_bin) { if (i != most_freq_bin) {
data[most_freq_bin].sum_gradients -= data[i].sum_gradients; GET_GRAD(data, most_freq_bin) -= GET_GRAD(data, i);
data[most_freq_bin].sum_hessians -= data[i].sum_hessians; GET_HESS(data, most_freq_bin) -= GET_HESS(data, i);
data[most_freq_bin].cnt -= data[i].cnt;
} }
} }
} }
...@@ -1094,7 +1366,7 @@ void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len, const std::ve ...@@ -1094,7 +1366,7 @@ void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len, const std::ve
} }
} }
void Dataset::addFeaturesFrom(Dataset* other) { void Dataset::AddFeaturesFrom(Dataset* other) {
if (other->num_data_ != num_data_) { if (other->num_data_ != num_data_) {
throw std::runtime_error("Cannot add features from other Dataset with a different number of rows"); throw std::runtime_error("Cannot add features from other Dataset with a different number of rows");
} }
......
...@@ -335,8 +335,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -335,8 +335,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
mem_ptr += sizeof(dataset->use_missing_); mem_ptr += sizeof(dataset->use_missing_);
dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr)); dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += sizeof(dataset->zero_as_missing_); mem_ptr += sizeof(dataset->zero_as_missing_);
dataset->sparse_threshold_ = *(reinterpret_cast<const double*>(mem_ptr));
mem_ptr += sizeof(dataset->sparse_threshold_);
const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr); const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
dataset->used_feature_map_.clear(); dataset->used_feature_map_.clear();
for (int i = 0; i < dataset->num_total_features_; ++i) { for (int i = 0; i < dataset->num_total_features_; ++i) {
......
...@@ -31,9 +31,9 @@ class DenseBinIterator: public BinIterator { ...@@ -31,9 +31,9 @@ class DenseBinIterator: public BinIterator {
} }
inline uint32_t RawGet(data_size_t idx) override; inline uint32_t RawGet(data_size_t idx) override;
inline uint32_t Get(data_size_t idx) override; inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override { } inline void Reset(data_size_t) override {}
private: private:
const DenseBin<VAL_T>* bin_data_; const DenseBin<VAL_T>* bin_data_;
VAL_T min_bin_; VAL_T min_bin_;
VAL_T max_bin_; VAL_T max_bin_;
...@@ -46,7 +46,7 @@ class DenseBinIterator: public BinIterator { ...@@ -46,7 +46,7 @@ class DenseBinIterator: public BinIterator {
*/ */
template <typename VAL_T> template <typename VAL_T>
class DenseBin: public Bin { class DenseBin: public Bin {
public: public:
friend DenseBinIterator<VAL_T>; friend DenseBinIterator<VAL_T>;
explicit DenseBin(data_size_t num_data) explicit DenseBin(data_size_t num_data)
: num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) { : num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) {
...@@ -68,84 +68,65 @@ class DenseBin: public Bin { ...@@ -68,84 +68,65 @@ class DenseBin: public Bin {
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, #define ACC_GH(hist, i, g, h) \
const score_t* ordered_gradients, const score_t* ordered_hessians, const auto ti = static_cast<int>(i) << 1; \
HistogramBinEntry* out) const override { hist[ti] += g; \
const data_size_t pf_offset = 64 / sizeof(VAL_T); hist[ti + 1] += h; \
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start; data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + data_indices[i + pf_offset]); if (use_prefetch) {
const VAL_T bin = data_[data_indices[i]]; const data_size_t pf_offset = 64 / sizeof(VAL_T);
out[bin].sum_gradients += ordered_gradients[i]; const data_size_t pf_end = end - pf_offset;
out[bin].sum_hessians += ordered_hessians[i]; for (; i < pf_end; ++i) {
++out[bin].cnt; const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + pf_idx);
const VAL_T bin = data_[idx];
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
}
} }
for (; i < end; i++) { for (; i < end; ++i) {
const VAL_T bin = data_[data_indices[i]]; const auto idx = use_indices ? data_indices[i] : i;
out[bin].sum_gradients += ordered_gradients[i]; const VAL_T bin = data_[idx];
out[bin].sum_hessians += ordered_hessians[i]; if (use_hessians) {
++out[bin].cnt; ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
} }
} }
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T); ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + i + pf_offset);
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
} }
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T); ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
} }
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T); ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + i + pf_offset);
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
} }
data_size_t Split( data_size_t Split(
...@@ -257,9 +238,6 @@ class DenseBin: public Bin { ...@@ -257,9 +238,6 @@ class DenseBin: public Bin {
data_size_t num_data() const override { return num_data_; } data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
OrderedBin* CreateOrderedBin() const override { return nullptr; }
void FinishLoad() override {} void FinishLoad() override {}
void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override { void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
...@@ -287,17 +265,18 @@ class DenseBin: public Bin { ...@@ -287,17 +265,18 @@ class DenseBin: public Bin {
} }
size_t SizesInByte() const override { size_t SizesInByte() const override {
return sizeof(VAL_T) * num_data_; return sizeof(VAL_T)* num_data_;
} }
DenseBin<VAL_T>* Clone() override; DenseBin<VAL_T>* Clone() override;
private: private:
data_size_t num_data_; data_size_t num_data_;
std::vector<VAL_T> data_; std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
DenseBin<VAL_T>(const DenseBin<VAL_T>& other) DenseBin<VAL_T>(const DenseBin<VAL_T>& other)
: num_data_(other.num_data_), data_(other.data_){} : num_data_(other.num_data_), data_(other.data_) {
}
}; };
template<typename VAL_T> template<typename VAL_T>
......
...@@ -16,7 +16,7 @@ namespace LightGBM { ...@@ -16,7 +16,7 @@ namespace LightGBM {
class Dense4bitsBin; class Dense4bitsBin;
class Dense4bitsBinIterator : public BinIterator { class Dense4bitsBinIterator : public BinIterator {
public: public:
explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
: bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)), : bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
max_bin_(static_cast<uint8_t>(max_bin)), max_bin_(static_cast<uint8_t>(max_bin)),
...@@ -31,7 +31,7 @@ class Dense4bitsBinIterator : public BinIterator { ...@@ -31,7 +31,7 @@ class Dense4bitsBinIterator : public BinIterator {
inline uint32_t Get(data_size_t idx) override; inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override {} inline void Reset(data_size_t) override {}
private: private:
const Dense4bitsBin* bin_data_; const Dense4bitsBin* bin_data_;
uint8_t min_bin_; uint8_t min_bin_;
uint8_t max_bin_; uint8_t max_bin_;
...@@ -40,12 +40,12 @@ class Dense4bitsBinIterator : public BinIterator { ...@@ -40,12 +40,12 @@ class Dense4bitsBinIterator : public BinIterator {
}; };
class Dense4bitsBin : public Bin { class Dense4bitsBin : public Bin {
public: public:
friend Dense4bitsBinIterator; friend Dense4bitsBinIterator;
explicit Dense4bitsBin(data_size_t num_data) explicit Dense4bitsBin(data_size_t num_data)
: num_data_(num_data) { : num_data_(num_data) {
int len = (num_data_ + 1) / 2; int len = (num_data_ + 1) / 2;
data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0)); data_.resize(len, static_cast<uint8_t>(0));
buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0)); buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
} }
...@@ -73,88 +73,65 @@ class Dense4bitsBin : public Bin { ...@@ -73,88 +73,65 @@ class Dense4bitsBin : public Bin {
inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, #define ACC_GH(hist, i, g, h) \
const score_t* ordered_gradients, const score_t* ordered_hessians, const auto ti = (i) << 1; \
HistogramBinEntry* out) const override { hist[ti] += g; \
const data_size_t pf_offset = 64; hist[ti + 1] += h; \
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start; data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1)); if (use_prefetch) {
const data_size_t idx = data_indices[i]; const data_size_t pf_offset = 64;
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const data_size_t pf_end = end - pf_offset;
out[bin].sum_gradients += ordered_gradients[i]; for (; i < pf_end; ++i) {
out[bin].sum_hessians += ordered_hessians[i]; const auto idx = use_indices ? data_indices[i] : i;
++out[bin].cnt; const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + (pf_idx >> 1));
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
}
} }
for (; i < end; i++) { for (; i < end; ++i) {
const data_size_t idx = data_indices[i]; const auto idx = use_indices ? data_indices[i] : i;
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i]; if (use_hessians) {
out[bin].sum_hessians += ordered_hessians[i]; ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
++out[bin].cnt; } else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
} }
} }
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64; ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
} }
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64; ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
} }
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64; ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
} }
data_size_t Split( data_size_t Split(
...@@ -266,8 +243,6 @@ class Dense4bitsBin : public Bin { ...@@ -266,8 +243,6 @@ class Dense4bitsBin : public Bin {
data_size_t num_data() const override { return num_data_; } data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
OrderedBin* CreateOrderedBin() const override { return nullptr; }
void FinishLoad() override { void FinishLoad() override {
if (buf_.empty()) { return; } if (buf_.empty()) { return; }
...@@ -325,19 +300,20 @@ class Dense4bitsBin : public Bin { ...@@ -325,19 +300,20 @@ class Dense4bitsBin : public Bin {
} }
size_t SizesInByte() const override { size_t SizesInByte() const override {
return sizeof(uint8_t) * data_.size(); return sizeof(uint8_t)* data_.size();
} }
Dense4bitsBin* Clone() override { Dense4bitsBin* Clone() override {
return new Dense4bitsBin(*this); return new Dense4bitsBin(*this);
} }
protected: protected:
Dense4bitsBin(const Dense4bitsBin& other) Dense4bitsBin(const Dense4bitsBin& other)
: num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {} : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {
}
data_size_t num_data_; data_size_t num_data_;
std::vector<uint8_t> data_; std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> data_;
std::vector<uint8_t> buf_; std::vector<uint8_t> buf_;
}; };
......
/*!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
#define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstdint>
#include <cstring>
#include <vector>
namespace LightGBM {
template <typename VAL_T>
class MultiValDenseBin : public MultiValBin {
public:
explicit MultiValDenseBin(data_size_t num_data, int num_bin, int num_feature)
: num_data_(num_data), num_bin_(num_bin), num_feature_(num_feature) {
data_.resize(static_cast<size_t>(num_data_) * num_feature_, static_cast<VAL_T>(0));
}
~MultiValDenseBin() {
}
data_size_t num_data() const override {
return num_data_;
}
int num_bin() const override {
return num_bin_;
}
void PushOneRow(int , data_size_t idx, const std::vector<uint32_t>& values) override {
auto start = RowPtr(idx);
CHECK(num_feature_ == static_cast<int>(values.size()));
for (auto i = 0; i < num_feature_; ++i) {
data_[start + i] = static_cast<VAL_T>(values[i]);
}
}
void FinishLoad() override {
}
bool IsSparse() override{
return false;
}
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
PREFETCH_T0(hessians + pf_idx);
}
PREFETCH_T0(data_.data() + RowPtr(pf_idx));
const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
data_.clear();
for (data_size_t i = 0; i < num_used_indices; ++i) {
for (int64_t j = other_bin->RowPtr(used_indices[i]); j < other_bin->RowPtr(used_indices[i] + 1); ++j) {
data_.push_back(other_bin->data_[j]);
}
}
}
inline int64_t RowPtr(data_size_t idx) const {
return static_cast<int64_t>(idx) * num_feature_;
}
MultiValDenseBin<VAL_T>* Clone() override;
private:
data_size_t num_data_;
int num_bin_;
int num_feature_;
std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
MultiValDenseBin<VAL_T>(const MultiValDenseBin<VAL_T>& other)
: num_data_(other.num_data_), num_bin_(other.num_bin_), num_feature_(other.num_feature_), data_(other.data_) {
}
};
template<typename VAL_T>
MultiValDenseBin<VAL_T>* MultiValDenseBin<VAL_T>::Clone() {
return new MultiValDenseBin<VAL_T>(*this);
}
} // namespace LightGBM
#endif // LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
/*!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstdint>
#include <cstring>
#include <vector>
namespace LightGBM {
template <typename VAL_T>
class MultiValSparseBin : public MultiValBin {
public:
explicit MultiValSparseBin(data_size_t num_data, int num_bin)
: num_data_(num_data), num_bin_(num_bin) {
row_ptr_.resize(num_data_ + 1, 0);
data_.reserve(num_data_);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
if (num_threads > 1) {
t_data_.resize(num_threads - 1);
}
}
~MultiValSparseBin() {
}
data_size_t num_data() const override {
return num_data_;
}
int num_bin() const override {
return num_bin_;
}
void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t> & values) override {
row_ptr_[idx + 1] = static_cast<data_size_t>(values.size());
if (tid == 0) {
for (auto val : values) {
data_.push_back(static_cast<VAL_T>(val));
}
} else {
for (auto val : values) {
t_data_[tid - 1].push_back(static_cast<VAL_T>(val));
}
}
}
void FinishLoad() override {
for (data_size_t i = 0; i < num_data_; ++i) {
row_ptr_[i + 1] += row_ptr_[i];
}
if (t_data_.size() > 0) {
size_t offset = data_.size();
data_.resize(row_ptr_[num_data_]);
for (size_t tid = 0; tid < t_data_.size(); ++tid) {
std::memcpy(data_.data() + offset, t_data_[tid].data(), t_data_[tid].size() * sizeof(VAL_T));
offset += t_data_[tid].size();
t_data_[tid].clear();
}
}
row_ptr_.shrink_to_fit();
data_.shrink_to_fit();
t_data_.clear();
t_data_.shrink_to_fit();
}
bool IsSparse() override {
return true;
}
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
PREFETCH_T0(hessians + pf_idx);
}
PREFETCH_T0(row_ptr_.data() + pf_idx);
PREFETCH_T0(data_.data() + row_ptr_[pf_idx]);
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
}
void CopySubset(const Bin * full_bin, const data_size_t * used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const MultiValSparseBin<VAL_T>*>(full_bin);
row_ptr_.resize(num_data_ + 1, 0);
data_.clear();
for (data_size_t i = 0; i < num_used_indices; ++i) {
for (data_size_t j = other_bin->row_ptr_[used_indices[i]]; j < other_bin->row_ptr_[used_indices[i] + 1]; ++j) {
data_.push_back(other_bin->data_[j]);
}
row_ptr_[i + 1] = row_ptr_[i] + other_bin->row_ptr_[used_indices[i] + 1] - other_bin->row_ptr_[used_indices[i]];
}
}
inline data_size_t RowPtr(data_size_t idx) const {
return row_ptr_[idx];
}
MultiValSparseBin<VAL_T>* Clone() override;
private:
data_size_t num_data_;
int num_bin_;
std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, 32>> row_ptr_;
std::vector<std::vector<VAL_T>> t_data_;
MultiValSparseBin<VAL_T>(const MultiValSparseBin<VAL_T> & other)
: num_data_(other.num_data_), num_bin_(other.num_bin_), data_(other.data_), row_ptr_(other.row_ptr_) {
}
};
template<typename VAL_T>
MultiValSparseBin<VAL_T>* MultiValSparseBin<VAL_T>::Clone() {
return new MultiValSparseBin<VAL_T>(*this);
}
} // namespace LightGBM
#endif // LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
/*!
* Copyright (c) 2016 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <mutex>
#include <utility>
#include <vector>
#include "sparse_bin.hpp"
namespace LightGBM {
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
template <typename VAL_T>
class OrderedSparseBin: public OrderedBin {
public:
/*! \brief Pair to store one bin entry */
struct SparsePair {
data_size_t ridx; // data(row) index
VAL_T bin; // bin for this data
SparsePair() : ridx(0), bin(0) {}
};
explicit OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
:bin_data_(bin_data) {
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
int non_zero_cnt = 0;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
++non_zero_cnt;
}
ordered_pair_.resize(non_zero_cnt);
leaf_cnt_.push_back(non_zero_cnt);
}
~OrderedSparseBin() {
}
void Init(const char* used_idices, int num_leaves) override {
// initialize the leaf information
leaf_start_ = std::vector<data_size_t>(num_leaves, 0);
leaf_cnt_ = std::vector<data_size_t>(num_leaves, 0);
if (used_idices == nullptr) {
// if using all data, copy all non-zero pair
data_size_t j = 0;
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j;
}
leaf_cnt_[0] = static_cast<data_size_t>(j);
} else {
// if using part of data(bagging)
data_size_t j = 0;
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
if (used_idices[cur_pos]) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j;
}
}
leaf_cnt_[0] = j;
}
}
void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian,
HistogramBinEntry* out) const override {
// get current leaf boundary
const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf];
for (data_size_t i = start; i < end; ++i) {
const VAL_T bin = ordered_pair_[i].bin;
const auto g = gradient[ordered_pair_[i].ridx];
const auto h = hessian[ordered_pair_[i].ridx];
out[bin].sum_gradients += g;
out[bin].sum_hessians += h;
++out[bin].cnt;
}
}
void ConstructHistogram(int leaf, const score_t* gradient,
HistogramBinEntry* out) const override {
// get current leaf boundary
const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf];
for (data_size_t i = start; i < end; ++i) {
const VAL_T bin = ordered_pair_[i].bin;
const auto g = gradient[ordered_pair_[i].ridx];
out[bin].sum_gradients += g;
++out[bin].cnt;
}
}
void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) override {
// get current leaf boundary
const data_size_t l_start = leaf_start_[leaf];
const data_size_t l_end = l_start + leaf_cnt_[leaf];
// new left leaf end after split
data_size_t new_left_end = l_start;
for (data_size_t i = l_start; i < l_end; ++i) {
if (is_in_leaf[ordered_pair_[i].ridx] == mark) {
std::swap(ordered_pair_[new_left_end], ordered_pair_[i]);
++new_left_end;
}
}
leaf_start_[right_leaf] = new_left_end;
leaf_cnt_[leaf] = new_left_end - l_start;
leaf_cnt_[right_leaf] = l_end - new_left_end;
}
data_size_t NonZeroCount(int leaf) const override {
return static_cast<data_size_t>(leaf_cnt_[leaf]);
}
/*! \brief Disable copy */
OrderedSparseBin<VAL_T>& operator=(const OrderedSparseBin<VAL_T>&) = delete;
/*! \brief Disable copy */
OrderedSparseBin<VAL_T>(const OrderedSparseBin<VAL_T>&) = delete;
private:
const SparseBin<VAL_T>* bin_data_;
/*! \brief Store non-zero pair , group by leaf */
std::vector<SparsePair> ordered_pair_;
/*! \brief leaf_start_[i] means data in i-th leaf start from */
std::vector<data_size_t> leaf_start_;
/*! \brief leaf_cnt_[i] means number of data in i-th leaf */
std::vector<data_size_t> leaf_cnt_;
};
template <typename VAL_T>
OrderedBin* SparseBin<VAL_T>::CreateOrderedBin() const {
return new OrderedSparseBin<VAL_T>(this);
}
} // namespace LightGBM
#endif // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_
...@@ -24,7 +24,7 @@ const size_t kNumFastIndex = 64; ...@@ -24,7 +24,7 @@ const size_t kNumFastIndex = 64;
template <typename VAL_T> template <typename VAL_T>
class SparseBinIterator: public BinIterator { class SparseBinIterator: public BinIterator {
public: public:
SparseBinIterator(const SparseBin<VAL_T>* bin_data, SparseBinIterator(const SparseBin<VAL_T>* bin_data,
uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
: bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)), : bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
...@@ -56,7 +56,7 @@ class SparseBinIterator: public BinIterator { ...@@ -56,7 +56,7 @@ class SparseBinIterator: public BinIterator {
inline void Reset(data_size_t idx) override; inline void Reset(data_size_t idx) override;
private: private:
const SparseBin<VAL_T>* bin_data_; const SparseBin<VAL_T>* bin_data_;
data_size_t cur_pos_; data_size_t cur_pos_;
data_size_t i_delta_; data_size_t i_delta_;
...@@ -66,20 +66,16 @@ class SparseBinIterator: public BinIterator { ...@@ -66,20 +66,16 @@ class SparseBinIterator: public BinIterator {
uint8_t offset_; uint8_t offset_;
}; };
template <typename VAL_T>
class OrderedSparseBin;
template <typename VAL_T> template <typename VAL_T>
class SparseBin: public Bin { class SparseBin: public Bin {
public: public:
friend class SparseBinIterator<VAL_T>; friend class SparseBinIterator<VAL_T>;
friend class OrderedSparseBin<VAL_T>;
explicit SparseBin(data_size_t num_data) explicit SparseBin(data_size_t num_data)
: num_data_(num_data) { : num_data_(num_data) {
int num_threads = 1; int num_threads = 1;
#pragma omp parallel #pragma omp parallel
#pragma omp master #pragma omp master
{ {
num_threads = omp_get_num_threads(); num_threads = omp_get_num_threads();
} }
...@@ -102,41 +98,97 @@ class SparseBin: public Bin { ...@@ -102,41 +98,97 @@ class SparseBin: public Bin {
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*, #define ACC_GH(hist, i, g, h) \
const score_t*, HistogramBinEntry*) const override { const auto ti = static_cast<int>(i) << 1; \
// Will use OrderedSparseBin->ConstructHistogram() instead hist[ti] += g; \
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); hist[ti + 1] += h; \
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
data_size_t i_delta, cur_pos;
InitIndex(data_indices[start], &i_delta, &cur_pos);
data_size_t i = start;
for (;;) {
if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
} else if (cur_pos > data_indices[i]) {
if (++i >= end) { break; }
} else {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
if (++i >= end) { break; }
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
}
}
} }
void ConstructHistogram(data_size_t, data_size_t, const score_t*, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t*, HistogramBinEntry*) const override { const score_t* ordered_gradients, const score_t* ordered_hessians,
// Will use OrderedSparseBin->ConstructHistogram() instead hist_t* out) const override {
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); data_size_t i_delta, cur_pos;
InitIndex(start, &i_delta, &cur_pos);
while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta];
}
while (cur_pos < end && i_delta < num_vals_) {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[cur_pos], ordered_hessians[cur_pos]);
cur_pos += deltas_[++i_delta];
}
} }
void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
HistogramBinEntry*) const override { const score_t* ordered_gradients,
// Will use OrderedSparseBin->ConstructHistogram() instead hist_t* out) const override {
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); data_size_t i_delta, cur_pos;
InitIndex(data_indices[start], &i_delta, &cur_pos);
data_size_t i = start;
for (;;) {
if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
} else if (cur_pos > data_indices[i]) {
if (++i >= end) { break; }
} else {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
if (++i >= end) { break; }
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
}
}
} }
void ConstructHistogram(data_size_t, data_size_t, const score_t*, void ConstructHistogram(data_size_t start, data_size_t end,
HistogramBinEntry*) const override { const score_t* ordered_gradients,
// Will use OrderedSparseBin->ConstructHistogram() instead hist_t* out) const override {
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); data_size_t i_delta, cur_pos;
InitIndex(start, &i_delta, &cur_pos);
while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta];
}
while (cur_pos < end && i_delta < num_vals_) {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[cur_pos], 1.0f);
cur_pos += deltas_[++i_delta];
}
} }
#undef ACC_GH
inline bool NextNonzero(data_size_t* i_delta, inline void NextNonzeroFast(data_size_t* i_delta,
data_size_t* cur_pos) const { data_size_t* cur_pos) const {
++(*i_delta); *cur_pos += deltas_[++(*i_delta)];
data_size_t shift = 0; if (*i_delta >= num_vals_) {
data_size_t delta = deltas_[*i_delta]; *cur_pos = num_data_;
while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
++(*i_delta);
shift += 8;
delta |= static_cast<data_size_t>(deltas_[*i_delta]) << shift;
} }
*cur_pos += delta; }
inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const {
*cur_pos += deltas_[++(*i_delta)];
if (*i_delta < num_vals_) { if (*i_delta < num_vals_) {
return true; return true;
} else { } else {
...@@ -257,8 +309,6 @@ class SparseBin: public Bin { ...@@ -257,8 +309,6 @@ class SparseBin: public Bin {
data_size_t num_data() const override { return num_data_; } data_size_t num_data() const override { return num_data_; }
OrderedBin* CreateOrderedBin() const override;
void FinishLoad() override { void FinishLoad() override {
// get total non zero size // get total non zero size
size_t pair_cnt = 0; size_t pair_cnt = 0;
...@@ -276,8 +326,8 @@ class SparseBin: public Bin { ...@@ -276,8 +326,8 @@ class SparseBin: public Bin {
// sort by data index // sort by data index
std::sort(idx_val_pairs.begin(), idx_val_pairs.end(), std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
[](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) { [](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
return a.first < b.first; return a.first < b.first;
}); });
// load delta array // load delta array
LoadFromPair(idx_val_pairs); LoadFromPair(idx_val_pairs);
} }
...@@ -291,11 +341,12 @@ class SparseBin: public Bin { ...@@ -291,11 +341,12 @@ class SparseBin: public Bin {
const data_size_t cur_idx = idx_val_pairs[i].first; const data_size_t cur_idx = idx_val_pairs[i].first;
const VAL_T bin = idx_val_pairs[i].second; const VAL_T bin = idx_val_pairs[i].second;
data_size_t cur_delta = cur_idx - last_idx; data_size_t cur_delta = cur_idx - last_idx;
// disallow the multi-val in one row
if (i > 0 && cur_delta == 0) { continue; } if (i > 0 && cur_delta == 0) { continue; }
while (cur_delta >= 256) { while (cur_delta >= 256) {
deltas_.push_back(cur_delta & 0xff); deltas_.push_back(255);
vals_.push_back(0); vals_.push_back(0);
cur_delta >>= 8; cur_delta -= 255;
} }
deltas_.push_back(static_cast<uint8_t>(cur_delta)); deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin); vals_.push_back(bin);
...@@ -384,7 +435,7 @@ class SparseBin: public Bin { ...@@ -384,7 +435,7 @@ class SparseBin: public Bin {
while (cur_pos < idx && j < num_vals_) { while (cur_pos < idx && j < num_vals_) {
NextNonzero(&j, &cur_pos); NextNonzero(&j, &cur_pos);
} }
if (cur_pos == idx && j < num_vals_) { if (cur_pos == idx && j < num_vals_ && vals_[j] > 0) {
// new row index is i // new row index is i
tmp_pair.emplace_back(i, vals_[j]); tmp_pair.emplace_back(i, vals_[j]);
} }
...@@ -405,13 +456,13 @@ class SparseBin: public Bin { ...@@ -405,13 +456,13 @@ class SparseBin: public Bin {
// transform to delta array // transform to delta array
data_size_t last_idx = 0; data_size_t last_idx = 0;
for (data_size_t i = 0; i < num_used_indices; ++i) { for (data_size_t i = 0; i < num_used_indices; ++i) {
VAL_T bin = iterator.InnerRawGet(used_indices[i]); auto bin = iterator.InnerRawGet(used_indices[i]);
if (bin > 0) { if (bin > 0) {
data_size_t cur_delta = i - last_idx; data_size_t cur_delta = i - last_idx;
while (cur_delta >= 256) { while (cur_delta >= 256) {
deltas_.push_back(cur_delta & 0xff); deltas_.push_back(255);
vals_.push_back(0); vals_.push_back(0);
cur_delta >>= 8; cur_delta -= 255;
} }
deltas_.push_back(static_cast<uint8_t>(cur_delta)); deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin); vals_.push_back(bin);
...@@ -432,15 +483,29 @@ class SparseBin: public Bin { ...@@ -432,15 +483,29 @@ class SparseBin: public Bin {
SparseBin<VAL_T>* Clone() override; SparseBin<VAL_T>* Clone() override;
protected:
SparseBin<VAL_T>(const SparseBin<VAL_T>& other) SparseBin<VAL_T>(const SparseBin<VAL_T>& other)
: num_data_(other.num_data_), deltas_(other.deltas_), vals_(other.vals_), : num_data_(other.num_data_), deltas_(other.deltas_), vals_(other.vals_),
num_vals_(other.num_vals_), push_buffers_(other.push_buffers_), num_vals_(other.num_vals_), push_buffers_(other.push_buffers_),
fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {} fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {
}
void InitIndex(data_size_t start_idx, data_size_t * i_delta, data_size_t * cur_pos) const {
auto idx = start_idx >> fast_index_shift_;
if (static_cast<size_t>(idx) < fast_index_.size()) {
const auto fast_pair = fast_index_[start_idx >> fast_index_shift_];
*i_delta = fast_pair.first;
*cur_pos = fast_pair.second;
} else {
*i_delta = -1;
*cur_pos = 0;
}
}
private:
data_size_t num_data_; data_size_t num_data_;
std::vector<uint8_t> deltas_; std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> deltas_;
std::vector<VAL_T> vals_; std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> vals_;
data_size_t num_vals_; data_size_t num_vals_;
std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_; std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
std::vector<std::pair<data_size_t, data_size_t>> fast_index_; std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
...@@ -460,7 +525,7 @@ inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) { ...@@ -460,7 +525,7 @@ inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
template <typename VAL_T> template <typename VAL_T>
inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) { inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
while (cur_pos_ < idx) { while (cur_pos_ < idx) {
bin_data_->NextNonzero(&i_delta_, &cur_pos_); bin_data_->NextNonzeroFast(&i_delta_, &cur_pos_);
} }
if (cur_pos_ == idx) { if (cur_pos_ == idx) {
return bin_data_->vals_[i_delta_]; return bin_data_->vals_[i_delta_];
...@@ -471,15 +536,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) { ...@@ -471,15 +536,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
template <typename VAL_T> template <typename VAL_T>
inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) { inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
auto idx = start_idx >> bin_data_->fast_index_shift_; bin_data_->InitIndex(start_idx, &i_delta_, &cur_pos_);
if (static_cast<size_t>(idx) < bin_data_->fast_index_.size()) {
const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
i_delta_ = fast_pair.first;
cur_pos_ = fast_pair.second;
} else {
i_delta_ = -1;
cur_pos_ = 0;
}
} }
template <typename VAL_T> template <typename VAL_T>
......
...@@ -73,9 +73,9 @@ class RankXENDCG: public ObjectiveFunction { ...@@ -73,9 +73,9 @@ class RankXENDCG: public ObjectiveFunction {
// Skip query if sum of labels is 0. // Skip query if sum of labels is 0.
float sum_labels = 0; float sum_labels = 0;
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
sum_labels += phi(label[i], gammas[i]); sum_labels += static_cast<float>(phi(label[i], gammas[i]));
} }
if (sum_labels == 0) { if (std::fabs(sum_labels) < kEpsilon) {
return; return;
} }
...@@ -111,7 +111,7 @@ class RankXENDCG: public ObjectiveFunction { ...@@ -111,7 +111,7 @@ class RankXENDCG: public ObjectiveFunction {
} }
double phi(const label_t l, double g) const { double phi(const label_t l, double g) const {
return Common::Pow(2, l) - g; return Common::Pow(2, static_cast<int>(l)) - g;
} }
const char* GetName() const override { const char* GetName() const override {
......
...@@ -27,7 +27,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, boo ...@@ -27,7 +27,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, boo
rank_ = Network::rank(); rank_ = Network::rank();
num_machines_ = Network::num_machines(); num_machines_ = Network::num_machines();
// allocate buffer for communication // allocate buffer for communication
size_t buffer_size = this->train_data_->NumTotalBin() * sizeof(HistogramBinEntry); size_t buffer_size = this->train_data_->NumTotalBin() * KHistEntrySize;
input_buffer_.resize(buffer_size); input_buffer_.resize(buffer_size);
output_buffer_.resize(buffer_size); output_buffer_.resize(buffer_size);
...@@ -82,7 +82,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() { ...@@ -82,7 +82,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
num_bin -= 1; num_bin -= 1;
} }
block_len_[i] += num_bin * sizeof(HistogramBinEntry); block_len_[i] += num_bin * KHistEntrySize;
} }
reduce_scatter_size_ += block_len_[i]; reduce_scatter_size_ += block_len_[i];
} }
...@@ -101,7 +101,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() { ...@@ -101,7 +101,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
num_bin -= 1; num_bin -= 1;
} }
bin_size += num_bin * sizeof(HistogramBinEntry); bin_size += num_bin * KHistEntrySize;
} }
} }
...@@ -113,7 +113,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() { ...@@ -113,7 +113,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
num_bin -= 1; num_bin -= 1;
} }
bin_size += num_bin * sizeof(HistogramBinEntry); bin_size += num_bin * KHistEntrySize;
} }
// sync global data sumup info // sync global data sumup info
...@@ -158,8 +158,8 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() { ...@@ -158,8 +158,8 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram()); this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
} }
// Reduce scatter for histogram // Reduce scatter for histogram
Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(),
block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer); block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramSumReducer);
this->FindBestSplitsFromHistograms(this->is_feature_used_, true); this->FindBestSplitsFromHistograms(this->is_feature_used_, true);
} }
...@@ -186,7 +186,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const ...@@ -186,7 +186,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
this->train_data_->FixHistogram(feature_index, this->train_data_->FixHistogram(feature_index,
this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
this->smaller_leaf_histogram_array_[feature_index].RawData()); this->smaller_leaf_histogram_array_[feature_index].RawData());
SplitInfo smaller_split; SplitInfo smaller_split;
// find best threshold for smaller child // find best threshold for smaller child
......
...@@ -108,58 +108,70 @@ class DataPartition { ...@@ -108,58 +108,70 @@ class DataPartition {
* \param threshold threshold that want to split * \param threshold threshold that want to split
* \param right_leaf index of right leaf * \param right_leaf index of right leaf
*/ */
void Split(int leaf, const Dataset* dataset, int feature, const uint32_t* threshold, int num_threshold, bool default_left, int right_leaf) { void Split(int leaf, const Dataset* dataset, int feature,
const uint32_t* threshold, int num_threshold, bool default_left,
int right_leaf) {
Common::FunctionTimer fun_timer("DataPartition::Split", global_timer);
const data_size_t min_inner_size = 512; const data_size_t min_inner_size = 512;
// get leaf boundary // get leaf boundary
const data_size_t begin = leaf_begin_[leaf]; const data_size_t begin = leaf_begin_[leaf];
const data_size_t cnt = leaf_count_[leaf]; const data_size_t cnt = leaf_count_[leaf];
data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_; const int nblock =
if (inner_size < min_inner_size) { inner_size = min_inner_size; } std::min(num_threads_, (cnt + min_inner_size - 1) / min_inner_size);
data_size_t inner_size = SIZE_ALIGNED((cnt + nblock - 1) / nblock);
auto left_start = indices_.data() + begin;
global_timer.Start("DataPartition::Split.MT");
// split data multi-threading // split data multi-threading
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) { for (int i = 0; i < nblock; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size; data_size_t cur_start = i * inner_size;
if (cur_start > cnt) { continue; } data_size_t cur_cnt = std::min(inner_size, cnt - cur_start);
data_size_t cur_cnt = inner_size; if (cur_cnt <= 0) {
if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; } left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
continue;
}
// split data inner, reduce the times of function called // split data inner, reduce the times of function called
data_size_t cur_left_count = dataset->Split(feature, threshold, num_threshold, default_left, indices_.data() + begin + cur_start, cur_cnt, data_size_t cur_left_count =
temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start); dataset->Split(feature, threshold, num_threshold, default_left,
left_start + cur_start, cur_cnt,
temp_left_indices_.data() + cur_start,
temp_right_indices_.data() + cur_start);
offsets_buf_[i] = cur_start; offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count; left_cnts_buf_[i] = cur_left_count;
right_cnts_buf_[i] = cur_cnt - cur_left_count; right_cnts_buf_[i] = cur_cnt - cur_left_count;
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
OMP_THROW_EX(); OMP_THROW_EX();
data_size_t left_cnt = 0; global_timer.Stop("DataPartition::Split.MT");
global_timer.Start("DataPartition::Split.Merge");
left_write_pos_buf_[0] = 0; left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0; right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) { for (int i = 1; i < nblock; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1]; left_write_pos_buf_[i] =
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1]; left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] =
right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
} }
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1]; data_size_t left_cnt =
// copy back indices of right leaf to indices_ left_write_pos_buf_[nblock - 1] + left_cnts_buf_[nblock - 1];
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) { auto right_start = left_start + left_cnt;
if (left_cnts_buf_[i] > 0) { #pragma omp parallel for schedule(static)
std::memcpy(indices_.data() + begin + left_write_pos_buf_[i], for (int i = 0; i < nblock; ++i) {
temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t)); std::copy_n(temp_left_indices_.data() + offsets_buf_[i],
} left_cnts_buf_[i], left_start + left_write_pos_buf_[i]);
if (right_cnts_buf_[i] > 0) { std::copy_n(temp_right_indices_.data() + offsets_buf_[i],
std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i], right_cnts_buf_[i], right_start + right_write_pos_buf_[i]);
temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
} }
// update leaf boundary // update leaf boundary
leaf_count_[leaf] = left_cnt; leaf_count_[leaf] = left_cnt;
leaf_begin_[right_leaf] = left_cnt + begin; leaf_begin_[right_leaf] = left_cnt + begin;
leaf_count_[right_leaf] = cnt - left_cnt; leaf_count_[right_leaf] = cnt - left_cnt;
global_timer.Stop("DataPartition::Split.Merge");
} }
/*! /*!
...@@ -201,11 +213,11 @@ class DataPartition { ...@@ -201,11 +213,11 @@ class DataPartition {
/*! \brief number of data on one leaf */ /*! \brief number of data on one leaf */
std::vector<data_size_t> leaf_count_; std::vector<data_size_t> leaf_count_;
/*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */ /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
std::vector<data_size_t> indices_; std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> indices_;
/*! \brief team indices buffer for split */ /*! \brief team indices buffer for split */
std::vector<data_size_t> temp_left_indices_; std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_left_indices_;
/*! \brief team indices buffer for split */ /*! \brief team indices buffer for split */
std::vector<data_size_t> temp_right_indices_; std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_right_indices_;
/*! \brief used data indices, used for bagging */ /*! \brief used data indices, used for bagging */
const data_size_t* used_data_indices_; const data_size_t* used_data_indices_;
/*! \brief used data count, used for bagging */ /*! \brief used data count, used for bagging */
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ #ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_ #define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <LightGBM/utils/array_args.h> #include <LightGBM/utils/array_args.h>
...@@ -20,7 +21,7 @@ ...@@ -20,7 +21,7 @@
namespace LightGBM { namespace LightGBM {
class FeatureMetainfo { class FeatureMetainfo {
public: public:
int num_bin; int num_bin;
MissingType missing_type; MissingType missing_type;
int8_t offset = 0; int8_t offset = 0;
...@@ -35,7 +36,7 @@ class FeatureMetainfo { ...@@ -35,7 +36,7 @@ class FeatureMetainfo {
* \brief FeatureHistogram is used to construct and store a histogram for a feature. * \brief FeatureHistogram is used to construct and store a histogram for a feature.
*/ */
class FeatureHistogram { class FeatureHistogram {
public: public:
FeatureHistogram() { FeatureHistogram() {
data_ = nullptr; data_ = nullptr;
} }
...@@ -53,19 +54,19 @@ class FeatureHistogram { ...@@ -53,19 +54,19 @@ class FeatureHistogram {
* \param feature the feature data for this histogram * \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf * \param min_num_data_one_leaf minimal number of data in one leaf
*/ */
void Init(HistogramBinEntry* data, const FeatureMetainfo* meta) { void Init(hist_t* data, const FeatureMetainfo* meta) {
meta_ = meta; meta_ = meta;
data_ = data; data_ = data;
if (meta_->bin_type == BinType::NumericalBin) { if (meta_->bin_type == BinType::NumericalBin) {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1 find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
} else { } else {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1 find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6); , std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
} }
} }
HistogramBinEntry* RawData() { hist_t* RawData() {
return data_; return data_;
} }
/*! /*!
...@@ -73,15 +74,13 @@ class FeatureHistogram { ...@@ -73,15 +74,13 @@ class FeatureHistogram {
* \param other The histogram that want to subtract * \param other The histogram that want to subtract
*/ */
void Subtract(const FeatureHistogram& other) { void Subtract(const FeatureHistogram& other) {
for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) {
data_[i].cnt -= other.data_[i].cnt; data_[i] -= other.data_[i];
data_[i].sum_gradients -= other.data_[i].sum_gradients;
data_[i].sum_hessians -= other.data_[i].sum_hessians;
} }
} }
void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
SplitInfo* output) { SplitInfo* output) {
output->default_left = true; output->default_left = true;
output->gain = kMinScore; output->gain = kMinScore;
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output); find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output);
...@@ -89,10 +88,10 @@ class FeatureHistogram { ...@@ -89,10 +88,10 @@ class FeatureHistogram {
} }
void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
SplitInfo* output) { SplitInfo* output) {
is_splittable_ = false; is_splittable_ = false;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step); meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
if (meta_->missing_type == MissingType::Zero) { if (meta_->missing_type == MissingType::Zero) {
...@@ -116,8 +115,8 @@ class FeatureHistogram { ...@@ -116,8 +115,8 @@ class FeatureHistogram {
} }
void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data, void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
double min_constraint, double max_constraint, double min_constraint, double max_constraint,
SplitInfo* output) { SplitInfo* output) {
output->default_left = false; output->default_left = false;
double best_gain = kMinScore; double best_gain = kMinScore;
data_size_t best_left_count = 0; data_size_t best_left_count = 0;
...@@ -134,25 +133,28 @@ class FeatureHistogram { ...@@ -134,25 +133,28 @@ class FeatureHistogram {
bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot; bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
int best_threshold = -1; int best_threshold = -1;
int best_dir = 1; int best_dir = 1;
const double cnt_factor = num_data / sum_hessian;
if (use_onehot) { if (use_onehot) {
for (int t = 0; t < used_bin; ++t) { for (int t = 0; t < used_bin; ++t) {
const auto grad = GET_GRAD(data_, t);
const auto hess = GET_HESS(data_, t);
data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
// if data not enough, or sum hessian too small // if data not enough, or sum hessian too small
if (data_[t].cnt < meta_->config->min_data_in_leaf if (cnt < meta_->config->min_data_in_leaf
|| data_[t].sum_hessians < meta_->config->min_sum_hessian_in_leaf) continue; || hess < meta_->config->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - data_[t].cnt; data_size_t other_count = num_data - cnt;
// if data not enough // if data not enough
if (other_count < meta_->config->min_data_in_leaf) continue; if (other_count < meta_->config->min_data_in_leaf) continue;
double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon; double sum_other_hessian = sum_hessian - hess - kEpsilon;
// if sum hessian too small // if sum hessian too small
if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue; if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
double sum_other_gradient = sum_gradient - data_[t].sum_gradients; double sum_other_gradient = sum_gradient - grad;
// current split gain // current split gain
double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon, double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step, meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint, 0); min_constraint, max_constraint, 0);
// gain with split is worse than without split // gain with split is worse than without split
if (current_gain <= min_gain_shift) continue; if (current_gain <= min_gain_shift) continue;
...@@ -161,15 +163,15 @@ class FeatureHistogram { ...@@ -161,15 +163,15 @@ class FeatureHistogram {
// better split point // better split point
if (current_gain > best_gain) { if (current_gain > best_gain) {
best_threshold = t; best_threshold = t;
best_sum_left_gradient = data_[t].sum_gradients; best_sum_left_gradient = grad;
best_sum_left_hessian = data_[t].sum_hessians + kEpsilon; best_sum_left_hessian = hess + kEpsilon;
best_left_count = data_[t].cnt; best_left_count = cnt;
best_gain = current_gain; best_gain = current_gain;
} }
} }
} else { } else {
for (int i = 0; i < used_bin; ++i) { for (int i = 0; i < used_bin; ++i) {
if (data_[i].cnt >= meta_->config->cat_smooth) { if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >= meta_->config->cat_smooth) {
sorted_idx.push_back(i); sorted_idx.push_back(i);
} }
} }
...@@ -181,9 +183,9 @@ class FeatureHistogram { ...@@ -181,9 +183,9 @@ class FeatureHistogram {
return (sum_grad) / (sum_hess + meta_->config->cat_smooth); return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
}; };
std::sort(sorted_idx.begin(), sorted_idx.end(), std::sort(sorted_idx.begin(), sorted_idx.end(),
[this, &ctr_fun](int i, int j) { [this, &ctr_fun](int i, int j) {
return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians); return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) < ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j));
}); });
std::vector<int> find_direction(1, 1); std::vector<int> find_direction(1, 1);
std::vector<int> start_position(1, 0); std::vector<int> start_position(1, 0);
...@@ -203,14 +205,17 @@ class FeatureHistogram { ...@@ -203,14 +205,17 @@ class FeatureHistogram {
for (int i = 0; i < used_bin && i < max_num_cat; ++i) { for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
auto t = sorted_idx[start_pos]; auto t = sorted_idx[start_pos];
start_pos += dir; start_pos += dir;
const auto grad = GET_GRAD(data_, t);
const auto hess = GET_HESS(data_, t);
data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
sum_left_gradient += data_[t].sum_gradients; sum_left_gradient += grad;
sum_left_hessian += data_[t].sum_hessians; sum_left_hessian += hess;
left_count += data_[t].cnt; left_count += cnt;
cnt_cur_group += data_[t].cnt; cnt_cur_group += cnt;
if (left_count < meta_->config->min_data_in_leaf if (left_count < meta_->config->min_data_in_leaf
|| sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue; || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
data_size_t right_count = num_data - left_count; data_size_t right_count = num_data - left_count;
if (right_count < meta_->config->min_data_in_leaf || right_count < min_data_per_group) break; if (right_count < meta_->config->min_data_in_leaf || right_count < min_data_per_group) break;
...@@ -223,8 +228,8 @@ class FeatureHistogram { ...@@ -223,8 +228,8 @@ class FeatureHistogram {
double sum_right_gradient = sum_gradient - sum_left_gradient; double sum_right_gradient = sum_gradient - sum_left_gradient;
double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step, meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint, 0); min_constraint, max_constraint, 0);
if (current_gain <= min_gain_shift) continue; if (current_gain <= min_gain_shift) continue;
is_splittable_ = true; is_splittable_ = true;
if (current_gain > best_gain) { if (current_gain > best_gain) {
...@@ -241,15 +246,15 @@ class FeatureHistogram { ...@@ -241,15 +246,15 @@ class FeatureHistogram {
if (is_splittable_) { if (is_splittable_) {
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step, meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint); min_constraint, max_constraint);
output->left_count = best_left_count; output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient; output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon; output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient, output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian, sum_hessian - best_sum_left_hessian,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step, meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint); min_constraint, max_constraint);
output->right_count = num_data - best_left_count; output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
...@@ -279,22 +284,22 @@ class FeatureHistogram { ...@@ -279,22 +284,22 @@ class FeatureHistogram {
} }
void GatherInfoForThreshold(double sum_gradient, double sum_hessian, void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
uint32_t threshold, data_size_t num_data, SplitInfo *output) { uint32_t threshold, data_size_t num_data, SplitInfo* output) {
if (meta_->bin_type == BinType::NumericalBin) { if (meta_->bin_type == BinType::NumericalBin) {
GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold, GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
num_data, output); num_data, output);
} else { } else {
GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold, GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
num_data, output); num_data, output);
} }
} }
void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian, void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian,
uint32_t threshold, data_size_t num_data, uint32_t threshold, data_size_t num_data,
SplitInfo *output) { SplitInfo* output) {
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step); meta_->config->max_delta_step);
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
// do stuff here // do stuff here
...@@ -315,27 +320,29 @@ class FeatureHistogram { ...@@ -315,27 +320,29 @@ class FeatureHistogram {
int t = meta_->num_bin - 1 - offset - use_na_as_missing; int t = meta_->num_bin - 1 - offset - use_na_as_missing;
const int t_end = 1 - offset; const int t_end = 1 - offset;
const double cnt_factor = num_data / sum_hessian;
// from right to left, and we don't need data in bin0 // from right to left, and we don't need data in bin0
for (; t >= t_end; --t) { for (; t >= t_end; --t) {
if (static_cast<uint32_t>(t + offset) < threshold) { break; } if (static_cast<uint32_t>(t + offset) < threshold) { break; }
// need to skip default bin // need to skip default bin
if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; } if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
const auto grad = GET_GRAD(data_, t);
sum_right_gradient += data_[t].sum_gradients; const auto hess = GET_HESS(data_, t);
sum_right_hessian += data_[t].sum_hessians; data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
right_count += data_[t].cnt; sum_right_gradient += grad;
sum_right_hessian += hess;
right_count += cnt;
} }
double sum_left_gradient = sum_gradient - sum_right_gradient; double sum_left_gradient = sum_gradient - sum_right_gradient;
double sum_left_hessian = sum_hessian - sum_right_hessian; double sum_left_hessian = sum_hessian - sum_right_hessian;
data_size_t left_count = num_data - right_count; data_size_t left_count = num_data - right_count;
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian, double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step) meta_->config->max_delta_step)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian, + GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step); meta_->config->max_delta_step);
// gain with split is worse than without split // gain with split is worse than without split
if (std::isnan(current_gain) || current_gain <= min_gain_shift) { if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
...@@ -347,15 +354,15 @@ class FeatureHistogram { ...@@ -347,15 +354,15 @@ class FeatureHistogram {
// update split information // update split information
output->threshold = threshold; output->threshold = threshold;
output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step); meta_->config->max_delta_step);
output->left_count = left_count; output->left_count = left_count;
output->left_sum_gradient = sum_left_gradient; output->left_sum_gradient = sum_left_gradient;
output->left_sum_hessian = sum_left_hessian - kEpsilon; output->left_sum_hessian = sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - sum_left_gradient, output->right_output = CalculateSplittedLeafOutput(sum_gradient - sum_left_gradient,
sum_hessian - sum_left_hessian, sum_hessian - sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step); meta_->config->max_delta_step);
output->right_count = num_data - left_count; output->right_count = num_data - left_count;
output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_gradient = sum_gradient - sum_left_gradient;
output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon; output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
...@@ -365,13 +372,13 @@ class FeatureHistogram { ...@@ -365,13 +372,13 @@ class FeatureHistogram {
} }
void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian, void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian,
uint32_t threshold, data_size_t num_data, SplitInfo *output) { uint32_t threshold, data_size_t num_data, SplitInfo* output) {
// get SplitInfo for a given one-hot categorical split. // get SplitInfo for a given one-hot categorical split.
output->default_left = false; output->default_left = false;
double gain_shift = GetLeafSplitGain( double gain_shift = GetLeafSplitGain(
sum_gradient, sum_hessian, sum_gradient, sum_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step); meta_->config->max_delta_step);
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
bool is_full_categorical = meta_->missing_type == MissingType::None; bool is_full_categorical = meta_->missing_type == MissingType::None;
int used_bin = meta_->num_bin - 1 + is_full_categorical; int used_bin = meta_->num_bin - 1 + is_full_categorical;
...@@ -380,21 +387,25 @@ class FeatureHistogram { ...@@ -380,21 +387,25 @@ class FeatureHistogram {
Log::Warning("Invalid categorical threshold split"); Log::Warning("Invalid categorical threshold split");
return; return;
} }
const double cnt_factor = num_data / sum_hessian;
const auto grad = GET_GRAD(data_, threshold);
const auto hess = GET_HESS(data_, threshold);
data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
double l2 = meta_->config->lambda_l2; double l2 = meta_->config->lambda_l2;
data_size_t left_count = data_[threshold].cnt; data_size_t left_count = cnt;
data_size_t right_count = num_data - left_count; data_size_t right_count = num_data - left_count;
double sum_left_hessian = data_[threshold].sum_hessians + kEpsilon; double sum_left_hessian = hess + kEpsilon;
double sum_right_hessian = sum_hessian - sum_left_hessian; double sum_right_hessian = sum_hessian - sum_left_hessian;
double sum_left_gradient = data_[threshold].sum_gradients; double sum_left_gradient = grad;
double sum_right_gradient = sum_gradient - sum_left_gradient; double sum_right_gradient = sum_gradient - sum_left_gradient;
// current split gain // current split gain
double current_gain = GetLeafSplitGain(sum_right_gradient, sum_right_hessian, double current_gain = GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, l2, meta_->config->lambda_l1, l2,
meta_->config->max_delta_step) meta_->config->max_delta_step)
+ GetLeafSplitGain(sum_left_gradient, sum_left_hessian, + GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, l2, meta_->config->lambda_l1, l2,
meta_->config->max_delta_step); meta_->config->max_delta_step);
if (std::isnan(current_gain) || current_gain <= min_gain_shift) { if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
output->gain = kMinScore; output->gain = kMinScore;
Log::Warning("'Forced Split' will be ignored since the gain getting worse. "); Log::Warning("'Forced Split' will be ignored since the gain getting worse. ");
...@@ -402,14 +413,14 @@ class FeatureHistogram { ...@@ -402,14 +413,14 @@ class FeatureHistogram {
} }
output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian, output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, l2, meta_->config->lambda_l1, l2,
meta_->config->max_delta_step); meta_->config->max_delta_step);
output->left_count = left_count; output->left_count = left_count;
output->left_sum_gradient = sum_left_gradient; output->left_sum_gradient = sum_left_gradient;
output->left_sum_hessian = sum_left_hessian - kEpsilon; output->left_sum_hessian = sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian, output->right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, l2, meta_->config->lambda_l1, l2,
meta_->config->max_delta_step); meta_->config->max_delta_step);
output->right_count = right_count; output->right_count = right_count;
output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_gradient = sum_gradient - sum_left_gradient;
output->right_sum_hessian = sum_right_hessian - kEpsilon; output->right_sum_hessian = sum_right_hessian - kEpsilon;
...@@ -423,14 +434,14 @@ class FeatureHistogram { ...@@ -423,14 +434,14 @@ class FeatureHistogram {
* \brief Binary size of this histogram * \brief Binary size of this histogram
*/ */
int SizeOfHistgram() const { int SizeOfHistgram() const {
return (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry); return (meta_->num_bin - meta_->offset) * KHistEntrySize;
} }
/*! /*!
* \brief Restore histogram from memory * \brief Restore histogram from memory
*/ */
void FromMemory(char* memory_data) { void FromMemory(char* memory_data) {
std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry)); std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * KHistEntrySize);
} }
/*! /*!
...@@ -457,11 +468,11 @@ class FeatureHistogram { ...@@ -457,11 +468,11 @@ class FeatureHistogram {
} }
} }
private: private:
static double GetSplitGains(double sum_left_gradients, double sum_left_hessians, static double GetSplitGains(double sum_left_gradients, double sum_left_hessians,
double sum_right_gradients, double sum_right_hessians, double sum_right_gradients, double sum_right_hessians,
double l1, double l2, double max_delta_step, double l1, double l2, double max_delta_step,
double min_constraint, double max_constraint, int8_t monotone_constraint) { double min_constraint, double max_constraint, int8_t monotone_constraint) {
double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint); double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint); double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
if (((monotone_constraint > 0) && (left_output > right_output)) || if (((monotone_constraint > 0) && (left_output > right_output)) ||
...@@ -479,7 +490,7 @@ class FeatureHistogram { ...@@ -479,7 +490,7 @@ class FeatureHistogram {
* \return leaf output * \return leaf output
*/ */
static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step, static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step,
double min_constraint, double max_constraint) { double min_constraint, double max_constraint) {
double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step); double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step);
if (ret < min_constraint) { if (ret < min_constraint) {
ret = min_constraint; ret = min_constraint;
...@@ -506,7 +517,7 @@ class FeatureHistogram { ...@@ -506,7 +517,7 @@ class FeatureHistogram {
} }
void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint, void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) { double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
const int8_t offset = meta_->offset; const int8_t offset = meta_->offset;
double best_sum_left_gradient = NAN; double best_sum_left_gradient = NAN;
...@@ -514,7 +525,7 @@ class FeatureHistogram { ...@@ -514,7 +525,7 @@ class FeatureHistogram {
double best_gain = kMinScore; double best_gain = kMinScore;
data_size_t best_left_count = 0; data_size_t best_left_count = 0;
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin); uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
const double cnt_factor = num_data / sum_hessian;
if (dir == -1) { if (dir == -1) {
double sum_right_gradient = 0.0f; double sum_right_gradient = 0.0f;
double sum_right_hessian = kEpsilon; double sum_right_hessian = kEpsilon;
...@@ -528,12 +539,15 @@ class FeatureHistogram { ...@@ -528,12 +539,15 @@ class FeatureHistogram {
// need to skip default bin // need to skip default bin
if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; } if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
sum_right_gradient += data_[t].sum_gradients; const auto grad = GET_GRAD(data_, t);
sum_right_hessian += data_[t].sum_hessians; const auto hess = GET_HESS(data_, t);
right_count += data_[t].cnt; data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
sum_right_gradient += grad;
sum_right_hessian += hess;
right_count += cnt;
// if data not enough, or sum hessian too small // if data not enough, or sum hessian too small
if (right_count < meta_->config->min_data_in_leaf if (right_count < meta_->config->min_data_in_leaf
|| sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue; || sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
data_size_t left_count = num_data - right_count; data_size_t left_count = num_data - right_count;
// if data not enough // if data not enough
if (left_count < meta_->config->min_data_in_leaf) break; if (left_count < meta_->config->min_data_in_leaf) break;
...@@ -545,8 +559,8 @@ class FeatureHistogram { ...@@ -545,8 +559,8 @@ class FeatureHistogram {
double sum_left_gradient = sum_gradient - sum_right_gradient; double sum_left_gradient = sum_gradient - sum_right_gradient;
// current split gain // current split gain
double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint, meta_->monotone_type); min_constraint, max_constraint, meta_->monotone_type);
// gain with split is worse than without split // gain with split is worse than without split
if (current_gain <= min_gain_shift) continue; if (current_gain <= min_gain_shift) continue;
...@@ -575,9 +589,12 @@ class FeatureHistogram { ...@@ -575,9 +589,12 @@ class FeatureHistogram {
sum_left_hessian = sum_hessian - kEpsilon; sum_left_hessian = sum_hessian - kEpsilon;
left_count = num_data; left_count = num_data;
for (int i = 0; i < meta_->num_bin - offset; ++i) { for (int i = 0; i < meta_->num_bin - offset; ++i) {
sum_left_gradient -= data_[i].sum_gradients; const auto grad = GET_GRAD(data_, i);
sum_left_hessian -= data_[i].sum_hessians; const auto hess = GET_HESS(data_, i);
left_count -= data_[i].cnt; data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
sum_left_gradient -= grad;
sum_left_hessian -= hess;
left_count -= cnt;
} }
t = -1; t = -1;
} }
...@@ -586,13 +603,13 @@ class FeatureHistogram { ...@@ -586,13 +603,13 @@ class FeatureHistogram {
// need to skip default bin // need to skip default bin
if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; } if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
if (t >= 0) { if (t >= 0) {
sum_left_gradient += data_[t].sum_gradients; sum_left_gradient += GET_GRAD(data_, t);
sum_left_hessian += data_[t].sum_hessians; sum_left_hessian += GET_HESS(data_, t);
left_count += data_[t].cnt; left_count += static_cast<data_size_t>(Common::RoundInt(GET_HESS(data_, t) * cnt_factor));
} }
// if data not enough, or sum hessian too small // if data not enough, or sum hessian too small
if (left_count < meta_->config->min_data_in_leaf if (left_count < meta_->config->min_data_in_leaf
|| sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue; || sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
data_size_t right_count = num_data - left_count; data_size_t right_count = num_data - left_count;
// if data not enough // if data not enough
if (right_count < meta_->config->min_data_in_leaf) break; if (right_count < meta_->config->min_data_in_leaf) break;
...@@ -604,8 +621,8 @@ class FeatureHistogram { ...@@ -604,8 +621,8 @@ class FeatureHistogram {
double sum_right_gradient = sum_gradient - sum_left_gradient; double sum_right_gradient = sum_gradient - sum_left_gradient;
// current split gain // current split gain
double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian, double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint, meta_->monotone_type); min_constraint, max_constraint, meta_->monotone_type);
// gain with split is worse than without split // gain with split is worse than without split
if (current_gain <= min_gain_shift) continue; if (current_gain <= min_gain_shift) continue;
...@@ -626,15 +643,15 @@ class FeatureHistogram { ...@@ -626,15 +643,15 @@ class FeatureHistogram {
// update split information // update split information
output->threshold = best_threshold; output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint); min_constraint, max_constraint);
output->left_count = best_left_count; output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient; output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon; output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient, output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian, sum_hessian - best_sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint); min_constraint, max_constraint);
output->right_count = num_data - best_left_count; output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
...@@ -645,14 +662,13 @@ class FeatureHistogram { ...@@ -645,14 +662,13 @@ class FeatureHistogram {
const FeatureMetainfo* meta_; const FeatureMetainfo* meta_;
/*! \brief sum of gradient of each bin */ /*! \brief sum of gradient of each bin */
HistogramBinEntry* data_; hist_t* data_;
// std::vector<HistogramBinEntry> data_;
bool is_splittable_ = true; bool is_splittable_ = true;
std::function<void(double, double, data_size_t, double, double, SplitInfo*)> find_best_threshold_fun_; std::function<void(double, double, data_size_t, double, double, SplitInfo*)> find_best_threshold_fun_;
}; };
class HistogramPool { class HistogramPool {
public: public:
/*! /*!
* \brief Constructor * \brief Constructor
*/ */
...@@ -698,7 +714,7 @@ class HistogramPool { ...@@ -698,7 +714,7 @@ class HistogramPool {
} }
} }
void DynamicChangeSize(const Dataset* train_data, const Config* config, int cache_size, int total_size) { void DynamicChangeSize(const Dataset* train_data, bool is_hist_colwise, const Config* config, int cache_size, int total_size) {
if (feature_metas_.empty()) { if (feature_metas_.empty()) {
uint64_t bin_cnt_over_features = 0; uint64_t bin_cnt_over_features = 0;
int num_feature = train_data->num_features(); int num_feature = train_data->num_features();
...@@ -720,7 +736,6 @@ class HistogramPool { ...@@ -720,7 +736,6 @@ class HistogramPool {
} }
Log::Info("Total Bins %d", bin_cnt_over_features); Log::Info("Total Bins %d", bin_cnt_over_features);
} }
uint64_t num_total_bin = train_data->NumTotalBin();
int old_cache_size = static_cast<int>(pool_.size()); int old_cache_size = static_cast<int>(pool_.size());
Reset(cache_size, total_size); Reset(cache_size, total_size);
...@@ -728,24 +743,39 @@ class HistogramPool { ...@@ -728,24 +743,39 @@ class HistogramPool {
pool_.resize(cache_size); pool_.resize(cache_size);
data_.resize(cache_size); data_.resize(cache_size);
} }
int num_total_bin = static_cast<int>(train_data->NumTotalBin());
std::vector<int> offsets;
if (is_hist_colwise) {
int offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) {
offset += train_data->SubFeatureBinOffset(j);
offsets.push_back(offset);
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
num_bin -= 1;
}
offset += num_bin;
}
} else {
num_total_bin = 1;
for (int j = 0; j < train_data->num_features(); ++j) {
offsets.push_back(num_total_bin);
num_total_bin += train_data->FeatureBinMapper(j)->num_bin();
if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
num_total_bin -= 1;
}
}
}
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int i = old_cache_size; i < cache_size; ++i) { for (int i = old_cache_size; i < cache_size; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
pool_[i].reset(new FeatureHistogram[train_data->num_features()]); pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
data_[i].resize(num_total_bin); data_[i].resize(num_total_bin * 2);
uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) { for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j)); pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j]);
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
num_bin -= 1;
}
offset += static_cast<uint64_t>(num_bin);
} }
CHECK(offset == num_total_bin);
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
OMP_THROW_EX(); OMP_THROW_EX();
...@@ -816,9 +846,9 @@ class HistogramPool { ...@@ -816,9 +846,9 @@ class HistogramPool {
inverse_mapper_[slot] = dst_idx; inverse_mapper_[slot] = dst_idx;
} }
private: private:
std::vector<std::unique_ptr<FeatureHistogram[]>> pool_; std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
std::vector<std::vector<HistogramBinEntry>> data_; std::vector<std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>> data_;
std::vector<FeatureMetainfo> feature_metas_; std::vector<FeatureMetainfo> feature_metas_;
int cache_size_; int cache_size_;
int total_size_; int total_size_;
......
...@@ -49,15 +49,15 @@ void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { ...@@ -49,15 +49,15 @@ void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
// some functions used for debugging the GPU histogram construction // some functions used for debugging the GPU histogram construction
#if GPU_DEBUG > 0 #if GPU_DEBUG > 0
void PrintHistograms(HistogramBinEntry* h, size_t size) { void PrintHistograms(hist_t* h, size_t size) {
size_t total = 0; double total_hess = 0;
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt); printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
total += h[i].cnt; if ((i & 2) == 2)
if ((i & 3) == 3)
printf("\n"); printf("\n");
total_hess += GET_HESS(h, i);
} }
printf("\nTotal examples: %lu\n", total); printf("\nSum hessians: %9.3g\n", total_hess);
} }
union Float_t { union Float_t {
...@@ -69,27 +69,23 @@ union Float_t { ...@@ -69,27 +69,23 @@ union Float_t {
}; };
void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) { void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
size_t i; size_t i;
Float_t a, b; Float_t a, b;
for (i = 0; i < size; ++i) { for (i = 0; i < size; ++i) {
a.f = h1[i].sum_gradients; a.f = GET_GRAD(h1, i);
b.f = h2[i].sum_gradients; b.f = GET_GRAD(h2, i);
int32_t ulps = Float_t::ulp_diff(a, b); int32_t ulps = Float_t::ulp_diff(a, b);
if (fabs(h1[i].cnt - h2[i].cnt != 0)) {
printf("%d != %d\n", h1[i].cnt, h2[i].cnt);
goto err;
}
if (ulps > 0) { if (ulps > 0) {
// printf("grad %g != %g (%d ULPs)\n", h1[i].sum_gradients, h2[i].sum_gradients, ulps); // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
// goto err; // goto err;
} }
a.f = h1[i].sum_hessians; a.f = GET_HESS(h1, i);
b.f = h2[i].sum_hessians; b.f = GET_HESS(h2, i);
ulps = Float_t::ulp_diff(a, b); ulps = Float_t::ulp_diff(a, b);
if (ulps > 0) { if (std::fabs(a.f - b.f) >= 1e-20) {
// printf("hessian %g != %g (%d ULPs)\n", h1[i].sum_hessians, h2[i].sum_hessians, ulps); printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps);
// goto err; goto err;
} }
} }
return; return;
...@@ -191,7 +187,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur ...@@ -191,7 +187,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
} }
template <typename HistType> template <typename HistType>
void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_); HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
// when the output is ready, the computation is done // when the output is ready, the computation is done
histograms_wait_obj_.wait(); histograms_wait_obj_.wait();
...@@ -201,29 +197,25 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { ...@@ -201,29 +197,25 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
continue; continue;
} }
int dense_group_index = dense_feature_group_map_[i]; int dense_group_index = dense_feature_group_map_[i];
auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index); auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
if (device_bin_mults_[i] == 1) { if (device_bin_mults_[i] == 1) {
for (int j = 0; j < bin_size; ++j) { for (int j = 0; j < bin_size; ++j) {
old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients; GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians; GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j);
old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
} }
} else { } else {
// values of this feature has been redistributed to multiple bins; need a reduction here // values of this feature has been redistributed to multiple bins; need a reduction here
int ind = 0; int ind = 0;
for (int j = 0; j < bin_size; ++j) { for (int j = 0; j < bin_size; ++j) {
double sum_g = 0.0, sum_h = 0.0; double sum_g = 0.0, sum_h = 0.0;
size_t cnt = 0;
for (int k = 0; k < device_bin_mults_[i]; ++k) { for (int k = 0; k < device_bin_mults_[i]; ++k) {
sum_g += hist_outputs[i * device_bin_size_+ ind].sum_gradients; sum_g += GET_GRAD(hist_outputs, i * device_bin_size_+ ind);
sum_h += hist_outputs[i * device_bin_size_+ ind].sum_hessians; sum_h += GET_HESS(hist_outputs, i * device_bin_size_+ ind);
cnt += hist_outputs[i * device_bin_size_ + ind].cnt;
ind++; ind++;
} }
old_histogram_array[j].sum_gradients = sum_g; GET_GRAD(old_histogram_array, j) = sum_g;
old_histogram_array[j].sum_hessians = sum_h; GET_HESS(old_histogram_array, j) = sum_h;
old_histogram_array[j].cnt = (data_size_t)cnt;
} }
} }
} }
...@@ -233,7 +225,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { ...@@ -233,7 +225,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
void GPUTreeLearner::AllocateGPUMemory() { void GPUTreeLearner::AllocateGPUMemory() {
num_dense_feature_groups_ = 0; num_dense_feature_groups_ = 0;
for (int i = 0; i < num_feature_groups_; ++i) { for (int i = 0; i < num_feature_groups_; ++i) {
if (ordered_bins_[i] == nullptr) { if (!train_data_->IsMultiGroup(i)) {
num_dense_feature_groups_++; num_dense_feature_groups_++;
} }
} }
...@@ -303,7 +295,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -303,7 +295,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_)); device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_));
boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_); boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_);
// histogram bin entry size depends on the precision (single/double) // histogram bin entry size depends on the precision (single/double)
hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry); hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) * 2 : sizeof(gpu_hist_t) * 2;
Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_); Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_);
// create output buffer, each feature has a histogram with device_bin_size_ bins, // create output buffer, each feature has a histogram with device_bin_size_ bins,
// each work group generates a sub-histogram of dword_features_ features. // each work group generates a sub-histogram of dword_features_ features.
...@@ -326,7 +318,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -326,7 +318,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
std::vector<int> dense_dword_ind(dword_features_); std::vector<int> dense_dword_ind(dword_features_);
for (int i = 0; i < num_feature_groups_; ++i) { for (int i = 0; i < num_feature_groups_; ++i) {
// looking for dword_features_ non-sparse feature-groups // looking for dword_features_ non-sparse feature-groups
if (ordered_bins_[i] == nullptr) { if (!train_data_->IsMultiGroup(i)) {
dense_dword_ind[k] = i; dense_dword_ind[k] = i;
// decide if we need to redistribute the bin // decide if we need to redistribute the bin
double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i)); double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i));
...@@ -682,6 +674,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) { ...@@ -682,6 +674,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
printf("bin size: "); printf("bin size: ");
#endif #endif
for (int i = 0; i < num_feature_groups_; ++i) { for (int i = 0; i < num_feature_groups_; ++i) {
if (train_data_->IsMultiGroup(i)) {
continue;
}
#if GPU_DEBUG >= 1 #if GPU_DEBUG >= 1
printf("%d, ", train_data_->FeatureGroupNumBin(i)); printf("%d, ", train_data_->FeatureGroupNumBin(i));
#endif #endif
...@@ -960,35 +955,34 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -960,35 +955,34 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
for (int feature_index = 0; feature_index < num_features_; ++feature_index) { for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used_[feature_index]) continue; if (!is_feature_used_[feature_index]) continue;
if (!is_feature_used[feature_index]) continue; if (!is_feature_used[feature_index]) continue;
if (ordered_bins_[train_data_->Feature2Group(feature_index)]) { if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
is_sparse_feature_used[feature_index] = 1; is_sparse_feature_used[feature_index] = 1;
} else { } else {
is_dense_feature_used[feature_index] = 1; is_dense_feature_used[feature_index] = 1;
} }
} }
// construct smaller leaf // construct smaller leaf
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
// ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf(), nullptr, smaller_leaf_splits_->num_data_in_leaf(),
nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr); nullptr, nullptr);
// then construct sparse features on CPU // then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
train_data_->ConstructHistograms(is_sparse_feature_used, train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf(), smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(), gradients_, hessians_,
&ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
multi_val_bin_.get(), is_hist_colwise_,
ptr_smaller_leaf_hist_data); ptr_smaller_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used // wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) { if (is_gpu_used) {
if (config_->gpu_use_dp) { if (config_->gpu_use_dp) {
// use double precision // use double precision
WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data); WaitAndGetHistograms<hist_t>(ptr_smaller_leaf_hist_data);
} else { } else {
// use single precision // use single precision
WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data); WaitAndGetHistograms<gpu_hist_t>(ptr_smaller_leaf_hist_data);
} }
} }
...@@ -1000,48 +994,58 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -1000,48 +994,58 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
continue; continue;
int dense_feature_group_index = dense_feature_group_map_[i]; int dense_feature_group_index = dense_feature_group_map_[i];
size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index); size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index); hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size]; hist_t* gpu_histogram = new hist_t[size * 2];
data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size); printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
std::copy(current_histogram, current_histogram + size, gpu_histogram); std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry)); std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( if(train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr){continue;}
num_data != num_data_ ? smaller_leaf_splits_->data_indices() : nullptr, if (num_data != num_data_ ) {
num_data, train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
num_data != num_data_ ? ordered_gradients_.data() : gradients_, smaller_leaf_splits_->data_indices(),
num_data != num_data_ ? ordered_hessians_.data() : hessians_, 0,
current_histogram); num_data,
ordered_gradients_.data(),
ordered_hessians_.data(),
current_histogram);
} else {
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
0,
num_data,
gradients_,
hessians_,
current_histogram);
}
CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index); CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
std::copy(gpu_histogram, gpu_histogram + size, current_histogram); std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
delete [] gpu_histogram; delete [] gpu_histogram;
} }
#endif #endif
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf // construct larger leaf
HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1; hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - KHistOffset;
is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data()); ordered_gradients_.data(), ordered_hessians_.data());
// then construct sparse features on CPU // then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
train_data_->ConstructHistograms(is_sparse_feature_used, train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, larger_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->LeafIndex(), gradients_, hessians_,
&ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
multi_val_bin_.get(), is_hist_colwise_,
ptr_larger_leaf_hist_data); ptr_larger_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used // wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) { if (is_gpu_used) {
if (config_->gpu_use_dp) { if (config_->gpu_use_dp) {
// use double precision // use double precision
WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data); WaitAndGetHistograms<hist_t>(ptr_larger_leaf_hist_data);
} else { } else {
// use single precision // use single precision
WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data); WaitAndGetHistograms<gpu_hist_t>(ptr_larger_leaf_hist_data);
} }
} }
} }
......
...@@ -76,12 +76,7 @@ class GPUTreeLearner: public SerialTreeLearner { ...@@ -76,12 +76,7 @@ class GPUTreeLearner: public SerialTreeLearner {
uint8_t s[4]; uint8_t s[4];
}; };
/*! \brief Single precision histogram entiry for GPU */ typedef float gpu_hist_t;
struct GPUHistogramBinEntry {
score_t sum_gradients;
score_t sum_hessians;
uint32_t cnt;
};
/*! /*!
* \brief Find the best number of workgroups processing one feature for maximizing efficiency * \brief Find the best number of workgroups processing one feature for maximizing efficiency
...@@ -133,7 +128,7 @@ class GPUTreeLearner: public SerialTreeLearner { ...@@ -133,7 +128,7 @@ class GPUTreeLearner: public SerialTreeLearner {
* \param histograms Destination of histogram results from GPU. * \param histograms Destination of histogram results from GPU.
*/ */
template <typename HistType> template <typename HistType>
void WaitAndGetHistograms(HistogramBinEntry* histograms); void WaitAndGetHistograms(hist_t* histograms);
/*! /*!
* \brief Construct GPU histogram asynchronously. * \brief Construct GPU histogram asynchronously.
......
...@@ -163,7 +163,7 @@ R""() ...@@ -163,7 +163,7 @@ R""()
void within_kernel_reduction16x8(uchar8 feature_mask, void within_kernel_reduction16x8(uchar8 feature_mask,
__global const acc_type* restrict feature4_sub_hist, __global const acc_type* restrict feature4_sub_hist,
const uint skip_id, const uint skip_id,
acc_type stat_val, uint cnt_val, acc_type stat_val,
const ushort num_sub_hist, const ushort num_sub_hist,
__global acc_type* restrict output_buf, __global acc_type* restrict output_buf,
__local acc_type * restrict local_hist) { __local acc_type * restrict local_hist) {
...@@ -181,33 +181,21 @@ void within_kernel_reduction16x8(uchar8 feature_mask, ...@@ -181,33 +181,21 @@ void within_kernel_reduction16x8(uchar8 feature_mask,
// 256 threads working on 8 features' 16 bins, gradient and hessian // 256 threads working on 8 features' 16 bins, gradient and hessian
stat_val += *p; stat_val += *p;
p += NUM_BINS * DWORD_FEATURES * 2; p += NUM_BINS * DWORD_FEATURES * 2;
if (ltid < LOCAL_SIZE_0 / 2) {
cnt_val += as_acc_int_type(*p);
}
p += NUM_BINS * DWORD_FEATURES;
} }
// skip the counters we already have // skip the counters we already have
p += 3 * DWORD_FEATURES * NUM_BINS; p += 2 * DWORD_FEATURES * NUM_BINS;
for (i = i + 1; i < num_sub_hist; ++i) { for (i = i + 1; i < num_sub_hist; ++i) {
stat_val += *p; stat_val += *p;
p += NUM_BINS * DWORD_FEATURES * 2; p += NUM_BINS * DWORD_FEATURES * 2;
if (ltid < LOCAL_SIZE_0 / 2) {
cnt_val += as_acc_int_type(*p);
}
p += NUM_BINS * DWORD_FEATURES;
} }
#endif #endif
// printf("thread %d:feature=%d, bin_id=%d, hessian=%d, stat_val=%f, cnt=%d", ltid, feature_id, bin_id, is_hessian_first, stat_val, cnt_val); // printf("thread %d:feature=%d, bin_id=%d, hessian=%d, stat_val=%f, cnt=%d", ltid, feature_id, bin_id, is_hessian_first, stat_val, cnt_val);
// now overwrite the local_hist for final reduction and output // now overwrite the local_hist for final reduction and output
// reverse the f7...f0 order to match the real order // reverse the f7...f0 order to match the real order
feature_id = DWORD_FEATURES_MASK - feature_id; feature_id = DWORD_FEATURES_MASK - feature_id;
local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + is_hessian_first] = stat_val; local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + is_hessian_first] = stat_val;
bin_id = ltid >> (LOG2_DWORD_FEATURES); // range 0 - 16, for counter
if (ltid < LOCAL_SIZE_0 / 2) {
local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (i = ltid; i < DWORD_FEATURES * 3 * NUM_BINS; i += lsize) { for (i = ltid; i < DWORD_FEATURES * 2 * NUM_BINS; i += lsize) {
output_buf[i] = local_hist[i]; output_buf[i] = local_hist[i];
} }
} }
...@@ -335,7 +323,9 @@ __kernel void histogram16(__global const uchar4* feature_data_base, ...@@ -335,7 +323,9 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
bk7_c_f0_bin16 bk7_c_f1_bin16 bk7_c_f2_bin16 bk7_c_f3_bin16 bk7_c_f4_bin16 bk7_c_f5_bin16 bk7_c_f6_bin16 bk7_c_f7_bin0 bk7_c_f0_bin16 bk7_c_f1_bin16 bk7_c_f2_bin16 bk7_c_f3_bin16 bk7_c_f4_bin16 bk7_c_f5_bin16 bk7_c_f6_bin16 bk7_c_f7_bin0
----------------------------------------------- -----------------------------------------------
*/ */
#if CONST_HESSIAN == 1
__local uint * cnt_hist = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS); __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
#endif
// thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first // thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first
// thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first // thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first
...@@ -547,7 +537,7 @@ R""() ...@@ -547,7 +537,7 @@ R""()
atomic_local_add_f(gh_hist + addr2, stat2); atomic_local_add_f(gh_hist + addr2, stat2);
#endif #endif
} }
#if CONST_HESSIAN == 1
// STAGE 3: accumulate counter // STAGE 3: accumulate counter
// there are 8 counters for 8 features // there are 8 counters for 8 features
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's counts for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's counts for example 0, 1, 2, 3, 4, 5, 6, 7
...@@ -614,6 +604,7 @@ R""() ...@@ -614,6 +604,7 @@ R""()
// printf("thread %x add counter %d feature %d (7)\n", ltid, bin, offset); // printf("thread %x add counter %d feature %d (7)\n", ltid, bin, offset);
atom_inc(cnt_hist + addr); atom_inc(cnt_hist + addr);
} }
#endif
stat1 = stat1_next; stat1 = stat1_next;
stat2 = stat2_next; stat2 = stat2_next;
feature4 = feature4_next; feature4 = feature4_next;
...@@ -642,6 +633,7 @@ R""() ...@@ -642,6 +633,7 @@ R""()
ushort bank_id = (i + offset) & BANK_MASK; ushort bank_id = (i + offset) & BANK_MASK;
stat_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + feature_id]; stat_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + feature_id];
} }
#if CONST_HESSIAN == 1
if (ltid < LOCAL_SIZE_0 / 2) { if (ltid < LOCAL_SIZE_0 / 2) {
// first 128 threads accumulate the 8 * 16 = 128 counter values // first 128 threads accumulate the 8 * 16 = 128 counter values
bin_id = ltid >> LOG2_DWORD_FEATURES; // bits 3 - 6 range 0 - 16 is bin ID bin_id = ltid >> LOG2_DWORD_FEATURES; // bits 3 - 6 range 0 - 16 is bin ID
...@@ -651,6 +643,7 @@ R""() ...@@ -651,6 +643,7 @@ R""()
cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * DWORD_FEATURES + feature_id]; cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * DWORD_FEATURES + feature_id];
} }
} }
#endif
// now thread 0 - 7 holds feature 0 - 7's gradient for bin 0 and counter bin 0 // now thread 0 - 7 holds feature 0 - 7's gradient for bin 0 and counter bin 0
// now thread 8 - 15 holds feature 0 - 7's hessian for bin 0 and counter bin 1 // now thread 8 - 15 holds feature 0 - 7's hessian for bin 0 and counter bin 1
...@@ -687,7 +680,7 @@ R""() ...@@ -687,7 +680,7 @@ R""()
// write to output // write to output
// write gradients and hessians histogram for all 4 features // write gradients and hessians histogram for all 4 features
// output data in linear order for further reduction // output data in linear order for further reduction
// output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float) // output size = 4 (features) * 2 (counters) * 64 (bins) * sizeof(float)
/* memory layout of output: /* memory layout of output:
g_f0_bin0 g_f1_bin0 g_f2_bin0 g_f3_bin0 g_f4_bin0 g_f5_bin0 g_f6_bin0 g_f7_bin0 g_f0_bin0 g_f1_bin0 g_f2_bin0 g_f3_bin0 g_f4_bin0 g_f5_bin0 g_f6_bin0 g_f7_bin0
h_f0_bin0 h_f1_bin0 h_f2_bin0 h_f3_bin0 h_f4_bin0 h_f5_bin0 h_f6_bin0 h_f7_bin0 h_f0_bin0 h_f1_bin0 h_f2_bin0 h_f3_bin0 h_f4_bin0 h_f5_bin0 h_f6_bin0 h_f7_bin0
...@@ -705,14 +698,10 @@ R""() ...@@ -705,14 +698,10 @@ R""()
// if there is only one workgroup processing this feature4, don't even need to write // if there is only one workgroup processing this feature4, don't even need to write
uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS); uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
#if POWER_FEATURE_WORKGROUPS != 0 #if POWER_FEATURE_WORKGROUPS != 0
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 3 * NUM_BINS; __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 2 * NUM_BINS;
// if g_val and h_val are double, they are converted to float here // if g_val and h_val are double, they are converted to float here
// write gradients and hessians for 8 features // write gradients and hessians for 8 features
output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val; output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val;
// write counts for 8 features
if (ltid < LOCAL_SIZE_0 / 2) {
output[2 * DWORD_FEATURES * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
}
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
// To avoid the cost of an extra reducting kernel, we have to deal with some // To avoid the cost of an extra reducting kernel, we have to deal with some
...@@ -738,7 +727,7 @@ R""() ...@@ -738,7 +727,7 @@ R""()
// The is done by using an global atomic counter. // The is done by using an global atomic counter.
// On AMD GPUs ideally this should be done in GDS, // On AMD GPUs ideally this should be done in GDS,
// but currently there is no easy way to access it via OpenCL. // but currently there is no easy way to access it via OpenCL.
__local uint * counter_val = cnt_hist; __local uint * counter_val = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
if (ltid == 0) { if (ltid == 0) {
// all workgroups processing the same feature add this counter // all workgroups processing the same feature add this counter
*counter_val = atom_inc(sync_counters + feature4_id); *counter_val = atom_inc(sync_counters + feature4_id);
...@@ -762,12 +751,12 @@ R""() ...@@ -762,12 +751,12 @@ R""()
// locate our feature4's block in output memory // locate our feature4's block in output memory
uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS); uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
__global acc_type const * restrict feature4_subhists = __global acc_type const * restrict feature4_subhists =
(__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 3 * NUM_BINS; (__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 2 * NUM_BINS;
// skip reading the data already in local memory // skip reading the data already in local memory
uint skip_id = group_id ^ output_offset; uint skip_id = group_id ^ output_offset;
// locate output histogram location for this feature4 // locate output histogram location for this feature4
__global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 3 * NUM_BINS; __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 2 * NUM_BINS;
within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, cnt_val, within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val,
1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array); 1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
} }
} }
...@@ -776,4 +765,3 @@ R""() ...@@ -776,4 +765,3 @@ R""()
// the +9 skips extra characters ")", newline, "#endif" and newline at the beginning // the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
// )"" "\n#endif" + 9 // )"" "\n#endif" + 9
#endif #endif
...@@ -155,15 +155,6 @@ void within_kernel_reduction256x4(uchar4 feature_mask, ...@@ -155,15 +155,6 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
acc_type f1_hess_bin = local_hist[ltid * 8 + 5]; acc_type f1_hess_bin = local_hist[ltid * 8 + 5];
acc_type f2_hess_bin = local_hist[ltid * 8 + 6]; acc_type f2_hess_bin = local_hist[ltid * 8 + 6];
acc_type f3_hess_bin = local_hist[ltid * 8 + 7]; acc_type f3_hess_bin = local_hist[ltid * 8 + 7];
__local uint* restrict local_cnt = (__local uint *)(local_hist + 4 * 2 * NUM_BINS);
#if POWER_FEATURE_WORKGROUPS != 0
uint f0_cont_bin = ltid ? local_cnt[ltid * 4] : old_val_f0_cont_bin0;
#else
uint f0_cont_bin = local_cnt[ltid * 4];
#endif
uint f1_cont_bin = local_cnt[ltid * 4 + 1];
uint f2_cont_bin = local_cnt[ltid * 4 + 2];
uint f3_cont_bin = local_cnt[ltid * 4 + 3];
ushort i; ushort i;
// printf("%d-pre(skip %d): %f %f %f %f %f %f %f %f %d %d %d %d", ltid, skip_id, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin); // printf("%d-pre(skip %d): %f %f %f %f %f %f %f %f %d %d %d %d", ltid, skip_id, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
#if POWER_FEATURE_WORKGROUPS != 0 #if POWER_FEATURE_WORKGROUPS != 0
...@@ -173,70 +164,62 @@ void within_kernel_reduction256x4(uchar4 feature_mask, ...@@ -173,70 +164,62 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
if (feature_mask.s3) { if (feature_mask.s3) {
f0_grad_bin += *p; p += NUM_BINS; f0_grad_bin += *p; p += NUM_BINS;
f0_hess_bin += *p; p += NUM_BINS; f0_hess_bin += *p; p += NUM_BINS;
f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
} }
else { else {
p += 3 * NUM_BINS; p += 2 * NUM_BINS;
} }
if (feature_mask.s2) { if (feature_mask.s2) {
f1_grad_bin += *p; p += NUM_BINS; f1_grad_bin += *p; p += NUM_BINS;
f1_hess_bin += *p; p += NUM_BINS; f1_hess_bin += *p; p += NUM_BINS;
f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
} }
else { else {
p += 3 * NUM_BINS; p += 2 * NUM_BINS;
} }
if (feature_mask.s1) { if (feature_mask.s1) {
f2_grad_bin += *p; p += NUM_BINS; f2_grad_bin += *p; p += NUM_BINS;
f2_hess_bin += *p; p += NUM_BINS; f2_hess_bin += *p; p += NUM_BINS;
f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
} }
else { else {
p += 3 * NUM_BINS; p += 2 * NUM_BINS;
} }
if (feature_mask.s0) { if (feature_mask.s0) {
f3_grad_bin += *p; p += NUM_BINS; f3_grad_bin += *p; p += NUM_BINS;
f3_hess_bin += *p; p += NUM_BINS; f3_hess_bin += *p; p += NUM_BINS;
f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
} }
else { else {
p += 3 * NUM_BINS; p += 2 * NUM_BINS;
} }
} }
// skip the counters we already have // skip the counters we already have
p += 3 * 4 * NUM_BINS; p += 2 * 4 * NUM_BINS;
for (i = i + 1; i < num_sub_hist; ++i) { for (i = i + 1; i < num_sub_hist; ++i) {
if (feature_mask.s3) { if (feature_mask.s3) {
f0_grad_bin += *p; p += NUM_BINS; f0_grad_bin += *p; p += NUM_BINS;
f0_hess_bin += *p; p += NUM_BINS; f0_hess_bin += *p; p += NUM_BINS;
f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
} }
else { else {
p += 3 * NUM_BINS; p += 2 * NUM_BINS;
} }
if (feature_mask.s2) { if (feature_mask.s2) {
f1_grad_bin += *p; p += NUM_BINS; f1_grad_bin += *p; p += NUM_BINS;
f1_hess_bin += *p; p += NUM_BINS; f1_hess_bin += *p; p += NUM_BINS;
f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
} }
else { else {
p += 3 * NUM_BINS; p += 2 * NUM_BINS;
} }
if (feature_mask.s1) { if (feature_mask.s1) {
f2_grad_bin += *p; p += NUM_BINS; f2_grad_bin += *p; p += NUM_BINS;
f2_hess_bin += *p; p += NUM_BINS; f2_hess_bin += *p; p += NUM_BINS;
f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
} }
else { else {
p += 3 * NUM_BINS; p += 2 * NUM_BINS;
} }
if (feature_mask.s0) { if (feature_mask.s0) {
f3_grad_bin += *p; p += NUM_BINS; f3_grad_bin += *p; p += NUM_BINS;
f3_hess_bin += *p; p += NUM_BINS; f3_hess_bin += *p; p += NUM_BINS;
f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
} }
else { else {
p += 3 * NUM_BINS; p += 2 * NUM_BINS;
} }
} }
// printf("%d-aft: %f %f %f %f %f %f %f %f %d %d %d %d", ltid, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin); // printf("%d-aft: %f %f %f %f %f %f %f %f %d %d %d %d", ltid, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
...@@ -245,18 +228,14 @@ void within_kernel_reduction256x4(uchar4 feature_mask, ...@@ -245,18 +228,14 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#if USE_DP_FLOAT == 0 #if USE_DP_FLOAT == 0
// reverse the f3...f0 order to match the real order // reverse the f3...f0 order to match the real order
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin; local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin; local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin); local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin; local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin; local_hist[2 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin); local_hist[2 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin;
local_hist[2 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin; local_hist[3 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin;
local_hist[2 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin; local_hist[3 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin;
local_hist[2 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
local_hist[3 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
local_hist[3 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
local_hist[3 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
/* /*
for (ushort i = ltid; i < 4 * 3 * NUM_BINS; i += lsize) { for (ushort i = ltid; i < 4 * 3 * NUM_BINS; i += lsize) {
...@@ -267,34 +246,28 @@ void within_kernel_reduction256x4(uchar4 feature_mask, ...@@ -267,34 +246,28 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
if (feature_mask.s0) { if (feature_mask.s0) {
output_buf[i] = local_hist[i]; output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
} }
i += 1 * 3 * NUM_BINS; i += 1 * 2 * NUM_BINS;
if (feature_mask.s1) { if (feature_mask.s1) {
output_buf[i] = local_hist[i]; output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
} }
i += 1 * 3 * NUM_BINS; i += 1 * 2 * NUM_BINS;
if (feature_mask.s2) { if (feature_mask.s2) {
output_buf[i] = local_hist[i]; output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
} }
i += 1 * 3 * NUM_BINS; i += 1 * 2 * NUM_BINS;
if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) { if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) {
output_buf[i] = local_hist[i]; output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
} }
#else #else
// when double precision is used, we need to write twice, because local memory size is not enough // when double precision is used, we need to write twice, because local memory size is not enough
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin; local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin; local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin); local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin; local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
/* /*
for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) { for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
...@@ -305,21 +278,17 @@ void within_kernel_reduction256x4(uchar4 feature_mask, ...@@ -305,21 +278,17 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
if (feature_mask.s0) { if (feature_mask.s0) {
output_buf[i] = local_hist[i]; output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
} }
i += 1 * 3 * NUM_BINS; i += 1 * 2 * NUM_BINS;
if (feature_mask.s1) { if (feature_mask.s1) {
output_buf[i] = local_hist[i]; output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS]; output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin; local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin; local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin); local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin; local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
/* /*
for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) { for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
...@@ -328,15 +297,13 @@ void within_kernel_reduction256x4(uchar4 feature_mask, ...@@ -328,15 +297,13 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
*/ */
i = ltid; i = ltid;
if (feature_mask.s2) { if (feature_mask.s2) {
output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i]; output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i];
output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS]; output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
} }
i += 1 * 3 * NUM_BINS; i += 1 * 2 * NUM_BINS;
if (feature_mask.s3) { if (feature_mask.s3) {
output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i]; output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i];
output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS]; output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
} }
#endif #endif
} }
...@@ -401,7 +368,9 @@ __kernel void histogram256(__global const uchar4* feature_data_base, ...@@ -401,7 +368,9 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
__local acc_type * gh_hist = (__local acc_type *)shared_array; __local acc_type * gh_hist = (__local acc_type *)shared_array;
// counter histogram // counter histogram
// total size: 4 * 256 * size_of(uint) = 4 KB // total size: 4 * 256 * size_of(uint) = 4 KB
#if CONST_HESSIAN == 1
__local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS); __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);
#endif
// thread 0, 1, 2, 3 compute histograms for gradients first // thread 0, 1, 2, 3 compute histograms for gradients first
// thread 4, 5, 6, 7 compute histograms for hessians first // thread 4, 5, 6, 7 compute histograms for hessians first
...@@ -602,7 +571,7 @@ R""() ...@@ -602,7 +571,7 @@ R""()
s0_stat1 += stat1; s0_stat1 += stat1;
s0_stat2 += stat2; s0_stat2 += stat2;
} }
#if CONST_HESSIAN == 1
// STAGE 3: accumulate counter // STAGE 3: accumulate counter
// there are 4 counters for 4 features // there are 4 counters for 4 features
// thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
...@@ -633,6 +602,7 @@ R""() ...@@ -633,6 +602,7 @@ R""()
addr = bin * 4 + offset; addr = bin * 4 + offset;
atom_inc(cnt_hist + addr); atom_inc(cnt_hist + addr);
} }
#endif
stat1 = stat1_next; stat1 = stat1_next;
stat2 = stat2_next; stat2 = stat2_next;
feature4 = feature4_next; feature4 = feature4_next;
...@@ -741,7 +711,7 @@ R""() ...@@ -741,7 +711,7 @@ R""()
uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS); uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
// if there is only one workgroup processing this feature4, don't even need to write // if there is only one workgroup processing this feature4, don't even need to write
#if POWER_FEATURE_WORKGROUPS != 0 #if POWER_FEATURE_WORKGROUPS != 0
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS; __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS;
// write gradients and hessians // write gradients and hessians
__global acc_type * restrict ptr_f = output; __global acc_type * restrict ptr_f = output;
for (ushort j = 0; j < 4; ++j) { for (ushort j = 0; j < 4; ++j) {
...@@ -751,17 +721,7 @@ R""() ...@@ -751,17 +721,7 @@ R""()
acc_type value = gh_hist[i * 4 + j]; acc_type value = gh_hist[i * 4 + j];
ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value; ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
} }
ptr_f += 3 * NUM_BINS; ptr_f += 2 * NUM_BINS;
}
// write counts
__global acc_int_type * restrict ptr_i = (__global acc_int_type * restrict)(output + 2 * NUM_BINS);
for (ushort j = 0; j < 4; ++j) {
for (ushort i = ltid; i < NUM_BINS; i += lsize) {
// FIXME: 2-way bank conflict
uint value = cnt_hist[i * 4 + j];
ptr_i[i] = value;
}
ptr_i += 3 * NUM_BINS;
} }
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
...@@ -788,7 +748,7 @@ R""() ...@@ -788,7 +748,7 @@ R""()
// The is done by using an global atomic counter. // The is done by using an global atomic counter.
// On AMD GPUs ideally this should be done in GDS, // On AMD GPUs ideally this should be done in GDS,
// but currently there is no easy way to access it via OpenCL. // but currently there is no easy way to access it via OpenCL.
__local uint * counter_val = cnt_hist; __local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);;
// backup the old value // backup the old value
uint old_val = *counter_val; uint old_val = *counter_val;
if (ltid == 0) { if (ltid == 0) {
...@@ -814,11 +774,11 @@ R""() ...@@ -814,11 +774,11 @@ R""()
// locate our feature4's block in output memory // locate our feature4's block in output memory
uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS); uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
__global acc_type const * restrict feature4_subhists = __global acc_type const * restrict feature4_subhists =
(__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS; (__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS;
// skip reading the data already in local memory // skip reading the data already in local memory
uint skip_id = group_id ^ output_offset; uint skip_id = group_id ^ output_offset;
// locate output histogram location for this feature4 // locate output histogram location for this feature4
__global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS; __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS;
within_kernel_reduction256x4(feature_mask, feature4_subhists, skip_id, old_val, 1 << POWER_FEATURE_WORKGROUPS, within_kernel_reduction256x4(feature_mask, feature4_subhists, skip_id, old_val, 1 << POWER_FEATURE_WORKGROUPS,
hist_buf, (__local acc_type *)shared_array); hist_buf, (__local acc_type *)shared_array);
// if (ltid == 0) // if (ltid == 0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment