Unverified Commit 509c2e50 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Support both row-wise and col-wise multi-threading (#2699)



* commit

* fix a bug

* fix bug

* reset to track changes

* refine the auto choose logic

* sort the time stats output

* fix include

* change  multi_val_bin_sparse_threshold

* add cmake

* add _mm_malloc and _mm_free for cross platform

* fix cmake bug

* timer for split

* try to fix cmake

* fix tests

* refactor DataPartition::Split

* fix test

* typo

* formating

* Revert "formating"

This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222.

* add document

* [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719)

* naming

* fix gpu code

* Update include/LightGBM/bin.h
Co-Authored-By: default avatarJames Lamb <jaylamb20@gmail.com>

* Update src/treelearner/ocl/histogram16.cl

* test: swap compilers for CI

* fix omp

* not avx2

* no aligned for feature histogram

* Revert "refactor DataPartition::Split"

This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8.

* slightly refactor data partition

* reduce the memory cost
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent bc7bc4a1
......@@ -1065,7 +1065,7 @@ int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
API_BEGIN();
auto target_d = reinterpret_cast<Dataset*>(target);
auto source_d = reinterpret_cast<Dataset*>(source);
target_d->addFeaturesFrom(source_d);
target_d->AddFeaturesFrom(source_d);
API_END();
}
......
......@@ -15,7 +15,8 @@
#include "dense_bin.hpp"
#include "dense_nbits_bin.hpp"
#include "ordered_sparse_bin.hpp"
#include "multi_val_dense_bin.hpp"
#include "multi_val_sparse_bin.hpp"
#include "sparse_bin.hpp"
namespace LightGBM {
......@@ -636,21 +637,10 @@ namespace LightGBM {
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;
template class MultiValDenseBin<uint8_t>;
template class MultiValDenseBin<uint16_t>;
template class MultiValDenseBin<uint32_t>;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
// sparse threshold
if (sparse_rate >= sparse_threshold && is_enable_sparse) {
*is_sparse = true;
return CreateSparseBin(num_data, num_bin);
} else {
*is_sparse = false;
return CreateDenseBin(num_data, num_bin);
}
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) {
......@@ -674,4 +664,25 @@ namespace LightGBM {
}
}
MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate) {
const double multi_val_bin_sparse_threshold = 0.25f;
if (sparse_rate >= multi_val_bin_sparse_threshold) {
if (num_bin <= 256) {
return new MultiValSparseBin<uint8_t>(num_data, num_bin);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<uint16_t>(num_data, num_bin);
} else {
return new MultiValSparseBin<uint32_t>(num_data, num_bin);
}
} else {
if (num_bin <= 256) {
return new MultiValDenseBin<uint8_t>(num_data, num_bin, num_feature);
} else if (num_bin <= 65536) {
return new MultiValDenseBin<uint16_t>(num_data, num_bin, num_feature);
} else {
return new MultiValDenseBin<uint32_t>(num_data, num_bin, num_feature);
}
}
}
} // namespace LightGBM
......@@ -312,6 +312,11 @@ void Config::CheckParamConflict() {
num_leaves = static_cast<int>(full_num_leaves);
}
}
// force col-wise for gpu
if (device_type == std::string("gpu")) {
force_col_wise = true;
force_row_wise = false;
}
}
std::string Config::ToString() const {
......
......@@ -116,9 +116,6 @@ std::unordered_map<std::string, std::string> Config::alias_table({
{"is_pre_partition", "pre_partition"},
{"is_enable_bundle", "enable_bundle"},
{"bundle", "enable_bundle"},
{"is_sparse", "is_enable_sparse"},
{"enable_sparse", "is_enable_sparse"},
{"sparse", "is_enable_sparse"},
{"two_round_loading", "two_round"},
{"use_two_round_loading", "two_round"},
{"is_save_binary", "save_binary"},
......@@ -181,6 +178,8 @@ std::unordered_set<std::string> Config::parameter_set({
"num_threads",
"device_type",
"seed",
"force_col_wise",
"force_row_wise",
"max_depth",
"min_data_in_leaf",
"min_sum_hessian_in_leaf",
......@@ -236,9 +235,6 @@ std::unordered_set<std::string> Config::parameter_set({
"valid_data_initscores",
"pre_partition",
"enable_bundle",
"max_conflict_rate",
"is_enable_sparse",
"sparse_threshold",
"use_missing",
"zero_as_missing",
"two_round",
......@@ -309,6 +305,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
GetInt(params, "num_threads", &num_threads);
GetBool(params, "force_col_wise", &force_col_wise);
GetBool(params, "force_row_wise", &force_row_wise);
GetInt(params, "max_depth", &max_depth);
GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
......@@ -467,16 +467,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
GetBool(params, "enable_bundle", &enable_bundle);
GetDouble(params, "max_conflict_rate", &max_conflict_rate);
CHECK(max_conflict_rate >=0.0);
CHECK(max_conflict_rate <1.0);
GetBool(params, "is_enable_sparse", &is_enable_sparse);
GetDouble(params, "sparse_threshold", &sparse_threshold);
CHECK(sparse_threshold >0.0);
CHECK(sparse_threshold <=1.0);
GetBool(params, "use_missing", &use_missing);
GetBool(params, "zero_as_missing", &zero_as_missing);
......@@ -600,6 +590,8 @@ std::string Config::SaveMembersToString() const {
str_buf << "[learning_rate: " << learning_rate << "]\n";
str_buf << "[num_leaves: " << num_leaves << "]\n";
str_buf << "[num_threads: " << num_threads << "]\n";
str_buf << "[force_col_wise: " << force_col_wise << "]\n";
str_buf << "[force_row_wise: " << force_row_wise << "]\n";
str_buf << "[max_depth: " << max_depth << "]\n";
str_buf << "[min_data_in_leaf: " << min_data_in_leaf << "]\n";
str_buf << "[min_sum_hessian_in_leaf: " << min_sum_hessian_in_leaf << "]\n";
......@@ -655,9 +647,6 @@ std::string Config::SaveMembersToString() const {
str_buf << "[valid_data_initscores: " << Common::Join(valid_data_initscores, ",") << "]\n";
str_buf << "[pre_partition: " << pre_partition << "]\n";
str_buf << "[enable_bundle: " << enable_bundle << "]\n";
str_buf << "[max_conflict_rate: " << max_conflict_rate << "]\n";
str_buf << "[is_enable_sparse: " << is_enable_sparse << "]\n";
str_buf << "[sparse_threshold: " << sparse_threshold << "]\n";
str_buf << "[use_missing: " << use_missing << "]\n";
str_buf << "[zero_as_missing: " << zero_as_missing << "]\n";
str_buf << "[two_round: " << two_round << "]\n";
......
......@@ -36,6 +36,7 @@ Dataset::Dataset(data_size_t num_data) {
}
Dataset::~Dataset() {
}
std::vector<std::vector<int>> NoGroup(
......@@ -48,19 +49,20 @@ std::vector<std::vector<int>> NoGroup(
return features_in_group;
}
int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, int max_cnt) {
int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, data_size_t max_cnt) {
int ret = 0;
for (int i = 0; i < num_indices; ++i) {
if (mark[indices[i]]) {
++ret;
if (ret > max_cnt) {
return -1;
}
}
if (ret > max_cnt) {
return -1;
}
}
return ret;
}
void MarkUsed(std::vector<bool>* mark, const int* indices, int num_indices) {
void MarkUsed(std::vector<bool>* mark, const int* indices, data_size_t num_indices) {
auto& ref_mark = *mark;
for (int i = 0; i < num_indices; ++i) {
ref_mark[indices[i]] = true;
......@@ -93,29 +95,31 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
int** sample_indices,
const int* num_per_col,
int num_sample_col,
size_t total_sample_cnt,
data_size_t max_error_cnt,
data_size_t filter_cnt,
data_size_t total_sample_cnt,
data_size_t num_data,
bool is_use_gpu) {
bool is_use_gpu,
std::vector<int8_t>* multi_val_group) {
const int max_search_group = 100;
const int gpu_max_bin_per_group = 256;
const int max_bin_per_group = 256;
const data_size_t single_val_max_conflict_cnt = static_cast<data_size_t>(total_sample_cnt / 10000);
multi_val_group->clear();
Random rand(num_data);
std::vector<std::vector<int>> features_in_group;
std::vector<std::vector<bool>> conflict_marks;
std::vector<int> group_conflict_cnt;
std::vector<size_t> group_non_zero_cnt;
std::vector<data_size_t> group_used_row_cnt;
std::vector<data_size_t> group_total_data_cnt;
std::vector<int> group_num_bin;
// first round: fill the single val group
for (auto fidx : find_order) {
bool is_filtered_feature = fidx >= num_sample_col;
const size_t cur_non_zero_cnt = is_filtered_feature ? 0: num_per_col[fidx];
bool need_new_group = true;
const data_size_t cur_non_zero_cnt = is_filtered_feature ? 0 : num_per_col[fidx];
std::vector<int> available_groups;
for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
if (group_non_zero_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + max_error_cnt) {
if (!is_use_gpu || group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)
<= gpu_max_bin_per_group) {
auto cur_num_bin = group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
if (group_total_data_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + single_val_max_conflict_cnt) {
if (!is_use_gpu || cur_num_bin <= max_bin_per_group) {
available_groups.push_back(gid);
}
}
......@@ -124,44 +128,82 @@ std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMa
if (!available_groups.empty()) {
int last = static_cast<int>(available_groups.size()) - 1;
auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
// always push the last group
search_groups.push_back(available_groups.back());
for (auto idx : indices) {
search_groups.push_back(available_groups[idx]);
}
}
int best_gid = -1;
int best_conflict_cnt = -1;
for (auto gid : search_groups) {
const int rest_max_cnt = max_error_cnt - group_conflict_cnt[gid];
const int cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
if (cnt >= 0 && cnt <= rest_max_cnt) {
data_size_t rest_non_zero_data = static_cast<data_size_t>(
static_cast<double>(cur_non_zero_cnt - cnt) * num_data / total_sample_cnt);
if (rest_non_zero_data < filter_cnt) { continue; }
need_new_group = false;
features_in_group[gid].push_back(fidx);
group_conflict_cnt[gid] += cnt;
group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt;
if (!is_filtered_feature) {
MarkUsed(&conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
}
if (is_use_gpu) {
group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
}
const data_size_t rest_max_cnt = single_val_max_conflict_cnt - group_total_data_cnt[gid] + group_used_row_cnt[gid];
const data_size_t cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
if (cnt >= 0 && cnt <= rest_max_cnt && cnt <= cur_non_zero_cnt / 2) {
best_gid = gid;
best_conflict_cnt = cnt;
break;
}
}
if (need_new_group) {
if (best_gid >= 0) {
features_in_group[best_gid].push_back(fidx);
group_total_data_cnt[best_gid] += cur_non_zero_cnt;
group_used_row_cnt[best_gid] += cur_non_zero_cnt - best_conflict_cnt;
if (!is_filtered_feature) {
MarkUsed(&conflict_marks[best_gid], sample_indices[fidx], num_per_col[fidx]);
}
group_num_bin[best_gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
} else {
features_in_group.emplace_back();
features_in_group.back().push_back(fidx);
group_conflict_cnt.push_back(0);
conflict_marks.emplace_back(total_sample_cnt, false);
if (!is_filtered_feature) {
MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
}
group_non_zero_cnt.emplace_back(cur_non_zero_cnt);
if (is_use_gpu) {
group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
group_total_data_cnt.emplace_back(cur_non_zero_cnt);
group_used_row_cnt.emplace_back(cur_non_zero_cnt);
group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
}
}
std::vector<int> second_round_features;
std::vector<std::vector<int>> features_in_group2;
std::vector<std::vector<bool>> conflict_marks2;
const double dense_threshold = 0.4;
for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
const double dense_rate = static_cast<double>(group_used_row_cnt[gid]) / total_sample_cnt;
if (dense_rate >= dense_threshold) {
features_in_group2.push_back(std::move(features_in_group[gid]));
conflict_marks2.push_back(std::move(conflict_marks[gid]));
} else {
for (auto fidx : features_in_group[gid]) {
second_round_features.push_back(fidx);
}
}
}
features_in_group = features_in_group2;
conflict_marks = conflict_marks2;
multi_val_group->resize(features_in_group.size(), false);
if (!second_round_features.empty()) {
features_in_group.emplace_back();
conflict_marks.emplace_back(total_sample_cnt, false);
bool is_multi_val = is_use_gpu ? true : false;
int conflict_cnt = 0;
for (auto fidx : second_round_features) {
features_in_group.back().push_back(fidx);
if (!is_multi_val) {
const int rest_max_cnt = single_val_max_conflict_cnt - conflict_cnt;
const auto cnt = GetConfilctCount(conflict_marks.back(), sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
conflict_cnt += cnt;
if (cnt < 0 || conflict_cnt > single_val_max_conflict_cnt) {
is_multi_val = true;
continue;
}
MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
}
}
multi_val_group->push_back(is_multi_val);
}
return features_in_group;
}
......@@ -171,17 +213,12 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
double** sample_values,
const int* num_per_col,
int num_sample_col,
size_t total_sample_cnt,
data_size_t total_sample_cnt,
const std::vector<int>& used_features,
double max_conflict_rate,
data_size_t num_data,
data_size_t min_data,
double sparse_threshold,
bool is_enable_sparse,
bool is_use_gpu) {
// filter is based on sampling data, so decrease its range
const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * min_data) / num_data * total_sample_cnt);
const data_size_t max_error_cnt = static_cast<data_size_t>(total_sample_cnt * max_conflict_rate);
bool is_use_gpu,
std::vector<int8_t>* multi_val_group) {
Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
std::vector<size_t> feature_non_zero_cnt;
feature_non_zero_cnt.reserve(used_features.size());
// put dense feature first
......@@ -209,6 +246,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
for (auto sidx : sorted_idx) {
feature_order_by_cnt.push_back(used_features[sidx]);
}
std::vector<std::vector<int>> tmp_indices;
std::vector<int> tmp_num_per_col(num_sample_col, 0);
for (auto fidx : used_features) {
......@@ -224,42 +262,25 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
tmp_num_per_col[fidx] = num_per_col[fidx];
}
}
auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
std::vector<int8_t> group_is_multi_val, group_is_multi_val2;
auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val);
auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, num_data, is_use_gpu, &group_is_multi_val2);
if (features_in_group.size() > group2.size()) {
features_in_group = group2;
}
std::vector<std::vector<int>> ret;
for (size_t i = 0; i < features_in_group.size(); ++i) {
if (features_in_group[i].size() <= 1 || features_in_group[i].size() >= 5) {
ret.push_back(features_in_group[i]);
} else {
int cnt_non_zero = 0;
for (size_t j = 0; j < features_in_group[i].size(); ++j) {
const int fidx = features_in_group[i][j];
cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers[fidx]->sparse_rate()));
}
double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
// take apart small sparse group, due it will not gain on speed
if (sparse_rate >= sparse_threshold && is_enable_sparse) {
for (size_t j = 0; j < features_in_group[i].size(); ++j) {
const int fidx = features_in_group[i][j];
ret.emplace_back();
ret.back().push_back(fidx);
}
} else {
ret.push_back(features_in_group[i]);
}
}
group_is_multi_val = group_is_multi_val2;
}
// shuffle groups
int num_group = static_cast<int>(ret.size());
Random tmp_rand(12);
int num_group = static_cast<int>(features_in_group.size());
Random tmp_rand(num_data);
for (int i = 0; i < num_group - 1; ++i) {
int j = tmp_rand.NextShort(i + 1, num_group);
std::swap(ret[i], ret[j]);
std::swap(features_in_group[i], features_in_group[j]);
// Use std::swap for vector<bool> will cause the wrong result..
std::swap(group_is_multi_val[i], group_is_multi_val[j]);
}
return ret;
*multi_val_group = group_is_multi_val;
return features_in_group;
}
void Dataset::Construct(
......@@ -274,7 +295,6 @@ void Dataset::Construct(
const Config& io_config) {
num_total_features_ = num_total_features;
CHECK(num_total_features_ == static_cast<int>(bin_mappers->size()));
sparse_threshold_ = io_config.sparse_threshold;
// get num_features
std::vector<int> used_features;
auto& ref_bin_mappers = *bin_mappers;
......@@ -287,13 +307,11 @@ void Dataset::Construct(
Log::Warning("There are no meaningful features, as all feature values are constant.");
}
auto features_in_group = NoGroup(used_features);
std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
if (io_config.enable_bundle && !used_features.empty()) {
features_in_group = FastFeatureBundling(*bin_mappers,
sample_non_zero_indices, sample_values, num_per_col, num_sample_col, total_sample_cnt,
used_features, io_config.max_conflict_rate,
num_data_, io_config.min_data_in_leaf,
sparse_threshold_, io_config.is_enable_sparse, io_config.device_type == std::string("gpu"));
sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast<data_size_t>(total_sample_cnt),
used_features, num_data_, io_config.device_type == std::string("gpu"), &group_is_multi_val);
}
num_features_ = 0;
......@@ -306,10 +324,14 @@ void Dataset::Construct(
real_feature_idx_.resize(num_features_);
feature2group_.resize(num_features_);
feature2subfeature_.resize(num_features_);
int num_multi_val_group = 0;
feature_need_push_zeros_.clear();
for (int i = 0; i < num_groups_; ++i) {
auto cur_features = features_in_group[i];
int cur_cnt_features = static_cast<int>(cur_features.size());
if (group_is_multi_val[i]) {
++num_multi_val_group;
}
// get bin_mappers
std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
for (int j = 0; j < cur_cnt_features; ++j) {
......@@ -325,8 +347,7 @@ void Dataset::Construct(
++cur_fidx;
}
feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(cur_cnt_features, &cur_bin_mappers, num_data_, sparse_threshold_,
io_config.is_enable_sparse)));
new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_)));
}
feature_groups_.shrink_to_fit();
group_bin_boundaries_.clear();
......@@ -414,9 +435,6 @@ void Dataset::ResetConfig(const char* parameters) {
if (param.count("zero_as_missing") && io_config.zero_as_missing != zero_as_missing_) {
Log::Warning("Cannot change zero_as_missing after constructed Dataset handle.");
}
if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
}
if (param.count("forcedbins_filename")) {
Log::Warning("Cannot change forced bins after constructed Dataset handle.");
}
......@@ -452,23 +470,229 @@ void Dataset::ResetConfig(const char* parameters) {
void Dataset::FinishLoad() {
if (is_finish_load_) { return; }
if (num_groups_ > 0) {
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_groups_; ++i) {
OMP_LOOP_EX_BEGIN();
feature_groups_[i]->bin_data_->FinishLoad();
OMP_LOOP_EX_END();
feature_groups_[i]->FinishLoad();
}
OMP_THROW_EX();
}
is_finish_load_ = true;
}
void PushDataToMultiValBin(int num_threads, data_size_t num_data, const std::vector<uint32_t> most_freq_bins,
const std::vector<uint32_t> offsets, std::vector<std::vector<std::unique_ptr<BinIterator>>>& iters, MultiValBin* ret) {
Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin", global_timer);
const data_size_t min_block_size = 4096;
const int n_block = std::min(num_threads, (num_data + min_block_size - 1) / min_block_size);
const data_size_t block_size = (num_data + n_block - 1) / n_block;
if (ret->IsSparse()) {
#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_block; ++tid) {
std::vector<uint32_t> cur_data;
data_size_t start = tid * block_size;
data_size_t end = std::min(num_data, start + block_size);
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
iters[tid][j]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
cur_data.clear();
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
auto cur_bin = iters[tid][j]->Get(i);
if (cur_bin == most_freq_bins[j]) {
continue;
}
cur_bin += offsets[j];
if (most_freq_bins[j] == 0) {
cur_bin -= 1;
}
cur_data.push_back(cur_bin);
}
ret->PushOneRow(tid, i, cur_data);
}
}
} else {
#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_block; ++tid) {
std::vector<uint32_t> cur_data;
data_size_t start = tid * block_size;
data_size_t end = std::min(num_data, start + block_size);
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
iters[tid][j]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
cur_data.clear();
for (size_t j = 0; j < most_freq_bins.size(); ++j) {
auto cur_bin = iters[tid][j]->Get(i);
if (cur_bin == most_freq_bins[j]) {
cur_bin = 0;
} else {
cur_bin += offsets[j];
if (most_freq_bins[j] == 0) {
cur_bin -= 1;
}
}
cur_data.push_back(cur_bin);
}
ret->PushOneRow(tid, i, cur_data);
}
}
}
}
MultiValBin* Dataset::GetMultiBinFromSparseFeatures() const {
Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures", global_timer);
int multi_group_id = -1;
for (int i = 0; i < num_groups_; ++i) {
if (feature_groups_[i]->is_multi_val_) {
if (multi_group_id < 0) {
multi_group_id = i;
} else {
Log::Fatal("Bug. There should be only one multi-val group.");
}
}
}
if (multi_group_id < 0) {
return nullptr;
}
const auto& offsets = feature_groups_[multi_group_id]->bin_offsets_;
const int num_feature = feature_groups_[multi_group_id]->num_feature_;
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
std::vector<uint32_t> most_freq_bins;
double sum_sparse_rate = 0;
for (int i = 0; i < num_feature; ++i) {
for (int tid = 0; tid < num_threads; ++tid) {
iters[tid].emplace_back(feature_groups_[multi_group_id]->SubFeatureIterator(i));
}
most_freq_bins.push_back(feature_groups_[multi_group_id]->bin_mappers_[i]->GetMostFreqBin());
sum_sparse_rate += feature_groups_[multi_group_id]->bin_mappers_[i]->sparse_rate();
}
sum_sparse_rate /= num_feature;
Log::Debug("GetMultiBinFromSparseFeatures:: sparse rate %f", sum_sparse_rate);
std::unique_ptr<MultiValBin> ret;
ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(), num_feature, sum_sparse_rate));
PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
ret->FinishLoad();
return ret.release();
}
MultiValBin* Dataset::GetMultiBinFromAllFeatures() const {
Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures", global_timer);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
double sum_dense_ratio = 0;
std::unique_ptr<MultiValBin> ret;
std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
std::vector<uint32_t> most_freq_bins;
std::vector<uint32_t> offsets;
int num_total_bin = 1;
offsets.push_back(num_total_bin);
for (int gid = 0; gid < num_groups_; ++gid) {
if (feature_groups_[gid]->is_multi_val_) {
for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
num_total_bin += bin_mapper->num_bin();
if (most_freq_bins.back() == 0) {
num_total_bin -= 1;
}
offsets.push_back(num_total_bin);
for (int tid = 0; tid < num_threads; ++tid) {
iters[tid].emplace_back(feature_groups_[gid]->SubFeatureIterator(fid));
}
}
} else {
most_freq_bins.push_back(0);
num_total_bin += feature_groups_[gid]->bin_offsets_.back() - 1;
for (int tid = 0; tid < num_threads; ++tid) {
iters[tid].emplace_back(feature_groups_[gid]->FeatureGroupIterator());
}
offsets.push_back(num_total_bin);
for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
}
}
}
sum_dense_ratio /= static_cast<double>(most_freq_bins.size());
Log::Debug("GetMultiBinFromAllFeatures:: sparse rate %f", 1.0 - sum_dense_ratio);
ret.reset(MultiValBin::CreateMultiValBin(num_data_, num_total_bin, static_cast<int>(most_freq_bins.size()), 1.0 - sum_dense_ratio));
PushDataToMultiValBin(num_threads, num_data_, most_freq_bins, offsets, iters, ret.get());
ret->FinishLoad();
return ret.release();
}
MultiValBin* Dataset::TestMultiThreadingMethod(score_t* gradients, score_t* hessians, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{ num_threads = omp_get_num_threads(); }
Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", global_timer);
if (force_colwise && force_rowwise) {
Log::Fatal("cannot set both `force_col_wise` and `force_row_wise` to `true`.");
}
if (num_groups_ <= 0) {
return nullptr;
}
if (force_colwise) {
*is_hist_col_wise = true;
return GetMultiBinFromSparseFeatures();
} else if (force_rowwise) {
*is_hist_col_wise = false;
auto ret = GetMultiBinFromAllFeatures();
const int num_bin_aligned =
(ret->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
hist_buf_.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
return ret;
} else {
std::unique_ptr<MultiValBin> sparse_bin;
std::unique_ptr<MultiValBin> all_bin;
sparse_bin.reset(GetMultiBinFromSparseFeatures());
all_bin.reset(GetMultiBinFromAllFeatures());
std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_data(NumTotalBin() * 2);
const int num_bin_aligned =
(all_bin->num_bin() + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
hist_buf_.resize(static_cast<size_t>(num_bin_aligned) * 2 * num_threads);
std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
auto start_time = std::chrono::steady_clock::now();
ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, hessians, gradients, hessians, is_constant_hessian, sparse_bin.get(), true, hist_data.data());
col_wise_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now();
ConstructHistogramsMultiVal(all_bin.get(), nullptr, num_data_, gradients, hessians, is_constant_hessian, hist_data.data());
row_wise_time = std::chrono::steady_clock::now() - start_time;
Log::Debug("colwise cost %f seconds, rowwise cost %f seconds", col_wise_time * 1e-3, row_wise_time * 1e-3);
if (col_wise_time < row_wise_time) {
*is_hist_col_wise = true;
hist_buf_.clear();
return sparse_bin.release();
} else {
*is_hist_col_wise = false;
Log::Info("Use row-wise multi-threading, may increase memory usage. If memory is not enough, you can set `force_col_wise=true`.");
if (all_bin->IsSparse()) {
Log::Debug("Use Sparse Multi-Val Bin");
} else {
Log::Debug("Use Dense Multi-Val Bin");
}
return all_bin.release();
}
}
}
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
feature_groups_.clear();
num_features_ = dataset->num_features_;
num_groups_ = dataset->num_groups_;
sparse_threshold_ = dataset->sparse_threshold_;
// copy feature bin mapper data
for (int i = 0; i < num_groups_; ++i) {
std::vector<std::unique_ptr<BinMapper>> bin_mappers;
......@@ -477,9 +701,9 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
}
feature_groups_.emplace_back(new FeatureGroup(
dataset->feature_groups_[i]->num_feature_,
dataset->feature_groups_[i]->is_multi_val_,
&bin_mappers,
num_data_,
dataset->feature_groups_[i]->is_sparse_));
num_data_));
}
feature_groups_.shrink_to_fit();
used_feature_map_ = dataset->used_feature_map_;
......@@ -502,8 +726,6 @@ void Dataset::CreateValid(const Dataset* dataset) {
feature_groups_.clear();
num_features_ = dataset->num_features_;
num_groups_ = num_features_;
sparse_threshold_ = dataset->sparse_threshold_;
bool is_enable_sparse = true;
feature2group_.clear();
feature2subfeature_.clear();
// copy feature bin mapper data
......@@ -514,12 +736,8 @@ void Dataset::CreateValid(const Dataset* dataset) {
if (bin_mappers.back()->GetDefaultBin() != bin_mappers.back()->GetMostFreqBin()) {
feature_need_push_zeros_.push_back(i);
}
feature_groups_.emplace_back(new FeatureGroup(
1,
&bin_mappers,
num_data_,
sparse_threshold_,
is_enable_sparse));
feature_groups_.emplace_back(new FeatureGroup(&bin_mappers,
num_data_));
feature2group_.push_back(i);
feature2subfeature_.push_back(0);
}
......@@ -721,7 +939,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
writer->Write(binary_file_token, size_of_token);
// get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) + sizeof(sparse_threshold_)
+ sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_)
+ 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_
+ sizeof(double) * num_features_ + sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2;
// size of feature names
......@@ -743,7 +961,6 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_));
writer->Write(&use_missing_, sizeof(use_missing_));
writer->Write(&zero_as_missing_, sizeof(zero_as_missing_));
writer->Write(&sparse_threshold_, sizeof(sparse_threshold_));
writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_);
writer->Write(&num_groups_, sizeof(num_groups_));
writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_);
......@@ -866,20 +1083,110 @@ void Dataset::DumpTextFile(const char* text_filename) {
fclose(file);
}
void Dataset::ConstructHistogramsMultiVal(const MultiValBin* multi_val_bin, const data_size_t* data_indices, data_size_t num_data,
const score_t* gradients, const score_t* hessians,
bool is_constant_hessian,
hist_t* hist_data) const {
Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal", global_timer);
if (multi_val_bin == nullptr) { return; }
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
global_timer.Start("Dataset::sparse_bin_histogram");
const int num_bin = multi_val_bin->num_bin();
const int num_bin_aligned = (num_bin + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
const int min_data_block_size = 1024;
const int n_data_block = std::min(num_threads, (num_data + min_data_block_size - 1) / min_data_block_size);
const int data_block_size = (num_data + n_data_block - 1) / n_data_block;
const size_t buf_size = static_cast<size_t>(n_data_block - 1)* num_bin_aligned * 2;
if (hist_buf_.size() < buf_size) {
hist_buf_.resize(buf_size);
}
#pragma omp parallel for schedule(static)
for (int tid = 0; tid < n_data_block; ++tid) {
data_size_t start = tid * data_block_size;
data_size_t end = std::min(start + data_block_size, num_data);
auto data_ptr = hist_data;
if (tid > 0) {
data_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
}
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin* KHistEntrySize);
if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, hessians, data_ptr);
} else {
multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, data_ptr);
}
} else {
if (!is_constant_hessian) {
multi_val_bin->ConstructHistogram(start, end, gradients, hessians, data_ptr);
} else {
multi_val_bin->ConstructHistogram(start, end, gradients, data_ptr);
}
}
}
global_timer.Stop("Dataset::sparse_bin_histogram");
global_timer.Start("Dataset::sparse_bin_histogram_merge");
const int min_bin_block_size = 512;
const int n_bin_block = std::min(num_threads, (num_bin + min_bin_block_size - 1) / min_bin_block_size);
const int bin_block_size = (num_bin + n_bin_block - 1) / n_bin_block;
if (!is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
}
}
}
} else {
#pragma omp parallel for schedule(static)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin);
for (int tid = 1; tid < n_data_block; ++tid) {
auto src_ptr = hist_buf_.data() + static_cast<size_t>(num_bin_aligned) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
hist_data[i] += src_ptr[i];
}
}
for (int i = start; i < end; i++) {
GET_HESS(hist_data, i) = GET_HESS(hist_data, i) * hessians[0];
}
}
}
global_timer.Stop("Dataset::sparse_bin_histogram_merge");
}
void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
bool is_constant_hessian,
HistogramBinEntry* hist_data) const {
if (leaf_idx < 0 || num_data < 0 || hist_data == nullptr) {
const MultiValBin* multi_val_bin, bool is_colwise,
hist_t* hist_data) const {
Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer);
if (num_data < 0 || hist_data == nullptr) {
return;
}
std::vector<int> used_group;
used_group.reserve(num_groups_);
if (!is_colwise) {
return ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian, hist_data);
}
global_timer.Start("Dataset::Get used group");
std::vector<int> used_dense_group;
int multi_val_groud_id = -1;
used_dense_group.reserve(num_groups_);
for (int group = 0; group < num_groups_; ++group) {
const int f_cnt = group_feature_cnt_[group];
bool is_group_used = false;
......@@ -891,172 +1198,137 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
}
}
if (is_group_used) {
used_group.push_back(group);
}
}
int num_used_group = static_cast<int>(used_group.size());
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
auto& ref_ordered_bins = *ordered_bins;
if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
if (feature_groups_[group]->is_multi_val_) {
multi_val_groud_id = group;
} else {
used_dense_group.push_back(group);
}
}
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ref_ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
0,
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
}
int num_used_dense_group = static_cast<int>(used_dense_group.size());
global_timer.Stop("Dataset::Get used group");
global_timer.Start("Dataset::dense_bin_histogram");
if (num_used_dense_group > 0) {
auto ptr_ordered_grad = gradients;
auto ptr_ordered_hess = hessians;
if (data_indices != nullptr && num_data < num_data_) {
if (!is_constant_hessian) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
ordered_hessians[i] = hessians[data_indices[i]];
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
ordered_gradients[i] = gradients[data_indices[i]];
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ref_ordered_bins[group] == nullptr) {
// if not use ordered bin
ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians;
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
0,
num_data,
ptr_ordered_grad,
data_ptr);
} else {
// used ordered bin
ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
}
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
data_ptr);
OMP_LOOP_EX_END();
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
}
} else {
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ref_ordered_bins[group] == nullptr) {
// if not use ordered bin
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
0,
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_LOOP_EX_END();
OMP_THROW_EX();
}
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ref_ordered_bins[group] == nullptr) {
// if not use ordered bin
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
0,
num_data,
ptr_ordered_grad,
data_ptr);
} else {
// used ordered bin
ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
OMP_LOOP_EX_END();
}
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi];
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * KHistEntrySize);
// construct histograms for smaller leaf
feature_groups_[group]->bin_data_->ConstructHistogram(
0, num_data, ptr_ordered_grad, data_ptr);
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_LOOP_EX_END();
OMP_THROW_EX();
}
OMP_THROW_EX();
}
}
global_timer.Stop("Dataset::dense_bin_histogram");
if (multi_val_groud_id >= 0) {
ConstructHistogramsMultiVal(multi_val_bin, data_indices, num_data, gradients, hessians, is_constant_hessian,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
}
}
void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
HistogramBinEntry* data) const {
void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const {
const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();
const int most_freq_bin = bin_mapper->GetMostFreqBin();
if (most_freq_bin > 0) {
const int num_bin = bin_mapper->num_bin();
data[most_freq_bin].sum_gradients = sum_gradient;
data[most_freq_bin].sum_hessians = sum_hessian;
data[most_freq_bin].cnt = num_data;
GET_GRAD(data, most_freq_bin) = sum_gradient;
GET_HESS(data, most_freq_bin) = sum_hessian;
for (int i = 0; i < num_bin; ++i) {
if (i != most_freq_bin) {
data[most_freq_bin].sum_gradients -= data[i].sum_gradients;
data[most_freq_bin].sum_hessians -= data[i].sum_hessians;
data[most_freq_bin].cnt -= data[i].cnt;
GET_GRAD(data, most_freq_bin) -= GET_GRAD(data, i);
GET_HESS(data, most_freq_bin) -= GET_HESS(data, i);
}
}
}
......@@ -1094,7 +1366,7 @@ void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len, const std::ve
}
}
void Dataset::addFeaturesFrom(Dataset* other) {
void Dataset::AddFeaturesFrom(Dataset* other) {
if (other->num_data_ != num_data_) {
throw std::runtime_error("Cannot add features from other Dataset with a different number of rows");
}
......
......@@ -335,8 +335,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
mem_ptr += sizeof(dataset->use_missing_);
dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += sizeof(dataset->zero_as_missing_);
dataset->sparse_threshold_ = *(reinterpret_cast<const double*>(mem_ptr));
mem_ptr += sizeof(dataset->sparse_threshold_);
const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
dataset->used_feature_map_.clear();
for (int i = 0; i < dataset->num_total_features_; ++i) {
......
......@@ -31,9 +31,9 @@ class DenseBinIterator: public BinIterator {
}
inline uint32_t RawGet(data_size_t idx) override;
inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override { }
inline void Reset(data_size_t) override {}
private:
private:
const DenseBin<VAL_T>* bin_data_;
VAL_T min_bin_;
VAL_T max_bin_;
......@@ -46,7 +46,7 @@ class DenseBinIterator: public BinIterator {
*/
template <typename VAL_T>
class DenseBin: public Bin {
public:
public:
friend DenseBinIterator<VAL_T>;
explicit DenseBin(data_size_t num_data)
: num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) {
......@@ -68,84 +68,65 @@ class DenseBin: public Bin {
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
if (use_prefetch) {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + pf_idx);
const VAL_T bin = data_[idx];
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
}
}
for (; i < end; i++) {
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const VAL_T bin = data_[idx];
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + i + pf_offset);
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients,
HistogramBinEntry* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
HistogramBinEntry* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + i + pf_offset);
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
}
data_size_t Split(
......@@ -257,9 +238,6 @@ class DenseBin: public Bin {
data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
OrderedBin* CreateOrderedBin() const override { return nullptr; }
void FinishLoad() override {}
void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
......@@ -287,17 +265,18 @@ class DenseBin: public Bin {
}
size_t SizesInByte() const override {
return sizeof(VAL_T) * num_data_;
return sizeof(VAL_T)* num_data_;
}
DenseBin<VAL_T>* Clone() override;
private:
private:
data_size_t num_data_;
std::vector<VAL_T> data_;
std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
DenseBin<VAL_T>(const DenseBin<VAL_T>& other)
: num_data_(other.num_data_), data_(other.data_){}
: num_data_(other.num_data_), data_(other.data_) {
}
};
template<typename VAL_T>
......
......@@ -16,7 +16,7 @@ namespace LightGBM {
class Dense4bitsBin;
class Dense4bitsBinIterator : public BinIterator {
public:
public:
explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
: bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
max_bin_(static_cast<uint8_t>(max_bin)),
......@@ -31,7 +31,7 @@ class Dense4bitsBinIterator : public BinIterator {
inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override {}
private:
private:
const Dense4bitsBin* bin_data_;
uint8_t min_bin_;
uint8_t max_bin_;
......@@ -40,12 +40,12 @@ class Dense4bitsBinIterator : public BinIterator {
};
class Dense4bitsBin : public Bin {
public:
public:
friend Dense4bitsBinIterator;
explicit Dense4bitsBin(data_size_t num_data)
: num_data_(num_data) {
int len = (num_data_ + 1) / 2;
data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
data_.resize(len, static_cast<uint8_t>(0));
buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
}
......@@ -73,88 +73,65 @@ class Dense4bitsBin : public Bin {
inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
const data_size_t pf_offset = 64;
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
#define ACC_GH(hist, i, g, h) \
const auto ti = (i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
if (use_prefetch) {
const data_size_t pf_offset = 64;
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + (pf_idx >> 1));
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
}
}
for (; i < end; i++) {
const data_size_t idx = data_indices[i];
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
const data_size_t pf_offset = 64;
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients,
HistogramBinEntry* out) const override {
const data_size_t pf_offset = 64;
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
HistogramBinEntry* out) const override {
const data_size_t pf_offset = 64;
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
}
data_size_t Split(
......@@ -266,8 +243,6 @@ class Dense4bitsBin : public Bin {
data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
OrderedBin* CreateOrderedBin() const override { return nullptr; }
void FinishLoad() override {
if (buf_.empty()) { return; }
......@@ -325,19 +300,20 @@ class Dense4bitsBin : public Bin {
}
size_t SizesInByte() const override {
return sizeof(uint8_t) * data_.size();
return sizeof(uint8_t)* data_.size();
}
Dense4bitsBin* Clone() override {
return new Dense4bitsBin(*this);
}
protected:
protected:
Dense4bitsBin(const Dense4bitsBin& other)
: num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {}
: num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {
}
data_size_t num_data_;
std::vector<uint8_t> data_;
std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> data_;
std::vector<uint8_t> buf_;
};
......
/*!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
#define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstdint>
#include <cstring>
#include <vector>
namespace LightGBM {
template <typename VAL_T>
class MultiValDenseBin : public MultiValBin {
public:
explicit MultiValDenseBin(data_size_t num_data, int num_bin, int num_feature)
: num_data_(num_data), num_bin_(num_bin), num_feature_(num_feature) {
data_.resize(static_cast<size_t>(num_data_) * num_feature_, static_cast<VAL_T>(0));
}
~MultiValDenseBin() {
}
data_size_t num_data() const override {
return num_data_;
}
int num_bin() const override {
return num_bin_;
}
void PushOneRow(int , data_size_t idx, const std::vector<uint32_t>& values) override {
auto start = RowPtr(idx);
CHECK(num_feature_ == static_cast<int>(values.size()));
for (auto i = 0; i < num_feature_; ++i) {
data_[start + i] = static_cast<VAL_T>(values[i]);
}
}
void FinishLoad() override {
}
bool IsSparse() override{
return false;
}
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
PREFETCH_T0(hessians + pf_idx);
}
PREFETCH_T0(data_.data() + RowPtr(pf_idx));
const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
data_.clear();
for (data_size_t i = 0; i < num_used_indices; ++i) {
for (int64_t j = other_bin->RowPtr(used_indices[i]); j < other_bin->RowPtr(used_indices[i] + 1); ++j) {
data_.push_back(other_bin->data_[j]);
}
}
}
inline int64_t RowPtr(data_size_t idx) const {
return static_cast<int64_t>(idx) * num_feature_;
}
MultiValDenseBin<VAL_T>* Clone() override;
private:
data_size_t num_data_;
int num_bin_;
int num_feature_;
std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
MultiValDenseBin<VAL_T>(const MultiValDenseBin<VAL_T>& other)
: num_data_(other.num_data_), num_bin_(other.num_bin_), num_feature_(other.num_feature_), data_(other.data_) {
}
};
template<typename VAL_T>
MultiValDenseBin<VAL_T>* MultiValDenseBin<VAL_T>::Clone() {
return new MultiValDenseBin<VAL_T>(*this);
}
} // namespace LightGBM
#endif // LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
/*!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstdint>
#include <cstring>
#include <vector>
namespace LightGBM {
template <typename VAL_T>
class MultiValSparseBin : public MultiValBin {
public:
explicit MultiValSparseBin(data_size_t num_data, int num_bin)
: num_data_(num_data), num_bin_(num_bin) {
row_ptr_.resize(num_data_ + 1, 0);
data_.reserve(num_data_);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
if (num_threads > 1) {
t_data_.resize(num_threads - 1);
}
}
~MultiValSparseBin() {
}
data_size_t num_data() const override {
return num_data_;
}
int num_bin() const override {
return num_bin_;
}
void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t> & values) override {
row_ptr_[idx + 1] = static_cast<data_size_t>(values.size());
if (tid == 0) {
for (auto val : values) {
data_.push_back(static_cast<VAL_T>(val));
}
} else {
for (auto val : values) {
t_data_[tid - 1].push_back(static_cast<VAL_T>(val));
}
}
}
void FinishLoad() override {
for (data_size_t i = 0; i < num_data_; ++i) {
row_ptr_[i + 1] += row_ptr_[i];
}
if (t_data_.size() > 0) {
size_t offset = data_.size();
data_.resize(row_ptr_[num_data_]);
for (size_t tid = 0; tid < t_data_.size(); ++tid) {
std::memcpy(data_.data() + offset, t_data_[tid].data(), t_data_[tid].size() * sizeof(VAL_T));
offset += t_data_[tid].size();
t_data_[tid].clear();
}
}
row_ptr_.shrink_to_fit();
data_.shrink_to_fit();
t_data_.clear();
t_data_.shrink_to_fit();
}
bool IsSparse() override {
return true;
}
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
PREFETCH_T0(hessians + pf_idx);
}
PREFETCH_T0(row_ptr_.data() + pf_idx);
PREFETCH_T0(data_.data() + row_ptr_[pf_idx]);
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
}
void CopySubset(const Bin * full_bin, const data_size_t * used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const MultiValSparseBin<VAL_T>*>(full_bin);
row_ptr_.resize(num_data_ + 1, 0);
data_.clear();
for (data_size_t i = 0; i < num_used_indices; ++i) {
for (data_size_t j = other_bin->row_ptr_[used_indices[i]]; j < other_bin->row_ptr_[used_indices[i] + 1]; ++j) {
data_.push_back(other_bin->data_[j]);
}
row_ptr_[i + 1] = row_ptr_[i] + other_bin->row_ptr_[used_indices[i] + 1] - other_bin->row_ptr_[used_indices[i]];
}
}
inline data_size_t RowPtr(data_size_t idx) const {
return row_ptr_[idx];
}
MultiValSparseBin<VAL_T>* Clone() override;
private:
data_size_t num_data_;
int num_bin_;
std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, 32>> row_ptr_;
std::vector<std::vector<VAL_T>> t_data_;
MultiValSparseBin<VAL_T>(const MultiValSparseBin<VAL_T> & other)
: num_data_(other.num_data_), num_bin_(other.num_bin_), data_(other.data_), row_ptr_(other.row_ptr_) {
}
};
template<typename VAL_T>
MultiValSparseBin<VAL_T>* MultiValSparseBin<VAL_T>::Clone() {
return new MultiValSparseBin<VAL_T>(*this);
}
} // namespace LightGBM
#endif // LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
/*!
* Copyright (c) 2016 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <mutex>
#include <utility>
#include <vector>
#include "sparse_bin.hpp"
namespace LightGBM {
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
template <typename VAL_T>
class OrderedSparseBin: public OrderedBin {
public:
/*! \brief Pair to store one bin entry */
struct SparsePair {
data_size_t ridx; // data(row) index
VAL_T bin; // bin for this data
SparsePair() : ridx(0), bin(0) {}
};
explicit OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
:bin_data_(bin_data) {
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
int non_zero_cnt = 0;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
++non_zero_cnt;
}
ordered_pair_.resize(non_zero_cnt);
leaf_cnt_.push_back(non_zero_cnt);
}
~OrderedSparseBin() {
}
void Init(const char* used_idices, int num_leaves) override {
// initialize the leaf information
leaf_start_ = std::vector<data_size_t>(num_leaves, 0);
leaf_cnt_ = std::vector<data_size_t>(num_leaves, 0);
if (used_idices == nullptr) {
// if using all data, copy all non-zero pair
data_size_t j = 0;
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j;
}
leaf_cnt_[0] = static_cast<data_size_t>(j);
} else {
// if using part of data(bagging)
data_size_t j = 0;
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
if (used_idices[cur_pos]) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j;
}
}
leaf_cnt_[0] = j;
}
}
void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian,
HistogramBinEntry* out) const override {
// get current leaf boundary
const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf];
for (data_size_t i = start; i < end; ++i) {
const VAL_T bin = ordered_pair_[i].bin;
const auto g = gradient[ordered_pair_[i].ridx];
const auto h = hessian[ordered_pair_[i].ridx];
out[bin].sum_gradients += g;
out[bin].sum_hessians += h;
++out[bin].cnt;
}
}
void ConstructHistogram(int leaf, const score_t* gradient,
HistogramBinEntry* out) const override {
// get current leaf boundary
const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf];
for (data_size_t i = start; i < end; ++i) {
const VAL_T bin = ordered_pair_[i].bin;
const auto g = gradient[ordered_pair_[i].ridx];
out[bin].sum_gradients += g;
++out[bin].cnt;
}
}
void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) override {
// get current leaf boundary
const data_size_t l_start = leaf_start_[leaf];
const data_size_t l_end = l_start + leaf_cnt_[leaf];
// new left leaf end after split
data_size_t new_left_end = l_start;
for (data_size_t i = l_start; i < l_end; ++i) {
if (is_in_leaf[ordered_pair_[i].ridx] == mark) {
std::swap(ordered_pair_[new_left_end], ordered_pair_[i]);
++new_left_end;
}
}
leaf_start_[right_leaf] = new_left_end;
leaf_cnt_[leaf] = new_left_end - l_start;
leaf_cnt_[right_leaf] = l_end - new_left_end;
}
data_size_t NonZeroCount(int leaf) const override {
return static_cast<data_size_t>(leaf_cnt_[leaf]);
}
/*! \brief Disable copy */
OrderedSparseBin<VAL_T>& operator=(const OrderedSparseBin<VAL_T>&) = delete;
/*! \brief Disable copy */
OrderedSparseBin<VAL_T>(const OrderedSparseBin<VAL_T>&) = delete;
private:
const SparseBin<VAL_T>* bin_data_;
/*! \brief Store non-zero pair , group by leaf */
std::vector<SparsePair> ordered_pair_;
/*! \brief leaf_start_[i] means data in i-th leaf start from */
std::vector<data_size_t> leaf_start_;
/*! \brief leaf_cnt_[i] means number of data in i-th leaf */
std::vector<data_size_t> leaf_cnt_;
};
template <typename VAL_T>
OrderedBin* SparseBin<VAL_T>::CreateOrderedBin() const {
return new OrderedSparseBin<VAL_T>(this);
}
} // namespace LightGBM
#endif // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_
......@@ -24,7 +24,7 @@ const size_t kNumFastIndex = 64;
template <typename VAL_T>
class SparseBinIterator: public BinIterator {
public:
public:
SparseBinIterator(const SparseBin<VAL_T>* bin_data,
uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
: bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
......@@ -56,7 +56,7 @@ class SparseBinIterator: public BinIterator {
inline void Reset(data_size_t idx) override;
private:
private:
const SparseBin<VAL_T>* bin_data_;
data_size_t cur_pos_;
data_size_t i_delta_;
......@@ -66,20 +66,16 @@ class SparseBinIterator: public BinIterator {
uint8_t offset_;
};
template <typename VAL_T>
class OrderedSparseBin;
template <typename VAL_T>
class SparseBin: public Bin {
public:
public:
friend class SparseBinIterator<VAL_T>;
friend class OrderedSparseBin<VAL_T>;
explicit SparseBin(data_size_t num_data)
: num_data_(num_data) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
......@@ -102,41 +98,97 @@ class SparseBin: public Bin {
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*,
const score_t*, HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
data_size_t i_delta, cur_pos;
InitIndex(data_indices[start], &i_delta, &cur_pos);
data_size_t i = start;
for (;;) {
if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
} else if (cur_pos > data_indices[i]) {
if (++i >= end) { break; }
} else {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
if (++i >= end) { break; }
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
}
}
}
void ConstructHistogram(data_size_t, data_size_t, const score_t*,
const score_t*, HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
data_size_t i_delta, cur_pos;
InitIndex(start, &i_delta, &cur_pos);
while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta];
}
while (cur_pos < end && i_delta < num_vals_) {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[cur_pos], ordered_hessians[cur_pos]);
cur_pos += deltas_[++i_delta];
}
}
void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*,
HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
data_size_t i_delta, cur_pos;
InitIndex(data_indices[start], &i_delta, &cur_pos);
data_size_t i = start;
for (;;) {
if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
} else if (cur_pos > data_indices[i]) {
if (++i >= end) { break; }
} else {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
if (++i >= end) { break; }
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
}
}
}
void ConstructHistogram(data_size_t, data_size_t, const score_t*,
HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
data_size_t i_delta, cur_pos;
InitIndex(start, &i_delta, &cur_pos);
while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta];
}
while (cur_pos < end && i_delta < num_vals_) {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[cur_pos], 1.0f);
cur_pos += deltas_[++i_delta];
}
}
#undef ACC_GH
inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const {
++(*i_delta);
data_size_t shift = 0;
data_size_t delta = deltas_[*i_delta];
while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
++(*i_delta);
shift += 8;
delta |= static_cast<data_size_t>(deltas_[*i_delta]) << shift;
inline void NextNonzeroFast(data_size_t* i_delta,
data_size_t* cur_pos) const {
*cur_pos += deltas_[++(*i_delta)];
if (*i_delta >= num_vals_) {
*cur_pos = num_data_;
}
*cur_pos += delta;
}
inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const {
*cur_pos += deltas_[++(*i_delta)];
if (*i_delta < num_vals_) {
return true;
} else {
......@@ -257,8 +309,6 @@ class SparseBin: public Bin {
data_size_t num_data() const override { return num_data_; }
OrderedBin* CreateOrderedBin() const override;
void FinishLoad() override {
// get total non zero size
size_t pair_cnt = 0;
......@@ -276,8 +326,8 @@ class SparseBin: public Bin {
// sort by data index
std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
[](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
return a.first < b.first;
});
return a.first < b.first;
});
// load delta array
LoadFromPair(idx_val_pairs);
}
......@@ -291,11 +341,12 @@ class SparseBin: public Bin {
const data_size_t cur_idx = idx_val_pairs[i].first;
const VAL_T bin = idx_val_pairs[i].second;
data_size_t cur_delta = cur_idx - last_idx;
// disallow the multi-val in one row
if (i > 0 && cur_delta == 0) { continue; }
while (cur_delta >= 256) {
deltas_.push_back(cur_delta & 0xff);
deltas_.push_back(255);
vals_.push_back(0);
cur_delta >>= 8;
cur_delta -= 255;
}
deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin);
......@@ -384,7 +435,7 @@ class SparseBin: public Bin {
while (cur_pos < idx && j < num_vals_) {
NextNonzero(&j, &cur_pos);
}
if (cur_pos == idx && j < num_vals_) {
if (cur_pos == idx && j < num_vals_ && vals_[j] > 0) {
// new row index is i
tmp_pair.emplace_back(i, vals_[j]);
}
......@@ -405,13 +456,13 @@ class SparseBin: public Bin {
// transform to delta array
data_size_t last_idx = 0;
for (data_size_t i = 0; i < num_used_indices; ++i) {
VAL_T bin = iterator.InnerRawGet(used_indices[i]);
auto bin = iterator.InnerRawGet(used_indices[i]);
if (bin > 0) {
data_size_t cur_delta = i - last_idx;
while (cur_delta >= 256) {
deltas_.push_back(cur_delta & 0xff);
deltas_.push_back(255);
vals_.push_back(0);
cur_delta >>= 8;
cur_delta -= 255;
}
deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin);
......@@ -432,15 +483,29 @@ class SparseBin: public Bin {
SparseBin<VAL_T>* Clone() override;
protected:
SparseBin<VAL_T>(const SparseBin<VAL_T>& other)
: num_data_(other.num_data_), deltas_(other.deltas_), vals_(other.vals_),
num_vals_(other.num_vals_), push_buffers_(other.push_buffers_),
fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {}
num_vals_(other.num_vals_), push_buffers_(other.push_buffers_),
fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {
}
void InitIndex(data_size_t start_idx, data_size_t * i_delta, data_size_t * cur_pos) const {
auto idx = start_idx >> fast_index_shift_;
if (static_cast<size_t>(idx) < fast_index_.size()) {
const auto fast_pair = fast_index_[start_idx >> fast_index_shift_];
*i_delta = fast_pair.first;
*cur_pos = fast_pair.second;
} else {
*i_delta = -1;
*cur_pos = 0;
}
}
private:
data_size_t num_data_;
std::vector<uint8_t> deltas_;
std::vector<VAL_T> vals_;
std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> deltas_;
std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> vals_;
data_size_t num_vals_;
std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
......@@ -460,7 +525,7 @@ inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
template <typename VAL_T>
inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
while (cur_pos_ < idx) {
bin_data_->NextNonzero(&i_delta_, &cur_pos_);
bin_data_->NextNonzeroFast(&i_delta_, &cur_pos_);
}
if (cur_pos_ == idx) {
return bin_data_->vals_[i_delta_];
......@@ -471,15 +536,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
template <typename VAL_T>
inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
auto idx = start_idx >> bin_data_->fast_index_shift_;
if (static_cast<size_t>(idx) < bin_data_->fast_index_.size()) {
const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
i_delta_ = fast_pair.first;
cur_pos_ = fast_pair.second;
} else {
i_delta_ = -1;
cur_pos_ = 0;
}
bin_data_->InitIndex(start_idx, &i_delta_, &cur_pos_);
}
template <typename VAL_T>
......
......@@ -73,9 +73,9 @@ class RankXENDCG: public ObjectiveFunction {
// Skip query if sum of labels is 0.
float sum_labels = 0;
for (data_size_t i = 0; i < cnt; ++i) {
sum_labels += phi(label[i], gammas[i]);
sum_labels += static_cast<float>(phi(label[i], gammas[i]));
}
if (sum_labels == 0) {
if (std::fabs(sum_labels) < kEpsilon) {
return;
}
......@@ -111,7 +111,7 @@ class RankXENDCG: public ObjectiveFunction {
}
double phi(const label_t l, double g) const {
return Common::Pow(2, l) - g;
return Common::Pow(2, static_cast<int>(l)) - g;
}
const char* GetName() const override {
......
......@@ -27,7 +27,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, boo
rank_ = Network::rank();
num_machines_ = Network::num_machines();
// allocate buffer for communication
size_t buffer_size = this->train_data_->NumTotalBin() * sizeof(HistogramBinEntry);
size_t buffer_size = this->train_data_->NumTotalBin() * KHistEntrySize;
input_buffer_.resize(buffer_size);
output_buffer_.resize(buffer_size);
......@@ -82,7 +82,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
num_bin -= 1;
}
block_len_[i] += num_bin * sizeof(HistogramBinEntry);
block_len_[i] += num_bin * KHistEntrySize;
}
reduce_scatter_size_ += block_len_[i];
}
......@@ -101,7 +101,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
num_bin -= 1;
}
bin_size += num_bin * sizeof(HistogramBinEntry);
bin_size += num_bin * KHistEntrySize;
}
}
......@@ -113,7 +113,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
num_bin -= 1;
}
bin_size += num_bin * sizeof(HistogramBinEntry);
bin_size += num_bin * KHistEntrySize;
}
// sync global data sumup info
......@@ -158,8 +158,8 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
}
// Reduce scatter for histogram
Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(),
block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer);
Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(),
block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramSumReducer);
this->FindBestSplitsFromHistograms(this->is_feature_used_, true);
}
......@@ -186,7 +186,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
this->train_data_->FixHistogram(feature_index,
this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
this->smaller_leaf_histogram_array_[feature_index].RawData());
SplitInfo smaller_split;
// find best threshold for smaller child
......
......@@ -108,58 +108,70 @@ class DataPartition {
* \param threshold threshold that want to split
* \param right_leaf index of right leaf
*/
void Split(int leaf, const Dataset* dataset, int feature, const uint32_t* threshold, int num_threshold, bool default_left, int right_leaf) {
void Split(int leaf, const Dataset* dataset, int feature,
const uint32_t* threshold, int num_threshold, bool default_left,
int right_leaf) {
Common::FunctionTimer fun_timer("DataPartition::Split", global_timer);
const data_size_t min_inner_size = 512;
// get leaf boundary
const data_size_t begin = leaf_begin_[leaf];
const data_size_t cnt = leaf_count_[leaf];
data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
if (inner_size < min_inner_size) { inner_size = min_inner_size; }
const int nblock =
std::min(num_threads_, (cnt + min_inner_size - 1) / min_inner_size);
data_size_t inner_size = SIZE_ALIGNED((cnt + nblock - 1) / nblock);
auto left_start = indices_.data() + begin;
global_timer.Start("DataPartition::Split.MT");
// split data multi-threading
OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < nblock; ++i) {
OMP_LOOP_EX_BEGIN();
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size;
if (cur_start > cnt) { continue; }
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
data_size_t cur_cnt = std::min(inner_size, cnt - cur_start);
if (cur_cnt <= 0) {
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
continue;
}
// split data inner, reduce the times of function called
data_size_t cur_left_count = dataset->Split(feature, threshold, num_threshold, default_left, indices_.data() + begin + cur_start, cur_cnt,
temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
data_size_t cur_left_count =
dataset->Split(feature, threshold, num_threshold, default_left,
left_start + cur_start, cur_cnt,
temp_left_indices_.data() + cur_start,
temp_right_indices_.data() + cur_start);
offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count;
right_cnts_buf_[i] = cur_cnt - cur_left_count;
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
data_size_t left_cnt = 0;
global_timer.Stop("DataPartition::Split.MT");
global_timer.Start("DataPartition::Split.Merge");
left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
for (int i = 1; i < nblock; ++i) {
left_write_pos_buf_[i] =
left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] =
right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
}
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
// copy back indices of right leaf to indices_
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
if (left_cnts_buf_[i] > 0) {
std::memcpy(indices_.data() + begin + left_write_pos_buf_[i],
temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
}
if (right_cnts_buf_[i] > 0) {
std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i],
temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
data_size_t left_cnt =
left_write_pos_buf_[nblock - 1] + left_cnts_buf_[nblock - 1];
auto right_start = left_start + left_cnt;
#pragma omp parallel for schedule(static)
for (int i = 0; i < nblock; ++i) {
std::copy_n(temp_left_indices_.data() + offsets_buf_[i],
left_cnts_buf_[i], left_start + left_write_pos_buf_[i]);
std::copy_n(temp_right_indices_.data() + offsets_buf_[i],
right_cnts_buf_[i], right_start + right_write_pos_buf_[i]);
}
// update leaf boundary
leaf_count_[leaf] = left_cnt;
leaf_begin_[right_leaf] = left_cnt + begin;
leaf_count_[right_leaf] = cnt - left_cnt;
global_timer.Stop("DataPartition::Split.Merge");
}
/*!
......@@ -201,11 +213,11 @@ class DataPartition {
/*! \brief number of data on one leaf */
std::vector<data_size_t> leaf_count_;
/*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
std::vector<data_size_t> indices_;
std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> indices_;
/*! \brief team indices buffer for split */
std::vector<data_size_t> temp_left_indices_;
std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_left_indices_;
/*! \brief team indices buffer for split */
std::vector<data_size_t> temp_right_indices_;
std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_right_indices_;
/*! \brief used data indices, used for bagging */
const data_size_t* used_data_indices_;
/*! \brief used data count, used for bagging */
......
......@@ -5,6 +5,7 @@
#ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/dataset.h>
#include <LightGBM/utils/array_args.h>
......@@ -20,7 +21,7 @@
namespace LightGBM {
class FeatureMetainfo {
public:
public:
int num_bin;
MissingType missing_type;
int8_t offset = 0;
......@@ -35,7 +36,7 @@ class FeatureMetainfo {
* \brief FeatureHistogram is used to construct and store a histogram for a feature.
*/
class FeatureHistogram {
public:
public:
FeatureHistogram() {
data_ = nullptr;
}
......@@ -53,19 +54,19 @@ class FeatureHistogram {
* \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf
*/
void Init(HistogramBinEntry* data, const FeatureMetainfo* meta) {
void Init(hist_t* data, const FeatureMetainfo* meta) {
meta_ = meta;
data_ = data;
if (meta_->bin_type == BinType::NumericalBin) {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
} else {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4, std::placeholders::_5, std::placeholders::_6);
}
}
HistogramBinEntry* RawData() {
hist_t* RawData() {
return data_;
}
/*!
......@@ -73,15 +74,13 @@ class FeatureHistogram {
* \param other The histogram that want to subtract
*/
void Subtract(const FeatureHistogram& other) {
for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) {
data_[i].cnt -= other.data_[i].cnt;
data_[i].sum_gradients -= other.data_[i].sum_gradients;
data_[i].sum_hessians -= other.data_[i].sum_hessians;
for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) {
data_[i] -= other.data_[i];
}
}
void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
SplitInfo* output) {
SplitInfo* output) {
output->default_left = true;
output->gain = kMinScore;
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, min_constraint, max_constraint, output);
......@@ -89,10 +88,10 @@ class FeatureHistogram {
}
void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
SplitInfo* output) {
SplitInfo* output) {
is_splittable_ = false;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step);
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
if (meta_->missing_type == MissingType::Zero) {
......@@ -116,8 +115,8 @@ class FeatureHistogram {
}
void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
double min_constraint, double max_constraint,
SplitInfo* output) {
double min_constraint, double max_constraint,
SplitInfo* output) {
output->default_left = false;
double best_gain = kMinScore;
data_size_t best_left_count = 0;
......@@ -134,25 +133,28 @@ class FeatureHistogram {
bool use_onehot = meta_->num_bin <= meta_->config->max_cat_to_onehot;
int best_threshold = -1;
int best_dir = 1;
const double cnt_factor = num_data / sum_hessian;
if (use_onehot) {
for (int t = 0; t < used_bin; ++t) {
const auto grad = GET_GRAD(data_, t);
const auto hess = GET_HESS(data_, t);
data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
// if data not enough, or sum hessian too small
if (data_[t].cnt < meta_->config->min_data_in_leaf
|| data_[t].sum_hessians < meta_->config->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - data_[t].cnt;
if (cnt < meta_->config->min_data_in_leaf
|| hess < meta_->config->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - cnt;
// if data not enough
if (other_count < meta_->config->min_data_in_leaf) continue;
double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
double sum_other_hessian = sum_hessian - hess - kEpsilon;
// if sum hessian too small
if (sum_other_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
double sum_other_gradient = sum_gradient - grad;
// current split gain
double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint, 0);
double current_gain = GetSplitGains(sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint, 0);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;
......@@ -161,15 +163,15 @@ class FeatureHistogram {
// better split point
if (current_gain > best_gain) {
best_threshold = t;
best_sum_left_gradient = data_[t].sum_gradients;
best_sum_left_hessian = data_[t].sum_hessians + kEpsilon;
best_left_count = data_[t].cnt;
best_sum_left_gradient = grad;
best_sum_left_hessian = hess + kEpsilon;
best_left_count = cnt;
best_gain = current_gain;
}
}
} else {
for (int i = 0; i < used_bin; ++i) {
if (data_[i].cnt >= meta_->config->cat_smooth) {
if (Common::RoundInt(GET_HESS(data_, i) * cnt_factor) >= meta_->config->cat_smooth) {
sorted_idx.push_back(i);
}
}
......@@ -181,9 +183,9 @@ class FeatureHistogram {
return (sum_grad) / (sum_hess + meta_->config->cat_smooth);
};
std::sort(sorted_idx.begin(), sorted_idx.end(),
[this, &ctr_fun](int i, int j) {
return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
});
[this, &ctr_fun](int i, int j) {
return ctr_fun(GET_GRAD(data_, i), GET_HESS(data_, i)) < ctr_fun(GET_GRAD(data_, j), GET_HESS(data_, j));
});
std::vector<int> find_direction(1, 1);
std::vector<int> start_position(1, 0);
......@@ -203,14 +205,17 @@ class FeatureHistogram {
for (int i = 0; i < used_bin && i < max_num_cat; ++i) {
auto t = sorted_idx[start_pos];
start_pos += dir;
const auto grad = GET_GRAD(data_, t);
const auto hess = GET_HESS(data_, t);
data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
sum_left_gradient += data_[t].sum_gradients;
sum_left_hessian += data_[t].sum_hessians;
left_count += data_[t].cnt;
cnt_cur_group += data_[t].cnt;
sum_left_gradient += grad;
sum_left_hessian += hess;
left_count += cnt;
cnt_cur_group += cnt;
if (left_count < meta_->config->min_data_in_leaf
|| sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
|| sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
data_size_t right_count = num_data - left_count;
if (right_count < meta_->config->min_data_in_leaf || right_count < min_data_per_group) break;
......@@ -223,8 +228,8 @@ class FeatureHistogram {
double sum_right_gradient = sum_gradient - sum_left_gradient;
double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint, 0);
meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint, 0);
if (current_gain <= min_gain_shift) continue;
is_splittable_ = true;
if (current_gain > best_gain) {
......@@ -241,15 +246,15 @@ class FeatureHistogram {
if (is_splittable_) {
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint);
meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint);
output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint);
sum_hessian - best_sum_left_hessian,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
min_constraint, max_constraint);
output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
......@@ -279,22 +284,22 @@ class FeatureHistogram {
}
void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
uint32_t threshold, data_size_t num_data, SplitInfo *output) {
uint32_t threshold, data_size_t num_data, SplitInfo* output) {
if (meta_->bin_type == BinType::NumericalBin) {
GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
num_data, output);
num_data, output);
} else {
GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
num_data, output);
num_data, output);
}
}
void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian,
uint32_t threshold, data_size_t num_data,
SplitInfo *output) {
uint32_t threshold, data_size_t num_data,
SplitInfo* output) {
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
// do stuff here
......@@ -315,27 +320,29 @@ class FeatureHistogram {
int t = meta_->num_bin - 1 - offset - use_na_as_missing;
const int t_end = 1 - offset;
const double cnt_factor = num_data / sum_hessian;
// from right to left, and we don't need data in bin0
for (; t >= t_end; --t) {
if (static_cast<uint32_t>(t + offset) < threshold) { break; }
// need to skip default bin
if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
sum_right_gradient += data_[t].sum_gradients;
sum_right_hessian += data_[t].sum_hessians;
right_count += data_[t].cnt;
const auto grad = GET_GRAD(data_, t);
const auto hess = GET_HESS(data_, t);
data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
sum_right_gradient += grad;
sum_right_hessian += hess;
right_count += cnt;
}
double sum_left_gradient = sum_gradient - sum_right_gradient;
double sum_left_hessian = sum_hessian - sum_right_hessian;
data_size_t left_count = num_data - right_count;
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
// gain with split is worse than without split
if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
......@@ -347,15 +354,15 @@ class FeatureHistogram {
// update split information
output->threshold = threshold;
output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
output->left_count = left_count;
output->left_sum_gradient = sum_left_gradient;
output->left_sum_hessian = sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - sum_left_gradient,
sum_hessian - sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
sum_hessian - sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
output->right_count = num_data - left_count;
output->right_sum_gradient = sum_gradient - sum_left_gradient;
output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
......@@ -365,13 +372,13 @@ class FeatureHistogram {
}
void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian,
uint32_t threshold, data_size_t num_data, SplitInfo *output) {
uint32_t threshold, data_size_t num_data, SplitInfo* output) {
// get SplitInfo for a given one-hot categorical split.
output->default_left = false;
double gain_shift = GetLeafSplitGain(
sum_gradient, sum_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
sum_gradient, sum_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step);
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
bool is_full_categorical = meta_->missing_type == MissingType::None;
int used_bin = meta_->num_bin - 1 + is_full_categorical;
......@@ -380,21 +387,25 @@ class FeatureHistogram {
Log::Warning("Invalid categorical threshold split");
return;
}
const double cnt_factor = num_data / sum_hessian;
const auto grad = GET_GRAD(data_, threshold);
const auto hess = GET_HESS(data_, threshold);
data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
double l2 = meta_->config->lambda_l2;
data_size_t left_count = data_[threshold].cnt;
data_size_t left_count = cnt;
data_size_t right_count = num_data - left_count;
double sum_left_hessian = data_[threshold].sum_hessians + kEpsilon;
double sum_left_hessian = hess + kEpsilon;
double sum_right_hessian = sum_hessian - sum_left_hessian;
double sum_left_gradient = data_[threshold].sum_gradients;
double sum_left_gradient = grad;
double sum_right_gradient = sum_gradient - sum_left_gradient;
// current split gain
double current_gain = GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, l2,
meta_->config->max_delta_step)
+ GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, l2,
meta_->config->max_delta_step);
meta_->config->lambda_l1, l2,
meta_->config->max_delta_step)
+ GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, l2,
meta_->config->max_delta_step);
if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
output->gain = kMinScore;
Log::Warning("'Forced Split' will be ignored since the gain getting worse. ");
......@@ -402,14 +413,14 @@ class FeatureHistogram {
}
output->left_output = CalculateSplittedLeafOutput(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, l2,
meta_->config->max_delta_step);
meta_->config->lambda_l1, l2,
meta_->config->max_delta_step);
output->left_count = left_count;
output->left_sum_gradient = sum_left_gradient;
output->left_sum_hessian = sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, l2,
meta_->config->max_delta_step);
meta_->config->lambda_l1, l2,
meta_->config->max_delta_step);
output->right_count = right_count;
output->right_sum_gradient = sum_gradient - sum_left_gradient;
output->right_sum_hessian = sum_right_hessian - kEpsilon;
......@@ -423,14 +434,14 @@ class FeatureHistogram {
* \brief Binary size of this histogram
*/
int SizeOfHistgram() const {
return (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry);
return (meta_->num_bin - meta_->offset) * KHistEntrySize;
}
/*!
* \brief Restore histogram from memory
*/
void FromMemory(char* memory_data) {
std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * sizeof(HistogramBinEntry));
std::memcpy(data_, memory_data, (meta_->num_bin - meta_->offset) * KHistEntrySize);
}
/*!
......@@ -457,11 +468,11 @@ class FeatureHistogram {
}
}
private:
private:
static double GetSplitGains(double sum_left_gradients, double sum_left_hessians,
double sum_right_gradients, double sum_right_hessians,
double l1, double l2, double max_delta_step,
double min_constraint, double max_constraint, int8_t monotone_constraint) {
double sum_right_gradients, double sum_right_hessians,
double l1, double l2, double max_delta_step,
double min_constraint, double max_constraint, int8_t monotone_constraint) {
double left_output = CalculateSplittedLeafOutput(sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
double right_output = CalculateSplittedLeafOutput(sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, min_constraint, max_constraint);
if (((monotone_constraint > 0) && (left_output > right_output)) ||
......@@ -479,7 +490,7 @@ class FeatureHistogram {
* \return leaf output
*/
static double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians, double l1, double l2, double max_delta_step,
double min_constraint, double max_constraint) {
double min_constraint, double max_constraint) {
double ret = CalculateSplittedLeafOutput(sum_gradients, sum_hessians, l1, l2, max_delta_step);
if (ret < min_constraint) {
ret = min_constraint;
......@@ -506,7 +517,7 @@ class FeatureHistogram {
}
void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_constraint, double max_constraint,
double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
double min_gain_shift, SplitInfo* output, int dir, bool skip_default_bin, bool use_na_as_missing) {
const int8_t offset = meta_->offset;
double best_sum_left_gradient = NAN;
......@@ -514,7 +525,7 @@ class FeatureHistogram {
double best_gain = kMinScore;
data_size_t best_left_count = 0;
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
const double cnt_factor = num_data / sum_hessian;
if (dir == -1) {
double sum_right_gradient = 0.0f;
double sum_right_hessian = kEpsilon;
......@@ -528,12 +539,15 @@ class FeatureHistogram {
// need to skip default bin
if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
sum_right_gradient += data_[t].sum_gradients;
sum_right_hessian += data_[t].sum_hessians;
right_count += data_[t].cnt;
const auto grad = GET_GRAD(data_, t);
const auto hess = GET_HESS(data_, t);
data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
sum_right_gradient += grad;
sum_right_hessian += hess;
right_count += cnt;
// if data not enough, or sum hessian too small
if (right_count < meta_->config->min_data_in_leaf
|| sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
|| sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
data_size_t left_count = num_data - right_count;
// if data not enough
if (left_count < meta_->config->min_data_in_leaf) break;
......@@ -545,8 +559,8 @@ class FeatureHistogram {
double sum_left_gradient = sum_gradient - sum_right_gradient;
// current split gain
double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint, meta_->monotone_type);
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint, meta_->monotone_type);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;
......@@ -575,9 +589,12 @@ class FeatureHistogram {
sum_left_hessian = sum_hessian - kEpsilon;
left_count = num_data;
for (int i = 0; i < meta_->num_bin - offset; ++i) {
sum_left_gradient -= data_[i].sum_gradients;
sum_left_hessian -= data_[i].sum_hessians;
left_count -= data_[i].cnt;
const auto grad = GET_GRAD(data_, i);
const auto hess = GET_HESS(data_, i);
data_size_t cnt = static_cast<data_size_t>(Common::RoundInt(hess * cnt_factor));
sum_left_gradient -= grad;
sum_left_hessian -= hess;
left_count -= cnt;
}
t = -1;
}
......@@ -586,13 +603,13 @@ class FeatureHistogram {
// need to skip default bin
if (skip_default_bin && (t + offset) == static_cast<int>(meta_->default_bin)) { continue; }
if (t >= 0) {
sum_left_gradient += data_[t].sum_gradients;
sum_left_hessian += data_[t].sum_hessians;
left_count += data_[t].cnt;
sum_left_gradient += GET_GRAD(data_, t);
sum_left_hessian += GET_HESS(data_, t);
left_count += static_cast<data_size_t>(Common::RoundInt(GET_HESS(data_, t) * cnt_factor));
}
// if data not enough, or sum hessian too small
if (left_count < meta_->config->min_data_in_leaf
|| sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
|| sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) continue;
data_size_t right_count = num_data - left_count;
// if data not enough
if (right_count < meta_->config->min_data_in_leaf) break;
......@@ -604,8 +621,8 @@ class FeatureHistogram {
double sum_right_gradient = sum_gradient - sum_left_gradient;
// current split gain
double current_gain = GetSplitGains(sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint, meta_->monotone_type);
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint, meta_->monotone_type);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;
......@@ -626,15 +643,15 @@ class FeatureHistogram {
// update split information
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint);
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint);
output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint);
sum_hessian - best_sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step,
min_constraint, max_constraint);
output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
......@@ -645,14 +662,13 @@ class FeatureHistogram {
const FeatureMetainfo* meta_;
/*! \brief sum of gradient of each bin */
HistogramBinEntry* data_;
// std::vector<HistogramBinEntry> data_;
hist_t* data_;
bool is_splittable_ = true;
std::function<void(double, double, data_size_t, double, double, SplitInfo*)> find_best_threshold_fun_;
};
class HistogramPool {
public:
public:
/*!
* \brief Constructor
*/
......@@ -698,7 +714,7 @@ class HistogramPool {
}
}
void DynamicChangeSize(const Dataset* train_data, const Config* config, int cache_size, int total_size) {
void DynamicChangeSize(const Dataset* train_data, bool is_hist_colwise, const Config* config, int cache_size, int total_size) {
if (feature_metas_.empty()) {
uint64_t bin_cnt_over_features = 0;
int num_feature = train_data->num_features();
......@@ -720,7 +736,6 @@ class HistogramPool {
}
Log::Info("Total Bins %d", bin_cnt_over_features);
}
uint64_t num_total_bin = train_data->NumTotalBin();
int old_cache_size = static_cast<int>(pool_.size());
Reset(cache_size, total_size);
......@@ -728,24 +743,39 @@ class HistogramPool {
pool_.resize(cache_size);
data_.resize(cache_size);
}
int num_total_bin = static_cast<int>(train_data->NumTotalBin());
std::vector<int> offsets;
if (is_hist_colwise) {
int offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) {
offset += train_data->SubFeatureBinOffset(j);
offsets.push_back(offset);
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
num_bin -= 1;
}
offset += num_bin;
}
} else {
num_total_bin = 1;
for (int j = 0; j < train_data->num_features(); ++j) {
offsets.push_back(num_total_bin);
num_total_bin += train_data->FeatureBinMapper(j)->num_bin();
if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
num_total_bin -= 1;
}
}
}
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int i = old_cache_size; i < cache_size; ++i) {
OMP_LOOP_EX_BEGIN();
pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
data_[i].resize(num_total_bin);
uint64_t offset = 0;
data_[i].resize(num_total_bin * 2);
for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j]);
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetMostFreqBin() == 0) {
num_bin -= 1;
}
offset += static_cast<uint64_t>(num_bin);
pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
}
CHECK(offset == num_total_bin);
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
......@@ -816,9 +846,9 @@ class HistogramPool {
inverse_mapper_[slot] = dst_idx;
}
private:
private:
std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
std::vector<std::vector<HistogramBinEntry>> data_;
std::vector<std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>> data_;
std::vector<FeatureMetainfo> feature_metas_;
int cache_size_;
int total_size_;
......
......@@ -49,15 +49,15 @@ void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
// some functions used for debugging the GPU histogram construction
#if GPU_DEBUG > 0
void PrintHistograms(HistogramBinEntry* h, size_t size) {
size_t total = 0;
void PrintHistograms(hist_t* h, size_t size) {
double total_hess = 0;
for (size_t i = 0; i < size; ++i) {
printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt);
total += h[i].cnt;
if ((i & 3) == 3)
printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
if ((i & 2) == 2)
printf("\n");
total_hess += GET_HESS(h, i);
}
printf("\nTotal examples: %lu\n", total);
printf("\nSum hessians: %9.3g\n", total_hess);
}
union Float_t {
......@@ -69,27 +69,23 @@ union Float_t {
};
void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) {
void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
size_t i;
Float_t a, b;
for (i = 0; i < size; ++i) {
a.f = h1[i].sum_gradients;
b.f = h2[i].sum_gradients;
a.f = GET_GRAD(h1, i);
b.f = GET_GRAD(h2, i);
int32_t ulps = Float_t::ulp_diff(a, b);
if (fabs(h1[i].cnt - h2[i].cnt != 0)) {
printf("%d != %d\n", h1[i].cnt, h2[i].cnt);
goto err;
}
if (ulps > 0) {
// printf("grad %g != %g (%d ULPs)\n", h1[i].sum_gradients, h2[i].sum_gradients, ulps);
// printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
// goto err;
}
a.f = h1[i].sum_hessians;
b.f = h2[i].sum_hessians;
a.f = GET_HESS(h1, i);
b.f = GET_HESS(h2, i);
ulps = Float_t::ulp_diff(a, b);
if (ulps > 0) {
// printf("hessian %g != %g (%d ULPs)\n", h1[i].sum_hessians, h2[i].sum_hessians, ulps);
// goto err;
if (std::fabs(a.f - b.f) >= 1e-20) {
printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps);
goto err;
}
}
return;
......@@ -191,7 +187,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
}
template <typename HistType>
void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
// when the output is ready, the computation is done
histograms_wait_obj_.wait();
......@@ -201,29 +197,25 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
continue;
}
int dense_group_index = dense_feature_group_map_[i];
auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index);
auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
if (device_bin_mults_[i] == 1) {
for (int j = 0; j < bin_size; ++j) {
old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians;
old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j);
}
} else {
// values of this feature has been redistributed to multiple bins; need a reduction here
int ind = 0;
for (int j = 0; j < bin_size; ++j) {
double sum_g = 0.0, sum_h = 0.0;
size_t cnt = 0;
for (int k = 0; k < device_bin_mults_[i]; ++k) {
sum_g += hist_outputs[i * device_bin_size_+ ind].sum_gradients;
sum_h += hist_outputs[i * device_bin_size_+ ind].sum_hessians;
cnt += hist_outputs[i * device_bin_size_ + ind].cnt;
sum_g += GET_GRAD(hist_outputs, i * device_bin_size_+ ind);
sum_h += GET_HESS(hist_outputs, i * device_bin_size_+ ind);
ind++;
}
old_histogram_array[j].sum_gradients = sum_g;
old_histogram_array[j].sum_hessians = sum_h;
old_histogram_array[j].cnt = (data_size_t)cnt;
GET_GRAD(old_histogram_array, j) = sum_g;
GET_HESS(old_histogram_array, j) = sum_h;
}
}
}
......@@ -233,7 +225,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
void GPUTreeLearner::AllocateGPUMemory() {
num_dense_feature_groups_ = 0;
for (int i = 0; i < num_feature_groups_; ++i) {
if (ordered_bins_[i] == nullptr) {
if (!train_data_->IsMultiGroup(i)) {
num_dense_feature_groups_++;
}
}
......@@ -303,7 +295,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_));
boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_);
// histogram bin entry size depends on the precision (single/double)
hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry);
hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) * 2 : sizeof(gpu_hist_t) * 2;
Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_);
// create output buffer, each feature has a histogram with device_bin_size_ bins,
// each work group generates a sub-histogram of dword_features_ features.
......@@ -326,7 +318,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
std::vector<int> dense_dword_ind(dword_features_);
for (int i = 0; i < num_feature_groups_; ++i) {
// looking for dword_features_ non-sparse feature-groups
if (ordered_bins_[i] == nullptr) {
if (!train_data_->IsMultiGroup(i)) {
dense_dword_ind[k] = i;
// decide if we need to redistribute the bin
double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i));
......@@ -682,6 +674,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
printf("bin size: ");
#endif
for (int i = 0; i < num_feature_groups_; ++i) {
if (train_data_->IsMultiGroup(i)) {
continue;
}
#if GPU_DEBUG >= 1
printf("%d, ", train_data_->FeatureGroupNumBin(i));
#endif
......@@ -960,35 +955,34 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used_[feature_index]) continue;
if (!is_feature_used[feature_index]) continue;
if (ordered_bins_[train_data_->Feature2Group(feature_index)]) {
if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
is_sparse_feature_used[feature_index] = 1;
} else {
is_dense_feature_used[feature_index] = 1;
}
}
// construct smaller leaf
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
// ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf(),
nullptr, nullptr,
nullptr, nullptr);
// then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(),
&ordered_bins_, gradients_, hessians_,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
multi_val_bin_.get(), is_hist_colwise_,
ptr_smaller_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) {
if (config_->gpu_use_dp) {
// use double precision
WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data);
WaitAndGetHistograms<hist_t>(ptr_smaller_leaf_hist_data);
} else {
// use single precision
WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data);
WaitAndGetHistograms<gpu_hist_t>(ptr_smaller_leaf_hist_data);
}
}
......@@ -1000,48 +994,58 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
continue;
int dense_feature_group_index = dense_feature_group_map_[i];
size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index);
HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size];
hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
hist_t* gpu_histogram = new hist_t[size * 2];
data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
std::copy(current_histogram, current_histogram + size, gpu_histogram);
std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry));
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
num_data != num_data_ ? smaller_leaf_splits_->data_indices() : nullptr,
num_data,
num_data != num_data_ ? ordered_gradients_.data() : gradients_,
num_data != num_data_ ? ordered_hessians_.data() : hessians_,
current_histogram);
std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
if(train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr){continue;}
if (num_data != num_data_ ) {
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
smaller_leaf_splits_->data_indices(),
0,
num_data,
ordered_gradients_.data(),
ordered_hessians_.data(),
current_histogram);
} else {
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
0,
num_data,
gradients_,
hessians_,
current_histogram);
}
CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
std::copy(gpu_histogram, gpu_histogram + size, current_histogram);
std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
delete [] gpu_histogram;
}
#endif
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf
HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - KHistOffset;
is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data());
// then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->LeafIndex(),
&ordered_bins_, gradients_, hessians_,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
multi_val_bin_.get(), is_hist_colwise_,
ptr_larger_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) {
if (config_->gpu_use_dp) {
// use double precision
WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data);
WaitAndGetHistograms<hist_t>(ptr_larger_leaf_hist_data);
} else {
// use single precision
WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data);
WaitAndGetHistograms<gpu_hist_t>(ptr_larger_leaf_hist_data);
}
}
}
......
......@@ -76,12 +76,7 @@ class GPUTreeLearner: public SerialTreeLearner {
uint8_t s[4];
};
/*! \brief Single precision histogram entiry for GPU */
struct GPUHistogramBinEntry {
score_t sum_gradients;
score_t sum_hessians;
uint32_t cnt;
};
typedef float gpu_hist_t;
/*!
* \brief Find the best number of workgroups processing one feature for maximizing efficiency
......@@ -133,7 +128,7 @@ class GPUTreeLearner: public SerialTreeLearner {
* \param histograms Destination of histogram results from GPU.
*/
template <typename HistType>
void WaitAndGetHistograms(HistogramBinEntry* histograms);
void WaitAndGetHistograms(hist_t* histograms);
/*!
* \brief Construct GPU histogram asynchronously.
......
......@@ -163,7 +163,7 @@ R""()
void within_kernel_reduction16x8(uchar8 feature_mask,
__global const acc_type* restrict feature4_sub_hist,
const uint skip_id,
acc_type stat_val, uint cnt_val,
acc_type stat_val,
const ushort num_sub_hist,
__global acc_type* restrict output_buf,
__local acc_type * restrict local_hist) {
......@@ -181,33 +181,21 @@ void within_kernel_reduction16x8(uchar8 feature_mask,
// 256 threads working on 8 features' 16 bins, gradient and hessian
stat_val += *p;
p += NUM_BINS * DWORD_FEATURES * 2;
if (ltid < LOCAL_SIZE_0 / 2) {
cnt_val += as_acc_int_type(*p);
}
p += NUM_BINS * DWORD_FEATURES;
}
// skip the counters we already have
p += 3 * DWORD_FEATURES * NUM_BINS;
p += 2 * DWORD_FEATURES * NUM_BINS;
for (i = i + 1; i < num_sub_hist; ++i) {
stat_val += *p;
p += NUM_BINS * DWORD_FEATURES * 2;
if (ltid < LOCAL_SIZE_0 / 2) {
cnt_val += as_acc_int_type(*p);
}
p += NUM_BINS * DWORD_FEATURES;
}
#endif
// printf("thread %d:feature=%d, bin_id=%d, hessian=%d, stat_val=%f, cnt=%d", ltid, feature_id, bin_id, is_hessian_first, stat_val, cnt_val);
// now overwrite the local_hist for final reduction and output
// reverse the f7...f0 order to match the real order
feature_id = DWORD_FEATURES_MASK - feature_id;
local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + is_hessian_first] = stat_val;
bin_id = ltid >> (LOG2_DWORD_FEATURES); // range 0 - 16, for counter
if (ltid < LOCAL_SIZE_0 / 2) {
local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
}
local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + is_hessian_first] = stat_val;
barrier(CLK_LOCAL_MEM_FENCE);
for (i = ltid; i < DWORD_FEATURES * 3 * NUM_BINS; i += lsize) {
for (i = ltid; i < DWORD_FEATURES * 2 * NUM_BINS; i += lsize) {
output_buf[i] = local_hist[i];
}
}
......@@ -335,7 +323,9 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
bk7_c_f0_bin16 bk7_c_f1_bin16 bk7_c_f2_bin16 bk7_c_f3_bin16 bk7_c_f4_bin16 bk7_c_f5_bin16 bk7_c_f6_bin16 bk7_c_f7_bin0
-----------------------------------------------
*/
#if CONST_HESSIAN == 1
__local uint * cnt_hist = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
#endif
// thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first
// thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first
......@@ -547,7 +537,7 @@ R""()
atomic_local_add_f(gh_hist + addr2, stat2);
#endif
}
#if CONST_HESSIAN == 1
// STAGE 3: accumulate counter
// there are 8 counters for 8 features
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's counts for example 0, 1, 2, 3, 4, 5, 6, 7
......@@ -614,6 +604,7 @@ R""()
// printf("thread %x add counter %d feature %d (7)\n", ltid, bin, offset);
atom_inc(cnt_hist + addr);
}
#endif
stat1 = stat1_next;
stat2 = stat2_next;
feature4 = feature4_next;
......@@ -642,6 +633,7 @@ R""()
ushort bank_id = (i + offset) & BANK_MASK;
stat_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + feature_id];
}
#if CONST_HESSIAN == 1
if (ltid < LOCAL_SIZE_0 / 2) {
// first 128 threads accumulate the 8 * 16 = 128 counter values
bin_id = ltid >> LOG2_DWORD_FEATURES; // bits 3 - 6 range 0 - 16 is bin ID
......@@ -651,6 +643,7 @@ R""()
cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * DWORD_FEATURES + feature_id];
}
}
#endif
// now thread 0 - 7 holds feature 0 - 7's gradient for bin 0 and counter bin 0
// now thread 8 - 15 holds feature 0 - 7's hessian for bin 0 and counter bin 1
......@@ -687,7 +680,7 @@ R""()
// write to output
// write gradients and hessians histogram for all 4 features
// output data in linear order for further reduction
// output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float)
// output size = 4 (features) * 2 (counters) * 64 (bins) * sizeof(float)
/* memory layout of output:
g_f0_bin0 g_f1_bin0 g_f2_bin0 g_f3_bin0 g_f4_bin0 g_f5_bin0 g_f6_bin0 g_f7_bin0
h_f0_bin0 h_f1_bin0 h_f2_bin0 h_f3_bin0 h_f4_bin0 h_f5_bin0 h_f6_bin0 h_f7_bin0
......@@ -705,14 +698,10 @@ R""()
// if there is only one workgroup processing this feature4, don't even need to write
uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
#if POWER_FEATURE_WORKGROUPS != 0
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 3 * NUM_BINS;
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 2 * NUM_BINS;
// if g_val and h_val are double, they are converted to float here
// write gradients and hessians for 8 features
output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val;
// write counts for 8 features
if (ltid < LOCAL_SIZE_0 / 2) {
output[2 * DWORD_FEATURES * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
}
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
mem_fence(CLK_GLOBAL_MEM_FENCE);
// To avoid the cost of an extra reducting kernel, we have to deal with some
......@@ -738,7 +727,7 @@ R""()
// The is done by using an global atomic counter.
// On AMD GPUs ideally this should be done in GDS,
// but currently there is no easy way to access it via OpenCL.
__local uint * counter_val = cnt_hist;
__local uint * counter_val = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
if (ltid == 0) {
// all workgroups processing the same feature add this counter
*counter_val = atom_inc(sync_counters + feature4_id);
......@@ -762,12 +751,12 @@ R""()
// locate our feature4's block in output memory
uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
__global acc_type const * restrict feature4_subhists =
(__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 3 * NUM_BINS;
(__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 2 * NUM_BINS;
// skip reading the data already in local memory
uint skip_id = group_id ^ output_offset;
// locate output histogram location for this feature4
__global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 3 * NUM_BINS;
within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, cnt_val,
__global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 2 * NUM_BINS;
within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val,
1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
}
}
......@@ -776,4 +765,3 @@ R""()
// the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
// )"" "\n#endif" + 9
#endif
......@@ -155,15 +155,6 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
acc_type f1_hess_bin = local_hist[ltid * 8 + 5];
acc_type f2_hess_bin = local_hist[ltid * 8 + 6];
acc_type f3_hess_bin = local_hist[ltid * 8 + 7];
__local uint* restrict local_cnt = (__local uint *)(local_hist + 4 * 2 * NUM_BINS);
#if POWER_FEATURE_WORKGROUPS != 0
uint f0_cont_bin = ltid ? local_cnt[ltid * 4] : old_val_f0_cont_bin0;
#else
uint f0_cont_bin = local_cnt[ltid * 4];
#endif
uint f1_cont_bin = local_cnt[ltid * 4 + 1];
uint f2_cont_bin = local_cnt[ltid * 4 + 2];
uint f3_cont_bin = local_cnt[ltid * 4 + 3];
ushort i;
// printf("%d-pre(skip %d): %f %f %f %f %f %f %f %f %d %d %d %d", ltid, skip_id, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
#if POWER_FEATURE_WORKGROUPS != 0
......@@ -173,70 +164,62 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
if (feature_mask.s3) {
f0_grad_bin += *p; p += NUM_BINS;
f0_hess_bin += *p; p += NUM_BINS;
f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
}
else {
p += 3 * NUM_BINS;
p += 2 * NUM_BINS;
}
if (feature_mask.s2) {
f1_grad_bin += *p; p += NUM_BINS;
f1_hess_bin += *p; p += NUM_BINS;
f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
}
else {
p += 3 * NUM_BINS;
p += 2 * NUM_BINS;
}
if (feature_mask.s1) {
f2_grad_bin += *p; p += NUM_BINS;
f2_hess_bin += *p; p += NUM_BINS;
f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
}
else {
p += 3 * NUM_BINS;
p += 2 * NUM_BINS;
}
if (feature_mask.s0) {
f3_grad_bin += *p; p += NUM_BINS;
f3_hess_bin += *p; p += NUM_BINS;
f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
}
else {
p += 3 * NUM_BINS;
p += 2 * NUM_BINS;
}
}
// skip the counters we already have
p += 3 * 4 * NUM_BINS;
p += 2 * 4 * NUM_BINS;
for (i = i + 1; i < num_sub_hist; ++i) {
if (feature_mask.s3) {
f0_grad_bin += *p; p += NUM_BINS;
f0_hess_bin += *p; p += NUM_BINS;
f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
}
else {
p += 3 * NUM_BINS;
p += 2 * NUM_BINS;
}
if (feature_mask.s2) {
f1_grad_bin += *p; p += NUM_BINS;
f1_hess_bin += *p; p += NUM_BINS;
f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
}
else {
p += 3 * NUM_BINS;
p += 2 * NUM_BINS;
}
if (feature_mask.s1) {
f2_grad_bin += *p; p += NUM_BINS;
f2_hess_bin += *p; p += NUM_BINS;
f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
}
else {
p += 3 * NUM_BINS;
p += 2 * NUM_BINS;
}
if (feature_mask.s0) {
f3_grad_bin += *p; p += NUM_BINS;
f3_hess_bin += *p; p += NUM_BINS;
f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
}
else {
p += 3 * NUM_BINS;
p += 2 * NUM_BINS;
}
}
// printf("%d-aft: %f %f %f %f %f %f %f %f %d %d %d %d", ltid, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
......@@ -245,18 +228,14 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
barrier(CLK_LOCAL_MEM_FENCE);
#if USE_DP_FLOAT == 0
// reverse the f3...f0 order to match the real order
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
local_hist[2 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
local_hist[2 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
local_hist[2 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
local_hist[3 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
local_hist[3 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
local_hist[3 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin;
local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin;
local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin;
local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin;
local_hist[2 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin;
local_hist[2 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin;
local_hist[3 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin;
local_hist[3 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin;
barrier(CLK_LOCAL_MEM_FENCE);
/*
for (ushort i = ltid; i < 4 * 3 * NUM_BINS; i += lsize) {
......@@ -267,34 +246,28 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
if (feature_mask.s0) {
output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
}
i += 1 * 3 * NUM_BINS;
i += 1 * 2 * NUM_BINS;
if (feature_mask.s1) {
output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
}
i += 1 * 3 * NUM_BINS;
i += 1 * 2 * NUM_BINS;
if (feature_mask.s2) {
output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
}
i += 1 * 3 * NUM_BINS;
if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
i += 1 * 2 * NUM_BINS;
if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) {
output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
}
#else
// when double precision is used, we need to write twice, because local memory size is not enough
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f3_grad_bin;
local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f3_hess_bin;
local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f2_grad_bin;
local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f2_hess_bin;
barrier(CLK_LOCAL_MEM_FENCE);
/*
for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
......@@ -305,21 +278,17 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
if (feature_mask.s0) {
output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
}
i += 1 * 3 * NUM_BINS;
i += 1 * 2 * NUM_BINS;
if (feature_mask.s1) {
output_buf[i] = local_hist[i];
output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
}
barrier(CLK_LOCAL_MEM_FENCE);
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
local_hist[0 * 2 * NUM_BINS + ltid * 2 + 0] = f1_grad_bin;
local_hist[0 * 2 * NUM_BINS + ltid * 2 + 1] = f1_hess_bin;
local_hist[1 * 2 * NUM_BINS + ltid * 2 + 0] = f0_grad_bin;
local_hist[1 * 2 * NUM_BINS + ltid * 2 + 1] = f0_hess_bin;
barrier(CLK_LOCAL_MEM_FENCE);
/*
for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
......@@ -328,15 +297,13 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
*/
i = ltid;
if (feature_mask.s2) {
output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i];
output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
}
i += 1 * 3 * NUM_BINS;
i += 1 * 2 * NUM_BINS;
if (feature_mask.s3) {
output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
output_buf[i + 2 * 2 * NUM_BINS] = local_hist[i];
output_buf[i + 2 * 2 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
}
#endif
}
......@@ -401,7 +368,9 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
__local acc_type * gh_hist = (__local acc_type *)shared_array;
// counter histogram
// total size: 4 * 256 * size_of(uint) = 4 KB
#if CONST_HESSIAN == 1
__local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);
#endif
// thread 0, 1, 2, 3 compute histograms for gradients first
// thread 4, 5, 6, 7 compute histograms for hessians first
......@@ -602,7 +571,7 @@ R""()
s0_stat1 += stat1;
s0_stat2 += stat2;
}
#if CONST_HESSIAN == 1
// STAGE 3: accumulate counter
// there are 4 counters for 4 features
// thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
......@@ -633,6 +602,7 @@ R""()
addr = bin * 4 + offset;
atom_inc(cnt_hist + addr);
}
#endif
stat1 = stat1_next;
stat2 = stat2_next;
feature4 = feature4_next;
......@@ -741,7 +711,7 @@ R""()
uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
// if there is only one workgroup processing this feature4, don't even need to write
#if POWER_FEATURE_WORKGROUPS != 0
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS;
// write gradients and hessians
__global acc_type * restrict ptr_f = output;
for (ushort j = 0; j < 4; ++j) {
......@@ -751,17 +721,7 @@ R""()
acc_type value = gh_hist[i * 4 + j];
ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
}
ptr_f += 3 * NUM_BINS;
}
// write counts
__global acc_int_type * restrict ptr_i = (__global acc_int_type * restrict)(output + 2 * NUM_BINS);
for (ushort j = 0; j < 4; ++j) {
for (ushort i = ltid; i < NUM_BINS; i += lsize) {
// FIXME: 2-way bank conflict
uint value = cnt_hist[i * 4 + j];
ptr_i[i] = value;
}
ptr_i += 3 * NUM_BINS;
ptr_f += 2 * NUM_BINS;
}
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
mem_fence(CLK_GLOBAL_MEM_FENCE);
......@@ -788,7 +748,7 @@ R""()
// The is done by using an global atomic counter.
// On AMD GPUs ideally this should be done in GDS,
// but currently there is no easy way to access it via OpenCL.
__local uint * counter_val = cnt_hist;
__local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);;
// backup the old value
uint old_val = *counter_val;
if (ltid == 0) {
......@@ -814,11 +774,11 @@ R""()
// locate our feature4's block in output memory
uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
__global acc_type const * restrict feature4_subhists =
(__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
(__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS;
// skip reading the data already in local memory
uint skip_id = group_id ^ output_offset;
// locate output histogram location for this feature4
__global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
__global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS;
within_kernel_reduction256x4(feature_mask, feature4_subhists, skip_id, old_val, 1 << POWER_FEATURE_WORKGROUPS,
hist_buf, (__local acc_type *)shared_array);
// if (ltid == 0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment