Unverified Commit 50f11a9f authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[ci][c++] fixed `whitespace/indent_namespace` errors from cpplint (#7056)



* dev

* dev

* dev

* dev

* dev

---------
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
parent 6f0d7cc2
......@@ -28,7 +28,7 @@ repos:
- id: cpplint
args:
- --recursive
- --filter=-build/include_subdir,-build/header_guard,-whitespace/indent_namespace,-whitespace/line_length
- --filter=-build/include_subdir,-build/header_guard,-whitespace/line_length
- repo: local
hooks:
- id: check-omp-pragmas
......
......@@ -83,7 +83,7 @@ const int kAlignedSize = 32;
// Refer to https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-4-c4127?view=vs-2019
#ifdef _MSC_VER
#pragma warning(disable : 4127)
#pragma warning(disable : 4127)
#endif
} // namespace LightGBM
......
......@@ -22,1055 +22,1055 @@
namespace LightGBM {
BinMapper::BinMapper(): num_bin_(1), is_trivial_(true), bin_type_(BinType::NumericalBin) {
bin_upper_bound_.clear();
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
BinMapper::BinMapper(): num_bin_(1), is_trivial_(true), bin_type_(BinType::NumericalBin) {
bin_upper_bound_.clear();
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
}
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
missing_type_ = other.missing_type_;
is_trivial_ = other.is_trivial_;
sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
}
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
missing_type_ = other.missing_type_;
is_trivial_ = other.is_trivial_;
sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
most_freq_bin_ = other.most_freq_bin_;
}
BinMapper::BinMapper(const void* memory) {
CopyFrom(reinterpret_cast<const char*>(memory));
}
BinMapper::~BinMapper() {
}
bool NeedFilter(const std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
int sum_left = 0;
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
sum_left += cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
return false;
}
}
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
most_freq_bin_ = other.most_freq_bin_;
}
BinMapper::BinMapper(const void* memory) {
CopyFrom(reinterpret_cast<const char*>(memory));
}
BinMapper::~BinMapper() {
}
bool NeedFilter(const std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
int sum_left = 0;
} else {
if (cnt_in_bin.size() <= 2) {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
sum_left += cnt_in_bin[i];
int sum_left = cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
return false;
}
}
} else {
if (cnt_in_bin.size() <= 2) {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
int sum_left = cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
return false;
}
}
} else {
return false;
}
return false;
}
return true;
}
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin,
size_t total_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
CHECK_GT(max_bin, 0);
if (num_distinct_values <= max_bin) {
bin_upper_bound.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
auto val = Common::GetDoubleUpperBound((distinct_values[i] + distinct_values[i + 1]) / 2.0);
if (bin_upper_bound.empty() || !Common::CheckDoubleEqualOrdered(bin_upper_bound.back(), val)) {
bin_upper_bound.push_back(val);
cur_cnt_inbin = 0;
}
}
}
cur_cnt_inbin += counts[num_distinct_values - 1];
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
double mean_bin_size = static_cast<double>(total_cnt) / max_bin;
// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_cnt);
std::vector<bool> is_big_count_value(num_distinct_values, false);
for (int i = 0; i < num_distinct_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) {
break;
}
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
++bin_cnt;
// update bin upper bound
bin_upper_bound.clear();
for (int i = 0; i < bin_cnt - 1; ++i) {
auto val = Common::GetDoubleUpperBound((upper_bounds[i] + lower_bounds[i + 1]) / 2.0);
return true;
}
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin,
size_t total_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
CHECK_GT(max_bin, 0);
if (num_distinct_values <= max_bin) {
bin_upper_bound.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
auto val = Common::GetDoubleUpperBound((distinct_values[i] + distinct_values[i + 1]) / 2.0);
if (bin_upper_bound.empty() || !Common::CheckDoubleEqualOrdered(bin_upper_bound.back(), val)) {
bin_upper_bound.push_back(val);
cur_cnt_inbin = 0;
}
}
// last bin upper bound
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
return bin_upper_bound;
}
std::vector<double> FindBinWithPredefinedBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin,
size_t total_sample_cnt, int min_data_in_bin,
const std::vector<double>& forced_upper_bounds) {
std::vector<double> bin_upper_bound;
// get number of positive and negative distinct values
int left_cnt = -1;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroThreshold) {
left_cnt = i;
break;
}
}
if (left_cnt < 0) {
left_cnt = num_distinct_values;
}
int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kZeroThreshold) {
right_start = i;
break;
}
}
// include zero bounds and infinity bound
if (max_bin == 2) {
if (left_cnt == 0) {
bin_upper_bound.push_back(kZeroThreshold);
} else {
bin_upper_bound.push_back(-kZeroThreshold);
}
} else if (max_bin >= 3) {
if (left_cnt > 0) {
bin_upper_bound.push_back(-kZeroThreshold);
}
if (right_start >= 0) {
bin_upper_bound.push_back(kZeroThreshold);
}
}
cur_cnt_inbin += counts[num_distinct_values - 1];
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
// add forced bounds, excluding zeros since we have already added zero bounds
int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
int num_inserted = 0;
for (size_t i = 0; i < forced_upper_bounds.size(); ++i) {
if (num_inserted >= max_to_insert) {
break;
}
if (std::fabs(forced_upper_bounds[i]) > kZeroThreshold) {
bin_upper_bound.push_back(forced_upper_bounds[i]);
++num_inserted;
}
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
// find remaining bounds
int free_bins = max_bin - static_cast<int>(bin_upper_bound.size());
std::vector<double> bounds_to_add;
int value_ind = 0;
for (size_t i = 0; i < bin_upper_bound.size(); ++i) {
int cnt_in_bin = 0;
int distinct_cnt_in_bin = 0;
int bin_start = value_ind;
while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) {
cnt_in_bin += counts[value_ind];
++distinct_cnt_in_bin;
++value_ind;
}
int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * free_bins / total_sample_cnt)));
num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
if (i == bin_upper_bound.size() - 1) {
num_sub_bins = bins_remaining + 1;
}
std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin,
num_sub_bins, cnt_in_bin, min_data_in_bin);
bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity
}
bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
CHECK_LE(bin_upper_bound.size(), static_cast<size_t>(max_bin));
return bin_upper_bound;
}
double mean_bin_size = static_cast<double>(total_cnt) / max_bin;
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
int left_cnt_data = 0;
int cnt_zero = 0;
int right_cnt_data = 0;
// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_cnt);
std::vector<bool> is_big_count_value(num_distinct_values, false);
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] <= -kZeroThreshold) {
left_cnt_data += counts[i];
} else if (distinct_values[i] > kZeroThreshold) {
right_cnt_data += counts[i];
} else {
cnt_zero += counts[i];
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
}
}
int left_cnt = -1;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroThreshold) {
left_cnt = i;
break;
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
}
if (left_cnt < 0) {
left_cnt = num_distinct_values;
}
if ((left_cnt > 0) && (max_bin > 1)) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
left_max_bin = std::max(1, left_max_bin);
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
if (bin_upper_bound.size() > 0) {
bin_upper_bound.back() = -kZeroThreshold;
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) {
break;
}
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kZeroThreshold) {
right_start = i;
break;
++bin_cnt;
// update bin upper bound
bin_upper_bound.clear();
for (int i = 0; i < bin_cnt - 1; ++i) {
auto val = Common::GetDoubleUpperBound((upper_bounds[i] + lower_bounds[i + 1]) / 2.0);
if (bin_upper_bound.empty() || !Common::CheckDoubleEqualOrdered(bin_upper_bound.back(), val)) {
bin_upper_bound.push_back(val);
}
}
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
if (right_start >= 0 && right_max_bin > 0) {
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound.push_back(kZeroThreshold);
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
CHECK_LE(bin_upper_bound.size(), static_cast<size_t>(max_bin));
return bin_upper_bound;
// last bin upper bound
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
return bin_upper_bound;
}
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
int max_bin, size_t total_sample_cnt, int min_data_in_bin,
std::vector<double> FindBinWithPredefinedBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin,
size_t total_sample_cnt, int min_data_in_bin,
const std::vector<double>& forced_upper_bounds) {
if (forced_upper_bounds.empty()) {
return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
} else {
return FindBinWithPredefinedBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin,
forced_upper_bounds);
std::vector<double> bin_upper_bound;
// get number of positive and negative distinct values
int left_cnt = -1;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroThreshold) {
left_cnt = i;
break;
}
}
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
bool use_missing, bool zero_as_missing,
const std::vector<double>& forced_upper_bounds) {
int na_cnt = 0;
int non_na_cnt = 0;
for (int i = 0; i < num_sample_values; ++i) {
if (!std::isnan(values[i])) {
values[non_na_cnt++] = values[i];
}
}
if (!use_missing) {
missing_type_ = MissingType::None;
} else if (zero_as_missing) {
missing_type_ = MissingType::Zero;
} else {
if (non_na_cnt == num_sample_values) {
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
na_cnt = num_sample_values - non_na_cnt;
}
if (left_cnt < 0) {
left_cnt = num_distinct_values;
}
int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kZeroThreshold) {
right_start = i;
break;
}
num_sample_values = non_na_cnt;
bin_type_ = bin_type;
default_bin_ = 0;
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts; // count of data points for each distinct feature value.
std::stable_sort(values, values + num_sample_values);
}
// push zero in the front
if (num_sample_values == 0 || (values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
// include zero bounds and infinity bound
if (max_bin == 2) {
if (left_cnt == 0) {
bin_upper_bound.push_back(kZeroThreshold);
} else {
bin_upper_bound.push_back(-kZeroThreshold);
}
if (num_sample_values > 0) {
distinct_values.push_back(values[0]);
counts.push_back(1);
} else if (max_bin >= 3) {
if (left_cnt > 0) {
bin_upper_bound.push_back(-kZeroThreshold);
}
for (int i = 1; i < num_sample_values; ++i) {
if (!Common::CheckDoubleEqualOrdered(values[i - 1], values[i])) {
if (values[i - 1] < 0.0f && values[i] > 0.0f) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
distinct_values.push_back(values[i]);
counts.push_back(1);
} else {
// use the large value
distinct_values.back() = values[i];
++counts.back();
}
if (right_start >= 0) {
bin_upper_bound.push_back(kZeroThreshold);
}
// push zero in the back
if (num_sample_values > 0 && values[num_sample_values - 1] < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
// add forced bounds, excluding zeros since we have already added zero bounds
int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
int num_inserted = 0;
for (size_t i = 0; i < forced_upper_bounds.size(); ++i) {
if (num_inserted >= max_to_insert) {
break;
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin; // count of data points in each bin.
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
min_data_in_bin, forced_upper_bounds);
if (bin_upper_bound_.size() == 2) {
missing_type_ = MissingType::None;
}
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
min_data_in_bin, forced_upper_bounds);
} else {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
min_data_in_bin, forced_upper_bounds);
bin_upper_bound_.push_back(NaN);
}
num_bin_ = static_cast<int>(bin_upper_bound_.size());
{
cnt_in_bin.resize(num_bin_, 0);
int i_bin = 0;
for (int i = 0; i < num_distinct_values; ++i) {
while (distinct_values[i] > bin_upper_bound_[i_bin] && i_bin < num_bin_ - 1) {
++i_bin;
}
cnt_in_bin[i_bin] += counts[i];
}
if (missing_type_ == MissingType::NaN) {
cnt_in_bin[num_bin_ - 1] = na_cnt;
}
}
CHECK_LE(num_bin_, max_bin);
} else {
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
for (size_t i = 0; i < distinct_values.size(); ++i) {
int val = static_cast<int>(distinct_values[i]);
if (val < 0) {
na_cnt += counts[i];
Log::Warning("Met negative value in categorical features, will convert it to NaN");
} else {
if (distinct_values_int.empty() || val != distinct_values_int.back()) {
distinct_values_int.push_back(val);
counts_int.push_back(counts[i]);
} else {
counts_int.back() += counts[i];
}
}
}
int rest_cnt = static_cast<int>(total_sample_cnt - na_cnt);
if (rest_cnt > 0) {
const int SPARSE_RATIO = 100;
if (distinct_values_int.back() / SPARSE_RATIO > static_cast<int>(distinct_values_int.size())) {
Log::Warning("Met categorical feature which contains sparse values. "
"Consider renumbering to consecutive integers started from zero");
}
// sort by counts in descending order
Common::SortForPair<int, int>(&counts_int, &distinct_values_int, 0, true);
// will ignore the categorical of small counts
int cut_cnt = static_cast<int>(
Common::RoundInt((total_sample_cnt - na_cnt) * 0.99f));
size_t cur_cat_idx = 0; // index of current category.
categorical_2_bin_.clear();
bin_2_categorical_.clear();
int used_cnt = 0;
int distinct_cnt = static_cast<int>(distinct_values_int.size());
if (na_cnt > 0) {
++distinct_cnt;
}
max_bin = std::min(distinct_cnt, max_bin);
cnt_in_bin.clear();
// Push the dummy bin for NaN
bin_2_categorical_.push_back(-1);
categorical_2_bin_[-1] = 0;
cnt_in_bin.push_back(0);
num_bin_ = 1;
while (cur_cat_idx < distinct_values_int.size()
&& (used_cnt < cut_cnt || num_bin_ < max_bin)) {
if (counts_int[cur_cat_idx] < min_data_in_bin && cur_cat_idx > 1) {
break;
}
bin_2_categorical_.push_back(distinct_values_int[cur_cat_idx]);
categorical_2_bin_[distinct_values_int[cur_cat_idx]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat_idx];
cnt_in_bin.push_back(counts_int[cur_cat_idx]);
++num_bin_;
++cur_cat_idx;
}
// Use MissingType::None to represent this bin contains all categoricals
if (cur_cat_idx == distinct_values_int.size() && na_cnt == 0) {
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
}
// fix count of NaN bin
cnt_in_bin[0] = static_cast<int>(total_sample_cnt - used_cnt);
}
if (std::fabs(forced_upper_bounds[i]) > kZeroThreshold) {
bin_upper_bound.push_back(forced_upper_bounds[i]);
++num_inserted;
}
// check trivial(num_bin_ == 1) feature
if (num_bin_ <= 1) {
is_trivial_ = true;
} else {
is_trivial_ = false;
}
std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
// find remaining bounds
int free_bins = max_bin - static_cast<int>(bin_upper_bound.size());
std::vector<double> bounds_to_add;
int value_ind = 0;
for (size_t i = 0; i < bin_upper_bound.size(); ++i) {
int cnt_in_bin = 0;
int distinct_cnt_in_bin = 0;
int bin_start = value_ind;
while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) {
cnt_in_bin += counts[value_ind];
++distinct_cnt_in_bin;
++value_ind;
}
// check useless bin
if (!is_trivial_ && pre_filter && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
is_trivial_ = true;
int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * free_bins / total_sample_cnt)));
num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
if (i == bin_upper_bound.size() - 1) {
num_sub_bins = bins_remaining + 1;
}
if (!is_trivial_) {
default_bin_ = ValueToBin(0);
most_freq_bin_ =
static_cast<uint32_t>(ArrayArgs<int>::ArgMax(cnt_in_bin));
const double max_sparse_rate =
static_cast<double>(cnt_in_bin[most_freq_bin_]) / total_sample_cnt;
// When most_freq_bin_ != default_bin_, there are some additional data loading costs.
// so use most_freq_bin_ = default_bin_ when there is not so sparse
if (most_freq_bin_ != default_bin_ && max_sparse_rate < kSparseThreshold) {
most_freq_bin_ = default_bin_;
}
sparse_rate_ =
static_cast<double>(cnt_in_bin[most_freq_bin_]) / total_sample_cnt;
std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin,
num_sub_bins, cnt_in_bin, min_data_in_bin);
bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity
}
bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
CHECK_LE(bin_upper_bound.size(), static_cast<size_t>(max_bin));
return bin_upper_bound;
}
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
int left_cnt_data = 0;
int cnt_zero = 0;
int right_cnt_data = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] <= -kZeroThreshold) {
left_cnt_data += counts[i];
} else if (distinct_values[i] > kZeroThreshold) {
right_cnt_data += counts[i];
} else {
sparse_rate_ = 1.0f;
cnt_zero += counts[i];
}
}
void BinMapper::CopyTo(char * buffer) const {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(buffer, &is_trivial_, sizeof(is_trivial_));
buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(buffer, &min_val_, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(buffer, &max_val_, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(buffer, &most_freq_bin_, sizeof(most_freq_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
int left_cnt = -1;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroThreshold) {
left_cnt = i;
break;
}
}
void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(&is_trivial_, buffer, sizeof(is_trivial_));
buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(&most_freq_bin_, buffer, sizeof(most_freq_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
} else {
bin_2_categorical_ = std::vector<int>(num_bin_);
std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
categorical_2_bin_.clear();
for (int i = 0; i < num_bin_; ++i) {
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
}
if (left_cnt < 0) {
left_cnt = num_distinct_values;
}
if ((left_cnt > 0) && (max_bin > 1)) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
left_max_bin = std::max(1, left_max_bin);
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
if (bin_upper_bound.size() > 0) {
bin_upper_bound.back() = -kZeroThreshold;
}
}
void BinMapper::SaveBinaryToFile(BinaryWriter* writer) const {
writer->AlignedWrite(&num_bin_, sizeof(num_bin_));
writer->AlignedWrite(&missing_type_, sizeof(missing_type_));
writer->AlignedWrite(&is_trivial_, sizeof(is_trivial_));
writer->Write(&sparse_rate_, sizeof(sparse_rate_));
writer->AlignedWrite(&bin_type_, sizeof(bin_type_));
writer->Write(&min_val_, sizeof(min_val_));
writer->Write(&max_val_, sizeof(max_val_));
writer->AlignedWrite(&default_bin_, sizeof(default_bin_));
writer->AlignedWrite(&most_freq_bin_, sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
writer->Write(bin_upper_bound_.data(), sizeof(double) * num_bin_);
} else {
writer->Write(bin_2_categorical_.data(), sizeof(int) * num_bin_);
int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kZeroThreshold) {
right_start = i;
break;
}
}
size_t BinMapper::SizesInByte() const {
size_t ret = VirtualFileWriter::AlignedSize(sizeof(num_bin_)) +
VirtualFileWriter::AlignedSize(sizeof(missing_type_)) +
VirtualFileWriter::AlignedSize(sizeof(is_trivial_)) +
sizeof(sparse_rate_) +
VirtualFileWriter::AlignedSize(sizeof(bin_type_)) +
sizeof(min_val_) + sizeof(max_val_) +
VirtualFileWriter::AlignedSize(sizeof(default_bin_)) +
VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
if (right_start >= 0 && right_max_bin > 0) {
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound.push_back(kZeroThreshold);
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
CHECK_LE(bin_upper_bound.size(), static_cast<size_t>(max_bin));
return bin_upper_bound;
}
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
int max_bin, size_t total_sample_cnt, int min_data_in_bin,
const std::vector<double>& forced_upper_bounds) {
if (forced_upper_bounds.empty()) {
return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
} else {
return FindBinWithPredefinedBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin,
forced_upper_bounds);
}
}
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
bool use_missing, bool zero_as_missing,
const std::vector<double>& forced_upper_bounds) {
int na_cnt = 0;
int non_na_cnt = 0;
for (int i = 0; i < num_sample_values; ++i) {
if (!std::isnan(values[i])) {
values[non_na_cnt++] = values[i];
}
}
if (!use_missing) {
missing_type_ = MissingType::None;
} else if (zero_as_missing) {
missing_type_ = MissingType::Zero;
} else {
if (non_na_cnt == num_sample_values) {
missing_type_ = MissingType::None;
} else {
ret += sizeof(int) * num_bin_;
missing_type_ = MissingType::NaN;
na_cnt = num_sample_values - non_na_cnt;
}
return ret;
}
num_sample_values = non_na_cnt;
template class DenseBin<uint8_t, true>;
template class DenseBin<uint8_t, false>;
template class DenseBin<uint16_t, false>;
template class DenseBin<uint32_t, false>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
bin_type_ = bin_type;
default_bin_ = 0;
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts; // count of data points for each distinct feature value.
template class MultiValDenseBin<uint8_t>;
template class MultiValDenseBin<uint16_t>;
template class MultiValDenseBin<uint32_t>;
std::stable_sort(values, values + num_sample_values);
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) {
return new DenseBin<uint8_t, true>(num_data);
} else if (num_bin <= 256) {
return new DenseBin<uint8_t, false>(num_data);
} else if (num_bin <= 65536) {
return new DenseBin<uint16_t, false>(num_data);
} else {
return new DenseBin<uint32_t, false>(num_data);
}
// push zero in the front
if (num_sample_values == 0 || (values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new SparseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new SparseBin<uint16_t>(num_data);
} else {
return new SparseBin<uint32_t>(num_data);
}
if (num_sample_values > 0) {
distinct_values.push_back(values[0]);
counts.push_back(1);
}
MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature,
double sparse_rate, const std::vector<uint32_t>& offsets) {
if (sparse_rate >= multi_val_bin_sparse_threshold) {
const double average_element_per_row = (1.0 - sparse_rate) * num_feature;
return CreateMultiValSparseBin(num_data, num_bin,
average_element_per_row);
for (int i = 1; i < num_sample_values; ++i) {
if (!Common::CheckDoubleEqualOrdered(values[i - 1], values[i])) {
if (values[i - 1] < 0.0f && values[i] > 0.0f) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
distinct_values.push_back(values[i]);
counts.push_back(1);
} else {
return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets);
// use the large value
distinct_values.back() = values[i];
++counts.back();
}
}
MultiValBin* MultiValBin::CreateMultiValDenseBin(data_size_t num_data,
int num_bin,
int num_feature,
const std::vector<uint32_t>& offsets) {
// calculate max bin of all features to select the int type in MultiValDenseBin
int max_bin = 0;
for (int i = 0; i < static_cast<int>(offsets.size()) - 1; ++i) {
int feature_bin = offsets[i + 1] - offsets[i];
if (feature_bin > max_bin) {
max_bin = feature_bin;
// push zero in the back
if (num_sample_values > 0 && values[num_sample_values - 1] < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin; // count of data points in each bin.
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
min_data_in_bin, forced_upper_bounds);
if (bin_upper_bound_.size() == 2) {
missing_type_ = MissingType::None;
}
}
if (max_bin <= 256) {
return new MultiValDenseBin<uint8_t>(num_data, num_bin, num_feature, offsets);
} else if (max_bin <= 65536) {
return new MultiValDenseBin<uint16_t>(num_data, num_bin, num_feature, offsets);
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
min_data_in_bin, forced_upper_bounds);
} else {
return new MultiValDenseBin<uint32_t>(num_data, num_bin, num_feature, offsets);
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
min_data_in_bin, forced_upper_bounds);
bin_upper_bound_.push_back(NaN);
}
}
MultiValBin* MultiValBin::CreateMultiValSparseBin(data_size_t num_data,
int num_bin,
double estimate_element_per_row) {
size_t estimate_total_entries =
static_cast<size_t>(estimate_element_per_row * 1.1 * num_data);
if (estimate_total_entries <= std::numeric_limits<uint16_t>::max()) {
if (num_bin <= 256) {
return new MultiValSparseBin<uint16_t, uint8_t>(
num_data, num_bin, estimate_element_per_row);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<uint16_t, uint16_t>(
num_data, num_bin, estimate_element_per_row);
} else {
return new MultiValSparseBin<uint16_t, uint32_t>(
num_data, num_bin, estimate_element_per_row);
num_bin_ = static_cast<int>(bin_upper_bound_.size());
{
cnt_in_bin.resize(num_bin_, 0);
int i_bin = 0;
for (int i = 0; i < num_distinct_values; ++i) {
while (distinct_values[i] > bin_upper_bound_[i_bin] && i_bin < num_bin_ - 1) {
++i_bin;
}
cnt_in_bin[i_bin] += counts[i];
}
} else if (estimate_total_entries <= std::numeric_limits<uint32_t>::max()) {
if (num_bin <= 256) {
return new MultiValSparseBin<uint32_t, uint8_t>(
num_data, num_bin, estimate_element_per_row);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<uint32_t, uint16_t>(
num_data, num_bin, estimate_element_per_row);
if (missing_type_ == MissingType::NaN) {
cnt_in_bin[num_bin_ - 1] = na_cnt;
}
}
CHECK_LE(num_bin_, max_bin);
} else {
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
for (size_t i = 0; i < distinct_values.size(); ++i) {
int val = static_cast<int>(distinct_values[i]);
if (val < 0) {
na_cnt += counts[i];
Log::Warning("Met negative value in categorical features, will convert it to NaN");
} else {
return new MultiValSparseBin<uint32_t, uint32_t>(
num_data, num_bin, estimate_element_per_row);
if (distinct_values_int.empty() || val != distinct_values_int.back()) {
distinct_values_int.push_back(val);
counts_int.push_back(counts[i]);
} else {
counts_int.back() += counts[i];
}
}
} else {
if (num_bin <= 256) {
return new MultiValSparseBin<size_t, uint8_t>(
num_data, num_bin, estimate_element_per_row);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<size_t, uint16_t>(
num_data, num_bin, estimate_element_per_row);
}
int rest_cnt = static_cast<int>(total_sample_cnt - na_cnt);
if (rest_cnt > 0) {
const int SPARSE_RATIO = 100;
if (distinct_values_int.back() / SPARSE_RATIO > static_cast<int>(distinct_values_int.size())) {
Log::Warning("Met categorical feature which contains sparse values. "
"Consider renumbering to consecutive integers started from zero");
}
// sort by counts in descending order
Common::SortForPair<int, int>(&counts_int, &distinct_values_int, 0, true);
// will ignore the categorical of small counts
int cut_cnt = static_cast<int>(
Common::RoundInt((total_sample_cnt - na_cnt) * 0.99f));
size_t cur_cat_idx = 0; // index of current category.
categorical_2_bin_.clear();
bin_2_categorical_.clear();
int used_cnt = 0;
int distinct_cnt = static_cast<int>(distinct_values_int.size());
if (na_cnt > 0) {
++distinct_cnt;
}
max_bin = std::min(distinct_cnt, max_bin);
cnt_in_bin.clear();
// Push the dummy bin for NaN
bin_2_categorical_.push_back(-1);
categorical_2_bin_[-1] = 0;
cnt_in_bin.push_back(0);
num_bin_ = 1;
while (cur_cat_idx < distinct_values_int.size()
&& (used_cnt < cut_cnt || num_bin_ < max_bin)) {
if (counts_int[cur_cat_idx] < min_data_in_bin && cur_cat_idx > 1) {
break;
}
bin_2_categorical_.push_back(distinct_values_int[cur_cat_idx]);
categorical_2_bin_[distinct_values_int[cur_cat_idx]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat_idx];
cnt_in_bin.push_back(counts_int[cur_cat_idx]);
++num_bin_;
++cur_cat_idx;
}
// Use MissingType::None to represent this bin contains all categoricals
if (cur_cat_idx == distinct_values_int.size() && na_cnt == 0) {
missing_type_ = MissingType::None;
} else {
return new MultiValSparseBin<size_t, uint32_t>(
num_data, num_bin, estimate_element_per_row);
missing_type_ = MissingType::NaN;
}
// fix count of NaN bin
cnt_in_bin[0] = static_cast<int>(total_sample_cnt - used_cnt);
}
}
template <>
const void* DenseBin<uint8_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int /*num_threads*/) const {
*is_sparse = false;
*bit_type = 8;
bin_iterator->clear();
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint16_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int /*num_threads*/) const {
*is_sparse = false;
*bit_type = 16;
bin_iterator->clear();
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint32_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int /*num_threads*/) const {
*is_sparse = false;
*bit_type = 32;
bin_iterator->clear();
return reinterpret_cast<const void*>(data_.data());
// check trivial(num_bin_ == 1) feature
if (num_bin_ <= 1) {
is_trivial_ = true;
} else {
is_trivial_ = false;
}
template <>
const void* DenseBin<uint8_t, true>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int /*num_threads*/) const {
*is_sparse = false;
*bit_type = 4;
bin_iterator->clear();
return reinterpret_cast<const void*>(data_.data());
// check useless bin
if (!is_trivial_ && pre_filter && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
is_trivial_ = true;
}
template <>
const void* DenseBin<uint8_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = false;
*bit_type = 8;
*bin_iterator = nullptr;
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint16_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = false;
*bit_type = 16;
*bin_iterator = nullptr;
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint32_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = false;
*bit_type = 32;
*bin_iterator = nullptr;
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint8_t, true>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = false;
*bit_type = 4;
*bin_iterator = nullptr;
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* SparseBin<uint8_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int num_threads) const {
*is_sparse = true;
*bit_type = 8;
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
bin_iterator->emplace_back(new SparseBinIterator<uint8_t>(this, 0));
if (!is_trivial_) {
default_bin_ = ValueToBin(0);
most_freq_bin_ =
static_cast<uint32_t>(ArrayArgs<int>::ArgMax(cnt_in_bin));
const double max_sparse_rate =
static_cast<double>(cnt_in_bin[most_freq_bin_]) / total_sample_cnt;
// When most_freq_bin_ != default_bin_, there are some additional data loading costs.
// so use most_freq_bin_ = default_bin_ when there is not so sparse
if (most_freq_bin_ != default_bin_ && max_sparse_rate < kSparseThreshold) {
most_freq_bin_ = default_bin_;
}
return nullptr;
sparse_rate_ =
static_cast<double>(cnt_in_bin[most_freq_bin_]) / total_sample_cnt;
} else {
sparse_rate_ = 1.0f;
}
template <>
const void* SparseBin<uint16_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int num_threads) const {
*is_sparse = true;
*bit_type = 16;
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
bin_iterator->emplace_back(new SparseBinIterator<uint16_t>(this, 0));
}
return nullptr;
}
void BinMapper::CopyTo(char * buffer) const {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(buffer, &is_trivial_, sizeof(is_trivial_));
buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(buffer, &min_val_, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(buffer, &max_val_, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(buffer, &most_freq_bin_, sizeof(most_freq_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
}
template <>
const void* SparseBin<uint32_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int num_threads) const {
*is_sparse = true;
*bit_type = 32;
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
bin_iterator->emplace_back(new SparseBinIterator<uint32_t>(this, 0));
}
void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(num_bin_));
std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
buffer += VirtualFileWriter::AlignedSize(sizeof(missing_type_));
std::memcpy(&is_trivial_, buffer, sizeof(is_trivial_));
buffer += VirtualFileWriter::AlignedSize(sizeof(is_trivial_));
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += VirtualFileWriter::AlignedSize(sizeof(bin_type_));
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(default_bin_));
std::memcpy(&most_freq_bin_, buffer, sizeof(most_freq_bin_));
buffer += VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
} else {
bin_2_categorical_ = std::vector<int>(num_bin_);
std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
categorical_2_bin_.clear();
for (int i = 0; i < num_bin_; ++i) {
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
}
return nullptr;
}
template <>
const void* SparseBin<uint8_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = true;
*bit_type = 8;
*bin_iterator = new SparseBinIterator<uint8_t>(this, 0);
return nullptr;
}
void BinMapper::SaveBinaryToFile(BinaryWriter* writer) const {
writer->AlignedWrite(&num_bin_, sizeof(num_bin_));
writer->AlignedWrite(&missing_type_, sizeof(missing_type_));
writer->AlignedWrite(&is_trivial_, sizeof(is_trivial_));
writer->Write(&sparse_rate_, sizeof(sparse_rate_));
writer->AlignedWrite(&bin_type_, sizeof(bin_type_));
writer->Write(&min_val_, sizeof(min_val_));
writer->Write(&max_val_, sizeof(max_val_));
writer->AlignedWrite(&default_bin_, sizeof(default_bin_));
writer->AlignedWrite(&most_freq_bin_, sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
writer->Write(bin_upper_bound_.data(), sizeof(double) * num_bin_);
} else {
writer->Write(bin_2_categorical_.data(), sizeof(int) * num_bin_);
}
template <>
const void* SparseBin<uint16_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = true;
*bit_type = 16;
*bin_iterator = new SparseBinIterator<uint16_t>(this, 0);
return nullptr;
}
size_t BinMapper::SizesInByte() const {
size_t ret = VirtualFileWriter::AlignedSize(sizeof(num_bin_)) +
VirtualFileWriter::AlignedSize(sizeof(missing_type_)) +
VirtualFileWriter::AlignedSize(sizeof(is_trivial_)) +
sizeof(sparse_rate_) +
VirtualFileWriter::AlignedSize(sizeof(bin_type_)) +
sizeof(min_val_) + sizeof(max_val_) +
VirtualFileWriter::AlignedSize(sizeof(default_bin_)) +
VirtualFileWriter::AlignedSize(sizeof(most_freq_bin_));
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
} else {
ret += sizeof(int) * num_bin_;
}
template <>
const void* SparseBin<uint32_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = true;
*bit_type = 32;
*bin_iterator = new SparseBinIterator<uint32_t>(this, 0);
return nullptr;
return ret;
}
template class DenseBin<uint8_t, true>;
template class DenseBin<uint8_t, false>;
template class DenseBin<uint16_t, false>;
template class DenseBin<uint32_t, false>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
template class MultiValDenseBin<uint8_t>;
template class MultiValDenseBin<uint16_t>;
template class MultiValDenseBin<uint32_t>;
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) {
return new DenseBin<uint8_t, true>(num_data);
} else if (num_bin <= 256) {
return new DenseBin<uint8_t, false>(num_data);
} else if (num_bin <= 65536) {
return new DenseBin<uint16_t, false>(num_data);
} else {
return new DenseBin<uint32_t, false>(num_data);
}
#ifdef USE_CUDA
template <>
const void* MultiValDenseBin<uint8_t>::GetRowWiseData(uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = data_.data();
*bit_type = 8;
*total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_feature_);
CHECK_EQ(*total_size, data_.size());
*is_sparse = false;
*out_data_ptr = nullptr;
*data_ptr_bit_type = 0;
return to_return;
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new SparseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new SparseBin<uint16_t>(num_data);
} else {
return new SparseBin<uint32_t>(num_data);
}
template <>
const void* MultiValDenseBin<uint16_t>::GetRowWiseData(uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint16_t* data_ptr = data_.data();
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_ptr);
*bit_type = 16;
*total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_feature_);
CHECK_EQ(*total_size, data_.size());
*is_sparse = false;
*out_data_ptr = nullptr;
*data_ptr_bit_type = 0;
return to_return;
}
MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature,
double sparse_rate, const std::vector<uint32_t>& offsets) {
if (sparse_rate >= multi_val_bin_sparse_threshold) {
const double average_element_per_row = (1.0 - sparse_rate) * num_feature;
return CreateMultiValSparseBin(num_data, num_bin,
average_element_per_row);
} else {
return CreateMultiValDenseBin(num_data, num_bin, num_feature, offsets);
}
template <>
const void* MultiValDenseBin<uint32_t>::GetRowWiseData(uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint32_t* data_ptr = data_.data();
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_ptr);
*bit_type = 32;
*total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_feature_);
CHECK_EQ(*total_size, data_.size());
*is_sparse = false;
*out_data_ptr = nullptr;
*data_ptr_bit_type = 0;
return to_return;
}
template <>
const void* MultiValSparseBin<uint16_t, uint8_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = data_.data();
*bit_type = 8;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 16;
return to_return;
}
template <>
const void* MultiValSparseBin<uint16_t, uint16_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 16;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 16;
return to_return;
}
template <>
const void* MultiValSparseBin<uint16_t, uint32_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 32;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 16;
return to_return;
}
MultiValBin* MultiValBin::CreateMultiValDenseBin(data_size_t num_data,
int num_bin,
int num_feature,
const std::vector<uint32_t>& offsets) {
// calculate max bin of all features to select the int type in MultiValDenseBin
int max_bin = 0;
for (int i = 0; i < static_cast<int>(offsets.size()) - 1; ++i) {
int feature_bin = offsets[i + 1] - offsets[i];
if (feature_bin > max_bin) {
max_bin = feature_bin;
}
}
template <>
const void* MultiValSparseBin<uint32_t, uint8_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = data_.data();
*bit_type = 8;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 32;
return to_return;
if (max_bin <= 256) {
return new MultiValDenseBin<uint8_t>(num_data, num_bin, num_feature, offsets);
} else if (max_bin <= 65536) {
return new MultiValDenseBin<uint16_t>(num_data, num_bin, num_feature, offsets);
} else {
return new MultiValDenseBin<uint32_t>(num_data, num_bin, num_feature, offsets);
}
template <>
const void* MultiValSparseBin<uint32_t, uint16_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 16;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 32;
return to_return;
}
MultiValBin* MultiValBin::CreateMultiValSparseBin(data_size_t num_data,
int num_bin,
double estimate_element_per_row) {
size_t estimate_total_entries =
static_cast<size_t>(estimate_element_per_row * 1.1 * num_data);
if (estimate_total_entries <= std::numeric_limits<uint16_t>::max()) {
if (num_bin <= 256) {
return new MultiValSparseBin<uint16_t, uint8_t>(
num_data, num_bin, estimate_element_per_row);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<uint16_t, uint16_t>(
num_data, num_bin, estimate_element_per_row);
} else {
return new MultiValSparseBin<uint16_t, uint32_t>(
num_data, num_bin, estimate_element_per_row);
}
} else if (estimate_total_entries <= std::numeric_limits<uint32_t>::max()) {
if (num_bin <= 256) {
return new MultiValSparseBin<uint32_t, uint8_t>(
num_data, num_bin, estimate_element_per_row);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<uint32_t, uint16_t>(
num_data, num_bin, estimate_element_per_row);
} else {
return new MultiValSparseBin<uint32_t, uint32_t>(
num_data, num_bin, estimate_element_per_row);
}
} else {
if (num_bin <= 256) {
return new MultiValSparseBin<size_t, uint8_t>(
num_data, num_bin, estimate_element_per_row);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<size_t, uint16_t>(
num_data, num_bin, estimate_element_per_row);
} else {
return new MultiValSparseBin<size_t, uint32_t>(
num_data, num_bin, estimate_element_per_row);
}
}
template <>
const void* MultiValSparseBin<uint32_t, uint32_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 32;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 32;
return to_return;
}
template <>
const void* DenseBin<uint8_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int /*num_threads*/) const {
*is_sparse = false;
*bit_type = 8;
bin_iterator->clear();
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint16_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int /*num_threads*/) const {
*is_sparse = false;
*bit_type = 16;
bin_iterator->clear();
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint32_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int /*num_threads*/) const {
*is_sparse = false;
*bit_type = 32;
bin_iterator->clear();
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint8_t, true>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int /*num_threads*/) const {
*is_sparse = false;
*bit_type = 4;
bin_iterator->clear();
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint8_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = false;
*bit_type = 8;
*bin_iterator = nullptr;
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint16_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = false;
*bit_type = 16;
*bin_iterator = nullptr;
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint32_t, false>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = false;
*bit_type = 32;
*bin_iterator = nullptr;
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* DenseBin<uint8_t, true>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = false;
*bit_type = 4;
*bin_iterator = nullptr;
return reinterpret_cast<const void*>(data_.data());
}
template <>
const void* SparseBin<uint8_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int num_threads) const {
*is_sparse = true;
*bit_type = 8;
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
bin_iterator->emplace_back(new SparseBinIterator<uint8_t>(this, 0));
}
template <>
const void* MultiValSparseBin<uint64_t, uint8_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = data_.data();
*bit_type = 8;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 64;
return to_return;
return nullptr;
}
template <>
const void* SparseBin<uint16_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int num_threads) const {
*is_sparse = true;
*bit_type = 16;
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
bin_iterator->emplace_back(new SparseBinIterator<uint16_t>(this, 0));
}
template <>
const void* MultiValSparseBin<uint64_t, uint16_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 16;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 64;
return to_return;
return nullptr;
}
template <>
const void* SparseBin<uint32_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
std::vector<BinIterator*>* bin_iterator,
const int num_threads) const {
*is_sparse = true;
*bit_type = 32;
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
bin_iterator->emplace_back(new SparseBinIterator<uint32_t>(this, 0));
}
template <>
const void* MultiValSparseBin<uint64_t, uint32_t>::GetRowWiseData(
uint8_t* bit_type,
return nullptr;
}
template <>
const void* SparseBin<uint8_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = true;
*bit_type = 8;
*bin_iterator = new SparseBinIterator<uint8_t>(this, 0);
return nullptr;
}
template <>
const void* SparseBin<uint16_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = true;
*bit_type = 16;
*bin_iterator = new SparseBinIterator<uint16_t>(this, 0);
return nullptr;
}
template <>
const void* SparseBin<uint32_t>::GetColWiseData(
uint8_t* bit_type,
bool* is_sparse,
BinIterator** bin_iterator) const {
*is_sparse = true;
*bit_type = 32;
*bin_iterator = new SparseBinIterator<uint32_t>(this, 0);
return nullptr;
}
#ifdef USE_CUDA
template <>
const void* MultiValDenseBin<uint8_t>::GetRowWiseData(uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 32;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 64;
return to_return;
}
#endif // USE_CUDA
const uint8_t* to_return = data_.data();
*bit_type = 8;
*total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_feature_);
CHECK_EQ(*total_size, data_.size());
*is_sparse = false;
*out_data_ptr = nullptr;
*data_ptr_bit_type = 0;
return to_return;
}
template <>
const void* MultiValDenseBin<uint16_t>::GetRowWiseData(uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint16_t* data_ptr = data_.data();
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_ptr);
*bit_type = 16;
*total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_feature_);
CHECK_EQ(*total_size, data_.size());
*is_sparse = false;
*out_data_ptr = nullptr;
*data_ptr_bit_type = 0;
return to_return;
}
template <>
const void* MultiValDenseBin<uint32_t>::GetRowWiseData(uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint32_t* data_ptr = data_.data();
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_ptr);
*bit_type = 32;
*total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_feature_);
CHECK_EQ(*total_size, data_.size());
*is_sparse = false;
*out_data_ptr = nullptr;
*data_ptr_bit_type = 0;
return to_return;
}
template <>
const void* MultiValSparseBin<uint16_t, uint8_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = data_.data();
*bit_type = 8;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 16;
return to_return;
}
template <>
const void* MultiValSparseBin<uint16_t, uint16_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 16;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 16;
return to_return;
}
template <>
const void* MultiValSparseBin<uint16_t, uint32_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 32;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 16;
return to_return;
}
template <>
const void* MultiValSparseBin<uint32_t, uint8_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = data_.data();
*bit_type = 8;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 32;
return to_return;
}
template <>
const void* MultiValSparseBin<uint32_t, uint16_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 16;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 32;
return to_return;
}
template <>
const void* MultiValSparseBin<uint32_t, uint32_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 32;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 32;
return to_return;
}
template <>
const void* MultiValSparseBin<uint64_t, uint8_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = data_.data();
*bit_type = 8;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 64;
return to_return;
}
template <>
const void* MultiValSparseBin<uint64_t, uint16_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 16;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 64;
return to_return;
}
template <>
const void* MultiValSparseBin<uint64_t, uint32_t>::GetRowWiseData(
uint8_t* bit_type,
size_t* total_size,
bool* is_sparse,
const void** out_data_ptr,
uint8_t* data_ptr_bit_type) const {
const uint8_t* to_return = reinterpret_cast<const uint8_t*>(data_.data());
*bit_type = 32;
*total_size = data_.size();
*is_sparse = true;
*out_data_ptr = reinterpret_cast<const uint8_t*>(row_ptr_.data());
*data_ptr_bit_type = 64;
return to_return;
}
#endif // USE_CUDA
} // namespace LightGBM
......@@ -28,10 +28,8 @@ namespace LightGBM {
const int Dataset::kSerializedReferenceVersionLength = 2;
const char* Dataset::serialized_reference_version = "v1";
const char* Dataset::binary_file_token =
"______LightGBM_Binary_File_Token______\n";
const char* Dataset::binary_serialized_reference_token =
"______LightGBM_Binary_Serialized_Token______\n";
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
const char* Dataset::binary_serialized_reference_token = "______LightGBM_Binary_Serialized_Token______\n";
Dataset::Dataset() {
data_filename_ = "noname";
......
......@@ -12,8 +12,8 @@
namespace LightGBM {
CUDABinaryLoglossMetric::CUDABinaryLoglossMetric(const Config& config):
CUDABinaryMetricInterface<BinaryLoglossMetric, CUDABinaryLoglossMetric>(config) {}
CUDABinaryLoglossMetric::CUDABinaryLoglossMetric(
const Config& config):CUDABinaryMetricInterface<BinaryLoglossMetric, CUDABinaryLoglossMetric>(config) {}
template <typename HOST_METRIC, typename CUDA_METRIC>
std::vector<double> CUDABinaryMetricInterface<HOST_METRIC, CUDA_METRIC>::Eval(const double* score, const ObjectiveFunction* objective) const {
......
......@@ -30,40 +30,40 @@
namespace LightGBM {
// label should be in interval [0, 1];
// prob should be in interval (0, 1); prob is clipped if needed
inline static double XentLoss(label_t label, double prob) {
const double log_arg_epsilon = 1.0e-12;
double a = label;
if (prob > log_arg_epsilon) {
a *= std::log(prob);
} else {
a *= std::log(log_arg_epsilon);
}
double b = 1.0f - label;
if (1.0f - prob > log_arg_epsilon) {
b *= std::log(1.0f - prob);
} else {
b *= std::log(log_arg_epsilon);
}
return - (a + b);
// label should be in interval [0, 1];
// prob should be in interval (0, 1); prob is clipped if needed
inline static double XentLoss(label_t label, double prob) {
const double log_arg_epsilon = 1.0e-12;
double a = label;
if (prob > log_arg_epsilon) {
a *= std::log(prob);
} else {
a *= std::log(log_arg_epsilon);
}
// hhat >(=) 0 assumed; and weight > 0 required; but not checked here
inline static double XentLambdaLoss(label_t label, label_t weight, double hhat) {
return XentLoss(label, 1.0f - std::exp(-weight * hhat));
}
// Computes the (negative) entropy for label p; p should be in interval [0, 1];
// This is used to presum the KL-divergence offset term (to be _added_ to the cross-entropy loss).
// NOTE: x*log(x) = 0 for x=0,1; so only add when in (0, 1); avoid log(0)*0
inline static double YentLoss(double p) {
double hp = 0.0;
if (p > 0) hp += p * std::log(p);
double q = 1.0f - p;
if (q > 0) hp += q * std::log(q);
return hp;
double b = 1.0f - label;
if (1.0f - prob > log_arg_epsilon) {
b *= std::log(1.0f - prob);
} else {
b *= std::log(log_arg_epsilon);
}
return - (a + b);
}
// hhat >(=) 0 assumed; and weight > 0 required; but not checked here
inline static double XentLambdaLoss(label_t label, label_t weight, double hhat) {
return XentLoss(label, 1.0f - std::exp(-weight * hhat));
}
// Computes the (negative) entropy for label p; p should be in interval [0, 1];
// This is used to presum the KL-divergence offset term (to be _added_ to the cross-entropy loss).
// NOTE: x*log(x) = 0 for x=0,1; so only add when in (0, 1); avoid log(0)*0
inline static double YentLoss(double p) {
double hp = 0.0;
if (p > 0) hp += p * std::log(p);
double q = 1.0f - p;
if (q > 0) hp += q * std::log(q);
return hp;
}
//
// CrossEntropyMetric : "xentropy" : (optional) weights are used linearly
......
......@@ -12,8 +12,7 @@
namespace LightGBM {
template <typename TREELEARNER_T>
DataParallelTreeLearner<TREELEARNER_T>::DataParallelTreeLearner(const Config* config)
:TREELEARNER_T(config) {
DataParallelTreeLearner<TREELEARNER_T>::DataParallelTreeLearner(const Config* config):TREELEARNER_T(config) {
}
template <typename TREELEARNER_T>
......
......@@ -384,8 +384,11 @@ void FeatureHistogram::FindBestThresholdCategoricalInner(double sum_gradient,
}
}
template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING, typename PACKED_HIST_BIN_T, typename PACKED_HIST_ACC_T,
typename HIST_BIN_T, typename HIST_ACC_T, int HIST_BITS_BIN, int HIST_BITS_ACC>
template <
bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING,
typename PACKED_HIST_BIN_T, typename PACKED_HIST_ACC_T, typename HIST_BIN_T, typename HIST_ACC_T,
int HIST_BITS_BIN, int HIST_BITS_ACC
>
void FeatureHistogram::FindBestThresholdCategoricalIntInner(int64_t int_sum_gradient_and_hessian,
const double grad_scale, const double hess_scale,
data_size_t num_data,
......
......@@ -11,8 +11,7 @@ namespace LightGBM {
template <typename TREELEARNER_T>
FeatureParallelTreeLearner<TREELEARNER_T>::FeatureParallelTreeLearner(const Config* config)
:TREELEARNER_T(config) {
FeatureParallelTreeLearner<TREELEARNER_T>::FeatureParallelTreeLearner(const Config* config):TREELEARNER_T(config) {
}
template <typename TREELEARNER_T>
......
......@@ -15,8 +15,7 @@
namespace LightGBM {
template <typename TREELEARNER_T>
VotingParallelTreeLearner<TREELEARNER_T>::VotingParallelTreeLearner(const Config* config)
:TREELEARNER_T(config) {
VotingParallelTreeLearner<TREELEARNER_T>::VotingParallelTreeLearner(const Config* config):TREELEARNER_T(config) {
top_k_ = this->config_->top_k;
}
......
......@@ -18,423 +18,423 @@ using LightGBM::Random;
namespace LightGBM {
/*!
* Creates a Dataset from the internal repository examples.
*/
int TestUtils::LoadDatasetFromExamples(const char* filename, const char* config, DatasetHandle* out) {
std::string fullPath("examples/");
fullPath += filename;
Log::Info("Debug sample data path: %s", fullPath.c_str());
return LGBM_DatasetCreateFromFile(
fullPath.c_str(),
config,
nullptr,
out);
}
/*!
* Creates a Dataset from the internal repository examples.
*/
int TestUtils::LoadDatasetFromExamples(const char* filename, const char* config, DatasetHandle* out) {
std::string fullPath("examples/");
fullPath += filename;
Log::Info("Debug sample data path: %s", fullPath.c_str());
return LGBM_DatasetCreateFromFile(
fullPath.c_str(),
config,
nullptr,
out);
}
/*!
* Creates fake data in the passed vectors.
*/
void TestUtils::CreateRandomDenseData(
int32_t nrows,
int32_t ncols,
int32_t nclasses,
std::vector<double>* features,
std::vector<float>* labels,
std::vector<float>* weights,
std::vector<double>* init_scores,
std::vector<int32_t>* groups) {
Random rand(42);
features->reserve(nrows * ncols);
for (int32_t row = 0; row < nrows; row++) {
for (int32_t col = 0; col < ncols; col++) {
features->push_back(rand.NextFloat());
}
/*!
* Creates fake data in the passed vectors.
*/
void TestUtils::CreateRandomDenseData(
int32_t nrows,
int32_t ncols,
int32_t nclasses,
std::vector<double>* features,
std::vector<float>* labels,
std::vector<float>* weights,
std::vector<double>* init_scores,
std::vector<int32_t>* groups) {
Random rand(42);
features->reserve(nrows * ncols);
for (int32_t row = 0; row < nrows; row++) {
for (int32_t col = 0; col < ncols; col++) {
features->push_back(rand.NextFloat());
}
CreateRandomMetadata(nrows, nclasses, labels, weights, init_scores, groups);
}
/*!
* Creates fake data in the passed vectors.
*/
void TestUtils::CreateRandomSparseData(
int32_t nrows,
int32_t ncols,
int32_t nclasses,
float sparse_percent,
std::vector<int32_t>* indptr,
std::vector<int32_t>* indices,
std::vector<double>* values,
std::vector<float>* labels,
std::vector<float>* weights,
std::vector<double>* init_scores,
std::vector<int32_t>* groups) {
Random rand(42);
indptr->reserve(static_cast<int32_t>(nrows + 1));
indices->reserve(static_cast<int32_t>(sparse_percent * nrows * ncols));
values->reserve(static_cast<int32_t>(sparse_percent * nrows * ncols));
indptr->push_back(0);
for (int32_t row = 0; row < nrows; row++) {
for (int32_t col = 0; col < ncols; col++) {
float rnd = rand.NextFloat();
if (rnd < sparse_percent) {
indices->push_back(col);
values->push_back(rand.NextFloat());
}
CreateRandomMetadata(nrows, nclasses, labels, weights, init_scores, groups);
}
/*!
* Creates fake data in the passed vectors.
*/
void TestUtils::CreateRandomSparseData(
int32_t nrows,
int32_t ncols,
int32_t nclasses,
float sparse_percent,
std::vector<int32_t>* indptr,
std::vector<int32_t>* indices,
std::vector<double>* values,
std::vector<float>* labels,
std::vector<float>* weights,
std::vector<double>* init_scores,
std::vector<int32_t>* groups) {
Random rand(42);
indptr->reserve(static_cast<int32_t>(nrows + 1));
indices->reserve(static_cast<int32_t>(sparse_percent * nrows * ncols));
values->reserve(static_cast<int32_t>(sparse_percent * nrows * ncols));
indptr->push_back(0);
for (int32_t row = 0; row < nrows; row++) {
for (int32_t col = 0; col < ncols; col++) {
float rnd = rand.NextFloat();
if (rnd < sparse_percent) {
indices->push_back(col);
values->push_back(rand.NextFloat());
}
indptr->push_back(static_cast<int32_t>(indices->size() - 1));
}
indptr->push_back(static_cast<int32_t>(indices->size() - 1));
}
CreateRandomMetadata(nrows, nclasses, labels, weights, init_scores, groups);
}
CreateRandomMetadata(nrows, nclasses, labels, weights, init_scores, groups);
/*!
* Creates fake data in the passed vectors.
*/
void TestUtils::CreateRandomMetadata(int32_t nrows,
int32_t nclasses,
std::vector<float>* labels,
std::vector<float>* weights,
std::vector<double>* init_scores,
std::vector<int32_t>* groups) {
Random rand(42);
labels->reserve(nrows);
if (weights) {
weights->reserve(nrows);
}
if (init_scores) {
init_scores->reserve(nrows * nclasses);
}
if (groups) {
groups->reserve(nrows);
}
/*!
* Creates fake data in the passed vectors.
*/
void TestUtils::CreateRandomMetadata(int32_t nrows,
int32_t nclasses,
std::vector<float>* labels,
std::vector<float>* weights,
std::vector<double>* init_scores,
std::vector<int32_t>* groups) {
Random rand(42);
labels->reserve(nrows);
int32_t group = 0;
for (int32_t row = 0; row < nrows; row++) {
labels->push_back(rand.NextFloat());
if (weights) {
weights->reserve(nrows);
weights->push_back(rand.NextFloat());
}
if (init_scores) {
init_scores->reserve(nrows * nclasses);
for (int32_t i = 0; i < nclasses; i++) {
init_scores->push_back(rand.NextFloat());
}
}
if (groups) {
groups->reserve(nrows);
if (rand.NextFloat() > 0.95) {
group++;
}
groups->push_back(group);
}
}
}
void TestUtils::StreamDenseDataset(DatasetHandle dataset_handle,
int32_t nrows,
int32_t ncols,
int32_t nclasses,
int32_t batch_count,
const std::vector<double>* features,
const std::vector<float>* labels,
const std::vector<float>* weights,
const std::vector<double>* init_scores,
const std::vector<int32_t>* groups) {
int result = LGBM_DatasetSetWaitForManualFinish(dataset_handle, 1);
EXPECT_EQ(0, result) << "LGBM_DatasetSetWaitForManualFinish result code: " << result;
Log::Info(" Begin StreamDenseDataset");
if ((nrows % batch_count) != 0) {
Log::Fatal("This utility method only handles nrows that are a multiple of batch_count");
}
int32_t group = 0;
const double* features_ptr = features->data();
const float* labels_ptr = labels->data();
const float* weights_ptr = nullptr;
if (weights) {
weights_ptr = weights->data();
}
for (int32_t row = 0; row < nrows; row++) {
labels->push_back(rand.NextFloat());
if (weights) {
weights->push_back(rand.NextFloat());
}
if (init_scores) {
for (int32_t i = 0; i < nclasses; i++) {
init_scores->push_back(rand.NextFloat());
}
}
if (groups) {
if (rand.NextFloat() > 0.95) {
group++;
}
groups->push_back(group);
}
}
// Since init_scores are in a column format, but need to be pushed as rows, we have to extract each batch
std::vector<double> init_score_batch;
const double* init_scores_ptr = nullptr;
if (init_scores) {
init_score_batch.reserve(nclasses * batch_count);
init_scores_ptr = init_score_batch.data();
}
void TestUtils::StreamDenseDataset(DatasetHandle dataset_handle,
int32_t nrows,
int32_t ncols,
int32_t nclasses,
int32_t batch_count,
const std::vector<double>* features,
const std::vector<float>* labels,
const std::vector<float>* weights,
const std::vector<double>* init_scores,
const std::vector<int32_t>* groups) {
int result = LGBM_DatasetSetWaitForManualFinish(dataset_handle, 1);
EXPECT_EQ(0, result) << "LGBM_DatasetSetWaitForManualFinish result code: " << result;
Log::Info(" Begin StreamDenseDataset");
if ((nrows % batch_count) != 0) {
Log::Fatal("This utility method only handles nrows that are a multiple of batch_count");
}
const int32_t* groups_ptr = nullptr;
if (groups) {
groups_ptr = groups->data();
}
const double* features_ptr = features->data();
const float* labels_ptr = labels->data();
const float* weights_ptr = nullptr;
if (weights) {
weights_ptr = weights->data();
}
auto start_time = std::chrono::steady_clock::now();
// Since init_scores are in a column format, but need to be pushed as rows, we have to extract each batch
std::vector<double> init_score_batch;
const double* init_scores_ptr = nullptr;
for (int32_t i = 0; i < nrows; i += batch_count) {
if (init_scores) {
init_score_batch.reserve(nclasses * batch_count);
init_scores_ptr = init_score_batch.data();
init_scores_ptr = CreateInitScoreBatch(&init_score_batch, i, nrows, nclasses, batch_count, init_scores);
}
const int32_t* groups_ptr = nullptr;
if (groups) {
groups_ptr = groups->data();
result = LGBM_DatasetPushRowsWithMetadata(dataset_handle,
features_ptr,
1,
batch_count,
ncols,
i,
labels_ptr,
weights_ptr,
init_scores_ptr,
groups_ptr,
0);
EXPECT_EQ(0, result) << "LGBM_DatasetPushRowsWithMetadata result code: " << result;
if (result != 0) {
FAIL() << "LGBM_DatasetPushRowsWithMetadata failed"; // This forces an immediate failure, which EXPECT_EQ does not
}
auto start_time = std::chrono::steady_clock::now();
for (int32_t i = 0; i < nrows; i += batch_count) {
if (init_scores) {
init_scores_ptr = CreateInitScoreBatch(&init_score_batch, i, nrows, nclasses, batch_count, init_scores);
}
result = LGBM_DatasetPushRowsWithMetadata(dataset_handle,
features_ptr,
1,
batch_count,
ncols,
i,
labels_ptr,
weights_ptr,
init_scores_ptr,
groups_ptr,
0);
EXPECT_EQ(0, result) << "LGBM_DatasetPushRowsWithMetadata result code: " << result;
if (result != 0) {
FAIL() << "LGBM_DatasetPushRowsWithMetadata failed"; // This forces an immediate failure, which EXPECT_EQ does not
}
features_ptr += batch_count * ncols;
labels_ptr += batch_count;
if (weights_ptr) {
weights_ptr += batch_count;
}
if (groups_ptr) {
groups_ptr += batch_count;
}
features_ptr += batch_count * ncols;
labels_ptr += batch_count;
if (weights_ptr) {
weights_ptr += batch_count;
}
if (groups_ptr) {
groups_ptr += batch_count;
}
}
auto cur_time = std::chrono::steady_clock::now();
Log::Info(" Time: %d", cur_time - start_time);
auto cur_time = std::chrono::steady_clock::now();
Log::Info(" Time: %d", cur_time - start_time);
}
void TestUtils::StreamSparseDataset(DatasetHandle dataset_handle,
int32_t nrows,
int32_t nclasses,
int32_t batch_count,
const std::vector<int32_t>* indptr,
const std::vector<int32_t>* indices,
const std::vector<double>* values,
const std::vector<float>* labels,
const std::vector<float>* weights,
const std::vector<double>* init_scores,
const std::vector<int32_t>* groups) {
int result = LGBM_DatasetSetWaitForManualFinish(dataset_handle, 1);
EXPECT_EQ(0, result) << "LGBM_DatasetSetWaitForManualFinish result code: " << result;
Log::Info(" Begin StreamSparseDataset");
if ((nrows % batch_count) != 0) {
Log::Fatal("This utility method only handles nrows that are a multiple of batch_count");
}
void TestUtils::StreamSparseDataset(DatasetHandle dataset_handle,
int32_t nrows,
int32_t nclasses,
int32_t batch_count,
const std::vector<int32_t>* indptr,
const std::vector<int32_t>* indices,
const std::vector<double>* values,
const std::vector<float>* labels,
const std::vector<float>* weights,
const std::vector<double>* init_scores,
const std::vector<int32_t>* groups) {
int result = LGBM_DatasetSetWaitForManualFinish(dataset_handle, 1);
EXPECT_EQ(0, result) << "LGBM_DatasetSetWaitForManualFinish result code: " << result;
Log::Info(" Begin StreamSparseDataset");
if ((nrows % batch_count) != 0) {
Log::Fatal("This utility method only handles nrows that are a multiple of batch_count");
}
const int32_t* indptr_ptr = indptr->data();
const int32_t* indices_ptr = indices->data();
const double* values_ptr = values->data();
const float* labels_ptr = labels->data();
const float* weights_ptr = nullptr;
if (weights) {
weights_ptr = weights->data();
}
const int32_t* indptr_ptr = indptr->data();
const int32_t* indices_ptr = indices->data();
const double* values_ptr = values->data();
const float* labels_ptr = labels->data();
const float* weights_ptr = nullptr;
if (weights) {
weights_ptr = weights->data();
}
const int32_t* groups_ptr = nullptr;
if (groups) {
groups_ptr = groups->data();
}
const int32_t* groups_ptr = nullptr;
if (groups) {
groups_ptr = groups->data();
}
auto start_time = std::chrono::steady_clock::now();
auto start_time = std::chrono::steady_clock::now();
// Use multiple threads to test concurrency
int thread_count = 2;
if (nrows == batch_count) {
thread_count = 1; // If pushing all rows in 1 batch, we cannot have multiple threads
}
std::vector<std::thread> threads;
threads.reserve(thread_count);
for (int32_t t = 0; t < thread_count; ++t) {
std::thread th(TestUtils::PushSparseBatch,
dataset_handle,
nrows,
nclasses,
batch_count,
indptr,
indptr_ptr,
indices_ptr,
values_ptr,
labels_ptr,
weights_ptr,
init_scores,
groups_ptr,
thread_count,
t);
threads.push_back(std::move(th));
}
// Use multiple threads to test concurrency
int thread_count = 2;
if (nrows == batch_count) {
thread_count = 1; // If pushing all rows in 1 batch, we cannot have multiple threads
}
std::vector<std::thread> threads;
threads.reserve(thread_count);
for (int32_t t = 0; t < thread_count; ++t) {
std::thread th(TestUtils::PushSparseBatch,
dataset_handle,
nrows,
nclasses,
batch_count,
indptr,
indptr_ptr,
indices_ptr,
values_ptr,
labels_ptr,
weights_ptr,
init_scores,
groups_ptr,
thread_count,
t);
threads.push_back(std::move(th));
}
for (auto& t : threads) t.join();
for (auto& t : threads) t.join();
auto cur_time = std::chrono::steady_clock::now();
Log::Info(" Time: %d", cur_time - start_time);
}
auto cur_time = std::chrono::steady_clock::now();
Log::Info(" Time: %d", cur_time - start_time);
/*!
* Pushes data from 1 thread into a Dataset based on thread_id and nrows.
* e.g. with 100 rows, thread 0 will push rows 0-49, and thread 2 will push rows 50-99.
* Note that rows are still pushed in microbatches within their range.
*/
void TestUtils::PushSparseBatch(DatasetHandle dataset_handle,
int32_t nrows,
int32_t nclasses,
int32_t batch_count,
const std::vector<int32_t>* indptr,
const int32_t* indptr_ptr,
const int32_t* indices_ptr,
const double* values_ptr,
const float* labels_ptr,
const float* weights_ptr,
const std::vector<double>* init_scores,
const int32_t* groups_ptr,
int32_t thread_count,
int32_t thread_id) {
int32_t threadChunkSize = nrows / thread_count;
int32_t startIndex = threadChunkSize * thread_id;
int32_t stopIndex = startIndex + threadChunkSize;
indptr_ptr += threadChunkSize * thread_id;
labels_ptr += threadChunkSize * thread_id;
if (weights_ptr) {
weights_ptr += threadChunkSize * thread_id;
}
if (groups_ptr) {
groups_ptr += threadChunkSize * thread_id;
}
/*!
* Pushes data from 1 thread into a Dataset based on thread_id and nrows.
* e.g. with 100 rows, thread 0 will push rows 0-49, and thread 2 will push rows 50-99.
* Note that rows are still pushed in microbatches within their range.
*/
void TestUtils::PushSparseBatch(DatasetHandle dataset_handle,
int32_t nrows,
int32_t nclasses,
int32_t batch_count,
const std::vector<int32_t>* indptr,
const int32_t* indptr_ptr,
const int32_t* indices_ptr,
const double* values_ptr,
const float* labels_ptr,
const float* weights_ptr,
const std::vector<double>* init_scores,
const int32_t* groups_ptr,
int32_t thread_count,
int32_t thread_id) {
int32_t threadChunkSize = nrows / thread_count;
int32_t startIndex = threadChunkSize * thread_id;
int32_t stopIndex = startIndex + threadChunkSize;
indptr_ptr += threadChunkSize * thread_id;
labels_ptr += threadChunkSize * thread_id;
for (int32_t i = startIndex; i < stopIndex; i += batch_count) {
// Since init_scores are in a column format, but need to be pushed as rows, we have to extract each batch
std::vector<double> init_score_batch;
const double* init_scores_ptr = nullptr;
if (init_scores) {
init_score_batch.reserve(nclasses * batch_count);
init_scores_ptr = CreateInitScoreBatch(&init_score_batch, i, nrows, nclasses, batch_count, init_scores);
}
int32_t nelem = indptr->at(i + batch_count - 1) - indptr->at(i);
int result = LGBM_DatasetPushRowsByCSRWithMetadata(dataset_handle,
indptr_ptr,
2,
indices_ptr,
values_ptr,
1,
batch_count + 1,
nelem,
i,
labels_ptr,
weights_ptr,
init_scores_ptr,
groups_ptr,
thread_id);
EXPECT_EQ(0, result) << "LGBM_DatasetPushRowsByCSRWithMetadata result code: " << result;
if (result != 0) {
FAIL() << "LGBM_DatasetPushRowsByCSRWithMetadata failed"; // This forces an immediate failure, which EXPECT_EQ does not
}
indptr_ptr += batch_count;
labels_ptr += batch_count;
if (weights_ptr) {
weights_ptr += threadChunkSize * thread_id;
weights_ptr += batch_count;
}
if (groups_ptr) {
groups_ptr += threadChunkSize * thread_id;
groups_ptr += batch_count;
}
for (int32_t i = startIndex; i < stopIndex; i += batch_count) {
// Since init_scores are in a column format, but need to be pushed as rows, we have to extract each batch
std::vector<double> init_score_batch;
const double* init_scores_ptr = nullptr;
if (init_scores) {
init_score_batch.reserve(nclasses * batch_count);
init_scores_ptr = CreateInitScoreBatch(&init_score_batch, i, nrows, nclasses, batch_count, init_scores);
}
int32_t nelem = indptr->at(i + batch_count - 1) - indptr->at(i);
int result = LGBM_DatasetPushRowsByCSRWithMetadata(dataset_handle,
indptr_ptr,
2,
indices_ptr,
values_ptr,
1,
batch_count + 1,
nelem,
i,
labels_ptr,
weights_ptr,
init_scores_ptr,
groups_ptr,
thread_id);
EXPECT_EQ(0, result) << "LGBM_DatasetPushRowsByCSRWithMetadata result code: " << result;
if (result != 0) {
FAIL() << "LGBM_DatasetPushRowsByCSRWithMetadata failed"; // This forces an immediate failure, which EXPECT_EQ does not
}
indptr_ptr += batch_count;
labels_ptr += batch_count;
if (weights_ptr) {
weights_ptr += batch_count;
}
if (groups_ptr) {
groups_ptr += batch_count;
}
}
}
void TestUtils::AssertMetadata(const Metadata* metadata,
const std::vector<float>* ref_labels,
const std::vector<float>* ref_weights,
const std::vector<double>* ref_init_scores,
const std::vector<int32_t>* ref_groups) {
const float* labels = metadata->label();
auto nTotal = static_cast<int32_t>(ref_labels->size());
for (auto i = 0; i < nTotal; i++) {
EXPECT_EQ(ref_labels->at(i), labels[i]) << "Inserted data: " << ref_labels->at(i) << " at " << i;
if (ref_labels->at(i) != labels[i]) {
FAIL() << "Mismatched labels"; // This forces an immediate failure, which EXPECT_EQ does not
}
}
void TestUtils::AssertMetadata(const Metadata* metadata,
const std::vector<float>* ref_labels,
const std::vector<float>* ref_weights,
const std::vector<double>* ref_init_scores,
const std::vector<int32_t>* ref_groups) {
const float* labels = metadata->label();
auto nTotal = static_cast<int32_t>(ref_labels->size());
const float* weights = metadata->weights();
if (weights) {
if (!ref_weights) {
FAIL() << "Expected null weights";
}
for (auto i = 0; i < nTotal; i++) {
EXPECT_EQ(ref_labels->at(i), labels[i]) << "Inserted data: " << ref_labels->at(i) << " at " << i;
if (ref_labels->at(i) != labels[i]) {
FAIL() << "Mismatched labels"; // This forces an immediate failure, which EXPECT_EQ does not
EXPECT_EQ(ref_weights->at(i), weights[i]) << "Inserted data: " << ref_weights->at(i);
if (ref_weights->at(i) != weights[i]) {
FAIL() << "Mismatched weights"; // This forces an immediate failure, which EXPECT_EQ does not
}
}
} else if (ref_weights) {
FAIL() << "Expected non-null weights";
}
const float* weights = metadata->weights();
if (weights) {
if (!ref_weights) {
FAIL() << "Expected null weights";
}
for (auto i = 0; i < nTotal; i++) {
EXPECT_EQ(ref_weights->at(i), weights[i]) << "Inserted data: " << ref_weights->at(i);
if (ref_weights->at(i) != weights[i]) {
FAIL() << "Mismatched weights"; // This forces an immediate failure, which EXPECT_EQ does not
}
}
} else if (ref_weights) {
FAIL() << "Expected non-null weights";
const double* init_scores = metadata->init_score();
if (init_scores) {
if (!ref_init_scores) {
FAIL() << "Expected null init_scores";
}
const double* init_scores = metadata->init_score();
if (init_scores) {
if (!ref_init_scores) {
FAIL() << "Expected null init_scores";
}
for (size_t i = 0; i < ref_init_scores->size(); i++) {
EXPECT_EQ(ref_init_scores->at(i), init_scores[i]) << "Inserted data: " << ref_init_scores->at(i) << " Index: " << i;
if (ref_init_scores->at(i) != init_scores[i]) {
FAIL() << "Mismatched init_scores"; // This forces an immediate failure, which EXPECT_EQ does not
}
for (size_t i = 0; i < ref_init_scores->size(); i++) {
EXPECT_EQ(ref_init_scores->at(i), init_scores[i]) << "Inserted data: " << ref_init_scores->at(i) << " Index: " << i;
if (ref_init_scores->at(i) != init_scores[i]) {
FAIL() << "Mismatched init_scores"; // This forces an immediate failure, which EXPECT_EQ does not
}
} else if (ref_init_scores) {
FAIL() << "Expected non-null init_scores";
}
} else if (ref_init_scores) {
FAIL() << "Expected non-null init_scores";
}
const int32_t* query_boundaries = metadata->query_boundaries();
if (query_boundaries) {
if (!ref_groups) {
FAIL() << "Expected null query_boundaries";
}
// Calculate expected boundaries
std::vector<int32_t> ref_query_boundaries;
ref_query_boundaries.push_back(0);
int group_val = ref_groups->at(0);
for (auto i = 1; i < nTotal; i++) {
if (ref_groups->at(i) != group_val) {
ref_query_boundaries.push_back(i);
group_val = ref_groups->at(i);
}
const int32_t* query_boundaries = metadata->query_boundaries();
if (query_boundaries) {
if (!ref_groups) {
FAIL() << "Expected null query_boundaries";
}
// Calculate expected boundaries
std::vector<int32_t> ref_query_boundaries;
ref_query_boundaries.push_back(0);
int group_val = ref_groups->at(0);
for (auto i = 1; i < nTotal; i++) {
if (ref_groups->at(i) != group_val) {
ref_query_boundaries.push_back(i);
group_val = ref_groups->at(i);
}
ref_query_boundaries.push_back(nTotal);
}
ref_query_boundaries.push_back(nTotal);
for (size_t i = 0; i < ref_query_boundaries.size(); i++) {
EXPECT_EQ(ref_query_boundaries[i], query_boundaries[i]) << "Inserted data: " << ref_query_boundaries[i];
if (ref_query_boundaries[i] != query_boundaries[i]) {
FAIL() << "Mismatched query_boundaries"; // This forces an immediate failure, which EXPECT_EQ does not
}
for (size_t i = 0; i < ref_query_boundaries.size(); i++) {
EXPECT_EQ(ref_query_boundaries[i], query_boundaries[i]) << "Inserted data: " << ref_query_boundaries[i];
if (ref_query_boundaries[i] != query_boundaries[i]) {
FAIL() << "Mismatched query_boundaries"; // This forces an immediate failure, which EXPECT_EQ does not
}
} else if (ref_groups) {
FAIL() << "Expected non-null query_boundaries";
}
} else if (ref_groups) {
FAIL() << "Expected non-null query_boundaries";
}
const double* TestUtils::CreateInitScoreBatch(std::vector<double>* init_score_batch,
int32_t index,
int32_t nrows,
int32_t nclasses,
int32_t batch_count,
const std::vector<double>* original_init_scores) {
// Extract a set of rows from the column-based format (still maintaining column based format)
init_score_batch->clear();
for (int32_t c = 0; c < nclasses; c++) {
for (int32_t row = index; row < index + batch_count; row++) {
init_score_batch->push_back(original_init_scores->at(row + nrows * c));
}
}
const double* TestUtils::CreateInitScoreBatch(std::vector<double>* init_score_batch,
int32_t index,
int32_t nrows,
int32_t nclasses,
int32_t batch_count,
const std::vector<double>* original_init_scores) {
// Extract a set of rows from the column-based format (still maintaining column based format)
init_score_batch->clear();
for (int32_t c = 0; c < nclasses; c++) {
for (int32_t row = index; row < index + batch_count; row++) {
init_score_batch->push_back(original_init_scores->at(row + nrows * c));
}
return init_score_batch->data();
}
return init_score_batch->data();
}
} // namespace LightGBM
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment