Unverified Commit bcad692e authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Speed-up "Split" and some code refactorings (#2883)

* commit

* fix msvc

* fix format
parent 1a48fd26
......@@ -218,61 +218,6 @@ class BinMapper {
uint32_t most_freq_bin_;
};
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram construction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
class OrderedBin {
public:
/*! \brief virtual destructor */
virtual ~OrderedBin() {}
/*!
* \brief Initialization logic.
* \param used_indices If used_indices.size() == 0 means using all data, otherwise, used_indices[i] == true means i-th data is used
(this logic was build for bagging logic)
* \param num_leaves Number of leaves on this iteration
*/
virtual void Init(const char* used_indices, data_size_t num_leaves) = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-ordered by leaf
* \param hessians Hessians, Note:non-ordered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients,
const score_t* hessians, hist_t* out) const = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-ordered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients, hist_t* out) const = 0;
/*!
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
* \param right_leaf The new leaf index after perform this split
* \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
* \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
*/
virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
virtual data_size_t NonZeroCount(int leaf) const = 0;
};
/*! \brief Iterator for one bin column */
class BinIterator {
public:
......@@ -382,43 +327,33 @@ class Bin {
virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin default bin for feature value 0
* \param most_freq_bin
* \param missing_type missing type
* \param default_left missing bin will go to left child
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object.
* \return The number of less than or equal data.
*/
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, uint32_t most_freq_bin, MissingType missing_type, bool default_left, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
uint32_t default_bin, uint32_t most_freq_bin,
MissingType missing_type, bool default_left,
uint32_t threshold, const data_size_t* data_indices,
data_size_t cnt,
data_size_t* lte_indices,
data_size_t* gt_indices) const = 0;
virtual data_size_t SplitCategorical(
uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin,
const uint32_t* threshold, int num_threshold,
const data_size_t* data_indices, data_size_t cnt,
data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
virtual data_size_t Split(uint32_t max_bin, uint32_t default_bin,
uint32_t most_freq_bin, MissingType missing_type,
bool default_left, uint32_t threshold,
const data_size_t* data_indices, data_size_t cnt,
data_size_t* lte_indices,
data_size_t* gt_indices) const = 0;
virtual data_size_t SplitCategorical(
uint32_t max_bin, uint32_t most_freq_bin, const uint32_t* threshold,
int num_threshold, const data_size_t* data_indices, data_size_t cnt,
data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param most_freq_bin
* \param threshold The split threshold.
* \param num_threshold Number of threshold
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object.
* \return The number of less than or equal data.
*/
virtual data_size_t SplitCategorical(uint32_t min_bin, uint32_t max_bin,
uint32_t most_freq_bin, const uint32_t* threshold, int num_threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
/*!
* \brief After pushed all feature data, call this could have better refactor for bin data
......
......@@ -535,13 +535,16 @@ class Dataset {
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
inline data_size_t Split(int feature,
const uint32_t* threshold, int num_threshold, bool default_left,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
inline data_size_t Split(int feature, const uint32_t* threshold,
int num_threshold, bool default_left,
const data_size_t* data_indices,
data_size_t cnt, data_size_t* lte_indices,
data_size_t* gt_indices) const {
const int group = feature2group_[feature];
const int sub_feature = feature2subfeature_[feature];
return feature_groups_[group]->Split(sub_feature, threshold, num_threshold, default_left, data_indices, num_data, lte_indices, gt_indices);
return feature_groups_[group]->Split(
sub_feature, threshold, num_threshold, default_left, data_indices,
cnt, lte_indices, gt_indices);
}
inline int SubFeatureBinOffset(int i) const {
......
......@@ -228,13 +228,11 @@ class FeatureGroup {
return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
}
inline data_size_t Split(
int sub_feature,
const uint32_t* threshold,
int num_threshold,
bool default_left,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
inline data_size_t Split(int sub_feature, const uint32_t* threshold,
int num_threshold, bool default_left,
const data_size_t* data_indices, data_size_t cnt,
data_size_t* lte_indices,
data_size_t* gt_indices) const {
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
if (!is_multi_val_) {
......@@ -242,21 +240,38 @@ class FeatureGroup {
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
*threshold, data_indices, num_data, lte_indices, gt_indices);
if (num_feature_ == 1) {
return bin_data_->Split(max_bin, default_bin, most_freq_bin,
missing_type, default_left, *threshold,
data_indices, cnt, lte_indices, gt_indices);
} else {
return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin,
missing_type, default_left, *threshold,
data_indices, cnt, lte_indices, gt_indices);
}
} else {
return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
if (num_feature_ == 1) {
return bin_data_->SplitCategorical(max_bin, most_freq_bin, threshold,
num_threshold, data_indices, cnt,
lte_indices, gt_indices);
} else {
return bin_data_->SplitCategorical(
min_bin, max_bin, most_freq_bin, threshold, num_threshold,
data_indices, cnt, lte_indices, gt_indices);
}
}
} else {
int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return multi_bin_data_[sub_feature]->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
*threshold, data_indices, num_data, lte_indices, gt_indices);
return multi_bin_data_[sub_feature]->Split(
max_bin, default_bin, most_freq_bin, missing_type, default_left,
*threshold, data_indices, cnt, lte_indices, gt_indices);
} else {
return multi_bin_data_[sub_feature]->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
return multi_bin_data_[sub_feature]->SplitCategorical(
max_bin, most_freq_bin, threshold, num_threshold, data_indices, cnt,
lte_indices, gt_indices);
}
}
}
......
......@@ -75,6 +75,9 @@ const int kAlignedSize = 32;
#define SIZE_ALIGNED(t) ((t) + kAlignedSize - 1) / kAlignedSize * kAlignedSize
// Refer to https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-4-c4127?view=vs-2019
#pragma warning(disable : 4127)
} // namespace LightGBM
#endif // LightGBM_META_H_
......@@ -14,7 +14,6 @@
#include <cstring>
#include "dense_bin.hpp"
#include "dense_nbits_bin.hpp"
#include "multi_val_dense_bin.hpp"
#include "multi_val_sparse_bin.hpp"
#include "sparse_bin.hpp"
......@@ -633,9 +632,10 @@ namespace LightGBM {
return ret;
}
template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;
template class DenseBin<uint8_t, true>;
template class DenseBin<uint8_t, false>;
template class DenseBin<uint16_t, false>;
template class DenseBin<uint32_t, false>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
......@@ -647,13 +647,13 @@ namespace LightGBM {
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) {
return new Dense4bitsBin(num_data);
return new DenseBin<uint8_t, true>(num_data);
} else if (num_bin <= 256) {
return new DenseBin<uint8_t>(num_data);
return new DenseBin<uint8_t, false>(num_data);
} else if (num_bin <= 65536) {
return new DenseBin<uint16_t>(num_data);
return new DenseBin<uint16_t, false>(num_data);
} else {
return new DenseBin<uint32_t>(num_data);
return new DenseBin<uint32_t, false>(num_data);
}
}
......
This diff is collapsed.
/*!
* Copyright (c) 2017 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
#define LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
#include <LightGBM/bin.h>
#include <cstdint>
#include <cstring>
#include <vector>
namespace LightGBM {
class Dense4bitsBin;
class Dense4bitsBinIterator : public BinIterator {
public:
explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
: bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
max_bin_(static_cast<uint8_t>(max_bin)),
most_freq_bin_(static_cast<uint8_t>(most_freq_bin)) {
if (most_freq_bin_ == 0) {
offset_ = 1;
} else {
offset_ = 0;
}
}
inline uint32_t RawGet(data_size_t idx) override;
inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override {}
private:
const Dense4bitsBin* bin_data_;
uint8_t min_bin_;
uint8_t max_bin_;
uint8_t most_freq_bin_;
uint8_t offset_;
};
class Dense4bitsBin : public Bin {
public:
friend Dense4bitsBinIterator;
explicit Dense4bitsBin(data_size_t num_data)
: num_data_(num_data) {
int len = (num_data_ + 1) / 2;
data_.resize(len, static_cast<uint8_t>(0));
buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
}
~Dense4bitsBin() {
}
void Push(int, data_size_t idx, uint32_t value) override {
const int i1 = idx >> 1;
const int i2 = (idx & 1) << 2;
const uint8_t val = static_cast<uint8_t>(value) << i2;
if (i2 == 0) {
data_[i1] = val;
} else {
buf_[i1] = val;
}
}
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
const int len = (num_data_ + 1) / 2;
data_.resize(len);
}
}
inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
template<bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start;
hist_t* grad = out;
hist_t* hess = out + 1;
hist_cnt_t* cnt = reinterpret_cast<hist_cnt_t*>(hess);
if (USE_PREFETCH) {
const data_size_t pf_offset = 64;
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + (pf_idx >> 1));
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
const uint8_t ti = static_cast<uint8_t>(bin) << 1;
if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else {
grad[ti] += ordered_gradients[i];
++cnt[ti];
}
}
}
for (; i < end; ++i) {
const auto idx = USE_INDICES ? data_indices[i] : i;
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
const uint8_t ti = static_cast<uint8_t>(bin) << 1;
if (USE_HESSIAN) {
grad[ti] += ordered_gradients[i];
hess[ti] += ordered_hessians[i];
} else {
grad[ti] += ordered_gradients[i];
++cnt[ti];
}
}
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
}
data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t most_freq_bin, MissingType missing_type, bool default_left,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
if (num_data <= 0) { return 0; }
uint8_t th = static_cast<uint8_t>(threshold + min_bin);
const uint8_t minb = static_cast<uint8_t>(min_bin);
const uint8_t maxb = static_cast<uint8_t>(max_bin);
uint8_t t_zero_bin = static_cast<uint8_t>(min_bin + default_bin);
uint8_t t_most_freq_bin = static_cast<uint8_t>(min_bin + most_freq_bin);
if (most_freq_bin == 0) {
th -= 1;
t_zero_bin -= 1;
t_most_freq_bin -= 1;
}
data_size_t lte_count = 0;
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
data_size_t* missing_default_indices = gt_indices;
data_size_t* missing_default_count = &gt_count;
if (most_freq_bin <= threshold) {
default_indices = lte_indices;
default_count = &lte_count;
}
if (missing_type == MissingType::NaN) {
if (default_left) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
if (t_most_freq_bin == maxb) {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (t_most_freq_bin == bin || bin < minb || bin > maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin == maxb) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin < minb || bin > maxb || t_most_freq_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
} else {
if ((default_left && missing_type == MissingType::Zero)
|| (default_bin <= threshold && missing_type != MissingType::Zero)) {
missing_default_indices = lte_indices;
missing_default_count = &lte_count;
}
if (default_bin == most_freq_bin) {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin < minb || bin > maxb || t_most_freq_bin == bin) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
} else {
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin == t_zero_bin) {
missing_default_indices[(*missing_default_count)++] = idx;
} else if (bin < minb || bin > maxb || t_most_freq_bin == bin) {
default_indices[(*default_count)++] = idx;
} else if (bin > th) {
gt_indices[gt_count++] = idx;
} else {
lte_indices[lte_count++] = idx;
}
}
}
}
return lte_count;
}
data_size_t SplitCategorical(
uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin,
const uint32_t* threshold, int num_threahold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
if (num_data <= 0) { return 0; }
data_size_t lte_count = 0;
data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count;
if (Common::FindInBitset(threshold, num_threahold, most_freq_bin)) {
default_indices = lte_indices;
default_count = &lte_count;
}
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
const uint32_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin < min_bin || bin > max_bin) {
default_indices[(*default_count)++] = idx;
} else if (Common::FindInBitset(threshold, num_threahold, bin - min_bin)) {
lte_indices[lte_count++] = idx;
} else {
gt_indices[gt_count++] = idx;
}
}
return lte_count;
}
data_size_t num_data() const override { return num_data_; }
void FinishLoad() override {
if (buf_.empty()) { return; }
int len = (num_data_ + 1) / 2;
for (int i = 0; i < len; ++i) {
data_[i] |= buf_[i];
}
buf_.clear();
}
void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
const uint8_t* mem_data = reinterpret_cast<const uint8_t*>(memory);
if (!local_used_indices.empty()) {
const data_size_t rest = num_data_ & 1;
for (int i = 0; i < num_data_ - rest; i += 2) {
// get old bins
data_size_t idx = local_used_indices[i];
const auto bin1 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
idx = local_used_indices[i + 1];
const auto bin2 = static_cast<uint8_t>((mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
// add
const int i1 = i >> 1;
data_[i1] = (bin1 | (bin2 << 4));
}
if (rest) {
data_size_t idx = local_used_indices[num_data_ - 1];
data_[num_data_ >> 1] = (mem_data[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
}
} else {
for (size_t i = 0; i < data_.size(); ++i) {
data_[i] = mem_data[i];
}
}
}
void CopySubrow(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const Dense4bitsBin*>(full_bin);
const data_size_t rest = num_used_indices & 1;
for (int i = 0; i < num_used_indices - rest; i += 2) {
data_size_t idx = used_indices[i];
const auto bin1 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
idx = used_indices[i + 1];
const auto bin2 = static_cast<uint8_t>((other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf);
const int i1 = i >> 1;
data_[i1] = (bin1 | (bin2 << 4));
}
if (rest) {
data_size_t idx = used_indices[num_used_indices - 1];
data_[num_used_indices >> 1] = (other_bin->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
}
}
void SaveBinaryToFile(const VirtualFileWriter* writer) const override {
writer->Write(data_.data(), sizeof(uint8_t) * data_.size());
}
size_t SizesInByte() const override {
return sizeof(uint8_t) * data_.size();
}
Dense4bitsBin* Clone() override {
return new Dense4bitsBin(*this);
}
protected:
Dense4bitsBin(const Dense4bitsBin& other)
: num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {
}
data_size_t num_data_;
std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> data_;
std::vector<uint8_t> buf_;
};
uint32_t Dense4bitsBinIterator::Get(data_size_t idx) {
const auto bin = (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin >= min_bin_ && bin <= max_bin_) {
return bin - min_bin_ + offset_;
} else {
return most_freq_bin_;
}
}
uint32_t Dense4bitsBinIterator::RawGet(data_size_t idx) {
return (bin_data_->data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
}
inline BinIterator* Dense4bitsBin::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const {
return new Dense4bitsBinIterator(this, min_bin, max_bin, most_freq_bin);
}
} // namespace LightGBM
#endif // LIGHTGBM_IO_DENSE_NBITS_BIN_HPP_
This diff is collapsed.
......@@ -106,7 +106,7 @@ class DataPartition {
const data_size_t begin = leaf_begin_[leaf];
const data_size_t cnt = leaf_count_[leaf];
auto left_start = indices_.data() + begin;
auto left_cnt = runner_.Run<false>(
const auto left_cnt = runner_.Run<false>(
cnt,
[=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
data_size_t* right) {
......
......@@ -144,72 +144,67 @@ class FeatureHistogram {
template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
void FuncForNumricalL2() {
#define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT
#define LAMBDA_ARGUMENTS \
double sum_gradient, double sum_hessian, data_size_t num_data, \
const ConstraintEntry &constraints, SplitInfo *output
#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, output, &rand_threshold
#define FUNC_ARGUMENTS \
sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, output, \
rand_threshold
if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
if (meta_->missing_type == MissingType::Zero) {
find_best_threshold_fun_ =
[=](double sum_gradient, double sum_hessian, data_size_t num_data,
const ConstraintEntry& constraints, SplitInfo* output) {
int rand_threshold = 0;
double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
sum_gradient, sum_hessian, output, &rand_threshold);
FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1,
USE_MAX_OUTPUT, true, true, false>(
sum_gradient, sum_hessian, num_data, constraints,
min_gain_shift, output, rand_threshold);
FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1,
USE_MAX_OUTPUT, false, true, false>(
sum_gradient, sum_hessian, num_data, constraints,
min_gain_shift, output, rand_threshold);
};
find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
int rand_threshold = 0;
double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
BEFORE_ARGUMENTS);
FindBestThresholdSequentially<TEMPLATE_PREFIX, true, true, false>(
FUNC_ARGUMENTS);
FindBestThresholdSequentially<TEMPLATE_PREFIX, false, true, false>(
FUNC_ARGUMENTS);
};
} else {
find_best_threshold_fun_ =
[=](double sum_gradient, double sum_hessian, data_size_t num_data,
const ConstraintEntry& constraints, SplitInfo* output) {
int rand_threshold = 0;
double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
sum_gradient, sum_hessian, output, &rand_threshold);
FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1,
USE_MAX_OUTPUT, true, false, true>(
sum_gradient, sum_hessian, num_data, constraints,
min_gain_shift, output, rand_threshold);
FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1,
USE_MAX_OUTPUT, false, false, true>(
sum_gradient, sum_hessian, num_data, constraints,
min_gain_shift, output, rand_threshold);
};
find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
int rand_threshold = 0;
double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
BEFORE_ARGUMENTS);
FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, true>(
FUNC_ARGUMENTS);
FindBestThresholdSequentially<TEMPLATE_PREFIX, false, false, true>(
FUNC_ARGUMENTS);
};
}
} else {
if (meta_->missing_type != MissingType::NaN) {
find_best_threshold_fun_ =
[=](double sum_gradient, double sum_hessian, data_size_t num_data,
const ConstraintEntry& constraints, SplitInfo* output) {
int rand_threshold = 0;
double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
sum_gradient, sum_hessian, output, &rand_threshold);
FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1,
USE_MAX_OUTPUT, true, false, false>(
sum_gradient, sum_hessian, num_data, constraints,
min_gain_shift, output, rand_threshold);
};
find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
int rand_threshold = 0;
double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
BEFORE_ARGUMENTS);
FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
FUNC_ARGUMENTS);
};
} else {
find_best_threshold_fun_ =
[=](double sum_gradient, double sum_hessian, data_size_t num_data,
const ConstraintEntry& constraints, SplitInfo* output) {
int rand_threshold = 0;
double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
sum_gradient, sum_hessian, output, &rand_threshold);
FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1,
USE_MAX_OUTPUT, true, false, false>(
sum_gradient, sum_hessian, num_data, constraints,
min_gain_shift, output, rand_threshold);
output->default_left = false;
};
find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
int rand_threshold = 0;
double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>(
BEFORE_ARGUMENTS);
FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1,
USE_MAX_OUTPUT, true, false, false>(
FUNC_ARGUMENTS);
output->default_left = false;
};
}
}
#undef TEMPLATE_PREFIX
#undef LAMBDA_ARGUMENTS
#undef BEFORE_ARGUMENTS
#undef FUNC_ARGURMENTS
}
void FuncForCategorical() {
......@@ -227,41 +222,38 @@ class FeatureHistogram {
}
}
}
template <bool USE_RAND, bool USE_MC>
void FuncForCategoricalL1() {
#define ARGUMENTS \
std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \
std::placeholders::_4, std::placeholders::_5
if (meta_->config->lambda_l1 > 0) {
if (meta_->config->max_delta_step > 0) {
find_best_threshold_fun_ =
std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
USE_RAND, USE_MC, true, true>,
this, std::placeholders::_1, std::placeholders::_2,
std::placeholders::_3, std::placeholders::_4,
std::placeholders::_5);
this, ARGUMENTS);
} else {
find_best_threshold_fun_ =
std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
USE_RAND, USE_MC, true, false>,
this, std::placeholders::_1, std::placeholders::_2,
std::placeholders::_3, std::placeholders::_4,
std::placeholders::_5);
this, ARGUMENTS);
}
} else {
if (meta_->config->max_delta_step > 0) {
find_best_threshold_fun_ =
std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
USE_RAND, USE_MC, false, true>,
this, std::placeholders::_1, std::placeholders::_2,
std::placeholders::_3, std::placeholders::_4,
std::placeholders::_5);
this, ARGUMENTS);
} else {
find_best_threshold_fun_ =
std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
USE_RAND, USE_MC, false, false>,
this, std::placeholders::_1, std::placeholders::_2,
std::placeholders::_3, std::placeholders::_4,
std::placeholders::_5);
this, ARGUMENTS);
}
}
#undef ARGUMENTS
}
template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
......
......@@ -13,7 +13,6 @@
#include <algorithm>
#include "../io/dense_bin.hpp"
#include "../io/dense_nbits_bin.hpp"
#define GPU_DEBUG 0
......@@ -378,20 +377,20 @@ void GPUTreeLearner::AllocateGPUMemory() {
BinIterator* bin_iters[8];
for (int s_idx = 0; s_idx < 8; ++s_idx) {
bin_iters[s_idx] = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
if (dynamic_cast<Dense4bitsBinIterator*>(bin_iters[s_idx]) == 0) {
if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[s_idx]) == 0) {
Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not", dense_ind[s_idx]);
}
}
// this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
Dense4bitsBinIterator iters[8] = {
*static_cast<Dense4bitsBinIterator*>(bin_iters[0]),
*static_cast<Dense4bitsBinIterator*>(bin_iters[1]),
*static_cast<Dense4bitsBinIterator*>(bin_iters[2]),
*static_cast<Dense4bitsBinIterator*>(bin_iters[3]),
*static_cast<Dense4bitsBinIterator*>(bin_iters[4]),
*static_cast<Dense4bitsBinIterator*>(bin_iters[5]),
*static_cast<Dense4bitsBinIterator*>(bin_iters[6]),
*static_cast<Dense4bitsBinIterator*>(bin_iters[7])};
DenseBinIterator<uint8_t, true> iters[8] = {
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[0]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[1]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[2]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[3]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[4]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[5]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[6]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[7])};
for (int j = 0; j < num_data_; ++j) {
host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
|((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
......@@ -407,15 +406,15 @@ void GPUTreeLearner::AllocateGPUMemory() {
for (int s_idx = 0; s_idx < 4; ++s_idx) {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[s_idx]);
// this guarantees that the RawGet() function is inlined, rather than using virtual function dispatching
if (dynamic_cast<DenseBinIterator<uint8_t>*>(bin_iter) != 0) {
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
// Dense bin
DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
for (int j = 0; j < num_data_; ++j) {
host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
}
} else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
// Dense 4-bit bin
Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
for (int j = 0; j < num_data_; ++j) {
host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
}
......@@ -450,8 +449,8 @@ void GPUTreeLearner::AllocateGPUMemory() {
for (int i = 0; i < k; ++i) {
if (dword_features_ == 8) {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
......@@ -463,15 +462,15 @@ void GPUTreeLearner::AllocateGPUMemory() {
}
} else if (dword_features_ == 4) {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<DenseBinIterator<uint8_t>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
}
} else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
......
......@@ -263,7 +263,6 @@
<ClInclude Include="..\src\boosting\rf.hpp" />
<ClInclude Include="..\src\boosting\score_updater.hpp" />
<ClInclude Include="..\src\io\dense_bin.hpp" />
<ClInclude Include="..\src\io\dense_nbits_bin.hpp" />
<ClInclude Include="..\src\io\multi_val_dense_bin.hpp" />
<ClInclude Include="..\src\io\multi_val_sparse_bin.hpp" />
<ClInclude Include="..\src\io\parser.hpp" />
......
......@@ -174,9 +174,6 @@
<ClInclude Include="..\src\boosting\goss.hpp">
<Filter>src\boosting</Filter>
</ClInclude>
<ClInclude Include="..\src\io\dense_nbits_bin.hpp">
<Filter>src\io</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\openmp_wrapper.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment