Unverified Commit 509c2e50 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Support both row-wise and col-wise multi-threading (#2699)



* commit

* fix a bug

* fix bug

* reset to track changes

* refine the auto choose logic

* sort the time stats output

* fix include

* change  multi_val_bin_sparse_threshold

* add cmake

* add _mm_malloc and _mm_free for cross platform

* fix cmake bug

* timer for split

* try to fix cmake

* fix tests

* refactor DataPartition::Split

* fix test

* typo

* formating

* Revert "formating"

This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222.

* add document

* [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719)

* naming

* fix gpu code

* Update include/LightGBM/bin.h
Co-Authored-By: default avatarJames Lamb <jaylamb20@gmail.com>

* Update src/treelearner/ocl/histogram16.cl

* test: swap compilers for CI

* fix omp

* not avx2

* no aligned for feature histogram

* Revert "refactor DataPartition::Split"

This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8.

* slightly refactor data partition

* reduce the memory cost
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent bc7bc4a1
...@@ -1065,7 +1065,7 @@ int LGBM_DatasetAddFeaturesFrom(DatasetHandle target, ...@@ -1065,7 +1065,7 @@ int LGBM_DatasetAddFeaturesFrom(DatasetHandle target,
API_BEGIN(); API_BEGIN();
auto target_d = reinterpret_cast<Dataset*>(target); auto target_d = reinterpret_cast<Dataset*>(target);
auto source_d = reinterpret_cast<Dataset*>(source); auto source_d = reinterpret_cast<Dataset*>(source);
target_d->addFeaturesFrom(source_d); target_d->AddFeaturesFrom(source_d);
API_END(); API_END();
} }
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
#include "dense_bin.hpp" #include "dense_bin.hpp"
#include "dense_nbits_bin.hpp" #include "dense_nbits_bin.hpp"
#include "ordered_sparse_bin.hpp" #include "multi_val_dense_bin.hpp"
#include "multi_val_sparse_bin.hpp"
#include "sparse_bin.hpp" #include "sparse_bin.hpp"
namespace LightGBM { namespace LightGBM {
...@@ -636,21 +637,10 @@ namespace LightGBM { ...@@ -636,21 +637,10 @@ namespace LightGBM {
template class SparseBin<uint16_t>; template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>; template class SparseBin<uint32_t>;
template class OrderedSparseBin<uint8_t>; template class MultiValDenseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>; template class MultiValDenseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>; template class MultiValDenseBin<uint32_t>;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
// sparse threshold
if (sparse_rate >= sparse_threshold && is_enable_sparse) {
*is_sparse = true;
return CreateSparseBin(num_data, num_bin);
} else {
*is_sparse = false;
return CreateDenseBin(num_data, num_bin);
}
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) { Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) { if (num_bin <= 16) {
...@@ -674,4 +664,25 @@ namespace LightGBM { ...@@ -674,4 +664,25 @@ namespace LightGBM {
} }
} }
MultiValBin* MultiValBin::CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate) {
const double multi_val_bin_sparse_threshold = 0.25f;
if (sparse_rate >= multi_val_bin_sparse_threshold) {
if (num_bin <= 256) {
return new MultiValSparseBin<uint8_t>(num_data, num_bin);
} else if (num_bin <= 65536) {
return new MultiValSparseBin<uint16_t>(num_data, num_bin);
} else {
return new MultiValSparseBin<uint32_t>(num_data, num_bin);
}
} else {
if (num_bin <= 256) {
return new MultiValDenseBin<uint8_t>(num_data, num_bin, num_feature);
} else if (num_bin <= 65536) {
return new MultiValDenseBin<uint16_t>(num_data, num_bin, num_feature);
} else {
return new MultiValDenseBin<uint32_t>(num_data, num_bin, num_feature);
}
}
}
} // namespace LightGBM } // namespace LightGBM
...@@ -312,6 +312,11 @@ void Config::CheckParamConflict() { ...@@ -312,6 +312,11 @@ void Config::CheckParamConflict() {
num_leaves = static_cast<int>(full_num_leaves); num_leaves = static_cast<int>(full_num_leaves);
} }
} }
// force col-wise for gpu
if (device_type == std::string("gpu")) {
force_col_wise = true;
force_row_wise = false;
}
} }
std::string Config::ToString() const { std::string Config::ToString() const {
......
...@@ -116,9 +116,6 @@ std::unordered_map<std::string, std::string> Config::alias_table({ ...@@ -116,9 +116,6 @@ std::unordered_map<std::string, std::string> Config::alias_table({
{"is_pre_partition", "pre_partition"}, {"is_pre_partition", "pre_partition"},
{"is_enable_bundle", "enable_bundle"}, {"is_enable_bundle", "enable_bundle"},
{"bundle", "enable_bundle"}, {"bundle", "enable_bundle"},
{"is_sparse", "is_enable_sparse"},
{"enable_sparse", "is_enable_sparse"},
{"sparse", "is_enable_sparse"},
{"two_round_loading", "two_round"}, {"two_round_loading", "two_round"},
{"use_two_round_loading", "two_round"}, {"use_two_round_loading", "two_round"},
{"is_save_binary", "save_binary"}, {"is_save_binary", "save_binary"},
...@@ -181,6 +178,8 @@ std::unordered_set<std::string> Config::parameter_set({ ...@@ -181,6 +178,8 @@ std::unordered_set<std::string> Config::parameter_set({
"num_threads", "num_threads",
"device_type", "device_type",
"seed", "seed",
"force_col_wise",
"force_row_wise",
"max_depth", "max_depth",
"min_data_in_leaf", "min_data_in_leaf",
"min_sum_hessian_in_leaf", "min_sum_hessian_in_leaf",
...@@ -236,9 +235,6 @@ std::unordered_set<std::string> Config::parameter_set({ ...@@ -236,9 +235,6 @@ std::unordered_set<std::string> Config::parameter_set({
"valid_data_initscores", "valid_data_initscores",
"pre_partition", "pre_partition",
"enable_bundle", "enable_bundle",
"max_conflict_rate",
"is_enable_sparse",
"sparse_threshold",
"use_missing", "use_missing",
"zero_as_missing", "zero_as_missing",
"two_round", "two_round",
...@@ -309,6 +305,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str ...@@ -309,6 +305,10 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
GetInt(params, "num_threads", &num_threads); GetInt(params, "num_threads", &num_threads);
GetBool(params, "force_col_wise", &force_col_wise);
GetBool(params, "force_row_wise", &force_row_wise);
GetInt(params, "max_depth", &max_depth); GetInt(params, "max_depth", &max_depth);
GetInt(params, "min_data_in_leaf", &min_data_in_leaf); GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
...@@ -467,16 +467,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str ...@@ -467,16 +467,6 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
GetBool(params, "enable_bundle", &enable_bundle); GetBool(params, "enable_bundle", &enable_bundle);
GetDouble(params, "max_conflict_rate", &max_conflict_rate);
CHECK(max_conflict_rate >=0.0);
CHECK(max_conflict_rate <1.0);
GetBool(params, "is_enable_sparse", &is_enable_sparse);
GetDouble(params, "sparse_threshold", &sparse_threshold);
CHECK(sparse_threshold >0.0);
CHECK(sparse_threshold <=1.0);
GetBool(params, "use_missing", &use_missing); GetBool(params, "use_missing", &use_missing);
GetBool(params, "zero_as_missing", &zero_as_missing); GetBool(params, "zero_as_missing", &zero_as_missing);
...@@ -600,6 +590,8 @@ std::string Config::SaveMembersToString() const { ...@@ -600,6 +590,8 @@ std::string Config::SaveMembersToString() const {
str_buf << "[learning_rate: " << learning_rate << "]\n"; str_buf << "[learning_rate: " << learning_rate << "]\n";
str_buf << "[num_leaves: " << num_leaves << "]\n"; str_buf << "[num_leaves: " << num_leaves << "]\n";
str_buf << "[num_threads: " << num_threads << "]\n"; str_buf << "[num_threads: " << num_threads << "]\n";
str_buf << "[force_col_wise: " << force_col_wise << "]\n";
str_buf << "[force_row_wise: " << force_row_wise << "]\n";
str_buf << "[max_depth: " << max_depth << "]\n"; str_buf << "[max_depth: " << max_depth << "]\n";
str_buf << "[min_data_in_leaf: " << min_data_in_leaf << "]\n"; str_buf << "[min_data_in_leaf: " << min_data_in_leaf << "]\n";
str_buf << "[min_sum_hessian_in_leaf: " << min_sum_hessian_in_leaf << "]\n"; str_buf << "[min_sum_hessian_in_leaf: " << min_sum_hessian_in_leaf << "]\n";
...@@ -655,9 +647,6 @@ std::string Config::SaveMembersToString() const { ...@@ -655,9 +647,6 @@ std::string Config::SaveMembersToString() const {
str_buf << "[valid_data_initscores: " << Common::Join(valid_data_initscores, ",") << "]\n"; str_buf << "[valid_data_initscores: " << Common::Join(valid_data_initscores, ",") << "]\n";
str_buf << "[pre_partition: " << pre_partition << "]\n"; str_buf << "[pre_partition: " << pre_partition << "]\n";
str_buf << "[enable_bundle: " << enable_bundle << "]\n"; str_buf << "[enable_bundle: " << enable_bundle << "]\n";
str_buf << "[max_conflict_rate: " << max_conflict_rate << "]\n";
str_buf << "[is_enable_sparse: " << is_enable_sparse << "]\n";
str_buf << "[sparse_threshold: " << sparse_threshold << "]\n";
str_buf << "[use_missing: " << use_missing << "]\n"; str_buf << "[use_missing: " << use_missing << "]\n";
str_buf << "[zero_as_missing: " << zero_as_missing << "]\n"; str_buf << "[zero_as_missing: " << zero_as_missing << "]\n";
str_buf << "[two_round: " << two_round << "]\n"; str_buf << "[two_round: " << two_round << "]\n";
......
This diff is collapsed.
...@@ -335,8 +335,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -335,8 +335,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
mem_ptr += sizeof(dataset->use_missing_); mem_ptr += sizeof(dataset->use_missing_);
dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr)); dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
mem_ptr += sizeof(dataset->zero_as_missing_); mem_ptr += sizeof(dataset->zero_as_missing_);
dataset->sparse_threshold_ = *(reinterpret_cast<const double*>(mem_ptr));
mem_ptr += sizeof(dataset->sparse_threshold_);
const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr); const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
dataset->used_feature_map_.clear(); dataset->used_feature_map_.clear();
for (int i = 0; i < dataset->num_total_features_; ++i) { for (int i = 0; i < dataset->num_total_features_; ++i) {
......
...@@ -31,9 +31,9 @@ class DenseBinIterator: public BinIterator { ...@@ -31,9 +31,9 @@ class DenseBinIterator: public BinIterator {
} }
inline uint32_t RawGet(data_size_t idx) override; inline uint32_t RawGet(data_size_t idx) override;
inline uint32_t Get(data_size_t idx) override; inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override { } inline void Reset(data_size_t) override {}
private: private:
const DenseBin<VAL_T>* bin_data_; const DenseBin<VAL_T>* bin_data_;
VAL_T min_bin_; VAL_T min_bin_;
VAL_T max_bin_; VAL_T max_bin_;
...@@ -46,7 +46,7 @@ class DenseBinIterator: public BinIterator { ...@@ -46,7 +46,7 @@ class DenseBinIterator: public BinIterator {
*/ */
template <typename VAL_T> template <typename VAL_T>
class DenseBin: public Bin { class DenseBin: public Bin {
public: public:
friend DenseBinIterator<VAL_T>; friend DenseBinIterator<VAL_T>;
explicit DenseBin(data_size_t num_data) explicit DenseBin(data_size_t num_data)
: num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) { : num_data_(num_data), data_(num_data_, static_cast<VAL_T>(0)) {
...@@ -68,84 +68,65 @@ class DenseBin: public Bin { ...@@ -68,84 +68,65 @@ class DenseBin: public Bin {
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, #define ACC_GH(hist, i, g, h) \
const score_t* ordered_gradients, const score_t* ordered_hessians, const auto ti = static_cast<int>(i) << 1; \
HistogramBinEntry* out) const override { hist[ti] += g; \
const data_size_t pf_offset = 64 / sizeof(VAL_T); hist[ti + 1] += h; \
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start; data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + data_indices[i + pf_offset]); if (use_prefetch) {
const VAL_T bin = data_[data_indices[i]]; const data_size_t pf_offset = 64 / sizeof(VAL_T);
out[bin].sum_gradients += ordered_gradients[i]; const data_size_t pf_end = end - pf_offset;
out[bin].sum_hessians += ordered_hessians[i]; for (; i < pf_end; ++i) {
++out[bin].cnt; const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + pf_idx);
const VAL_T bin = data_[idx];
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
}
} }
for (; i < end; i++) { for (; i < end; ++i) {
const VAL_T bin = data_[data_indices[i]]; const auto idx = use_indices ? data_indices[i] : i;
out[bin].sum_gradients += ordered_gradients[i]; const VAL_T bin = data_[idx];
out[bin].sum_hessians += ordered_hessians[i]; if (use_hessians) {
++out[bin].cnt; ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
} }
} }
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T); ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + i + pf_offset);
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
} }
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T); ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
} }
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T); ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + i + pf_offset);
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
} }
data_size_t Split( data_size_t Split(
...@@ -257,9 +238,6 @@ class DenseBin: public Bin { ...@@ -257,9 +238,6 @@ class DenseBin: public Bin {
data_size_t num_data() const override { return num_data_; } data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
OrderedBin* CreateOrderedBin() const override { return nullptr; }
void FinishLoad() override {} void FinishLoad() override {}
void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override { void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
...@@ -287,17 +265,18 @@ class DenseBin: public Bin { ...@@ -287,17 +265,18 @@ class DenseBin: public Bin {
} }
size_t SizesInByte() const override { size_t SizesInByte() const override {
return sizeof(VAL_T) * num_data_; return sizeof(VAL_T)* num_data_;
} }
DenseBin<VAL_T>* Clone() override; DenseBin<VAL_T>* Clone() override;
private: private:
data_size_t num_data_; data_size_t num_data_;
std::vector<VAL_T> data_; std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
DenseBin<VAL_T>(const DenseBin<VAL_T>& other) DenseBin<VAL_T>(const DenseBin<VAL_T>& other)
: num_data_(other.num_data_), data_(other.data_){} : num_data_(other.num_data_), data_(other.data_) {
}
}; };
template<typename VAL_T> template<typename VAL_T>
......
...@@ -16,7 +16,7 @@ namespace LightGBM { ...@@ -16,7 +16,7 @@ namespace LightGBM {
class Dense4bitsBin; class Dense4bitsBin;
class Dense4bitsBinIterator : public BinIterator { class Dense4bitsBinIterator : public BinIterator {
public: public:
explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) explicit Dense4bitsBinIterator(const Dense4bitsBin* bin_data, uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
: bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)), : bin_data_(bin_data), min_bin_(static_cast<uint8_t>(min_bin)),
max_bin_(static_cast<uint8_t>(max_bin)), max_bin_(static_cast<uint8_t>(max_bin)),
...@@ -31,7 +31,7 @@ class Dense4bitsBinIterator : public BinIterator { ...@@ -31,7 +31,7 @@ class Dense4bitsBinIterator : public BinIterator {
inline uint32_t Get(data_size_t idx) override; inline uint32_t Get(data_size_t idx) override;
inline void Reset(data_size_t) override {} inline void Reset(data_size_t) override {}
private: private:
const Dense4bitsBin* bin_data_; const Dense4bitsBin* bin_data_;
uint8_t min_bin_; uint8_t min_bin_;
uint8_t max_bin_; uint8_t max_bin_;
...@@ -40,12 +40,12 @@ class Dense4bitsBinIterator : public BinIterator { ...@@ -40,12 +40,12 @@ class Dense4bitsBinIterator : public BinIterator {
}; };
class Dense4bitsBin : public Bin { class Dense4bitsBin : public Bin {
public: public:
friend Dense4bitsBinIterator; friend Dense4bitsBinIterator;
explicit Dense4bitsBin(data_size_t num_data) explicit Dense4bitsBin(data_size_t num_data)
: num_data_(num_data) { : num_data_(num_data) {
int len = (num_data_ + 1) / 2; int len = (num_data_ + 1) / 2;
data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0)); data_.resize(len, static_cast<uint8_t>(0));
buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0)); buf_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0));
} }
...@@ -73,88 +73,65 @@ class Dense4bitsBin : public Bin { ...@@ -73,88 +73,65 @@ class Dense4bitsBin : public Bin {
inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, #define ACC_GH(hist, i, g, h) \
const score_t* ordered_gradients, const score_t* ordered_hessians, const auto ti = (i) << 1; \
HistogramBinEntry* out) const override { hist[ti] += g; \
const data_size_t pf_offset = 64; hist[ti + 1] += h; \
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const {
data_size_t i = start; data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1)); if (use_prefetch) {
const data_size_t idx = data_indices[i]; const data_size_t pf_offset = 64;
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const data_size_t pf_end = end - pf_offset;
out[bin].sum_gradients += ordered_gradients[i]; for (; i < pf_end; ++i) {
out[bin].sum_hessians += ordered_hessians[i]; const auto idx = use_indices ? data_indices[i] : i;
++out[bin].cnt; const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(data_.data() + (pf_idx >> 1));
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (use_hessians) {
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
} else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
}
} }
for (; i < end; i++) { for (; i < end; ++i) {
const data_size_t idx = data_indices[i]; const auto idx = use_indices ? data_indices[i] : i;
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i]; if (use_hessians) {
out[bin].sum_hessians += ordered_hessians[i]; ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
++out[bin].cnt; } else {
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
}
} }
} }
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, ordered_gradients, ordered_hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64; ConstructHistogramInner<false, false, true>(nullptr, start, end, ordered_gradients, ordered_hessians, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
} }
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64; ConstructHistogramInner<true, true, false>(data_indices, start, end, ordered_gradients, nullptr, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
} }
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { hist_t* out) const override {
const data_size_t pf_offset = 64; ConstructHistogramInner<false, false, false>(nullptr, start, end, ordered_gradients, nullptr, out);
const data_size_t pf_end = end - pf_offset - kCacheLineSize;
data_size_t i = start;
for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
} }
data_size_t Split( data_size_t Split(
...@@ -266,8 +243,6 @@ class Dense4bitsBin : public Bin { ...@@ -266,8 +243,6 @@ class Dense4bitsBin : public Bin {
data_size_t num_data() const override { return num_data_; } data_size_t num_data() const override { return num_data_; }
/*! \brief not ordered bin for dense feature */
OrderedBin* CreateOrderedBin() const override { return nullptr; }
void FinishLoad() override { void FinishLoad() override {
if (buf_.empty()) { return; } if (buf_.empty()) { return; }
...@@ -325,19 +300,20 @@ class Dense4bitsBin : public Bin { ...@@ -325,19 +300,20 @@ class Dense4bitsBin : public Bin {
} }
size_t SizesInByte() const override { size_t SizesInByte() const override {
return sizeof(uint8_t) * data_.size(); return sizeof(uint8_t)* data_.size();
} }
Dense4bitsBin* Clone() override { Dense4bitsBin* Clone() override {
return new Dense4bitsBin(*this); return new Dense4bitsBin(*this);
} }
protected: protected:
Dense4bitsBin(const Dense4bitsBin& other) Dense4bitsBin(const Dense4bitsBin& other)
: num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {} : num_data_(other.num_data_), data_(other.data_), buf_(other.buf_) {
}
data_size_t num_data_; data_size_t num_data_;
std::vector<uint8_t> data_; std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> data_;
std::vector<uint8_t> buf_; std::vector<uint8_t> buf_;
}; };
......
/*!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
#define LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstdint>
#include <cstring>
#include <vector>
namespace LightGBM {
template <typename VAL_T>
class MultiValDenseBin : public MultiValBin {
public:
explicit MultiValDenseBin(data_size_t num_data, int num_bin, int num_feature)
: num_data_(num_data), num_bin_(num_bin), num_feature_(num_feature) {
data_.resize(static_cast<size_t>(num_data_) * num_feature_, static_cast<VAL_T>(0));
}
~MultiValDenseBin() {
}
data_size_t num_data() const override {
return num_data_;
}
int num_bin() const override {
return num_bin_;
}
void PushOneRow(int , data_size_t idx, const std::vector<uint32_t>& values) override {
auto start = RowPtr(idx);
CHECK(num_feature_ == static_cast<int>(values.size()));
for (auto i = 0; i < num_feature_; ++i) {
data_[start + i] = static_cast<VAL_T>(values[i]);
}
}
void FinishLoad() override {
}
bool IsSparse() override{
return false;
}
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
PREFETCH_T0(hessians + pf_idx);
}
PREFETCH_T0(data_.data() + RowPtr(pf_idx));
const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
for (auto j = j_start; j < j_start + num_feature_; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const MultiValDenseBin<VAL_T>*>(full_bin);
data_.clear();
for (data_size_t i = 0; i < num_used_indices; ++i) {
for (int64_t j = other_bin->RowPtr(used_indices[i]); j < other_bin->RowPtr(used_indices[i] + 1); ++j) {
data_.push_back(other_bin->data_[j]);
}
}
}
inline int64_t RowPtr(data_size_t idx) const {
return static_cast<int64_t>(idx) * num_feature_;
}
MultiValDenseBin<VAL_T>* Clone() override;
private:
data_size_t num_data_;
int num_bin_;
int num_feature_;
std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
MultiValDenseBin<VAL_T>(const MultiValDenseBin<VAL_T>& other)
: num_data_(other.num_data_), num_bin_(other.num_bin_), num_feature_(other.num_feature_), data_(other.data_) {
}
};
template<typename VAL_T>
MultiValDenseBin<VAL_T>* MultiValDenseBin<VAL_T>::Clone() {
return new MultiValDenseBin<VAL_T>(*this);
}
} // namespace LightGBM
#endif // LIGHTGBM_IO_MULTI_VAL_DENSE_BIN_HPP_
/*!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstdint>
#include <cstring>
#include <vector>
namespace LightGBM {
template <typename VAL_T>
class MultiValSparseBin : public MultiValBin {
public:
explicit MultiValSparseBin(data_size_t num_data, int num_bin)
: num_data_(num_data), num_bin_(num_bin) {
row_ptr_.resize(num_data_ + 1, 0);
data_.reserve(num_data_);
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
if (num_threads > 1) {
t_data_.resize(num_threads - 1);
}
}
~MultiValSparseBin() {
}
data_size_t num_data() const override {
return num_data_;
}
int num_bin() const override {
return num_bin_;
}
void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t> & values) override {
row_ptr_[idx + 1] = static_cast<data_size_t>(values.size());
if (tid == 0) {
for (auto val : values) {
data_.push_back(static_cast<VAL_T>(val));
}
} else {
for (auto val : values) {
t_data_[tid - 1].push_back(static_cast<VAL_T>(val));
}
}
}
void FinishLoad() override {
for (data_size_t i = 0; i < num_data_; ++i) {
row_ptr_[i + 1] += row_ptr_[i];
}
if (t_data_.size() > 0) {
size_t offset = data_.size();
data_.resize(row_ptr_[num_data_]);
for (size_t tid = 0; tid < t_data_.size(); ++tid) {
std::memcpy(data_.data() + offset, t_data_[tid].data(), t_data_[tid].size() * sizeof(VAL_T));
offset += t_data_[tid].size();
t_data_[tid].clear();
}
}
row_ptr_.shrink_to_fit();
data_.shrink_to_fit();
t_data_.clear();
t_data_.shrink_to_fit();
}
bool IsSparse() override {
return true;
}
void ReSize(data_size_t num_data) override {
if (num_data_ != num_data) {
num_data_ = num_data;
}
}
#define ACC_GH(hist, i, g, h) \
const auto ti = static_cast<int>(i) << 1; \
hist[ti] += g; \
hist[ti + 1] += h; \
template<bool use_indices, bool use_prefetch, bool use_hessians>
void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians, hist_t* out) const {
data_size_t i = start;
if (use_prefetch) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset;
PREFETCH_T0(gradients + pf_idx);
if (use_hessians) {
PREFETCH_T0(hessians + pf_idx);
}
PREFETCH_T0(row_ptr_.data() + pf_idx);
PREFETCH_T0(data_.data() + row_ptr_[pf_idx]);
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
for (; i < end; ++i) {
const auto idx = use_indices ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
for (auto j = j_start; j < j_end; ++j) {
const VAL_T bin = data_[j];
if (use_hessians) {
ACC_GH(out, bin, gradients[idx], hessians[idx]);
} else {
ACC_GH(out, bin, gradients[idx], 1.0f);
}
}
}
}
#undef ACC_GH
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<true, true, true>(data_indices, start, end, gradients, hessians, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const override {
ConstructHistogramInner<false, false, true>(nullptr, start, end, gradients, hessians, out);
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<true, true, false>(data_indices, start, end, gradients, nullptr, out);
}
void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients,
hist_t* out) const override {
ConstructHistogramInner<false, false, false>(nullptr, start, end, gradients, nullptr, out);
}
void CopySubset(const Bin * full_bin, const data_size_t * used_indices, data_size_t num_used_indices) override {
auto other_bin = dynamic_cast<const MultiValSparseBin<VAL_T>*>(full_bin);
row_ptr_.resize(num_data_ + 1, 0);
data_.clear();
for (data_size_t i = 0; i < num_used_indices; ++i) {
for (data_size_t j = other_bin->row_ptr_[used_indices[i]]; j < other_bin->row_ptr_[used_indices[i] + 1]; ++j) {
data_.push_back(other_bin->data_[j]);
}
row_ptr_[i + 1] = row_ptr_[i] + other_bin->row_ptr_[used_indices[i] + 1] - other_bin->row_ptr_[used_indices[i]];
}
}
inline data_size_t RowPtr(data_size_t idx) const {
return row_ptr_[idx];
}
MultiValSparseBin<VAL_T>* Clone() override;
private:
data_size_t num_data_;
int num_bin_;
std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, 32>> data_;
std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, 32>> row_ptr_;
std::vector<std::vector<VAL_T>> t_data_;
MultiValSparseBin<VAL_T>(const MultiValSparseBin<VAL_T> & other)
: num_data_(other.num_data_), num_bin_(other.num_bin_), data_(other.data_), row_ptr_(other.row_ptr_) {
}
};
template<typename VAL_T>
MultiValSparseBin<VAL_T>* MultiValSparseBin<VAL_T>::Clone() {
return new MultiValSparseBin<VAL_T>(*this);
}
} // namespace LightGBM
#endif // LIGHTGBM_IO_MULTI_VAL_SPARSE_BIN_HPP_
/*!
* Copyright (c) 2016 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
#include <LightGBM/bin.h>
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <mutex>
#include <utility>
#include <vector>
#include "sparse_bin.hpp"
namespace LightGBM {
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
template <typename VAL_T>
class OrderedSparseBin: public OrderedBin {
public:
/*! \brief Pair to store one bin entry */
struct SparsePair {
data_size_t ridx; // data(row) index
VAL_T bin; // bin for this data
SparsePair() : ridx(0), bin(0) {}
};
explicit OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
:bin_data_(bin_data) {
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
int non_zero_cnt = 0;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
++non_zero_cnt;
}
ordered_pair_.resize(non_zero_cnt);
leaf_cnt_.push_back(non_zero_cnt);
}
~OrderedSparseBin() {
}
void Init(const char* used_idices, int num_leaves) override {
// initialize the leaf information
leaf_start_ = std::vector<data_size_t>(num_leaves, 0);
leaf_cnt_ = std::vector<data_size_t>(num_leaves, 0);
if (used_idices == nullptr) {
// if using all data, copy all non-zero pair
data_size_t j = 0;
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j;
}
leaf_cnt_[0] = static_cast<data_size_t>(j);
} else {
// if using part of data(bagging)
data_size_t j = 0;
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
if (used_idices[cur_pos]) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j;
}
}
leaf_cnt_[0] = j;
}
}
void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian,
HistogramBinEntry* out) const override {
// get current leaf boundary
const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf];
for (data_size_t i = start; i < end; ++i) {
const VAL_T bin = ordered_pair_[i].bin;
const auto g = gradient[ordered_pair_[i].ridx];
const auto h = hessian[ordered_pair_[i].ridx];
out[bin].sum_gradients += g;
out[bin].sum_hessians += h;
++out[bin].cnt;
}
}
void ConstructHistogram(int leaf, const score_t* gradient,
HistogramBinEntry* out) const override {
// get current leaf boundary
const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf];
for (data_size_t i = start; i < end; ++i) {
const VAL_T bin = ordered_pair_[i].bin;
const auto g = gradient[ordered_pair_[i].ridx];
out[bin].sum_gradients += g;
++out[bin].cnt;
}
}
void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) override {
// get current leaf boundary
const data_size_t l_start = leaf_start_[leaf];
const data_size_t l_end = l_start + leaf_cnt_[leaf];
// new left leaf end after split
data_size_t new_left_end = l_start;
for (data_size_t i = l_start; i < l_end; ++i) {
if (is_in_leaf[ordered_pair_[i].ridx] == mark) {
std::swap(ordered_pair_[new_left_end], ordered_pair_[i]);
++new_left_end;
}
}
leaf_start_[right_leaf] = new_left_end;
leaf_cnt_[leaf] = new_left_end - l_start;
leaf_cnt_[right_leaf] = l_end - new_left_end;
}
data_size_t NonZeroCount(int leaf) const override {
return static_cast<data_size_t>(leaf_cnt_[leaf]);
}
/*! \brief Disable copy */
OrderedSparseBin<VAL_T>& operator=(const OrderedSparseBin<VAL_T>&) = delete;
/*! \brief Disable copy */
OrderedSparseBin<VAL_T>(const OrderedSparseBin<VAL_T>&) = delete;
private:
const SparseBin<VAL_T>* bin_data_;
/*! \brief Store non-zero pair , group by leaf */
std::vector<SparsePair> ordered_pair_;
/*! \brief leaf_start_[i] means data in i-th leaf start from */
std::vector<data_size_t> leaf_start_;
/*! \brief leaf_cnt_[i] means number of data in i-th leaf */
std::vector<data_size_t> leaf_cnt_;
};
template <typename VAL_T>
OrderedBin* SparseBin<VAL_T>::CreateOrderedBin() const {
return new OrderedSparseBin<VAL_T>(this);
}
} // namespace LightGBM
#endif // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_
...@@ -24,7 +24,7 @@ const size_t kNumFastIndex = 64; ...@@ -24,7 +24,7 @@ const size_t kNumFastIndex = 64;
template <typename VAL_T> template <typename VAL_T>
class SparseBinIterator: public BinIterator { class SparseBinIterator: public BinIterator {
public: public:
SparseBinIterator(const SparseBin<VAL_T>* bin_data, SparseBinIterator(const SparseBin<VAL_T>* bin_data,
uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin)
: bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)), : bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
...@@ -56,7 +56,7 @@ class SparseBinIterator: public BinIterator { ...@@ -56,7 +56,7 @@ class SparseBinIterator: public BinIterator {
inline void Reset(data_size_t idx) override; inline void Reset(data_size_t idx) override;
private: private:
const SparseBin<VAL_T>* bin_data_; const SparseBin<VAL_T>* bin_data_;
data_size_t cur_pos_; data_size_t cur_pos_;
data_size_t i_delta_; data_size_t i_delta_;
...@@ -66,20 +66,16 @@ class SparseBinIterator: public BinIterator { ...@@ -66,20 +66,16 @@ class SparseBinIterator: public BinIterator {
uint8_t offset_; uint8_t offset_;
}; };
template <typename VAL_T>
class OrderedSparseBin;
template <typename VAL_T> template <typename VAL_T>
class SparseBin: public Bin { class SparseBin: public Bin {
public: public:
friend class SparseBinIterator<VAL_T>; friend class SparseBinIterator<VAL_T>;
friend class OrderedSparseBin<VAL_T>;
explicit SparseBin(data_size_t num_data) explicit SparseBin(data_size_t num_data)
: num_data_(num_data) { : num_data_(num_data) {
int num_threads = 1; int num_threads = 1;
#pragma omp parallel #pragma omp parallel
#pragma omp master #pragma omp master
{ {
num_threads = omp_get_num_threads(); num_threads = omp_get_num_threads();
} }
...@@ -102,41 +98,97 @@ class SparseBin: public Bin { ...@@ -102,41 +98,97 @@ class SparseBin: public Bin {
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override;
void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*, #define ACC_GH(hist, i, g, h) \
const score_t*, HistogramBinEntry*) const override { const auto ti = static_cast<int>(i) << 1; \
// Will use OrderedSparseBin->ConstructHistogram() instead hist[ti] += g; \
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); hist[ti + 1] += h; \
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const override {
data_size_t i_delta, cur_pos;
InitIndex(data_indices[start], &i_delta, &cur_pos);
data_size_t i = start;
for (;;) {
if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
} else if (cur_pos > data_indices[i]) {
if (++i >= end) { break; }
} else {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]);
if (++i >= end) { break; }
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
}
}
} }
void ConstructHistogram(data_size_t, data_size_t, const score_t*, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t*, HistogramBinEntry*) const override { const score_t* ordered_gradients, const score_t* ordered_hessians,
// Will use OrderedSparseBin->ConstructHistogram() instead hist_t* out) const override {
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); data_size_t i_delta, cur_pos;
InitIndex(start, &i_delta, &cur_pos);
while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta];
}
while (cur_pos < end && i_delta < num_vals_) {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[cur_pos], ordered_hessians[cur_pos]);
cur_pos += deltas_[++i_delta];
}
} }
void ConstructHistogram(const data_size_t*, data_size_t, data_size_t, const score_t*, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
HistogramBinEntry*) const override { const score_t* ordered_gradients,
// Will use OrderedSparseBin->ConstructHistogram() instead hist_t* out) const override {
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); data_size_t i_delta, cur_pos;
InitIndex(data_indices[start], &i_delta, &cur_pos);
data_size_t i = start;
for (;;) {
if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
} else if (cur_pos > data_indices[i]) {
if (++i >= end) { break; }
} else {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[i], 1.0f);
if (++i >= end) { break; }
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) { break; }
}
}
} }
void ConstructHistogram(data_size_t, data_size_t, const score_t*, void ConstructHistogram(data_size_t start, data_size_t end,
HistogramBinEntry*) const override { const score_t* ordered_gradients,
// Will use OrderedSparseBin->ConstructHistogram() instead hist_t* out) const override {
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); data_size_t i_delta, cur_pos;
InitIndex(start, &i_delta, &cur_pos);
while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta];
}
while (cur_pos < end && i_delta < num_vals_) {
const VAL_T bin = vals_[i_delta];
ACC_GH(out, bin, ordered_gradients[cur_pos], 1.0f);
cur_pos += deltas_[++i_delta];
}
} }
#undef ACC_GH
inline bool NextNonzero(data_size_t* i_delta, inline void NextNonzeroFast(data_size_t* i_delta,
data_size_t* cur_pos) const { data_size_t* cur_pos) const {
++(*i_delta); *cur_pos += deltas_[++(*i_delta)];
data_size_t shift = 0; if (*i_delta >= num_vals_) {
data_size_t delta = deltas_[*i_delta]; *cur_pos = num_data_;
while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
++(*i_delta);
shift += 8;
delta |= static_cast<data_size_t>(deltas_[*i_delta]) << shift;
} }
*cur_pos += delta; }
inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const {
*cur_pos += deltas_[++(*i_delta)];
if (*i_delta < num_vals_) { if (*i_delta < num_vals_) {
return true; return true;
} else { } else {
...@@ -257,8 +309,6 @@ class SparseBin: public Bin { ...@@ -257,8 +309,6 @@ class SparseBin: public Bin {
data_size_t num_data() const override { return num_data_; } data_size_t num_data() const override { return num_data_; }
OrderedBin* CreateOrderedBin() const override;
void FinishLoad() override { void FinishLoad() override {
// get total non zero size // get total non zero size
size_t pair_cnt = 0; size_t pair_cnt = 0;
...@@ -276,8 +326,8 @@ class SparseBin: public Bin { ...@@ -276,8 +326,8 @@ class SparseBin: public Bin {
// sort by data index // sort by data index
std::sort(idx_val_pairs.begin(), idx_val_pairs.end(), std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
[](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) { [](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
return a.first < b.first; return a.first < b.first;
}); });
// load delta array // load delta array
LoadFromPair(idx_val_pairs); LoadFromPair(idx_val_pairs);
} }
...@@ -291,11 +341,12 @@ class SparseBin: public Bin { ...@@ -291,11 +341,12 @@ class SparseBin: public Bin {
const data_size_t cur_idx = idx_val_pairs[i].first; const data_size_t cur_idx = idx_val_pairs[i].first;
const VAL_T bin = idx_val_pairs[i].second; const VAL_T bin = idx_val_pairs[i].second;
data_size_t cur_delta = cur_idx - last_idx; data_size_t cur_delta = cur_idx - last_idx;
// disallow the multi-val in one row
if (i > 0 && cur_delta == 0) { continue; } if (i > 0 && cur_delta == 0) { continue; }
while (cur_delta >= 256) { while (cur_delta >= 256) {
deltas_.push_back(cur_delta & 0xff); deltas_.push_back(255);
vals_.push_back(0); vals_.push_back(0);
cur_delta >>= 8; cur_delta -= 255;
} }
deltas_.push_back(static_cast<uint8_t>(cur_delta)); deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin); vals_.push_back(bin);
...@@ -384,7 +435,7 @@ class SparseBin: public Bin { ...@@ -384,7 +435,7 @@ class SparseBin: public Bin {
while (cur_pos < idx && j < num_vals_) { while (cur_pos < idx && j < num_vals_) {
NextNonzero(&j, &cur_pos); NextNonzero(&j, &cur_pos);
} }
if (cur_pos == idx && j < num_vals_) { if (cur_pos == idx && j < num_vals_ && vals_[j] > 0) {
// new row index is i // new row index is i
tmp_pair.emplace_back(i, vals_[j]); tmp_pair.emplace_back(i, vals_[j]);
} }
...@@ -405,13 +456,13 @@ class SparseBin: public Bin { ...@@ -405,13 +456,13 @@ class SparseBin: public Bin {
// transform to delta array // transform to delta array
data_size_t last_idx = 0; data_size_t last_idx = 0;
for (data_size_t i = 0; i < num_used_indices; ++i) { for (data_size_t i = 0; i < num_used_indices; ++i) {
VAL_T bin = iterator.InnerRawGet(used_indices[i]); auto bin = iterator.InnerRawGet(used_indices[i]);
if (bin > 0) { if (bin > 0) {
data_size_t cur_delta = i - last_idx; data_size_t cur_delta = i - last_idx;
while (cur_delta >= 256) { while (cur_delta >= 256) {
deltas_.push_back(cur_delta & 0xff); deltas_.push_back(255);
vals_.push_back(0); vals_.push_back(0);
cur_delta >>= 8; cur_delta -= 255;
} }
deltas_.push_back(static_cast<uint8_t>(cur_delta)); deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin); vals_.push_back(bin);
...@@ -432,15 +483,29 @@ class SparseBin: public Bin { ...@@ -432,15 +483,29 @@ class SparseBin: public Bin {
SparseBin<VAL_T>* Clone() override; SparseBin<VAL_T>* Clone() override;
protected:
SparseBin<VAL_T>(const SparseBin<VAL_T>& other) SparseBin<VAL_T>(const SparseBin<VAL_T>& other)
: num_data_(other.num_data_), deltas_(other.deltas_), vals_(other.vals_), : num_data_(other.num_data_), deltas_(other.deltas_), vals_(other.vals_),
num_vals_(other.num_vals_), push_buffers_(other.push_buffers_), num_vals_(other.num_vals_), push_buffers_(other.push_buffers_),
fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {} fast_index_(other.fast_index_), fast_index_shift_(other.fast_index_shift_) {
}
void InitIndex(data_size_t start_idx, data_size_t * i_delta, data_size_t * cur_pos) const {
auto idx = start_idx >> fast_index_shift_;
if (static_cast<size_t>(idx) < fast_index_.size()) {
const auto fast_pair = fast_index_[start_idx >> fast_index_shift_];
*i_delta = fast_pair.first;
*cur_pos = fast_pair.second;
} else {
*i_delta = -1;
*cur_pos = 0;
}
}
private:
data_size_t num_data_; data_size_t num_data_;
std::vector<uint8_t> deltas_; std::vector<uint8_t, Common::AlignmentAllocator<uint8_t, kAlignedSize>> deltas_;
std::vector<VAL_T> vals_; std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> vals_;
data_size_t num_vals_; data_size_t num_vals_;
std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_; std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
std::vector<std::pair<data_size_t, data_size_t>> fast_index_; std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
...@@ -460,7 +525,7 @@ inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) { ...@@ -460,7 +525,7 @@ inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
template <typename VAL_T> template <typename VAL_T>
inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) { inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
while (cur_pos_ < idx) { while (cur_pos_ < idx) {
bin_data_->NextNonzero(&i_delta_, &cur_pos_); bin_data_->NextNonzeroFast(&i_delta_, &cur_pos_);
} }
if (cur_pos_ == idx) { if (cur_pos_ == idx) {
return bin_data_->vals_[i_delta_]; return bin_data_->vals_[i_delta_];
...@@ -471,15 +536,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) { ...@@ -471,15 +536,7 @@ inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
template <typename VAL_T> template <typename VAL_T>
inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) { inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
auto idx = start_idx >> bin_data_->fast_index_shift_; bin_data_->InitIndex(start_idx, &i_delta_, &cur_pos_);
if (static_cast<size_t>(idx) < bin_data_->fast_index_.size()) {
const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
i_delta_ = fast_pair.first;
cur_pos_ = fast_pair.second;
} else {
i_delta_ = -1;
cur_pos_ = 0;
}
} }
template <typename VAL_T> template <typename VAL_T>
......
...@@ -73,9 +73,9 @@ class RankXENDCG: public ObjectiveFunction { ...@@ -73,9 +73,9 @@ class RankXENDCG: public ObjectiveFunction {
// Skip query if sum of labels is 0. // Skip query if sum of labels is 0.
float sum_labels = 0; float sum_labels = 0;
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
sum_labels += phi(label[i], gammas[i]); sum_labels += static_cast<float>(phi(label[i], gammas[i]));
} }
if (sum_labels == 0) { if (std::fabs(sum_labels) < kEpsilon) {
return; return;
} }
...@@ -111,7 +111,7 @@ class RankXENDCG: public ObjectiveFunction { ...@@ -111,7 +111,7 @@ class RankXENDCG: public ObjectiveFunction {
} }
double phi(const label_t l, double g) const { double phi(const label_t l, double g) const {
return Common::Pow(2, l) - g; return Common::Pow(2, static_cast<int>(l)) - g;
} }
const char* GetName() const override { const char* GetName() const override {
......
...@@ -27,7 +27,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, boo ...@@ -27,7 +27,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, boo
rank_ = Network::rank(); rank_ = Network::rank();
num_machines_ = Network::num_machines(); num_machines_ = Network::num_machines();
// allocate buffer for communication // allocate buffer for communication
size_t buffer_size = this->train_data_->NumTotalBin() * sizeof(HistogramBinEntry); size_t buffer_size = this->train_data_->NumTotalBin() * KHistEntrySize;
input_buffer_.resize(buffer_size); input_buffer_.resize(buffer_size);
output_buffer_.resize(buffer_size); output_buffer_.resize(buffer_size);
...@@ -82,7 +82,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() { ...@@ -82,7 +82,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
num_bin -= 1; num_bin -= 1;
} }
block_len_[i] += num_bin * sizeof(HistogramBinEntry); block_len_[i] += num_bin * KHistEntrySize;
} }
reduce_scatter_size_ += block_len_[i]; reduce_scatter_size_ += block_len_[i];
} }
...@@ -101,7 +101,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() { ...@@ -101,7 +101,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
num_bin -= 1; num_bin -= 1;
} }
bin_size += num_bin * sizeof(HistogramBinEntry); bin_size += num_bin * KHistEntrySize;
} }
} }
...@@ -113,7 +113,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() { ...@@ -113,7 +113,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
num_bin -= 1; num_bin -= 1;
} }
bin_size += num_bin * sizeof(HistogramBinEntry); bin_size += num_bin * KHistEntrySize;
} }
// sync global data sumup info // sync global data sumup info
...@@ -158,8 +158,8 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() { ...@@ -158,8 +158,8 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram()); this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
} }
// Reduce scatter for histogram // Reduce scatter for histogram
Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(), Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(),
block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer); block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramSumReducer);
this->FindBestSplitsFromHistograms(this->is_feature_used_, true); this->FindBestSplitsFromHistograms(this->is_feature_used_, true);
} }
...@@ -186,7 +186,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const ...@@ -186,7 +186,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
this->train_data_->FixHistogram(feature_index, this->train_data_->FixHistogram(feature_index,
this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
this->smaller_leaf_histogram_array_[feature_index].RawData()); this->smaller_leaf_histogram_array_[feature_index].RawData());
SplitInfo smaller_split; SplitInfo smaller_split;
// find best threshold for smaller child // find best threshold for smaller child
......
...@@ -108,58 +108,70 @@ class DataPartition { ...@@ -108,58 +108,70 @@ class DataPartition {
* \param threshold threshold that want to split * \param threshold threshold that want to split
* \param right_leaf index of right leaf * \param right_leaf index of right leaf
*/ */
void Split(int leaf, const Dataset* dataset, int feature, const uint32_t* threshold, int num_threshold, bool default_left, int right_leaf) { void Split(int leaf, const Dataset* dataset, int feature,
const uint32_t* threshold, int num_threshold, bool default_left,
int right_leaf) {
Common::FunctionTimer fun_timer("DataPartition::Split", global_timer);
const data_size_t min_inner_size = 512; const data_size_t min_inner_size = 512;
// get leaf boundary // get leaf boundary
const data_size_t begin = leaf_begin_[leaf]; const data_size_t begin = leaf_begin_[leaf];
const data_size_t cnt = leaf_count_[leaf]; const data_size_t cnt = leaf_count_[leaf];
data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_; const int nblock =
if (inner_size < min_inner_size) { inner_size = min_inner_size; } std::min(num_threads_, (cnt + min_inner_size - 1) / min_inner_size);
data_size_t inner_size = SIZE_ALIGNED((cnt + nblock - 1) / nblock);
auto left_start = indices_.data() + begin;
global_timer.Start("DataPartition::Split.MT");
// split data multi-threading // split data multi-threading
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) { for (int i = 0; i < nblock; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size; data_size_t cur_start = i * inner_size;
if (cur_start > cnt) { continue; } data_size_t cur_cnt = std::min(inner_size, cnt - cur_start);
data_size_t cur_cnt = inner_size; if (cur_cnt <= 0) {
if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; } left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
continue;
}
// split data inner, reduce the times of function called // split data inner, reduce the times of function called
data_size_t cur_left_count = dataset->Split(feature, threshold, num_threshold, default_left, indices_.data() + begin + cur_start, cur_cnt, data_size_t cur_left_count =
temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start); dataset->Split(feature, threshold, num_threshold, default_left,
left_start + cur_start, cur_cnt,
temp_left_indices_.data() + cur_start,
temp_right_indices_.data() + cur_start);
offsets_buf_[i] = cur_start; offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count; left_cnts_buf_[i] = cur_left_count;
right_cnts_buf_[i] = cur_cnt - cur_left_count; right_cnts_buf_[i] = cur_cnt - cur_left_count;
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
OMP_THROW_EX(); OMP_THROW_EX();
data_size_t left_cnt = 0; global_timer.Stop("DataPartition::Split.MT");
global_timer.Start("DataPartition::Split.Merge");
left_write_pos_buf_[0] = 0; left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0; right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) { for (int i = 1; i < nblock; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1]; left_write_pos_buf_[i] =
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1]; left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] =
right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
} }
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1]; data_size_t left_cnt =
// copy back indices of right leaf to indices_ left_write_pos_buf_[nblock - 1] + left_cnts_buf_[nblock - 1];
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) { auto right_start = left_start + left_cnt;
if (left_cnts_buf_[i] > 0) { #pragma omp parallel for schedule(static)
std::memcpy(indices_.data() + begin + left_write_pos_buf_[i], for (int i = 0; i < nblock; ++i) {
temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t)); std::copy_n(temp_left_indices_.data() + offsets_buf_[i],
} left_cnts_buf_[i], left_start + left_write_pos_buf_[i]);
if (right_cnts_buf_[i] > 0) { std::copy_n(temp_right_indices_.data() + offsets_buf_[i],
std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i], right_cnts_buf_[i], right_start + right_write_pos_buf_[i]);
temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
} }
// update leaf boundary // update leaf boundary
leaf_count_[leaf] = left_cnt; leaf_count_[leaf] = left_cnt;
leaf_begin_[right_leaf] = left_cnt + begin; leaf_begin_[right_leaf] = left_cnt + begin;
leaf_count_[right_leaf] = cnt - left_cnt; leaf_count_[right_leaf] = cnt - left_cnt;
global_timer.Stop("DataPartition::Split.Merge");
} }
/*! /*!
...@@ -201,11 +213,11 @@ class DataPartition { ...@@ -201,11 +213,11 @@ class DataPartition {
/*! \brief number of data on one leaf */ /*! \brief number of data on one leaf */
std::vector<data_size_t> leaf_count_; std::vector<data_size_t> leaf_count_;
/*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */ /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
std::vector<data_size_t> indices_; std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> indices_;
/*! \brief team indices buffer for split */ /*! \brief team indices buffer for split */
std::vector<data_size_t> temp_left_indices_; std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_left_indices_;
/*! \brief team indices buffer for split */ /*! \brief team indices buffer for split */
std::vector<data_size_t> temp_right_indices_; std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> temp_right_indices_;
/*! \brief used data indices, used for bagging */ /*! \brief used data indices, used for bagging */
const data_size_t* used_data_indices_; const data_size_t* used_data_indices_;
/*! \brief used data count, used for bagging */ /*! \brief used data count, used for bagging */
......
This diff is collapsed.
...@@ -49,15 +49,15 @@ void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { ...@@ -49,15 +49,15 @@ void GPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
// some functions used for debugging the GPU histogram construction // some functions used for debugging the GPU histogram construction
#if GPU_DEBUG > 0 #if GPU_DEBUG > 0
void PrintHistograms(HistogramBinEntry* h, size_t size) { void PrintHistograms(hist_t* h, size_t size) {
size_t total = 0; double total_hess = 0;
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
printf("%03lu=%9.3g,%9.3g,%7d\t", i, h[i].sum_gradients, h[i].sum_hessians, h[i].cnt); printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
total += h[i].cnt; if ((i & 2) == 2)
if ((i & 3) == 3)
printf("\n"); printf("\n");
total_hess += GET_HESS(h, i);
} }
printf("\nTotal examples: %lu\n", total); printf("\nSum hessians: %9.3g\n", total_hess);
} }
union Float_t { union Float_t {
...@@ -69,27 +69,23 @@ union Float_t { ...@@ -69,27 +69,23 @@ union Float_t {
}; };
void CompareHistograms(HistogramBinEntry* h1, HistogramBinEntry* h2, size_t size, int feature_id) { void CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id) {
size_t i; size_t i;
Float_t a, b; Float_t a, b;
for (i = 0; i < size; ++i) { for (i = 0; i < size; ++i) {
a.f = h1[i].sum_gradients; a.f = GET_GRAD(h1, i);
b.f = h2[i].sum_gradients; b.f = GET_GRAD(h2, i);
int32_t ulps = Float_t::ulp_diff(a, b); int32_t ulps = Float_t::ulp_diff(a, b);
if (fabs(h1[i].cnt - h2[i].cnt != 0)) {
printf("%d != %d\n", h1[i].cnt, h2[i].cnt);
goto err;
}
if (ulps > 0) { if (ulps > 0) {
// printf("grad %g != %g (%d ULPs)\n", h1[i].sum_gradients, h2[i].sum_gradients, ulps); // printf("grad %g != %g (%d ULPs)\n", GET_GRAD(h1, i), GET_GRAD(h2, i), ulps);
// goto err; // goto err;
} }
a.f = h1[i].sum_hessians; a.f = GET_HESS(h1, i);
b.f = h2[i].sum_hessians; b.f = GET_HESS(h2, i);
ulps = Float_t::ulp_diff(a, b); ulps = Float_t::ulp_diff(a, b);
if (ulps > 0) { if (std::fabs(a.f - b.f) >= 1e-20) {
// printf("hessian %g != %g (%d ULPs)\n", h1[i].sum_hessians, h2[i].sum_hessians, ulps); printf("hessian %g != %g (%d ULPs)\n", GET_HESS(h1, i), GET_HESS(h2, i), ulps);
// goto err; goto err;
} }
} }
return; return;
...@@ -191,7 +187,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur ...@@ -191,7 +187,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
} }
template <typename HistType> template <typename HistType>
void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_); HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
// when the output is ready, the computation is done // when the output is ready, the computation is done
histograms_wait_obj_.wait(); histograms_wait_obj_.wait();
...@@ -201,29 +197,25 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { ...@@ -201,29 +197,25 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
continue; continue;
} }
int dense_group_index = dense_feature_group_map_[i]; int dense_group_index = dense_feature_group_map_[i];
auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index); auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
if (device_bin_mults_[i] == 1) { if (device_bin_mults_[i] == 1) {
for (int j = 0; j < bin_size; ++j) { for (int j = 0; j < bin_size; ++j) {
old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients; GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians; GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j);
old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
} }
} else { } else {
// values of this feature has been redistributed to multiple bins; need a reduction here // values of this feature has been redistributed to multiple bins; need a reduction here
int ind = 0; int ind = 0;
for (int j = 0; j < bin_size; ++j) { for (int j = 0; j < bin_size; ++j) {
double sum_g = 0.0, sum_h = 0.0; double sum_g = 0.0, sum_h = 0.0;
size_t cnt = 0;
for (int k = 0; k < device_bin_mults_[i]; ++k) { for (int k = 0; k < device_bin_mults_[i]; ++k) {
sum_g += hist_outputs[i * device_bin_size_+ ind].sum_gradients; sum_g += GET_GRAD(hist_outputs, i * device_bin_size_+ ind);
sum_h += hist_outputs[i * device_bin_size_+ ind].sum_hessians; sum_h += GET_HESS(hist_outputs, i * device_bin_size_+ ind);
cnt += hist_outputs[i * device_bin_size_ + ind].cnt;
ind++; ind++;
} }
old_histogram_array[j].sum_gradients = sum_g; GET_GRAD(old_histogram_array, j) = sum_g;
old_histogram_array[j].sum_hessians = sum_h; GET_HESS(old_histogram_array, j) = sum_h;
old_histogram_array[j].cnt = (data_size_t)cnt;
} }
} }
} }
...@@ -233,7 +225,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) { ...@@ -233,7 +225,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
void GPUTreeLearner::AllocateGPUMemory() { void GPUTreeLearner::AllocateGPUMemory() {
num_dense_feature_groups_ = 0; num_dense_feature_groups_ = 0;
for (int i = 0; i < num_feature_groups_; ++i) { for (int i = 0; i < num_feature_groups_; ++i) {
if (ordered_bins_[i] == nullptr) { if (!train_data_->IsMultiGroup(i)) {
num_dense_feature_groups_++; num_dense_feature_groups_++;
} }
} }
...@@ -303,7 +295,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -303,7 +295,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_)); device_data_indices_ = std::unique_ptr<boost::compute::vector<data_size_t>>(new boost::compute::vector<data_size_t>(allocated_num_data_, ctx_));
boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_); boost::compute::fill(device_data_indices_->begin(), device_data_indices_->end(), 0, queue_);
// histogram bin entry size depends on the precision (single/double) // histogram bin entry size depends on the precision (single/double)
hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(HistogramBinEntry) : sizeof(GPUHistogramBinEntry); hist_bin_entry_sz_ = config_->gpu_use_dp ? sizeof(hist_t) * 2 : sizeof(gpu_hist_t) * 2;
Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_); Log::Info("Size of histogram bin entry: %d", hist_bin_entry_sz_);
// create output buffer, each feature has a histogram with device_bin_size_ bins, // create output buffer, each feature has a histogram with device_bin_size_ bins,
// each work group generates a sub-histogram of dword_features_ features. // each work group generates a sub-histogram of dword_features_ features.
...@@ -326,7 +318,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -326,7 +318,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
std::vector<int> dense_dword_ind(dword_features_); std::vector<int> dense_dword_ind(dword_features_);
for (int i = 0; i < num_feature_groups_; ++i) { for (int i = 0; i < num_feature_groups_; ++i) {
// looking for dword_features_ non-sparse feature-groups // looking for dword_features_ non-sparse feature-groups
if (ordered_bins_[i] == nullptr) { if (!train_data_->IsMultiGroup(i)) {
dense_dword_ind[k] = i; dense_dword_ind[k] = i;
// decide if we need to redistribute the bin // decide if we need to redistribute the bin
double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i)); double t = device_bin_size_ / static_cast<double>(train_data_->FeatureGroupNumBin(i));
...@@ -682,6 +674,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) { ...@@ -682,6 +674,9 @@ void GPUTreeLearner::InitGPU(int platform_id, int device_id) {
printf("bin size: "); printf("bin size: ");
#endif #endif
for (int i = 0; i < num_feature_groups_; ++i) { for (int i = 0; i < num_feature_groups_; ++i) {
if (train_data_->IsMultiGroup(i)) {
continue;
}
#if GPU_DEBUG >= 1 #if GPU_DEBUG >= 1
printf("%d, ", train_data_->FeatureGroupNumBin(i)); printf("%d, ", train_data_->FeatureGroupNumBin(i));
#endif #endif
...@@ -960,35 +955,34 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -960,35 +955,34 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
for (int feature_index = 0; feature_index < num_features_; ++feature_index) { for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used_[feature_index]) continue; if (!is_feature_used_[feature_index]) continue;
if (!is_feature_used[feature_index]) continue; if (!is_feature_used[feature_index]) continue;
if (ordered_bins_[train_data_->Feature2Group(feature_index)]) { if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
is_sparse_feature_used[feature_index] = 1; is_sparse_feature_used[feature_index] = 1;
} else { } else {
is_dense_feature_used[feature_index] = 1; is_dense_feature_used[feature_index] = 1;
} }
} }
// construct smaller leaf // construct smaller leaf
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
// ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU // ConstructGPUHistogramsAsync will return true if there are availabe feature gourps dispatched to GPU
bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf(), nullptr, smaller_leaf_splits_->num_data_in_leaf(),
nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr); nullptr, nullptr);
// then construct sparse features on CPU // then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
train_data_->ConstructHistograms(is_sparse_feature_used, train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, smaller_leaf_splits_->num_data_in_leaf(), smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(), gradients_, hessians_,
&ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
multi_val_bin_.get(), is_hist_colwise_,
ptr_smaller_leaf_hist_data); ptr_smaller_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used // wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) { if (is_gpu_used) {
if (config_->gpu_use_dp) { if (config_->gpu_use_dp) {
// use double precision // use double precision
WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data); WaitAndGetHistograms<hist_t>(ptr_smaller_leaf_hist_data);
} else { } else {
// use single precision // use single precision
WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data); WaitAndGetHistograms<gpu_hist_t>(ptr_smaller_leaf_hist_data);
} }
} }
...@@ -1000,48 +994,58 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -1000,48 +994,58 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
continue; continue;
int dense_feature_group_index = dense_feature_group_map_[i]; int dense_feature_group_index = dense_feature_group_map_[i];
size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index); size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1; hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - KHistOffset;
HistogramBinEntry* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index); hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
HistogramBinEntry* gpu_histogram = new HistogramBinEntry[size]; hist_t* gpu_histogram = new hist_t[size * 2];
data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size); printf("Comparing histogram for feature %d size %d, %lu bins\n", dense_feature_group_index, num_data, size);
std::copy(current_histogram, current_histogram + size, gpu_histogram); std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
std::memset(current_histogram, 0, train_data_->FeatureGroupNumBin(dense_feature_group_index) * sizeof(HistogramBinEntry)); std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( if(train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr){continue;}
num_data != num_data_ ? smaller_leaf_splits_->data_indices() : nullptr, if (num_data != num_data_ ) {
num_data, train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
num_data != num_data_ ? ordered_gradients_.data() : gradients_, smaller_leaf_splits_->data_indices(),
num_data != num_data_ ? ordered_hessians_.data() : hessians_, 0,
current_histogram); num_data,
ordered_gradients_.data(),
ordered_hessians_.data(),
current_histogram);
} else {
train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
0,
num_data,
gradients_,
hessians_,
current_histogram);
}
CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index); CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index);
std::copy(gpu_histogram, gpu_histogram + size, current_histogram); std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
delete [] gpu_histogram; delete [] gpu_histogram;
} }
#endif #endif
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf // construct larger leaf
HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1; hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - KHistOffset;
is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data()); ordered_gradients_.data(), ordered_hessians_.data());
// then construct sparse features on CPU // then construct sparse features on CPU
// We set data_indices to null to avoid rebuilding ordered gradients/hessians
train_data_->ConstructHistograms(is_sparse_feature_used, train_data_->ConstructHistograms(is_sparse_feature_used,
nullptr, larger_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->LeafIndex(), gradients_, hessians_,
&ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_, ordered_gradients_.data(), ordered_hessians_.data(), is_constant_hessian_,
multi_val_bin_.get(), is_hist_colwise_,
ptr_larger_leaf_hist_data); ptr_larger_leaf_hist_data);
// wait for GPU to finish, only if GPU is actually used // wait for GPU to finish, only if GPU is actually used
if (is_gpu_used) { if (is_gpu_used) {
if (config_->gpu_use_dp) { if (config_->gpu_use_dp) {
// use double precision // use double precision
WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data); WaitAndGetHistograms<hist_t>(ptr_larger_leaf_hist_data);
} else { } else {
// use single precision // use single precision
WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data); WaitAndGetHistograms<gpu_hist_t>(ptr_larger_leaf_hist_data);
} }
} }
} }
......
...@@ -76,12 +76,7 @@ class GPUTreeLearner: public SerialTreeLearner { ...@@ -76,12 +76,7 @@ class GPUTreeLearner: public SerialTreeLearner {
uint8_t s[4]; uint8_t s[4];
}; };
/*! \brief Single precision histogram entiry for GPU */ typedef float gpu_hist_t;
struct GPUHistogramBinEntry {
score_t sum_gradients;
score_t sum_hessians;
uint32_t cnt;
};
/*! /*!
* \brief Find the best number of workgroups processing one feature for maximizing efficiency * \brief Find the best number of workgroups processing one feature for maximizing efficiency
...@@ -133,7 +128,7 @@ class GPUTreeLearner: public SerialTreeLearner { ...@@ -133,7 +128,7 @@ class GPUTreeLearner: public SerialTreeLearner {
* \param histograms Destination of histogram results from GPU. * \param histograms Destination of histogram results from GPU.
*/ */
template <typename HistType> template <typename HistType>
void WaitAndGetHistograms(HistogramBinEntry* histograms); void WaitAndGetHistograms(hist_t* histograms);
/*! /*!
* \brief Construct GPU histogram asynchronously. * \brief Construct GPU histogram asynchronously.
......
...@@ -163,7 +163,7 @@ R""() ...@@ -163,7 +163,7 @@ R""()
void within_kernel_reduction16x8(uchar8 feature_mask, void within_kernel_reduction16x8(uchar8 feature_mask,
__global const acc_type* restrict feature4_sub_hist, __global const acc_type* restrict feature4_sub_hist,
const uint skip_id, const uint skip_id,
acc_type stat_val, uint cnt_val, acc_type stat_val,
const ushort num_sub_hist, const ushort num_sub_hist,
__global acc_type* restrict output_buf, __global acc_type* restrict output_buf,
__local acc_type * restrict local_hist) { __local acc_type * restrict local_hist) {
...@@ -181,33 +181,21 @@ void within_kernel_reduction16x8(uchar8 feature_mask, ...@@ -181,33 +181,21 @@ void within_kernel_reduction16x8(uchar8 feature_mask,
// 256 threads working on 8 features' 16 bins, gradient and hessian // 256 threads working on 8 features' 16 bins, gradient and hessian
stat_val += *p; stat_val += *p;
p += NUM_BINS * DWORD_FEATURES * 2; p += NUM_BINS * DWORD_FEATURES * 2;
if (ltid < LOCAL_SIZE_0 / 2) {
cnt_val += as_acc_int_type(*p);
}
p += NUM_BINS * DWORD_FEATURES;
} }
// skip the counters we already have // skip the counters we already have
p += 3 * DWORD_FEATURES * NUM_BINS; p += 2 * DWORD_FEATURES * NUM_BINS;
for (i = i + 1; i < num_sub_hist; ++i) { for (i = i + 1; i < num_sub_hist; ++i) {
stat_val += *p; stat_val += *p;
p += NUM_BINS * DWORD_FEATURES * 2; p += NUM_BINS * DWORD_FEATURES * 2;
if (ltid < LOCAL_SIZE_0 / 2) {
cnt_val += as_acc_int_type(*p);
}
p += NUM_BINS * DWORD_FEATURES;
} }
#endif #endif
// printf("thread %d:feature=%d, bin_id=%d, hessian=%d, stat_val=%f, cnt=%d", ltid, feature_id, bin_id, is_hessian_first, stat_val, cnt_val); // printf("thread %d:feature=%d, bin_id=%d, hessian=%d, stat_val=%f, cnt=%d", ltid, feature_id, bin_id, is_hessian_first, stat_val, cnt_val);
// now overwrite the local_hist for final reduction and output // now overwrite the local_hist for final reduction and output
// reverse the f7...f0 order to match the real order // reverse the f7...f0 order to match the real order
feature_id = DWORD_FEATURES_MASK - feature_id; feature_id = DWORD_FEATURES_MASK - feature_id;
local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + is_hessian_first] = stat_val; local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + is_hessian_first] = stat_val;
bin_id = ltid >> (LOG2_DWORD_FEATURES); // range 0 - 16, for counter
if (ltid < LOCAL_SIZE_0 / 2) {
local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (i = ltid; i < DWORD_FEATURES * 3 * NUM_BINS; i += lsize) { for (i = ltid; i < DWORD_FEATURES * 2 * NUM_BINS; i += lsize) {
output_buf[i] = local_hist[i]; output_buf[i] = local_hist[i];
} }
} }
...@@ -335,7 +323,9 @@ __kernel void histogram16(__global const uchar4* feature_data_base, ...@@ -335,7 +323,9 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
bk7_c_f0_bin16 bk7_c_f1_bin16 bk7_c_f2_bin16 bk7_c_f3_bin16 bk7_c_f4_bin16 bk7_c_f5_bin16 bk7_c_f6_bin16 bk7_c_f7_bin0 bk7_c_f0_bin16 bk7_c_f1_bin16 bk7_c_f2_bin16 bk7_c_f3_bin16 bk7_c_f4_bin16 bk7_c_f5_bin16 bk7_c_f6_bin16 bk7_c_f7_bin0
----------------------------------------------- -----------------------------------------------
*/ */
#if CONST_HESSIAN == 1
__local uint * cnt_hist = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS); __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
#endif
// thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first // thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first
// thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first // thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first
...@@ -547,7 +537,7 @@ R""() ...@@ -547,7 +537,7 @@ R""()
atomic_local_add_f(gh_hist + addr2, stat2); atomic_local_add_f(gh_hist + addr2, stat2);
#endif #endif
} }
#if CONST_HESSIAN == 1
// STAGE 3: accumulate counter // STAGE 3: accumulate counter
// there are 8 counters for 8 features // there are 8 counters for 8 features
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's counts for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's counts for example 0, 1, 2, 3, 4, 5, 6, 7
...@@ -614,6 +604,7 @@ R""() ...@@ -614,6 +604,7 @@ R""()
// printf("thread %x add counter %d feature %d (7)\n", ltid, bin, offset); // printf("thread %x add counter %d feature %d (7)\n", ltid, bin, offset);
atom_inc(cnt_hist + addr); atom_inc(cnt_hist + addr);
} }
#endif
stat1 = stat1_next; stat1 = stat1_next;
stat2 = stat2_next; stat2 = stat2_next;
feature4 = feature4_next; feature4 = feature4_next;
...@@ -642,6 +633,7 @@ R""() ...@@ -642,6 +633,7 @@ R""()
ushort bank_id = (i + offset) & BANK_MASK; ushort bank_id = (i + offset) & BANK_MASK;
stat_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + feature_id]; stat_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + feature_id];
} }
#if CONST_HESSIAN == 1
if (ltid < LOCAL_SIZE_0 / 2) { if (ltid < LOCAL_SIZE_0 / 2) {
// first 128 threads accumulate the 8 * 16 = 128 counter values // first 128 threads accumulate the 8 * 16 = 128 counter values
bin_id = ltid >> LOG2_DWORD_FEATURES; // bits 3 - 6 range 0 - 16 is bin ID bin_id = ltid >> LOG2_DWORD_FEATURES; // bits 3 - 6 range 0 - 16 is bin ID
...@@ -651,6 +643,7 @@ R""() ...@@ -651,6 +643,7 @@ R""()
cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * DWORD_FEATURES + feature_id]; cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * DWORD_FEATURES + feature_id];
} }
} }
#endif
// now thread 0 - 7 holds feature 0 - 7's gradient for bin 0 and counter bin 0 // now thread 0 - 7 holds feature 0 - 7's gradient for bin 0 and counter bin 0
// now thread 8 - 15 holds feature 0 - 7's hessian for bin 0 and counter bin 1 // now thread 8 - 15 holds feature 0 - 7's hessian for bin 0 and counter bin 1
...@@ -687,7 +680,7 @@ R""() ...@@ -687,7 +680,7 @@ R""()
// write to output // write to output
// write gradients and hessians histogram for all 4 features // write gradients and hessians histogram for all 4 features
// output data in linear order for further reduction // output data in linear order for further reduction
// output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float) // output size = 4 (features) * 2 (counters) * 64 (bins) * sizeof(float)
/* memory layout of output: /* memory layout of output:
g_f0_bin0 g_f1_bin0 g_f2_bin0 g_f3_bin0 g_f4_bin0 g_f5_bin0 g_f6_bin0 g_f7_bin0 g_f0_bin0 g_f1_bin0 g_f2_bin0 g_f3_bin0 g_f4_bin0 g_f5_bin0 g_f6_bin0 g_f7_bin0
h_f0_bin0 h_f1_bin0 h_f2_bin0 h_f3_bin0 h_f4_bin0 h_f5_bin0 h_f6_bin0 h_f7_bin0 h_f0_bin0 h_f1_bin0 h_f2_bin0 h_f3_bin0 h_f4_bin0 h_f5_bin0 h_f6_bin0 h_f7_bin0
...@@ -705,14 +698,10 @@ R""() ...@@ -705,14 +698,10 @@ R""()
// if there is only one workgroup processing this feature4, don't even need to write // if there is only one workgroup processing this feature4, don't even need to write
uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS); uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
#if POWER_FEATURE_WORKGROUPS != 0 #if POWER_FEATURE_WORKGROUPS != 0
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 3 * NUM_BINS; __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 2 * NUM_BINS;
// if g_val and h_val are double, they are converted to float here // if g_val and h_val are double, they are converted to float here
// write gradients and hessians for 8 features // write gradients and hessians for 8 features
output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val; output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val;
// write counts for 8 features
if (ltid < LOCAL_SIZE_0 / 2) {
output[2 * DWORD_FEATURES * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
}
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
// To avoid the cost of an extra reducting kernel, we have to deal with some // To avoid the cost of an extra reducting kernel, we have to deal with some
...@@ -738,7 +727,7 @@ R""() ...@@ -738,7 +727,7 @@ R""()
// The is done by using an global atomic counter. // The is done by using an global atomic counter.
// On AMD GPUs ideally this should be done in GDS, // On AMD GPUs ideally this should be done in GDS,
// but currently there is no easy way to access it via OpenCL. // but currently there is no easy way to access it via OpenCL.
__local uint * counter_val = cnt_hist; __local uint * counter_val = (__local uint *)(gh_hist + 2 * DWORD_FEATURES * NUM_BINS * NUM_BANKS);
if (ltid == 0) { if (ltid == 0) {
// all workgroups processing the same feature add this counter // all workgroups processing the same feature add this counter
*counter_val = atom_inc(sync_counters + feature4_id); *counter_val = atom_inc(sync_counters + feature4_id);
...@@ -762,12 +751,12 @@ R""() ...@@ -762,12 +751,12 @@ R""()
// locate our feature4's block in output memory // locate our feature4's block in output memory
uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS); uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
__global acc_type const * restrict feature4_subhists = __global acc_type const * restrict feature4_subhists =
(__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 3 * NUM_BINS; (__global acc_type *)output_buf + output_offset * DWORD_FEATURES * 2 * NUM_BINS;
// skip reading the data already in local memory // skip reading the data already in local memory
uint skip_id = group_id ^ output_offset; uint skip_id = group_id ^ output_offset;
// locate output histogram location for this feature4 // locate output histogram location for this feature4
__global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 3 * NUM_BINS; __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * DWORD_FEATURES * 2 * NUM_BINS;
within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val, cnt_val, within_kernel_reduction16x8(feature_mask, feature4_subhists, skip_id, stat_val,
1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array); 1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
} }
} }
...@@ -776,4 +765,3 @@ R""() ...@@ -776,4 +765,3 @@ R""()
// the +9 skips extra characters ")", newline, "#endif" and newline at the beginning // the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
// )"" "\n#endif" + 9 // )"" "\n#endif" + 9
#endif #endif
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment