Unverified Commit 509c2e50 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Support both row-wise and col-wise multi-threading (#2699)



* commit

* fix a bug

* fix bug

* reset to track changes

* refine the auto choose logic

* sort the time stats output

* fix include

* change  multi_val_bin_sparse_threshold

* add cmake

* add _mm_malloc and _mm_free for cross platform

* fix cmake bug

* timer for split

* try to fix cmake

* fix tests

* refactor DataPartition::Split

* fix test

* typo

* formating

* Revert "formating"

This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222.

* add document

* [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719)

* naming

* fix gpu code

* Update include/LightGBM/bin.h
Co-Authored-By: default avatarJames Lamb <jaylamb20@gmail.com>

* Update src/treelearner/ocl/histogram16.cl

* test: swap compilers for CI

* fix omp

* not avx2

* no aligned for feature histogram

* Revert "refactor DataPartition::Split"

This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8.

* slightly refactor data partition

* reduce the memory cost
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent bc7bc4a1
......@@ -157,7 +157,7 @@ R""()
void within_kernel_reduction64x4(uchar4 feature_mask,
__global const acc_type* restrict feature4_sub_hist,
const uint skip_id,
acc_type g_val, acc_type h_val, uint cnt_val,
acc_type g_val, acc_type h_val,
const ushort num_sub_hist,
__global acc_type* restrict output_buf,
__local acc_type * restrict local_hist) {
......@@ -173,38 +173,35 @@ void within_kernel_reduction64x4(uchar4 feature_mask,
for (i = 0; i < skip_id; ++i) {
g_val += *p; p += NUM_BINS * 4; // 256 threads working on 4 features' 64 bins
h_val += *p; p += NUM_BINS * 4;
cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
}
// skip the counters we already have
p += 3 * 4 * NUM_BINS;
p += 2 * 4 * NUM_BINS;
for (i = i + 1; i < num_sub_hist; ++i) {
g_val += *p; p += NUM_BINS * 4;
h_val += *p; p += NUM_BINS * 4;
cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
}
#endif
// printf("thread %d: g_val=%f, h_val=%f cnt=%d", ltid, g_val, h_val, cnt_val);
// now overwrite the local_hist for final reduction and output
// reverse the f3...f0 order to match the real order
feature_id = 3 - feature_id;
local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 0] = g_val;
local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 1] = h_val;
local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 0] = g_val;
local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 1] = h_val;
barrier(CLK_LOCAL_MEM_FENCE);
i = ltid;
if (feature_mask.s0 && i < 1 * 3 * NUM_BINS) {
if (feature_mask.s0 && i < 1 * 2 * NUM_BINS) {
output_buf[i] = local_hist[i];
}
i += 1 * 3 * NUM_BINS;
if (feature_mask.s1 && i < 2 * 3 * NUM_BINS) {
i += 1 * 2 * NUM_BINS;
if (feature_mask.s1 && i < 2 * 2 * NUM_BINS) {
output_buf[i] = local_hist[i];
}
i += 1 * 3 * NUM_BINS;
if (feature_mask.s2 && i < 3 * 3 * NUM_BINS) {
i += 1 * 2 * NUM_BINS;
if (feature_mask.s2 && i < 3 * 2 * NUM_BINS) {
output_buf[i] = local_hist[i];
}
i += 1 * 3 * NUM_BINS;
if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
i += 1 * 2 * NUM_BINS;
if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) {
output_buf[i] = local_hist[i];
}
}
......@@ -306,7 +303,9 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
bk3_c_f0_bin64 bk3_c_f1_bin64 bk3_c_f2_bin64 bk3_c_f3_bin64
-----------------------------------------------
*/
#if CONST_HESSIAN == 1
__local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);
#endif
// thread 0, 1, 2, 3 compute histograms for gradients first
// thread 4, 5, 6, 7 compute histograms for hessians first
......@@ -509,7 +508,7 @@ R""()
s0_stat1 += stat1;
s0_stat2 += stat2;
}
#if CONST_HESSIAN == 1
// STAGE 3: accumulate counter
// there are 4 counters for 4 features
// thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
......@@ -540,6 +539,7 @@ R""()
addr = bin * CNT_BIN_MULT + bank * 4 + offset;
atom_inc(cnt_hist + addr);
}
#endif
stat1 = stat1_next;
stat2 = stat2_next;
feature4 = feature4_next;
......@@ -639,7 +639,9 @@ R""()
ushort bank_id = (i + offset) & BANK_MASK;
g_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id];
h_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id + 4];
#if CONST_HESSIAN == 1
cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * 4 + feature_id];
#endif
}
// now thread 0 - 3 holds feature 0, 1, 2, 3's gradient, hessian and count bin 0
// now thread 4 - 7 holds feature 0, 1, 2, 3's gradient, hessian and count bin 1
......@@ -670,14 +672,12 @@ R""()
// if there is only one workgroup processing this feature4, don't even need to write
uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
#if POWER_FEATURE_WORKGROUPS != 0
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS;
// if g_val and h_val are double, they are converted to float here
// write gradients for 4 features
output[0 * 4 * NUM_BINS + ltid] = g_val;
// write hessians for 4 features
output[1 * 4 * NUM_BINS + ltid] = h_val;
// write counts for 4 features
output[2 * 4 * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
mem_fence(CLK_GLOBAL_MEM_FENCE);
// To avoid the cost of an extra reducting kernel, we have to deal with some
......@@ -703,7 +703,7 @@ R""()
// The is done by using an global atomic counter.
// On AMD GPUs ideally this should be done in GDS,
// but currently there is no easy way to access it via OpenCL.
__local uint * counter_val = cnt_hist;
__local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);;
if (ltid == 0) {
// all workgroups processing the same feature add this counter
*counter_val = atom_inc(sync_counters + feature4_id);
......@@ -727,12 +727,12 @@ R""()
// locate our feature4's block in output memory
uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
__global acc_type const * restrict feature4_subhists =
(__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
(__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS;
// skip reading the data already in local memory
uint skip_id = group_id ^ output_offset;
// locate output histogram location for this feature4
__global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, cnt_val,
__global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS;
within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val,
1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
}
}
......
......@@ -181,8 +181,8 @@ class VotingParallelTreeLearner: public TREELEARNER_T {
/*! \brief Store global histogram for larger leaf */
std::unique_ptr<FeatureHistogram[]> larger_leaf_histogram_array_global_;
std::vector<HistogramBinEntry> smaller_leaf_histogram_data_;
std::vector<HistogramBinEntry> larger_leaf_histogram_data_;
std::vector<hist_t> smaller_leaf_histogram_data_;
std::vector<hist_t> larger_leaf_histogram_data_;
std::vector<FeatureMetainfo> feature_metas_;
};
......
This diff is collapsed.
......@@ -79,7 +79,12 @@ class SerialTreeLearner: public TreeLearner {
void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;
bool IsHistColWise() const override { return is_hist_colwise_; }
protected:
void GetMultiValBin(const Dataset* dataset, bool is_first_time);
virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
/*!
* \brief Some initial works before training
......@@ -161,17 +166,13 @@ class SerialTreeLearner: public TreeLearner {
std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
#else
/*! \brief gradients of current iteration, ordered for cache optimized */
std::vector<score_t> ordered_gradients_;
std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_gradients_;
/*! \brief hessians of current iteration, ordered for cache optimized */
std::vector<score_t> ordered_hessians_;
std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_hessians_;
#endif
/*! \brief Store ordered bin */
std::vector<std::unique_ptr<OrderedBin>> ordered_bins_;
/*! \brief True if has ordered bin */
bool has_ordered_bin_ = false;
/*! \brief is_data_in_leaf_[i] != 0 means i-th data is marked */
std::vector<char> is_data_in_leaf_;
std::vector<char, Common::AlignmentAllocator<char, kAlignedSize>> is_data_in_leaf_;
/*! \brief used to cache historical histogram to speed up*/
HistogramPool histogram_pool_;
/*! \brief config of tree learner*/
......@@ -179,6 +180,8 @@ class SerialTreeLearner: public TreeLearner {
int num_threads_;
std::vector<int> ordered_bin_indices_;
bool is_constant_hessian_;
std::unique_ptr<MultiValBin> multi_val_bin_;
bool is_hist_colwise_;
std::unique_ptr<CostEfficientGradientBoosting> cegb_;
};
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment