Support both row-wise and col-wise multi-threading (#2699)

* commit * fix a bug * fix bug * reset to track changes * refine the auto choose logic * sort the time stats output * fix include * change multi_val_bin_sparse_threshold * add cmake * add _mm_malloc and _mm_free for cross platform * fix cmake bug * timer for split * try to fix cmake * fix tests * refactor DataPartition::Split * fix test * typo * formating * Revert "formating" This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222. * add document * [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719) * naming * fix gpu code * Update include/LightGBM/bin.h Co-Authored-By: James Lamb <jaylamb20@gmail.com> * Update src/treelearner/ocl/histogram16.cl * test: swap compilers for CI * fix omp * not avx2 * no aligned for feature histogram * Revert "refactor DataPartition::Split" This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8. * slightly refactor data partition * reduce the memory cost Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>

Support both row-wise and col-wise multi-threading (#2699)
* commit * fix a bug * fix bug * reset to track changes * refine the auto choose logic * sort the time stats output * fix include * change multi_val_bin_sparse_threshold * add cmake * add _mm_malloc and _mm_free for cross platform * fix cmake bug * timer for split * try to fix cmake * fix tests * refactor DataPartition::Split * fix test * typo * formating * Revert "formating" This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222. * add document * [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719) * naming * fix gpu code * Update include/LightGBM/bin.h Co-Authored-By: James Lamb <jaylamb20@gmail.com> * Update src/treelearner/ocl/histogram16.cl * test: swap compilers for CI * fix omp * not avx2 * no aligned for feature histogram * Revert "refactor DataPartition::Split" This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8. * slightly refactor data partition * reduce the memory cost Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
509c2e50 · Guolin Ke · GitHub · bc7bc4a1 · 509c2e50 · 509c2e50
Unverified Commit 509c2e50 authored Feb 02, 2020 by Guolin Ke Committed by GitHub Feb 02, 2020
10 changed files
--- a/src/treelearner/ocl/histogram64.cl
+++ b/src/treelearner/ocl/histogram64.cl
@@ -157,7 +157,7 @@ R""()
 void within_kernel_reduction64x4(uchar4 feature_mask,
                           __global const acc_type* restrict feature4_sub_hist, 
                           const uint skip_id,
-                           acc_type g_val, acc_type h_val, uint cnt_val,
+                           acc_type g_val, acc_type h_val,
                           const ushort num_sub_hist,
                           __global acc_type* restrict output_buf,
                           __local acc_type * restrict local_hist) {
@@ -173,38 +173,35 @@ void within_kernel_reduction64x4(uchar4 feature_mask,
    for (i = 0; i < skip_id; ++i) {
            g_val += *p;            p += NUM_BINS * 4; // 256 threads working on 4 features' 64 bins
            h_val += *p;            p += NUM_BINS * 4;
-            cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
    }
    // skip the counters we already have
-    p += 3 * 4 * NUM_BINS;
+    p += 2 * 4 * NUM_BINS;
    for (i = i + 1; i < num_sub_hist; ++i) {
            g_val += *p;            p += NUM_BINS * 4;
            h_val += *p;            p += NUM_BINS * 4;
-            cnt_val += as_acc_int_type(*p); p += NUM_BINS * 4;
    }
    #endif
    // printf("thread %d: g_val=%f, h_val=%f cnt=%d", ltid, g_val, h_val, cnt_val);
    // now overwrite the local_hist for final reduction and output
    // reverse the f3...f0 order to match the real order
    feature_id = 3 - feature_id;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 0] = g_val;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 1] = h_val;
-    local_hist[feature_id * 3 * NUM_BINS + bin_id * 3 + 2] = as_acc_type((acc_int_type)cnt_val);
+    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 0] = g_val;
+    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 1] = h_val;
    barrier(CLK_LOCAL_MEM_FENCE);
    i = ltid;
-    if (feature_mask.s0 && i < 1 * 3 * NUM_BINS) {
+    if (feature_mask.s0 && i < 1 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s1 && i < 2 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s1 && i < 2 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s2 && i < 3 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s2 && i < 3 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
-    i += 1 * 3 * NUM_BINS;
-    if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
+    i += 1 * 2 * NUM_BINS;
+    if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
 }
@@ -306,7 +303,9 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
       bk3_c_f0_bin64 bk3_c_f1_bin64 bk3_c_f2_bin64 bk3_c_f3_bin64
       -----------------------------------------------
    */
+    #if CONST_HESSIAN == 1
    __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);
+    #endif

    // thread 0, 1, 2, 3 compute histograms for gradients first
    // thread 4, 5, 6, 7 compute histograms for hessians  first
@@ -509,7 +508,7 @@ R""()
            s0_stat1 += stat1;
            s0_stat2 += stat2;
        }
-
+        #if CONST_HESSIAN == 1
        // STAGE 3: accumulate counter
        // there are 4 counters for 4 features
        // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
@@ -540,6 +539,7 @@ R""()
            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
+        #endif
        stat1 = stat1_next;
        stat2 = stat2_next;
        feature4 = feature4_next;
@@ -639,7 +639,9 @@ R""()
        ushort bank_id = (i + offset) & BANK_MASK;
        g_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id];
        h_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id + 4];
+        #if CONST_HESSIAN == 1
        cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * 4 + feature_id];
+        #endif
    }
    // now thread 0 - 3 holds feature 0, 1, 2, 3's gradient, hessian and count bin 0
    // now thread 4 - 7 holds feature 0, 1, 2, 3's gradient, hessian and count bin 1
@@ -670,14 +672,12 @@ R""()
    // if there is only one workgroup processing this feature4, don't even need to write
    uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
    #if POWER_FEATURE_WORKGROUPS != 0
-    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
+    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS;
    // if g_val and h_val are double, they are converted to float here
    // write gradients for 4 features
    output[0 * 4 * NUM_BINS + ltid] = g_val;
    // write hessians for 4 features
    output[1 * 4 * NUM_BINS + ltid] = h_val;
-    // write counts for 4 features
-    output[2 * 4 * NUM_BINS + ltid] = as_acc_type((acc_int_type)cnt_val);
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    mem_fence(CLK_GLOBAL_MEM_FENCE);
    // To avoid the cost of an extra reducting kernel, we have to deal with some 
@@ -703,7 +703,7 @@ R""()
    // The is done by using an global atomic counter.
    // On AMD GPUs ideally this should be done in GDS,
    // but currently there is no easy way to access it via OpenCL.
-    __local uint * counter_val = cnt_hist;
+    __local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);;
    if (ltid == 0) {
        // all workgroups processing the same feature add this counter
        *counter_val = atom_inc(sync_counters + feature4_id);
@@ -727,12 +727,12 @@ R""()
        // locate our feature4's block in output memory
        uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
        __global acc_type const * restrict feature4_subhists = 
-                 (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
+                 (__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS;
        // skip reading the data already in local memory
        uint skip_id = group_id ^ output_offset;
        // locate output histogram location for this feature4
-        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
-        within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, cnt_val, 
+        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS;
+        within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, 
                                    1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
    }
 }

--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -181,8 +181,8 @@ class VotingParallelTreeLearner: public TREELEARNER_T {
  /*! \brief Store global histogram for larger leaf  */
  std::unique_ptr<FeatureHistogram[]> larger_leaf_histogram_array_global_;

-  std::vector<HistogramBinEntry> smaller_leaf_histogram_data_;
-  std::vector<HistogramBinEntry> larger_leaf_histogram_data_;
+  std::vector<hist_t> smaller_leaf_histogram_data_;
+  std::vector<hist_t> larger_leaf_histogram_data_;
  std::vector<FeatureMetainfo> feature_metas_;
 };


--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -79,7 +79,12 @@ class SerialTreeLearner: public TreeLearner {
  void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
                       data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;

+  bool IsHistColWise() const override { return is_hist_colwise_; }
+
 protected:
+
+  void GetMultiValBin(const Dataset* dataset, bool is_first_time);
+
  virtual std::vector<int8_t> GetUsedFeatures(bool is_tree_level);
  /*!
  * \brief Some initial works before training
@@ -161,17 +166,13 @@ class SerialTreeLearner: public TreeLearner {
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
 #else
  /*! \brief gradients of current iteration, ordered for cache optimized */
-  std::vector<score_t> ordered_gradients_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_gradients_;
  /*! \brief hessians of current iteration, ordered for cache optimized */
-  std::vector<score_t> ordered_hessians_;
+  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> ordered_hessians_;
 #endif

-  /*! \brief Store ordered bin */
-  std::vector<std::unique_ptr<OrderedBin>> ordered_bins_;
-  /*! \brief True if has ordered bin */
-  bool has_ordered_bin_ = false;
  /*! \brief  is_data_in_leaf_[i] != 0 means i-th data is marked */
-  std::vector<char> is_data_in_leaf_;
+  std::vector<char, Common::AlignmentAllocator<char, kAlignedSize>> is_data_in_leaf_;
  /*! \brief used to cache historical histogram to speed up*/
  HistogramPool histogram_pool_;
  /*! \brief config of tree learner*/
@@ -179,6 +180,8 @@ class SerialTreeLearner: public TreeLearner {
  int num_threads_;
  std::vector<int> ordered_bin_indices_;
  bool is_constant_hessian_;
+  std::unique_ptr<MultiValBin> multi_val_bin_;
+  bool is_hist_colwise_;
  std::unique_ptr<CostEfficientGradientBoosting> cegb_;
 };


--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
--- a/tests/python_package_test/test_consistency.py
+++ b/tests/python_package_test/test_consistency.py
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters