Unverified Commit e79716e0 authored by Andrew Ziem's avatar Andrew Ziem Committed by GitHub
Browse files

Correct spelling (#4250)



* Correct spelling

Most changes were in comments, and there were a few changes to literals for log output.

There were no changes to variable names, function names, IDs, or functionality.

* Clarify a phrase in a comment
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>

* Clarify a phrase in a comment
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>

* Clarify a phrase in a comment
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>

* Correct spelling

Most are code comments, but one case is a literal in a logging message.

There are a few grammar fixes too.
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
parent bb88d92e
...@@ -133,13 +133,13 @@ class LeafSplits { ...@@ -133,13 +133,13 @@ class LeafSplits {
/*! \brief Get current leaf index */ /*! \brief Get current leaf index */
int leaf_index() const { return leaf_index_; } int leaf_index() const { return leaf_index_; }
/*! \brief Get numer of data in current leaf */ /*! \brief Get number of data in current leaf */
data_size_t num_data_in_leaf() const { return num_data_in_leaf_; } data_size_t num_data_in_leaf() const { return num_data_in_leaf_; }
/*! \brief Get sum of gradients of current leaf */ /*! \brief Get sum of gradients of current leaf */
double sum_gradients() const { return sum_gradients_; } double sum_gradients() const { return sum_gradients_; }
/*! \brief Get sum of hessians of current leaf */ /*! \brief Get sum of Hessians of current leaf */
double sum_hessians() const { return sum_hessians_; } double sum_hessians() const { return sum_hessians_; }
/*! \brief Get indices of data of current leaf */ /*! \brief Get indices of data of current leaf */
...@@ -160,7 +160,7 @@ class LeafSplits { ...@@ -160,7 +160,7 @@ class LeafSplits {
data_size_t num_data_; data_size_t num_data_;
/*! \brief sum of gradients of current leaf */ /*! \brief sum of gradients of current leaf */
double sum_gradients_; double sum_gradients_;
/*! \brief sum of hessians of current leaf */ /*! \brief sum of Hessians of current leaf */
double sum_hessians_; double sum_hessians_;
/*! \brief indices of data of current leaf */ /*! \brief indices of data of current leaf */
const data_size_t* data_indices_; const data_size_t* data_indices_;
......
...@@ -178,7 +178,7 @@ void within_kernel_reduction16x8(uchar8 feature_mask, ...@@ -178,7 +178,7 @@ void within_kernel_reduction16x8(uchar8 feature_mask,
// add all sub-histograms for 4 features // add all sub-histograms for 4 features
__global const acc_type* restrict p = feature4_sub_hist + ltid; __global const acc_type* restrict p = feature4_sub_hist + ltid;
for (i = 0; i < skip_id; ++i) { for (i = 0; i < skip_id; ++i) {
// 256 threads working on 8 features' 16 bins, gradient and hessian // 256 threads working on 8 features' 16 bins, gradient and Hessian
stat_val += *p; stat_val += *p;
p += NUM_BINS * DWORD_FEATURES * 2; p += NUM_BINS * DWORD_FEATURES * 2;
} }
...@@ -328,7 +328,7 @@ __kernel void histogram16(__global const uchar4* feature_data_base, ...@@ -328,7 +328,7 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
#endif #endif
// thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first // thread 0, 1, 2, 3, 4, 5, 6, 7 compute histograms for gradients first
// thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for hessians first // thread 8, 9, 10, 11, 12, 13, 14, 15 compute histograms for Hessians first
// etc. // etc.
uchar is_hessian_first = (ltid >> LOG2_DWORD_FEATURES) & 1; uchar is_hessian_first = (ltid >> LOG2_DWORD_FEATURES) & 1;
// thread 0-15 write result to bank0, 16-31 to bank1, 32-47 to bank2, 48-63 to bank3, etc // thread 0-15 write result to bank0, 16-31 to bank1, 32-47 to bank2, 48-63 to bank3, etc
...@@ -340,11 +340,11 @@ __kernel void histogram16(__global const uchar4* feature_data_base, ...@@ -340,11 +340,11 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
__global const uchar4* feature_data = feature_data_base + group_feature * feature_size; __global const uchar4* feature_data = feature_data_base + group_feature * feature_size;
// size of threads that process this feature4 // size of threads that process this feature4
const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS); const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS);
// equavalent thread ID in this subgroup for this feature4 // equivalent thread ID in this subgroup for this feature4
const uint subglobal_tid = gtid - group_feature * subglobal_size; const uint subglobal_tid = gtid - group_feature * subglobal_size;
// extract feature mask, when a byte is set to 0, that feature is disabled // extract feature mask, when a byte is set to 0, that feature is disabled
#if ENABLE_ALL_FEATURES == 1 #if ENABLE_ALL_FEATURES == 1
// hopefully the compiler will propogate the constants and eliminate all branches // hopefully the compiler will propagate the constants and eliminate all branches
uchar8 feature_mask = (uchar8)(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff); uchar8 feature_mask = (uchar8)(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff);
#else #else
uchar8 feature_mask = feature_masks[group_feature]; uchar8 feature_mask = feature_masks[group_feature];
...@@ -390,7 +390,7 @@ R""() ...@@ -390,7 +390,7 @@ R""()
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
// prefetch the next iteration variables // prefetch the next iteration variables
// we don't need bondary check because we have made the buffer larger // we don't need boundary check because we have made the buffer larger
stat1_next = ordered_gradients[i + subglobal_size]; stat1_next = ordered_gradients[i + subglobal_size];
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
stat2_next = ordered_hessians[i + subglobal_size]; stat2_next = ordered_hessians[i + subglobal_size];
...@@ -421,9 +421,9 @@ R""() ...@@ -421,9 +421,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset; addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first; addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6 ,7's gradients for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6 ,7's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 0, 1, 2, 3, 4, 5, 6, 7's hessians for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 0, 1, 2, 3, 4, 5, 6, 7's Hessians for example 8, 9, 10, 11, 12, 13, 14, 15
atomic_local_add_f(gh_hist + addr, stat1); atomic_local_add_f(gh_hist + addr, stat1);
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's hessians for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 0, 1, 2, 3, 4, 5, 6, 7's Hessians for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 0, 1, 2, 3, 4, 5, 6, 7's gradients for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 0, 1, 2, 3, 4, 5, 6, 7's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, stat2); atomic_local_add_f(gh_hist + addr2, stat2);
...@@ -435,9 +435,9 @@ R""() ...@@ -435,9 +435,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset; addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first; addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 1, 2, 3, 4, 5, 6 ,7, 0's gradients for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 1, 2, 3, 4, 5, 6 ,7, 0's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 1, 2, 3, 4, 5, 6, 7, 0's hessians for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 1, 2, 3, 4, 5, 6, 7, 0's Hessians for example 8, 9, 10, 11, 12, 13, 14, 15
atomic_local_add_f(gh_hist + addr, stat1); atomic_local_add_f(gh_hist + addr, stat1);
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 1, 2, 3, 4, 5, 6, 7, 0's hessians for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 1, 2, 3, 4, 5, 6, 7, 0's Hessians for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 1, 2, 3, 4, 5, 6, 7, 0's gradients for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 1, 2, 3, 4, 5, 6, 7, 0's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, stat2); atomic_local_add_f(gh_hist + addr2, stat2);
...@@ -450,9 +450,9 @@ R""() ...@@ -450,9 +450,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset; addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first; addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 2, 3, 4, 5, 6, 7, 0, 1's gradients for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 2, 3, 4, 5, 6, 7, 0, 1's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 2, 3, 4, 5, 6, 7, 0, 1's hessians for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 2, 3, 4, 5, 6, 7, 0, 1's Hessians for example 8, 9, 10, 11, 12, 13, 14, 15
atomic_local_add_f(gh_hist + addr, stat1); atomic_local_add_f(gh_hist + addr, stat1);
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 2, 3, 4, 5, 6, 7, 0, 1's hessians for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 2, 3, 4, 5, 6, 7, 0, 1's Hessians for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 2, 3, 4, 5, 6, 7, 0, 1's gradients for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 2, 3, 4, 5, 6, 7, 0, 1's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, stat2); atomic_local_add_f(gh_hist + addr2, stat2);
...@@ -464,9 +464,9 @@ R""() ...@@ -464,9 +464,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset; addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first; addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 3, 4, 5, 6, 7, 0, 1, 2's gradients for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 3, 4, 5, 6, 7, 0, 1, 2's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 3, 4, 5, 6, 7, 0, 1, 2's hessians for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 3, 4, 5, 6, 7, 0, 1, 2's Hessians for example 8, 9, 10, 11, 12, 13, 14, 15
atomic_local_add_f(gh_hist + addr, stat1); atomic_local_add_f(gh_hist + addr, stat1);
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 3, 4, 5, 6, 7, 0, 1, 2's hessians for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 3, 4, 5, 6, 7, 0, 1, 2's Hessians for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 3, 4, 5, 6, 7, 0, 1, 2's gradients for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 3, 4, 5, 6, 7, 0, 1, 2's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, stat2); atomic_local_add_f(gh_hist + addr2, stat2);
...@@ -500,9 +500,9 @@ R""() ...@@ -500,9 +500,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset; addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first; addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 5, 6, 7, 0, 1, 2, 3, 4's gradients for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 5, 6, 7, 0, 1, 2, 3, 4's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 5, 6, 7, 0, 1, 2, 3, 4's hessians for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 5, 6, 7, 0, 1, 2, 3, 4's Hessians for example 8, 9, 10, 11, 12, 13, 14, 15
atomic_local_add_f(gh_hist + addr, stat1); atomic_local_add_f(gh_hist + addr, stat1);
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 5, 6, 7, 0, 1, 2, 3, 4's hessians for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 5, 6, 7, 0, 1, 2, 3, 4's Hessians for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 5, 6, 7, 0, 1, 2, 3, 4's gradients for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 5, 6, 7, 0, 1, 2, 3, 4's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, stat2); atomic_local_add_f(gh_hist + addr2, stat2);
...@@ -515,9 +515,9 @@ R""() ...@@ -515,9 +515,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset; addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first; addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 6, 7, 0, 1, 2, 3, 4, 5's gradients for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 6, 7, 0, 1, 2, 3, 4, 5's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 6, 7, 0, 1, 2, 3, 4, 5's hessians for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 6, 7, 0, 1, 2, 3, 4, 5's Hessians for example 8, 9, 10, 11, 12, 13, 14, 15
atomic_local_add_f(gh_hist + addr, stat1); atomic_local_add_f(gh_hist + addr, stat1);
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 6, 7, 0, 1, 2, 3, 4, 5's hessians for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 6, 7, 0, 1, 2, 3, 4, 5's Hessians for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 6, 7, 0, 1, 2, 3, 4, 5's gradients for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 6, 7, 0, 1, 2, 3, 4, 5's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, stat2); atomic_local_add_f(gh_hist + addr2, stat2);
...@@ -529,9 +529,9 @@ R""() ...@@ -529,9 +529,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset; addr = bin * HG_BIN_MULT + bank * 2 * DWORD_FEATURES + is_hessian_first * DWORD_FEATURES + offset;
addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first; addr2 = addr + DWORD_FEATURES - 2 * DWORD_FEATURES * is_hessian_first;
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 7, 0, 1, 2, 3, 4, 5, 6's gradients for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 7, 0, 1, 2, 3, 4, 5, 6's gradients for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 7, 0, 1, 2, 3, 4, 5, 6's hessians for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 7, 0, 1, 2, 3, 4, 5, 6's Hessians for example 8, 9, 10, 11, 12, 13, 14, 15
atomic_local_add_f(gh_hist + addr, stat1); atomic_local_add_f(gh_hist + addr, stat1);
// thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 7, 0, 1, 2, 3, 4, 5, 6's hessians for example 0, 1, 2, 3, 4, 5, 6, 7 // thread 0, 1, 2, 3, 4, 5, 6, 7 now process feature 7, 0, 1, 2, 3, 4, 5, 6's Hessians for example 0, 1, 2, 3, 4, 5, 6, 7
// thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 7, 0, 1, 2, 3, 4, 5, 6's gradients for example 8, 9, 10, 11, 12, 13, 14, 15 // thread 8, 9, 10, 11, 12, 13, 14, 15 now process feature 7, 0, 1, 2, 3, 4, 5, 6's gradients for example 8, 9, 10, 11, 12, 13, 14, 15
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, stat2); atomic_local_add_f(gh_hist + addr2, stat2);
...@@ -652,7 +652,7 @@ R""() ...@@ -652,7 +652,7 @@ R""()
// etc, // etc,
#if CONST_HESSIAN == 1 #if CONST_HESSIAN == 1
// Combine the two banks into one, and fill the hessians with counter value * hessian constant // Combine the two banks into one, and fill the Hessians with counter value * hessian constant
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
gh_hist[ltid] = stat_val; gh_hist[ltid] = stat_val;
if (ltid < LOCAL_SIZE_0 / 2) { if (ltid < LOCAL_SIZE_0 / 2) {
...@@ -660,7 +660,7 @@ R""() ...@@ -660,7 +660,7 @@ R""()
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (is_hessian_first) { if (is_hessian_first) {
// this is the hessians // these are the Hessians
// thread 8 - 15 read counters stored by thread 0 - 7 // thread 8 - 15 read counters stored by thread 0 - 7
// thread 24- 31 read counters stored by thread 8 - 15 // thread 24- 31 read counters stored by thread 8 - 15
// thread 40- 47 read counters stored by thread 16- 23, etc // thread 40- 47 read counters stored by thread 16- 23, etc
...@@ -668,7 +668,7 @@ R""() ...@@ -668,7 +668,7 @@ R""()
cnt_hist[((ltid - DWORD_FEATURES) >> (LOG2_DWORD_FEATURES + 1)) * DWORD_FEATURES + (ltid & DWORD_FEATURES_MASK)]; cnt_hist[((ltid - DWORD_FEATURES) >> (LOG2_DWORD_FEATURES + 1)) * DWORD_FEATURES + (ltid & DWORD_FEATURES_MASK)];
} }
else { else {
// this is the gradients // these are the gradients
// thread 0 - 7 read gradients stored by thread 8 - 15 // thread 0 - 7 read gradients stored by thread 8 - 15
// thread 16- 23 read gradients stored by thread 24- 31 // thread 16- 23 read gradients stored by thread 24- 31
// thread 32- 39 read gradients stored by thread 40- 47, etc // thread 32- 39 read gradients stored by thread 40- 47, etc
...@@ -678,7 +678,7 @@ R""() ...@@ -678,7 +678,7 @@ R""()
#endif #endif
// write to output // write to output
// write gradients and hessians histogram for all 4 features // write gradients and Hessians histogram for all 4 features
// output data in linear order for further reduction // output data in linear order for further reduction
// output size = 4 (features) * 2 (counters) * 64 (bins) * sizeof(float) // output size = 4 (features) * 2 (counters) * 64 (bins) * sizeof(float)
/* memory layout of output: /* memory layout of output:
...@@ -700,17 +700,17 @@ R""() ...@@ -700,17 +700,17 @@ R""()
#if POWER_FEATURE_WORKGROUPS != 0 #if POWER_FEATURE_WORKGROUPS != 0
__global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 2 * NUM_BINS; __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * DWORD_FEATURES * 2 * NUM_BINS;
// if g_val and h_val are double, they are converted to float here // if g_val and h_val are double, they are converted to float here
// write gradients and hessians for 8 features // write gradients and Hessians for 8 features
output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val; output[0 * DWORD_FEATURES * NUM_BINS + ltid] = stat_val;
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
// To avoid the cost of an extra reducting kernel, we have to deal with some // To avoid the cost of an extra reducing kernel, we have to deal with some
// gray area in OpenCL. We want the last work group that process this feature to // gray area in OpenCL. We want the last work group that process this feature to
// make the final reduction, and other threads will just quit. // make the final reduction, and other threads will just quit.
// This requires that the results written by other workgroups available to the // This requires that the results written by other workgroups available to the
// last workgroup (memory consistency) // last workgroup (memory consistency)
#if NVIDIA == 1 #if NVIDIA == 1
// this is equavalent to CUDA __threadfence(); // this is equivalent to CUDA __threadfence();
// ensure the writes above goes to main memory and other workgroups can see it // ensure the writes above goes to main memory and other workgroups can see it
asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
#else #else
...@@ -734,7 +734,7 @@ R""() ...@@ -734,7 +734,7 @@ R""()
} }
// make sure everyone in this workgroup is here // make sure everyone in this workgroup is here
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// everyone in this wrokgroup: if we are the last workgroup, then do reduction! // everyone in this workgroup: if we are the last workgroup, then do reduction!
if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) { if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) {
if (ltid == 0) { if (ltid == 0) {
// printf("workgroup %d start reduction!\n", group_id); // printf("workgroup %d start reduction!\n", group_id);
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#ifndef __OPENCL_VERSION__ #ifndef __OPENCL_VERSION__
// If we are including this file in C++, // If we are including this file in C++,
// the entire source file following (except the last #endif) will become // the entire source file following (except the last #endif) will become
// a raw string literal. The extra ")" is just for mathcing parentheses // a raw string literal. The extra ")" is just for matching parentheses
// to make the editor happy. The extra ")" and extra endif will be skipped. // to make the editor happy. The extra ")" and extra endif will be skipped.
// DO NOT add anything between here and the next #ifdef, otherwise you need // DO NOT add anything between here and the next #ifdef, otherwise you need
// to modify the skip count at the end of this file. // to modify the skip count at the end of this file.
...@@ -364,7 +364,7 @@ __kernel void histogram256(__global const uchar4* feature_data_base, ...@@ -364,7 +364,7 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
// assume this starts at 32 * 4 = 128-byte boundary // assume this starts at 32 * 4 = 128-byte boundary
// total size: 2 * 4 * 256 * size_of(float) = 8 KB // total size: 2 * 4 * 256 * size_of(float) = 8 KB
// organization: each feature/grad/hessian is at a different bank, // organization: each feature/grad/hessian is at a different bank,
// as indepedent of the feature value as possible // as independent of the feature value as possible
__local acc_type * gh_hist = (__local acc_type *)shared_array; __local acc_type * gh_hist = (__local acc_type *)shared_array;
// counter histogram // counter histogram
// total size: 4 * 256 * size_of(uint) = 4 KB // total size: 4 * 256 * size_of(uint) = 4 KB
...@@ -373,7 +373,7 @@ __kernel void histogram256(__global const uchar4* feature_data_base, ...@@ -373,7 +373,7 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
#endif #endif
// thread 0, 1, 2, 3 compute histograms for gradients first // thread 0, 1, 2, 3 compute histograms for gradients first
// thread 4, 5, 6, 7 compute histograms for hessians first // thread 4, 5, 6, 7 compute histograms for Hessians first
// etc. // etc.
uchar is_hessian_first = (ltid >> 2) & 1; uchar is_hessian_first = (ltid >> 2) & 1;
...@@ -383,7 +383,7 @@ __kernel void histogram256(__global const uchar4* feature_data_base, ...@@ -383,7 +383,7 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
__global const uchar4* feature_data = feature_data_base + group_feature * feature_size; __global const uchar4* feature_data = feature_data_base + group_feature * feature_size;
// size of threads that process this feature4 // size of threads that process this feature4
const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS); const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS);
// equavalent thread ID in this subgroup for this feature4 // equivalent thread ID in this subgroup for this feature4
const uint subglobal_tid = gtid - group_feature * subglobal_size; const uint subglobal_tid = gtid - group_feature * subglobal_size;
// extract feature mask, when a byte is set to 0, that feature is disabled // extract feature mask, when a byte is set to 0, that feature is disabled
#if ENABLE_ALL_FEATURES == 1 #if ENABLE_ALL_FEATURES == 1
...@@ -441,7 +441,7 @@ R""() ...@@ -441,7 +441,7 @@ R""()
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
// prefetch the next iteration variables // prefetch the next iteration variables
// we don't need bondary check because we have made the buffer larger // we don't need boundary check because we have made the buffer larger
stat1_next = ordered_gradients[i + subglobal_size]; stat1_next = ordered_gradients[i + subglobal_size];
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
stat2_next = ordered_hessians[i + subglobal_size]; stat2_next = ordered_hessians[i + subglobal_size];
...@@ -475,11 +475,11 @@ R""() ...@@ -475,11 +475,11 @@ R""()
addr2 = addr + 4 - 8 * is_hessian_first; addr2 = addr + 4 - 8 * is_hessian_first;
atomic_local_add_f(gh_hist + addr, s3_stat1); atomic_local_add_f(gh_hist + addr, s3_stat1);
// thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's gradients for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's gradients for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's hessians for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's Hessians for example 4, 5, 6, 7
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, s3_stat2); atomic_local_add_f(gh_hist + addr2, s3_stat2);
#endif #endif
// thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's hessians for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's Hessians for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's gradients for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's gradients for example 4, 5, 6, 7
s3_stat1 = stat1; s3_stat1 = stat1;
s3_stat2 = stat2; s3_stat2 = stat2;
...@@ -500,11 +500,11 @@ R""() ...@@ -500,11 +500,11 @@ R""()
addr2 = addr + 4 - 8 * is_hessian_first; addr2 = addr + 4 - 8 * is_hessian_first;
atomic_local_add_f(gh_hist + addr, s2_stat1); atomic_local_add_f(gh_hist + addr, s2_stat1);
// thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's gradients for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's gradients for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's hessians for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's Hessians for example 4, 5, 6, 7
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, s2_stat2); atomic_local_add_f(gh_hist + addr2, s2_stat2);
#endif #endif
// thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's hessians for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's Hessians for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's gradients for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's gradients for example 4, 5, 6, 7
s2_stat1 = stat1; s2_stat1 = stat1;
s2_stat2 = stat2; s2_stat2 = stat2;
...@@ -517,7 +517,7 @@ R""() ...@@ -517,7 +517,7 @@ R""()
// prefetch the next iteration variables // prefetch the next iteration variables
// we don't need bondary check because if it is out of boundary, ind_next = 0 // we don't need boundary check because if it is out of boundary, ind_next = 0
#ifndef IGNORE_INDICES #ifndef IGNORE_INDICES
feature4_next = feature_data[ind_next]; feature4_next = feature_data[ind_next];
#endif #endif
...@@ -532,11 +532,11 @@ R""() ...@@ -532,11 +532,11 @@ R""()
addr2 = addr + 4 - 8 * is_hessian_first; addr2 = addr + 4 - 8 * is_hessian_first;
atomic_local_add_f(gh_hist + addr, s1_stat1); atomic_local_add_f(gh_hist + addr, s1_stat1);
// thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's gradients for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's gradients for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's hessians for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's Hessians for example 4, 5, 6, 7
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, s1_stat2); atomic_local_add_f(gh_hist + addr2, s1_stat2);
#endif #endif
// thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's hessians for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's Hessians for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's gradients for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's gradients for example 4, 5, 6, 7
s1_stat1 = stat1; s1_stat1 = stat1;
s1_stat2 = stat2; s1_stat2 = stat2;
...@@ -557,11 +557,11 @@ R""() ...@@ -557,11 +557,11 @@ R""()
addr2 = addr + 4 - 8 * is_hessian_first; addr2 = addr + 4 - 8 * is_hessian_first;
atomic_local_add_f(gh_hist + addr, s0_stat1); atomic_local_add_f(gh_hist + addr, s0_stat1);
// thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's gradients for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's gradients for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's hessians for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's Hessians for example 4, 5, 6, 7
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, s0_stat2); atomic_local_add_f(gh_hist + addr2, s0_stat2);
#endif #endif
// thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's hessians for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's Hessians for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's gradients for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's gradients for example 4, 5, 6, 7
s0_stat1 = stat1; s0_stat1 = stat1;
s0_stat2 = stat2; s0_stat2 = stat2;
...@@ -725,13 +725,13 @@ R""() ...@@ -725,13 +725,13 @@ R""()
} }
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
// To avoid the cost of an extra reducting kernel, we have to deal with some // To avoid the cost of an extra reducing kernel, we have to deal with some
// gray area in OpenCL. We want the last work group that process this feature to // gray area in OpenCL. We want the last work group that process this feature to
// make the final reduction, and other threads will just quit. // make the final reduction, and other threads will just quit.
// This requires that the results written by other workgroups available to the // This requires that the results written by other workgroups available to the
// last workgroup (memory consistency) // last workgroup (memory consistency)
#if NVIDIA == 1 #if NVIDIA == 1
// this is equavalent to CUDA __threadfence(); // this is equivalent to CUDA __threadfence();
// ensure the writes above goes to main memory and other workgroups can see it // ensure the writes above goes to main memory and other workgroups can see it
asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
#else #else
...@@ -757,7 +757,7 @@ R""() ...@@ -757,7 +757,7 @@ R""()
} }
// make sure everyone in this workgroup is here // make sure everyone in this workgroup is here
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// everyone in this wrokgroup: if we are the last workgroup, then do reduction! // everyone in this workgroup: if we are the last workgroup, then do reduction!
if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) { if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) {
if (ltid == 0) { if (ltid == 0) {
// printf("workgroup %d: %g %g %g %g %g %g %g %g\n", group_id, gh_hist[0], gh_hist[1], gh_hist[2], gh_hist[3], gh_hist[4], gh_hist[5], gh_hist[6], gh_hist[7]); // printf("workgroup %d: %g %g %g %g %g %g %g %g\n", group_id, gh_hist[0], gh_hist[1], gh_hist[2], gh_hist[3], gh_hist[4], gh_hist[5], gh_hist[6], gh_hist[7]);
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#ifndef __OPENCL_VERSION__ #ifndef __OPENCL_VERSION__
// If we are including this file in C++, // If we are including this file in C++,
// the entire source file following (except the last #endif) will become // the entire source file following (except the last #endif) will become
// a raw string literal. The extra ")" is just for mathcing parentheses // a raw string literal. The extra ")" is just for matching parentheses
// to make the editor happy. The extra ")" and extra endif will be skipped. // to make the editor happy. The extra ")" and extra endif will be skipped.
// DO NOT add anything between here and the next #ifdef, otherwise you need // DO NOT add anything between here and the next #ifdef, otherwise you need
// to modify the skip count at the end of this file. // to modify the skip count at the end of this file.
...@@ -308,7 +308,7 @@ __kernel void histogram64(__global const uchar4* feature_data_base, ...@@ -308,7 +308,7 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
#endif #endif
// thread 0, 1, 2, 3 compute histograms for gradients first // thread 0, 1, 2, 3 compute histograms for gradients first
// thread 4, 5, 6, 7 compute histograms for hessians first // thread 4, 5, 6, 7 compute histograms for Hessians first
// etc. // etc.
uchar is_hessian_first = (ltid >> 2) & 1; uchar is_hessian_first = (ltid >> 2) & 1;
// thread 0-7 write result to bank0, 8-15 to bank1, 16-23 to bank2, 24-31 to bank3 // thread 0-7 write result to bank0, 8-15 to bank1, 16-23 to bank2, 24-31 to bank3
...@@ -320,11 +320,11 @@ __kernel void histogram64(__global const uchar4* feature_data_base, ...@@ -320,11 +320,11 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
__global const uchar4* feature_data = feature_data_base + group_feature * feature_size; __global const uchar4* feature_data = feature_data_base + group_feature * feature_size;
// size of threads that process this feature4 // size of threads that process this feature4
const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS); const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS);
// equavalent thread ID in this subgroup for this feature4 // equivalent thread ID in this subgroup for this feature4
const uint subglobal_tid = gtid - group_feature * subglobal_size; const uint subglobal_tid = gtid - group_feature * subglobal_size;
// extract feature mask, when a byte is set to 0, that feature is disabled // extract feature mask, when a byte is set to 0, that feature is disabled
#if ENABLE_ALL_FEATURES == 1 #if ENABLE_ALL_FEATURES == 1
// hopefully the compiler will propogate the constants and eliminate all branches // hopefully the compiler will propagate the constants and eliminate all branches
uchar4 feature_mask = (uchar4)(0xff, 0xff, 0xff, 0xff); uchar4 feature_mask = (uchar4)(0xff, 0xff, 0xff, 0xff);
#else #else
uchar4 feature_mask = feature_masks[group_feature]; uchar4 feature_mask = feature_masks[group_feature];
...@@ -378,7 +378,7 @@ R""() ...@@ -378,7 +378,7 @@ R""()
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
// prefetch the next iteration variables // prefetch the next iteration variables
// we don't need bondary check because we have made the buffer larger // we don't need boundary check because we have made the buffer larger
stat1_next = ordered_gradients[i + subglobal_size]; stat1_next = ordered_gradients[i + subglobal_size];
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
stat2_next = ordered_hessians[i + subglobal_size]; stat2_next = ordered_hessians[i + subglobal_size];
...@@ -411,9 +411,9 @@ R""() ...@@ -411,9 +411,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset; addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
addr2 = addr + 4 - 8 * is_hessian_first; addr2 = addr + 4 - 8 * is_hessian_first;
// thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's gradients for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's gradients for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's hessians for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's Hessians for example 4, 5, 6, 7
atomic_local_add_f(gh_hist + addr, s3_stat1); atomic_local_add_f(gh_hist + addr, s3_stat1);
// thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's hessians for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's Hessians for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's gradients for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's gradients for example 4, 5, 6, 7
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, s3_stat2); atomic_local_add_f(gh_hist + addr2, s3_stat2);
...@@ -436,9 +436,9 @@ R""() ...@@ -436,9 +436,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset; addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
addr2 = addr + 4 - 8 * is_hessian_first; addr2 = addr + 4 - 8 * is_hessian_first;
// thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's gradients for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's gradients for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's hessians for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's Hessians for example 4, 5, 6, 7
atomic_local_add_f(gh_hist + addr, s2_stat1); atomic_local_add_f(gh_hist + addr, s2_stat1);
// thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's hessians for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's Hessians for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's gradients for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's gradients for example 4, 5, 6, 7
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, s2_stat2); atomic_local_add_f(gh_hist + addr2, s2_stat2);
...@@ -468,9 +468,9 @@ R""() ...@@ -468,9 +468,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset; addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
addr2 = addr + 4 - 8 * is_hessian_first; addr2 = addr + 4 - 8 * is_hessian_first;
// thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's gradients for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's gradients for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's hessians for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's Hessians for example 4, 5, 6, 7
atomic_local_add_f(gh_hist + addr, s1_stat1); atomic_local_add_f(gh_hist + addr, s1_stat1);
// thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's hessians for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's Hessians for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's gradients for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's gradients for example 4, 5, 6, 7
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, s1_stat2); atomic_local_add_f(gh_hist + addr2, s1_stat2);
...@@ -493,9 +493,9 @@ R""() ...@@ -493,9 +493,9 @@ R""()
addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset; addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
addr2 = addr + 4 - 8 * is_hessian_first; addr2 = addr + 4 - 8 * is_hessian_first;
// thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's gradients for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's gradients for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's hessians for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's Hessians for example 4, 5, 6, 7
atomic_local_add_f(gh_hist + addr, s0_stat1); atomic_local_add_f(gh_hist + addr, s0_stat1);
// thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's hessians for example 0, 1, 2, 3 // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's Hessians for example 0, 1, 2, 3
// thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's gradients for example 4, 5, 6, 7 // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's gradients for example 4, 5, 6, 7
#if CONST_HESSIAN == 0 #if CONST_HESSIAN == 0
atomic_local_add_f(gh_hist + addr2, s0_stat2); atomic_local_add_f(gh_hist + addr2, s0_stat2);
...@@ -652,7 +652,7 @@ R""() ...@@ -652,7 +652,7 @@ R""()
h_val = cnt_val * const_hessian; h_val = cnt_val * const_hessian;
#endif #endif
// write to output // write to output
// write gradients and hessians histogram for all 4 features // write gradients and Hessians histogram for all 4 features
// output data in linear order for further reduction // output data in linear order for further reduction
// output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float) // output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float)
/* memory layout of output: /* memory layout of output:
...@@ -676,17 +676,17 @@ R""() ...@@ -676,17 +676,17 @@ R""()
// if g_val and h_val are double, they are converted to float here // if g_val and h_val are double, they are converted to float here
// write gradients for 4 features // write gradients for 4 features
output[0 * 4 * NUM_BINS + ltid] = g_val; output[0 * 4 * NUM_BINS + ltid] = g_val;
// write hessians for 4 features // write Hessians for 4 features
output[1 * 4 * NUM_BINS + ltid] = h_val; output[1 * 4 * NUM_BINS + ltid] = h_val;
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
mem_fence(CLK_GLOBAL_MEM_FENCE); mem_fence(CLK_GLOBAL_MEM_FENCE);
// To avoid the cost of an extra reducting kernel, we have to deal with some // To avoid the cost of an extra reducing kernel, we have to deal with some
// gray area in OpenCL. We want the last work group that process this feature to // gray area in OpenCL. We want the last work group that process this feature to
// make the final reduction, and other threads will just quit. // make the final reduction, and other threads will just quit.
// This requires that the results written by other workgroups available to the // This requires that the results written by other workgroups available to the
// last workgroup (memory consistency) // last workgroup (memory consistency)
#if NVIDIA == 1 #if NVIDIA == 1
// this is equavalent to CUDA __threadfence(); // this is equivalent to CUDA __threadfence();
// ensure the writes above goes to main memory and other workgroups can see it // ensure the writes above goes to main memory and other workgroups can see it
asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory"); asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
#else #else
...@@ -710,7 +710,7 @@ R""() ...@@ -710,7 +710,7 @@ R""()
} }
// make sure everyone in this workgroup is here // make sure everyone in this workgroup is here
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// everyone in this wrokgroup: if we are the last workgroup, then do reduction! // everyone in this workgroup: if we are the last workgroup, then do reduction!
if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) { if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) {
if (ltid == 0) { if (ltid == 0) {
// printf("workgroup %d start reduction!\n", group_id); // printf("workgroup %d start reduction!\n", group_id);
......
...@@ -21,7 +21,7 @@ namespace LightGBM { ...@@ -21,7 +21,7 @@ namespace LightGBM {
/*! /*!
* \brief Feature parallel learning algorithm. * \brief Feature parallel learning algorithm.
* Different machine will find best split on different features, then sync global best split * Different machine will find best split on different features, then sync global best split
* It is recommonded used when #data is small or #feature is large * It is recommended used when #data is small or #feature is large
*/ */
template <typename TREELEARNER_T> template <typename TREELEARNER_T>
class FeatureParallelTreeLearner: public TREELEARNER_T { class FeatureParallelTreeLearner: public TREELEARNER_T {
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include "split_info.hpp" #include "split_info.hpp"
#ifdef USE_GPU #ifdef USE_GPU
// Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled. // Use 4KBytes aligned allocator for ordered gradients and ordered Hessians when GPU is enabled.
// This is necessary to pin the two arrays in memory and make transferring faster. // This is necessary to pin the two arrays in memory and make transferring faster.
#include <boost/align/aligned_allocator.hpp> #include <boost/align/aligned_allocator.hpp>
#endif #endif
......
...@@ -747,7 +747,7 @@ def test_cv(): ...@@ -747,7 +747,7 @@ def test_cv():
cv_res_obj = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=tss, cv_res_obj = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=tss,
verbose_eval=False) verbose_eval=False)
np.testing.assert_allclose(cv_res_gen['l2-mean'], cv_res_obj['l2-mean']) np.testing.assert_allclose(cv_res_gen['l2-mean'], cv_res_obj['l2-mean'])
# lambdarank # LambdaRank
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train')) '../../examples/lambdarank/rank.train'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment