Unverified Commit d6c41fa1 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

check _mm_prefetch in cmake (#2694)

* try to use _mm_prefetch anywhere

* refine

* fix bug

* remove the unneeded prefetch
parent ade9bd51
...@@ -115,6 +115,21 @@ if(USE_HDFS) ...@@ -115,6 +115,21 @@ if(USE_HDFS)
SET(HDFS_CXX_LIBRARIES ${HDFS_LIB} ${JAVA_JVM_LIBRARY}) SET(HDFS_CXX_LIBRARIES ${HDFS_LIB} ${JAVA_JVM_LIBRARY})
endif(USE_HDFS) endif(USE_HDFS)
include(CheckCXXSourceCompiles)
check_cxx_source_compiles("
#include <xmmintrin.h>
int main() {
int a = 0;
_mm_prefetch(&a, _MM_HINT_NTA);
return 0;
}
" MM_PREFETCH)
if(${MM_PREFETCH})
message(STATUS "Use _mm_prefetch")
ADD_DEFINITIONS(-DMM_PREFETCH)
endif()
if(UNIX OR MINGW OR CYGWIN) if(UNIX OR MINGW OR CYGWIN)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
if(USE_SWIG) if(USE_SWIG)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) || defined(__INTEL_COMPILER) #if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) || defined(__INTEL_COMPILER) || MM_PREFETCH
#include <xmmintrin.h> #include <xmmintrin.h>
#define PREFETCH_T0(addr) _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0) #define PREFETCH_T0(addr) _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0)
#elif defined(__GNUC__) #elif defined(__GNUC__)
...@@ -71,6 +71,9 @@ typedef void(*AllgatherFunction)(char* input, comm_size_t input_size, const comm ...@@ -71,6 +71,9 @@ typedef void(*AllgatherFunction)(char* input, comm_size_t input_size, const comm
#define NO_SPECIFIC (-1) #define NO_SPECIFIC (-1)
// Prefetch size is usually 64 bytes
const int kCacheLineSize = 64;
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_META_H_ #endif // LightGBM_META_H_
...@@ -71,12 +71,17 @@ class DenseBin: public Bin { ...@@ -71,12 +71,17 @@ class DenseBin: public Bin {
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t prefetch_size = 32 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
for (data_size_t i = start; i < end; i++) { data_size_t i = start;
if (i + prefetch_size < end) { for (; i < pf_end; i++) {
PREFETCH_T0(data_.data() + data_indices[i + prefetch_size]); PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
} const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[data_indices[i]]; const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
...@@ -87,11 +92,17 @@ class DenseBin: public Bin { ...@@ -87,11 +92,17 @@ class DenseBin: public Bin {
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t prefetch_size = 32 / sizeof(VAL_T); const data_size_t pf_offset = 64 / sizeof(VAL_T);
for (data_size_t i = start; i < end; ++i) { const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
if (i + prefetch_size < end) { data_size_t i = start;
PREFETCH_T0(data_.data() + i + prefetch_size); for (; i < pf_end; i++) {
} PREFETCH_T0(data_.data() + i + pf_offset);
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[i]; const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
...@@ -102,11 +113,16 @@ class DenseBin: public Bin { ...@@ -102,11 +113,16 @@ class DenseBin: public Bin {
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t prefetch_size = 32 / sizeof(VAL_T); const data_size_t pf_offset = 64 / sizeof(VAL_T);
for (data_size_t i = start; i < end; ++i) { const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
if (i + prefetch_size < end) { data_size_t i = start;
PREFETCH_T0(data_.data() + data_indices[i + prefetch_size]); for (; i < pf_end; i++) {
} PREFETCH_T0(data_.data() + data_indices[i + pf_offset]);
const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[data_indices[i]]; const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt; ++out[bin].cnt;
...@@ -116,11 +132,16 @@ class DenseBin: public Bin { ...@@ -116,11 +132,16 @@ class DenseBin: public Bin {
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t prefetch_size = 32 / sizeof(VAL_T); const data_size_t pf_offset = 64 / sizeof(VAL_T);
for (data_size_t i = start; i < end; ++i) { const data_size_t pf_end = end - pf_offset - kCacheLineSize / sizeof(VAL_T);
if (i + prefetch_size < end) { data_size_t i = start;
PREFETCH_T0(data_.data() + i + prefetch_size); for (; i < pf_end; i++) {
} PREFETCH_T0(data_.data() + i + pf_offset);
const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const VAL_T bin = data_[i]; const VAL_T bin = data_[i];
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt; ++out[bin].cnt;
......
...@@ -76,11 +76,18 @@ class Dense4bitsBin : public Bin { ...@@ -76,11 +76,18 @@ class Dense4bitsBin : public Bin {
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t prefetch_size = 32; const data_size_t pf_offset = 64;
for (data_size_t i = start; i < end; ++i) { const data_size_t pf_end = end - pf_offset - kCacheLineSize;
if (i + prefetch_size < end) { data_size_t i = start;
PREFETCH_T0(data_.data() + (data_indices[i + prefetch_size] >> 1)); for (; i < pf_end; i++) {
} PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
...@@ -92,11 +99,17 @@ class Dense4bitsBin : public Bin { ...@@ -92,11 +99,17 @@ class Dense4bitsBin : public Bin {
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t prefetch_size = 32; const data_size_t pf_offset = 64;
for (data_size_t i = start; i < end; ++i) { const data_size_t pf_end = end - pf_offset - kCacheLineSize;
if (i + prefetch_size < end) { data_size_t i = start;
PREFETCH_T0(data_.data() + ((i + prefetch_size) >> 1)); for (; i < pf_end; i++) {
} PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
...@@ -107,11 +120,17 @@ class Dense4bitsBin : public Bin { ...@@ -107,11 +120,17 @@ class Dense4bitsBin : public Bin {
void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t prefetch_size = 32; const data_size_t pf_offset = 64;
for (data_size_t i = start; i < end; ++i) { const data_size_t pf_end = end - pf_offset - kCacheLineSize;
if (i + prefetch_size < end) { data_size_t i = start;
PREFETCH_T0(data_.data() + (data_indices[i + prefetch_size] >> 1)); for (; i < pf_end; i++) {
} PREFETCH_T0(data_.data() + (data_indices[i + pf_offset] >> 1));
const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
...@@ -122,11 +141,16 @@ class Dense4bitsBin : public Bin { ...@@ -122,11 +141,16 @@ class Dense4bitsBin : public Bin {
void ConstructHistogram(data_size_t start, data_size_t end, void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t prefetch_size = 32; const data_size_t pf_offset = 64;
for (data_size_t i = start; i < end; ++i) { const data_size_t pf_end = end - pf_offset - kCacheLineSize;
if (i + prefetch_size < end) { data_size_t i = start;
PREFETCH_T0(data_.data() + ((i + prefetch_size) >> 1)); for (; i < pf_end; i++) {
} PREFETCH_T0(data_.data() + ((i + pf_offset) >> 1));
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
}
for (; i < end; i++) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt; ++out[bin].cnt;
......
...@@ -84,41 +84,30 @@ class OrderedSparseBin: public OrderedBin { ...@@ -84,41 +84,30 @@ class OrderedSparseBin: public OrderedBin {
void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian, void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t prefetch_size = 4;
// get current leaf boundary // get current leaf boundary
const data_size_t start = leaf_start_[leaf]; const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf]; const data_size_t end = start + leaf_cnt_[leaf];
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
if (i + prefetch_size < end) { const VAL_T bin = ordered_pair_[i].bin;
PREFETCH_T0(ordered_pair_.data() + i + prefetch_size); const auto g = gradient[ordered_pair_[i].ridx];
PREFETCH_T0(gradient + ordered_pair_[i + prefetch_size].ridx); const auto h = hessian[ordered_pair_[i].ridx];
PREFETCH_T0(hessian + ordered_pair_[i + prefetch_size].ridx);
}
const VAL_T bin0 = ordered_pair_[i].bin;
const auto g0 = gradient[ordered_pair_[i].ridx];
const auto h0 = hessian[ordered_pair_[i].ridx];
out[bin0].sum_gradients += g0; out[bin].sum_gradients += g;
out[bin0].sum_hessians += h0; out[bin].sum_hessians += h;
++out[bin0].cnt; ++out[bin].cnt;
} }
} }
void ConstructHistogram(int leaf, const score_t* gradient, void ConstructHistogram(int leaf, const score_t* gradient,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
const data_size_t prefetch_size = 4;
// get current leaf boundary // get current leaf boundary
const data_size_t start = leaf_start_[leaf]; const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf]; const data_size_t end = start + leaf_cnt_[leaf];
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
if (i + prefetch_size < end) { const VAL_T bin = ordered_pair_[i].bin;
PREFETCH_T0(ordered_pair_.data() + i + prefetch_size); const auto g = gradient[ordered_pair_[i].ridx];
PREFETCH_T0(gradient + ordered_pair_[i + prefetch_size].ridx); out[bin].sum_gradients += g;
} ++out[bin].cnt;
const VAL_T bin0 = ordered_pair_[i].bin;
const auto g0 = gradient[ordered_pair_[i].ridx];
out[bin0].sum_gradients += g0;
++out[bin0].cnt;
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment