Commit 66b7f032 authored by Guolin Ke's avatar Guolin Ke
Browse files

reduce branching in histogram sum-up.

parent 062bfa79
...@@ -36,19 +36,19 @@ install: ...@@ -36,19 +36,19 @@ install:
script: script:
- cd $TRAVIS_BUILD_DIR - cd $TRAVIS_BUILD_DIR
- mkdir build && cd build && cmake .. && make -j - mkdir build && cd build && cmake .. && make
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
- cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 --exclude=./compute . - cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 --exclude=./compute .
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
- cd $TRAVIS_BUILD_DIR - cd $TRAVIS_BUILD_DIR
- rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=ON -DBOOST_ROOT="$HOME/miniconda/" -DOpenCL_INCLUDE_DIR=$AMDAPPSDK/include/ .. - rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=ON -DBOOST_ROOT="$HOME/miniconda/" -DOpenCL_INCLUDE_DIR=$AMDAPPSDK/include/ ..
- sed -i 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ../include/LightGBM/config.h - sed -i 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ../include/LightGBM/config.h
- make -j$(nproc) - make
- sed -i 's/std::string device_type = "gpu";/std::string device_type = "cpu";/' ../include/LightGBM/config.h - sed -i 's/std::string device_type = "gpu";/std::string device_type = "cpu";/' ../include/LightGBM/config.h
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
......
...@@ -47,7 +47,7 @@ if(USE_GPU) ...@@ -47,7 +47,7 @@ if(USE_GPU)
endif(USE_GPU) endif(USE_GPU)
if(UNIX OR MINGW OR CYGWIN) if(UNIX OR MINGW OR CYGWIN)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11 -Wno-ignored-attributes") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11 -Wno-ignored-attributes -march=core2 -mtune=native")
endif() endif()
if(MSVC) if(MSVC)
......
...@@ -333,6 +333,10 @@ public: ...@@ -333,6 +333,10 @@ public:
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0; HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
/*! /*!
* \brief Construct histogram of this feature, * \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance * Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
...@@ -348,6 +352,9 @@ public: ...@@ -348,6 +352,9 @@ public:
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0; const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
/*! /*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices) * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature * \param min_bin min_bin of current used feature
......
...@@ -384,7 +384,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -384,7 +384,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
} }
// get sub gradients // get sub gradients
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
auto bias = cur_tree_id * num_data_; size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
// cannot multi-threading here. // cannot multi-threading here.
for (int i = 0; i < bag_data_cnt_; ++i) { for (int i = 0; i < bag_data_cnt_; ++i) {
gradients_[bias + i] = gradient[bias + bag_data_indices_[i]]; gradients_[bias + i] = gradient[bias + bag_data_indices_[i]];
...@@ -404,8 +404,9 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -404,8 +404,9 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
#endif #endif
std::unique_ptr<Tree> new_tree(new Tree(2)); std::unique_ptr<Tree> new_tree(new Tree(2));
if (class_need_train_[cur_tree_id]) { if (class_need_train_[cur_tree_id]) {
size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
new_tree.reset( new_tree.reset(
tree_learner_->Train(gradient + cur_tree_id * num_data_, hessian + cur_tree_id * num_data_, is_constant_hessian_)); tree_learner_->Train(gradient + bias, hessian + bias, is_constant_hessian_));
} }
#ifdef TIMETAG #ifdef TIMETAG
tree_time += std::chrono::steady_clock::now() - start_time; tree_time += std::chrono::steady_clock::now() - start_time;
......
...@@ -80,7 +80,7 @@ public: ...@@ -80,7 +80,7 @@ public:
std::vector<score_t> tmp_gradients(cnt, 0.0f); std::vector<score_t> tmp_gradients(cnt, 0.0f);
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
int idx = cur_tree_id * num_data_ + start + i; size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]); tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]);
} }
} }
...@@ -97,7 +97,7 @@ public: ...@@ -97,7 +97,7 @@ public:
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
score_t grad = 0.0f; score_t grad = 0.0f;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
int idx = cur_tree_id * num_data_ + start + i; size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
grad += std::fabs(gradients_[idx] * hessians_[idx]); grad += std::fabs(gradients_[idx] * hessians_[idx]);
} }
if (grad >= threshold) { if (grad >= threshold) {
...@@ -111,7 +111,7 @@ public: ...@@ -111,7 +111,7 @@ public:
if (cur_rand.NextFloat() < prob) { if (cur_rand.NextFloat() < prob) {
buffer[cur_left_cnt++] = start + i; buffer[cur_left_cnt++] = start + i;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
int idx = cur_tree_id * num_data_ + start + i; size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
gradients_[idx] *= multiply; gradients_[idx] *= multiply;
hessians_[idx] *= multiply; hessians_[idx] *= multiply;
} }
......
...@@ -435,7 +435,6 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, ...@@ -435,7 +435,6 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
} }
ptr_ordered_grad = ordered_gradients; ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians; ptr_ordered_hess = ordered_hessians;
}
if (!is_constant_hessian) { if (!is_constant_hessian) {
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
...@@ -515,6 +514,85 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, ...@@ -515,6 +514,85 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
} }
OMP_THROW_EX(); OMP_THROW_EX();
} }
} else {
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN();
bool is_groud_used = false;
const int f_cnt = group_feature_cnt_[group];
for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j;
if (is_feature_used[fidx]) {
is_groud_used = true;
break;
}
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN();
bool is_groud_used = false;
const int f_cnt = group_feature_cnt_[group];
for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j;
if (is_feature_used[fidx]) {
is_groud_used = true;
break;
}
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
num_data,
ptr_ordered_grad,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
}
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
}
}
} }
void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data, void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
......
...@@ -66,8 +66,6 @@ public: ...@@ -66,8 +66,6 @@ public:
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster
if (data_indices != nullptr) { // if use part of data
const data_size_t rest = num_data & 0x3; const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
...@@ -97,7 +95,11 @@ public: ...@@ -97,7 +95,11 @@ public:
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt; ++out[bin].cnt;
} }
} else { // use full data }
void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
const data_size_t rest = num_data & 0x3; const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
...@@ -128,13 +130,10 @@ public: ...@@ -128,13 +130,10 @@ public:
++out[bin].cnt; ++out[bin].cnt;
} }
} }
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster
if (data_indices != nullptr) { // if use part of data
const data_size_t rest = num_data & 0x3; const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
...@@ -158,7 +157,11 @@ public: ...@@ -158,7 +157,11 @@ public:
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt; ++out[bin].cnt;
} }
} else { // use full data }
void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients,
HistogramBinEntry* out) const override {
const data_size_t rest = num_data & 0x3; const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
...@@ -183,7 +186,6 @@ public: ...@@ -183,7 +186,6 @@ public:
++out[bin].cnt; ++out[bin].cnt;
} }
} }
}
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
......
...@@ -49,7 +49,7 @@ public: ...@@ -49,7 +49,7 @@ public:
void Push(int, data_size_t idx, uint32_t value) override { void Push(int, data_size_t idx, uint32_t value) override {
if (buf_.empty()) { if (buf_.empty()) {
#pragma omp critical #pragma omp critical
{ {
if (buf_.empty()) { if (buf_.empty()) {
int len = (num_data_ + 1) / 2; int len = (num_data_ + 1) / 2;
...@@ -80,7 +80,6 @@ public: ...@@ -80,7 +80,6 @@ public:
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
if (data_indices != nullptr) { // if use part of data
const data_size_t rest = num_data & 0x3; const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
...@@ -98,7 +97,6 @@ public: ...@@ -98,7 +97,6 @@ public:
idx = data_indices[i + 3]; idx = data_indices[i + 3];
const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1]; out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2]; out[bin2].sum_gradients += ordered_gradients[i + 2];
...@@ -123,8 +121,11 @@ public: ...@@ -123,8 +121,11 @@ public:
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt; ++out[bin].cnt;
} }
}
} else { // use full data void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
const data_size_t rest = num_data & 0x3; const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
...@@ -157,17 +158,13 @@ public: ...@@ -157,17 +158,13 @@ public:
++out[bin].cnt; ++out[bin].cnt;
} }
} }
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
if (data_indices != nullptr) { // if use part of data
const data_size_t rest = num_data & 0x3; const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
data_size_t idx = data_indices[i]; data_size_t idx = data_indices[i];
const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
...@@ -180,7 +177,6 @@ public: ...@@ -180,7 +177,6 @@ public:
idx = data_indices[i + 3]; idx = data_indices[i + 3];
const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1]; out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2]; out[bin2].sum_gradients += ordered_gradients[i + 2];
...@@ -190,7 +186,6 @@ public: ...@@ -190,7 +186,6 @@ public:
++out[bin1].cnt; ++out[bin1].cnt;
++out[bin2].cnt; ++out[bin2].cnt;
++out[bin3].cnt; ++out[bin3].cnt;
} }
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
...@@ -199,8 +194,11 @@ public: ...@@ -199,8 +194,11 @@ public:
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt; ++out[bin].cnt;
} }
}
} else { // use full data void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients,
HistogramBinEntry* out) const override {
const data_size_t rest = num_data & 0x3; const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
...@@ -227,7 +225,6 @@ public: ...@@ -227,7 +225,6 @@ public:
++out[bin].cnt; ++out[bin].cnt;
} }
} }
}
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
......
...@@ -104,12 +104,24 @@ public: ...@@ -104,12 +104,24 @@ public:
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
} }
void ConstructHistogram(data_size_t, const score_t*,
const score_t*, HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
}
void ConstructHistogram(const data_size_t*, data_size_t, const score_t*, void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
HistogramBinEntry*) const override { HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead // Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
} }
void ConstructHistogram(data_size_t, const score_t*,
HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
}
inline bool NextNonzero(data_size_t* i_delta, inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const { data_size_t* cur_pos) const {
++(*i_delta); ++(*i_delta);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment