Commit 66b7f032 authored by Guolin Ke's avatar Guolin Ke
Browse files

reduce branching in histogram sum-up.

parent 062bfa79
...@@ -36,19 +36,19 @@ install: ...@@ -36,19 +36,19 @@ install:
script: script:
- cd $TRAVIS_BUILD_DIR - cd $TRAVIS_BUILD_DIR
- mkdir build && cd build && cmake .. && make -j - mkdir build && cd build && cmake .. && make
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
- cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 --exclude=./compute . - cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 --exclude=./compute .
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
- cd $TRAVIS_BUILD_DIR - cd $TRAVIS_BUILD_DIR
- rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=ON -DBOOST_ROOT="$HOME/miniconda/" -DOpenCL_INCLUDE_DIR=$AMDAPPSDK/include/ .. - rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=ON -DBOOST_ROOT="$HOME/miniconda/" -DOpenCL_INCLUDE_DIR=$AMDAPPSDK/include/ ..
- sed -i 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ../include/LightGBM/config.h - sed -i 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ../include/LightGBM/config.h
- make -j$(nproc) - make
- sed -i 's/std::string device_type = "gpu";/std::string device_type = "cpu";/' ../include/LightGBM/config.h - sed -i 's/std::string device_type = "gpu";/std::string device_type = "cpu";/' ../include/LightGBM/config.h
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
......
...@@ -47,7 +47,7 @@ if(USE_GPU) ...@@ -47,7 +47,7 @@ if(USE_GPU)
endif(USE_GPU) endif(USE_GPU)
if(UNIX OR MINGW OR CYGWIN) if(UNIX OR MINGW OR CYGWIN)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11 -Wno-ignored-attributes") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11 -Wno-ignored-attributes -march=core2 -mtune=native")
endif() endif()
if(MSVC) if(MSVC)
......
...@@ -333,6 +333,10 @@ public: ...@@ -333,6 +333,10 @@ public:
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0; HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
/*! /*!
* \brief Construct histogram of this feature, * \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance * Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
...@@ -348,6 +352,9 @@ public: ...@@ -348,6 +352,9 @@ public:
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0; const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
/*! /*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices) * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature * \param min_bin min_bin of current used feature
......
...@@ -384,7 +384,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -384,7 +384,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
} }
// get sub gradients // get sub gradients
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
auto bias = cur_tree_id * num_data_; size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
// cannot multi-threading here. // cannot multi-threading here.
for (int i = 0; i < bag_data_cnt_; ++i) { for (int i = 0; i < bag_data_cnt_; ++i) {
gradients_[bias + i] = gradient[bias + bag_data_indices_[i]]; gradients_[bias + i] = gradient[bias + bag_data_indices_[i]];
...@@ -404,8 +404,9 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -404,8 +404,9 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
#endif #endif
std::unique_ptr<Tree> new_tree(new Tree(2)); std::unique_ptr<Tree> new_tree(new Tree(2));
if (class_need_train_[cur_tree_id]) { if (class_need_train_[cur_tree_id]) {
size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
new_tree.reset( new_tree.reset(
tree_learner_->Train(gradient + cur_tree_id * num_data_, hessian + cur_tree_id * num_data_, is_constant_hessian_)); tree_learner_->Train(gradient + bias, hessian + bias, is_constant_hessian_));
} }
#ifdef TIMETAG #ifdef TIMETAG
tree_time += std::chrono::steady_clock::now() - start_time; tree_time += std::chrono::steady_clock::now() - start_time;
......
...@@ -80,7 +80,7 @@ public: ...@@ -80,7 +80,7 @@ public:
std::vector<score_t> tmp_gradients(cnt, 0.0f); std::vector<score_t> tmp_gradients(cnt, 0.0f);
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
int idx = cur_tree_id * num_data_ + start + i; size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]); tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]);
} }
} }
...@@ -97,7 +97,7 @@ public: ...@@ -97,7 +97,7 @@ public:
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
score_t grad = 0.0f; score_t grad = 0.0f;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
int idx = cur_tree_id * num_data_ + start + i; size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
grad += std::fabs(gradients_[idx] * hessians_[idx]); grad += std::fabs(gradients_[idx] * hessians_[idx]);
} }
if (grad >= threshold) { if (grad >= threshold) {
...@@ -111,7 +111,7 @@ public: ...@@ -111,7 +111,7 @@ public:
if (cur_rand.NextFloat() < prob) { if (cur_rand.NextFloat() < prob) {
buffer[cur_left_cnt++] = start + i; buffer[cur_left_cnt++] = start + i;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
int idx = cur_tree_id * num_data_ + start + i; size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
gradients_[idx] *= multiply; gradients_[idx] *= multiply;
hessians_[idx] *= multiply; hessians_[idx] *= multiply;
} }
......
This diff is collapsed.
...@@ -435,85 +435,163 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, ...@@ -435,85 +435,163 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
} }
ptr_ordered_grad = ordered_gradients; ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians; ptr_ordered_hess = ordered_hessians;
} if (!is_constant_hessian) {
if (!is_constant_hessian) { OMP_INIT_EX();
OMP_INIT_EX(); #pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static) for (int group = 0; group < num_groups_; ++group) {
for (int group = 0; group < num_groups_; ++group) { OMP_LOOP_EX_BEGIN();
OMP_LOOP_EX_BEGIN(); bool is_groud_used = false;
bool is_groud_used = false; const int f_cnt = group_feature_cnt_[group];
const int f_cnt = group_feature_cnt_[group]; for (int j = 0; j < f_cnt; ++j) {
for (int j = 0; j < f_cnt; ++j) { const int fidx = group_feature_start_[group] + j;
const int fidx = group_feature_start_[group] + j; if (is_feature_used[fidx]) {
if (is_feature_used[fidx]) { is_groud_used = true;
is_groud_used = true; break;
break; }
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
} }
OMP_LOOP_EX_END();
} }
if (!is_groud_used) { continue; } OMP_THROW_EX();
// feature is not used } else {
auto data_ptr = hist_data + group_bin_boundaries_[group]; OMP_INIT_EX();
const int num_bin = feature_groups_[group]->num_total_bin_; #pragma omp parallel for schedule(static)
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry)); for (int group = 0; group < num_groups_; ++group) {
// construct histograms for smaller leaf OMP_LOOP_EX_BEGIN();
if (ordered_bins[group] == nullptr) { bool is_groud_used = false;
// if not use ordered bin const int f_cnt = group_feature_cnt_[group];
feature_groups_[group]->bin_data_->ConstructHistogram( for (int j = 0; j < f_cnt; ++j) {
data_indices, const int fidx = group_feature_start_[group] + j;
num_data, if (is_feature_used[fidx]) {
ptr_ordered_grad, is_groud_used = true;
ptr_ordered_hess, break;
data_ptr); }
} else { }
// used ordered bin if (!is_groud_used) { continue; }
ordered_bins[group]->ConstructHistogram(leaf_idx, // feature is not used
gradients, auto data_ptr = hist_data + group_bin_boundaries_[group];
hessians, const int num_bin = feature_groups_[group]->num_total_bin_;
data_ptr); std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices,
num_data,
ptr_ordered_grad,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
}
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
}
OMP_LOOP_EX_END();
} }
OMP_LOOP_EX_END(); OMP_THROW_EX();
} }
OMP_THROW_EX();
} else { } else {
OMP_INIT_EX(); if (!is_constant_hessian) {
#pragma omp parallel for schedule(static) OMP_INIT_EX();
for (int group = 0; group < num_groups_; ++group) { #pragma omp parallel for schedule(static)
OMP_LOOP_EX_BEGIN(); for (int group = 0; group < num_groups_; ++group) {
bool is_groud_used = false; OMP_LOOP_EX_BEGIN();
const int f_cnt = group_feature_cnt_[group]; bool is_groud_used = false;
for (int j = 0; j < f_cnt; ++j) { const int f_cnt = group_feature_cnt_[group];
const int fidx = group_feature_start_[group] + j; for (int j = 0; j < f_cnt; ++j) {
if (is_feature_used[fidx]) { const int fidx = group_feature_start_[group] + j;
is_groud_used = true; if (is_feature_used[fidx]) {
break; is_groud_used = true;
break;
}
} }
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
data_ptr);
}
OMP_LOOP_EX_END();
} }
if (!is_groud_used) { continue; } OMP_THROW_EX();
// feature is not used } else {
auto data_ptr = hist_data + group_bin_boundaries_[group]; OMP_INIT_EX();
const int num_bin = feature_groups_[group]->num_total_bin_; #pragma omp parallel for schedule(static)
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry)); for (int group = 0; group < num_groups_; ++group) {
// construct histograms for smaller leaf OMP_LOOP_EX_BEGIN();
if (ordered_bins[group] == nullptr) { bool is_groud_used = false;
// if not use ordered bin const int f_cnt = group_feature_cnt_[group];
feature_groups_[group]->bin_data_->ConstructHistogram( for (int j = 0; j < f_cnt; ++j) {
data_indices, const int fidx = group_feature_start_[group] + j;
num_data, if (is_feature_used[fidx]) {
ptr_ordered_grad, is_groud_used = true;
data_ptr); break;
} else { }
// used ordered bin }
ordered_bins[group]->ConstructHistogram(leaf_idx, if (!is_groud_used) { continue; }
gradients, // feature is not used
data_ptr); auto data_ptr = hist_data + group_bin_boundaries_[group];
} const int num_bin = feature_groups_[group]->num_total_bin_;
// fixed hessian. std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
for (int i = 0; i < num_bin; ++i) { // construct histograms for smaller leaf
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0]; if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
num_data,
ptr_ordered_grad,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
data_ptr);
}
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
}
OMP_LOOP_EX_END();
} }
OMP_LOOP_EX_END(); OMP_THROW_EX();
} }
OMP_THROW_EX();
} }
} }
......
...@@ -66,122 +66,124 @@ public: ...@@ -66,122 +66,124 @@ public:
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster const data_size_t rest = num_data & 0x3;
if (data_indices != nullptr) { // if use part of data data_size_t i = 0;
const data_size_t rest = num_data & 0x3; for (; i < num_data - rest; i += 4) {
data_size_t i = 0; const VAL_T bin0 = data_[data_indices[i]];
for (; i < num_data - rest; i += 4) { const VAL_T bin1 = data_[data_indices[i + 1]];
const VAL_T bin0 = data_[data_indices[i]]; const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin1 = data_[data_indices[i + 1]]; const VAL_T bin3 = data_[data_indices[i + 3]];
const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin3 = data_[data_indices[i + 3]]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin0].sum_gradients += ordered_gradients[i]; out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin1].sum_gradients += ordered_gradients[i + 1]; out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3]; out[bin0].sum_hessians += ordered_hessians[i];
out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin0].sum_hessians += ordered_hessians[i]; out[bin2].sum_hessians += ordered_hessians[i + 2];
out[bin1].sum_hessians += ordered_hessians[i + 1]; out[bin3].sum_hessians += ordered_hessians[i + 3];
out[bin2].sum_hessians += ordered_hessians[i + 2];
out[bin3].sum_hessians += ordered_hessians[i + 3]; ++out[bin0].cnt;
++out[bin1].cnt;
++out[bin0].cnt; ++out[bin2].cnt;
++out[bin1].cnt; ++out[bin3].cnt;
++out[bin2].cnt; }
++out[bin3].cnt; for (; i < num_data; ++i) {
} const VAL_T bin = data_[data_indices[i]];
for (; i < num_data; ++i) { out[bin].sum_gradients += ordered_gradients[i];
const VAL_T bin = data_[data_indices[i]]; out[bin].sum_hessians += ordered_hessians[i];
out[bin].sum_gradients += ordered_gradients[i]; ++out[bin].cnt;
out[bin].sum_hessians += ordered_hessians[i]; }
++out[bin].cnt; }
}
} else { // use full data void ConstructHistogram(data_size_t num_data,
const data_size_t rest = num_data & 0x3; const score_t* ordered_gradients, const score_t* ordered_hessians,
data_size_t i = 0; HistogramBinEntry* out) const override {
for (; i < num_data - rest; i += 4) { const data_size_t rest = num_data & 0x3;
const VAL_T bin0 = data_[i]; data_size_t i = 0;
const VAL_T bin1 = data_[i + 1]; for (; i < num_data - rest; i += 4) {
const VAL_T bin2 = data_[i + 2]; const VAL_T bin0 = data_[i];
const VAL_T bin3 = data_[i + 3]; const VAL_T bin1 = data_[i + 1];
const VAL_T bin2 = data_[i + 2];
out[bin0].sum_gradients += ordered_gradients[i]; const VAL_T bin3 = data_[i + 3];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin3].sum_gradients += ordered_gradients[i + 3]; out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin0].sum_hessians += ordered_hessians[i]; out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_hessians += ordered_hessians[i + 2]; out[bin0].sum_hessians += ordered_hessians[i];
out[bin3].sum_hessians += ordered_hessians[i + 3]; out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_hessians += ordered_hessians[i + 2];
++out[bin0].cnt; out[bin3].sum_hessians += ordered_hessians[i + 3];
++out[bin1].cnt;
++out[bin2].cnt; ++out[bin0].cnt;
++out[bin3].cnt; ++out[bin1].cnt;
} ++out[bin2].cnt;
for (; i < num_data; ++i) { ++out[bin3].cnt;
const VAL_T bin = data_[i]; }
out[bin].sum_gradients += ordered_gradients[i]; for (; i < num_data; ++i) {
out[bin].sum_hessians += ordered_hessians[i]; const VAL_T bin = data_[i];
++out[bin].cnt; out[bin].sum_gradients += ordered_gradients[i];
} out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
} }
} }
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster const data_size_t rest = num_data & 0x3;
if (data_indices != nullptr) { // if use part of data data_size_t i = 0;
const data_size_t rest = num_data & 0x3; for (; i < num_data - rest; i += 4) {
data_size_t i = 0; const VAL_T bin0 = data_[data_indices[i]];
for (; i < num_data - rest; i += 4) { const VAL_T bin1 = data_[data_indices[i + 1]];
const VAL_T bin0 = data_[data_indices[i]]; const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin1 = data_[data_indices[i + 1]]; const VAL_T bin3 = data_[data_indices[i + 3]];
const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin3 = data_[data_indices[i + 3]]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin0].sum_gradients += ordered_gradients[i]; out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin1].sum_gradients += ordered_gradients[i + 1]; out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3]; ++out[bin0].cnt;
++out[bin1].cnt;
++out[bin0].cnt; ++out[bin2].cnt;
++out[bin1].cnt; ++out[bin3].cnt;
++out[bin2].cnt; }
++out[bin3].cnt; for (; i < num_data; ++i) {
} const VAL_T bin = data_[data_indices[i]];
for (; i < num_data; ++i) { out[bin].sum_gradients += ordered_gradients[i];
const VAL_T bin = data_[data_indices[i]]; ++out[bin].cnt;
out[bin].sum_gradients += ordered_gradients[i]; }
++out[bin].cnt; }
}
} else { // use full data void ConstructHistogram(data_size_t num_data,
const data_size_t rest = num_data & 0x3; const score_t* ordered_gradients,
data_size_t i = 0; HistogramBinEntry* out) const override {
for (; i < num_data - rest; i += 4) { const data_size_t rest = num_data & 0x3;
const VAL_T bin0 = data_[i]; data_size_t i = 0;
const VAL_T bin1 = data_[i + 1]; for (; i < num_data - rest; i += 4) {
const VAL_T bin2 = data_[i + 2]; const VAL_T bin0 = data_[i];
const VAL_T bin3 = data_[i + 3]; const VAL_T bin1 = data_[i + 1];
const VAL_T bin2 = data_[i + 2];
out[bin0].sum_gradients += ordered_gradients[i]; const VAL_T bin3 = data_[i + 3];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin3].sum_gradients += ordered_gradients[i + 3]; out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
++out[bin0].cnt; out[bin3].sum_gradients += ordered_gradients[i + 3];
++out[bin1].cnt;
++out[bin2].cnt; ++out[bin0].cnt;
++out[bin3].cnt; ++out[bin1].cnt;
} ++out[bin2].cnt;
for (; i < num_data; ++i) { ++out[bin3].cnt;
const VAL_T bin = data_[i]; }
out[bin].sum_gradients += ordered_gradients[i]; for (; i < num_data; ++i) {
++out[bin].cnt; const VAL_T bin = data_[i];
} out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
} }
} }
......
...@@ -49,7 +49,7 @@ public: ...@@ -49,7 +49,7 @@ public:
void Push(int, data_size_t idx, uint32_t value) override { void Push(int, data_size_t idx, uint32_t value) override {
if (buf_.empty()) { if (buf_.empty()) {
#pragma omp critical #pragma omp critical
{ {
if (buf_.empty()) { if (buf_.empty()) {
int len = (num_data_ + 1) / 2; int len = (num_data_ + 1) / 2;
...@@ -80,152 +80,149 @@ public: ...@@ -80,152 +80,149 @@ public:
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
if (data_indices != nullptr) { // if use part of data
const data_size_t rest = num_data & 0x3; const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
data_size_t idx = data_indices[i]; data_size_t idx = data_indices[i];
const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 1]; idx = data_indices[i + 1];
const auto bin1 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin1 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 2]; idx = data_indices[i + 2];
const auto bin2 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin2 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 3]; idx = data_indices[i + 3];
const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin0].sum_gradients += ordered_gradients[i]; out[bin0].sum_hessians += ordered_hessians[i];
out[bin1].sum_gradients += ordered_gradients[i + 1]; out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2]; out[bin2].sum_hessians += ordered_hessians[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3]; out[bin3].sum_hessians += ordered_hessians[i + 3];
out[bin0].sum_hessians += ordered_hessians[i]; ++out[bin0].cnt;
out[bin1].sum_hessians += ordered_hessians[i + 1]; ++out[bin1].cnt;
out[bin2].sum_hessians += ordered_hessians[i + 2]; ++out[bin2].cnt;
out[bin3].sum_hessians += ordered_hessians[i + 3]; ++out[bin3].cnt;
++out[bin0].cnt; }
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
}
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt; ++out[bin].cnt;
} }
}
} else { // use full data void ConstructHistogram(data_size_t num_data,
const data_size_t rest = num_data & 0x3; const score_t* ordered_gradients, const score_t* ordered_hessians,
data_size_t i = 0; HistogramBinEntry* out) const override {
for (; i < num_data - rest; i += 4) { const data_size_t rest = num_data & 0x3;
int j = i >> 1; data_size_t i = 0;
const auto bin0 = (data_[j]) & 0xf; for (; i < num_data - rest; i += 4) {
const auto bin1 = (data_[j] >> 4) & 0xf; int j = i >> 1;
++j; const auto bin0 = (data_[j]) & 0xf;
const auto bin2 = (data_[j]) & 0xf; const auto bin1 = (data_[j] >> 4) & 0xf;
const auto bin3 = (data_[j] >> 4) & 0xf; ++j;
const auto bin2 = (data_[j]) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i]; const auto bin3 = (data_[j] >> 4) & 0xf;
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin3].sum_gradients += ordered_gradients[i + 3]; out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin0].sum_hessians += ordered_hessians[i]; out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_hessians += ordered_hessians[i + 2]; out[bin0].sum_hessians += ordered_hessians[i];
out[bin3].sum_hessians += ordered_hessians[i + 3]; out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_hessians += ordered_hessians[i + 2];
++out[bin0].cnt; out[bin3].sum_hessians += ordered_hessians[i + 3];
++out[bin1].cnt;
++out[bin2].cnt; ++out[bin0].cnt;
++out[bin3].cnt; ++out[bin1].cnt;
} ++out[bin2].cnt;
for (; i < num_data; ++i) { ++out[bin3].cnt;
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; }
out[bin].sum_gradients += ordered_gradients[i]; for (; i < num_data; ++i) {
out[bin].sum_hessians += ordered_hessians[i]; const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
++out[bin].cnt; out[bin].sum_gradients += ordered_gradients[i];
} out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
} }
} }
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_gradients,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
if (data_indices != nullptr) { // if use part of data const data_size_t rest = num_data & 0x3;
data_size_t i = 0;
const data_size_t rest = num_data & 0x3; for (; i < num_data - rest; i += 4) {
data_size_t i = 0; data_size_t idx = data_indices[i];
for (; i < num_data - rest; i += 4) { const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
data_size_t idx = data_indices[i]; idx = data_indices[i + 1];
const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin1 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 1]; idx = data_indices[i + 2];
const auto bin1 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin2 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 2]; idx = data_indices[i + 3];
const auto bin2 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 3]; out[bin0].sum_gradients += ordered_gradients[i];
const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1]; ++out[bin0].cnt;
out[bin2].sum_gradients += ordered_gradients[i + 2]; ++out[bin1].cnt;
out[bin3].sum_gradients += ordered_gradients[i + 3]; ++out[bin2].cnt;
++out[bin3].cnt;
++out[bin0].cnt; }
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
}
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt; ++out[bin].cnt;
} }
}
} else { // use full data void ConstructHistogram(data_size_t num_data,
const data_size_t rest = num_data & 0x3; const score_t* ordered_gradients,
data_size_t i = 0; HistogramBinEntry* out) const override {
for (; i < num_data - rest; i += 4) { const data_size_t rest = num_data & 0x3;
int j = i >> 1; data_size_t i = 0;
const auto bin0 = (data_[j]) & 0xf; for (; i < num_data - rest; i += 4) {
const auto bin1 = (data_[j] >> 4) & 0xf; int j = i >> 1;
++j; const auto bin0 = (data_[j]) & 0xf;
const auto bin2 = (data_[j]) & 0xf; const auto bin1 = (data_[j] >> 4) & 0xf;
const auto bin3 = (data_[j] >> 4) & 0xf; ++j;
const auto bin2 = (data_[j]) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i]; const auto bin3 = (data_[j] >> 4) & 0xf;
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2]; out[bin0].sum_gradients += ordered_gradients[i];
out[bin3].sum_gradients += ordered_gradients[i + 3]; out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
++out[bin0].cnt; out[bin3].sum_gradients += ordered_gradients[i + 3];
++out[bin1].cnt;
++out[bin2].cnt; ++out[bin0].cnt;
++out[bin3].cnt; ++out[bin1].cnt;
} ++out[bin2].cnt;
for (; i < num_data; ++i) { ++out[bin3].cnt;
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; }
out[bin].sum_gradients += ordered_gradients[i]; for (; i < num_data; ++i) {
++out[bin].cnt; const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
} out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt;
} }
} }
......
...@@ -104,12 +104,24 @@ public: ...@@ -104,12 +104,24 @@ public:
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
} }
void ConstructHistogram(data_size_t, const score_t*,
const score_t*, HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
}
void ConstructHistogram(const data_size_t*, data_size_t, const score_t*, void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
HistogramBinEntry*) const override { HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead // Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
} }
void ConstructHistogram(data_size_t, const score_t*,
HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
}
inline bool NextNonzero(data_size_t* i_delta, inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const { data_size_t* cur_pos) const {
++(*i_delta); ++(*i_delta);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment