Commit 98c7c2a3 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

faster histogram sum up (#418)

* some refactor.

* two stage sum up to reduce sum up error.

* add more two-stage sumup.

* some refactor.

* add alignment.

* change name to aligned_allocator.

* remove some useless sumup.

* fix a warning.

* add -march=native .

* remove the padding of gradients.

* no alignment.

* fix test.

* change KNumSumupGroup to 32768.

* change gcc flags.
parent b0017e5b
...@@ -36,19 +36,19 @@ install: ...@@ -36,19 +36,19 @@ install:
script: script:
- cd $TRAVIS_BUILD_DIR - cd $TRAVIS_BUILD_DIR
- mkdir build && cd build && cmake .. && make -j - mkdir build && cd build && cmake .. && make
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
- cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 --exclude=./compute . - cd $TRAVIS_BUILD_DIR && pep8 --ignore=E501 --exclude=./compute .
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py - cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_engine.py && python test_sklearn.py && python test_plotting.py
- cd $TRAVIS_BUILD_DIR - cd $TRAVIS_BUILD_DIR
- rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=ON -DBOOST_ROOT="$HOME/miniconda/" -DOpenCL_INCLUDE_DIR=$AMDAPPSDK/include/ .. - rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=ON -DBOOST_ROOT="$HOME/miniconda/" -DOpenCL_INCLUDE_DIR=$AMDAPPSDK/include/ ..
- sed -i 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ../include/LightGBM/config.h - sed -i 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ../include/LightGBM/config.h
- make -j$(nproc) - make
- sed -i 's/std::string device_type = "gpu";/std::string device_type = "cpu";/' ../include/LightGBM/config.h - sed -i 's/std::string device_type = "gpu";/std::string device_type = "cpu";/' ../include/LightGBM/config.h
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install - cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
......
...@@ -47,7 +47,7 @@ if(USE_GPU) ...@@ -47,7 +47,7 @@ if(USE_GPU)
endif(USE_GPU) endif(USE_GPU)
if(UNIX OR MINGW OR CYGWIN) if(UNIX OR MINGW OR CYGWIN)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11 -Wno-ignored-attributes") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O3 -Wall -std=c++11 -Wno-ignored-attributes -march=core2 -mtune=native")
endif() endif()
if(MSVC) if(MSVC)
......
...@@ -22,9 +22,9 @@ enum BinType { ...@@ -22,9 +22,9 @@ enum BinType {
struct HistogramBinEntry { struct HistogramBinEntry {
public: public:
/*! \brief Sum of gradients on this bin */ /*! \brief Sum of gradients on this bin */
double sum_gradients = 0.0f; float sum_gradients = 0.0f;
/*! \brief Sum of hessians on this bin */ /*! \brief Sum of hessians on this bin */
double sum_hessians = 0.0f; float sum_hessians = 0.0f;
/*! \brief Number of data on this bin */ /*! \brief Number of data on this bin */
data_size_t cnt = 0; data_size_t cnt = 0;
/*! /*!
...@@ -221,10 +221,11 @@ public: ...@@ -221,10 +221,11 @@ public:
* \param leaf Using which leaf's data to construct * \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf * \param gradients Gradients, Note:non-oredered by leaf
* \param hessians Hessians, Note:non-oredered by leaf * \param hessians Hessians, Note:non-oredered by leaf
* \param num_bin The number of bins
* \param out Output Result * \param out Output Result
*/ */
virtual void ConstructHistogram(int leaf, const score_t* gradients, virtual void ConstructHistogram(int leaf, const float* gradients,
const score_t* hessians, HistogramBinEntry* out) const = 0; const float* hessians, int num_bin, HistogramBinEntry* out) const = 0;
/*! /*!
* \brief Construct histogram by using this bin * \brief Construct histogram by using this bin
...@@ -232,9 +233,10 @@ public: ...@@ -232,9 +233,10 @@ public:
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins. * Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct * \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf * \param gradients Gradients, Note:non-oredered by leaf
* \param num_bin The number of bins
* \param out Output Result * \param out Output Result
*/ */
virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0; virtual void ConstructHistogram(int leaf, const float* gradients, int num_bin, HistogramBinEntry* out) const = 0;
/*! /*!
* \brief Split current bin, and perform re-order by leaf * \brief Split current bin, and perform re-order by leaf
...@@ -326,11 +328,16 @@ public: ...@@ -326,11 +328,16 @@ public:
* \param num_data Number of used data * \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i] * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i] * \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
* \param num_bin The number of bins
* \param out Output Result * \param out Output Result
*/ */
virtual void ConstructHistogram( virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t num_data, const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians, const float* ordered_gradients, const float* ordered_hessians, int num_bin,
HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const float* ordered_gradients, const float* ordered_hessians, int num_bin,
HistogramBinEntry* out) const = 0; HistogramBinEntry* out) const = 0;
/*! /*!
...@@ -343,10 +350,14 @@ public: ...@@ -343,10 +350,14 @@ public:
* \param data_indices Used data indices in current leaf * \param data_indices Used data indices in current leaf
* \param num_data Number of used data * \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i] * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param num_bin The number of bins
* \param out Output Result * \param out Output Result
*/ */
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0; const float* ordered_gradients, int num_bin, HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const float* ordered_gradients, int num_bin, HistogramBinEntry* out) const = 0;
/*! /*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices) * \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
...@@ -432,6 +443,75 @@ inline uint32_t BinMapper::ValueToBin(double value) const { ...@@ -432,6 +443,75 @@ inline uint32_t BinMapper::ValueToBin(double value) const {
} }
} }
#define AddGradientPtrToHistogram(data, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7, \
gptr) { \
data[bin0].sum_gradients += *(gptr + 0);\
data[bin1].sum_gradients += *(gptr + 1);\
data[bin2].sum_gradients += *(gptr + 2);\
data[bin3].sum_gradients += *(gptr + 3);\
data[bin4].sum_gradients += *(gptr + 4);\
data[bin5].sum_gradients += *(gptr + 5);\
data[bin6].sum_gradients += *(gptr + 6);\
data[bin7].sum_gradients += *(gptr + 7);\
}
#define AddGradientToHistogram(data, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7, \
g0, g1, g2, g3, g4, g5, g6, g7) { \
data[bin0].sum_gradients += (g0);\
data[bin1].sum_gradients += (g1);\
data[bin2].sum_gradients += (g2);\
data[bin3].sum_gradients += (g3);\
data[bin4].sum_gradients += (g4);\
data[bin5].sum_gradients += (g5);\
data[bin6].sum_gradients += (g6);\
data[bin7].sum_gradients += (g7);\
}
#define AddHessianPtrToHistogram(data, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7, \
hptr) { \
data[bin0].sum_hessians += *(hptr + 0);\
data[bin1].sum_hessians += *(hptr + 1);\
data[bin2].sum_hessians += *(hptr + 2);\
data[bin3].sum_hessians += *(hptr + 3);\
data[bin4].sum_hessians += *(hptr + 4);\
data[bin5].sum_hessians += *(hptr + 5);\
data[bin6].sum_hessians += *(hptr + 6);\
data[bin7].sum_hessians += *(hptr + 7);\
}
#define AddHessianToHistogram(data, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7, \
h0, h1, h2, h3, h4, h5, h6, h7) { \
data[bin0].sum_hessians += (h0);\
data[bin1].sum_hessians += (h1);\
data[bin2].sum_hessians += (h2);\
data[bin3].sum_hessians += (h3);\
data[bin4].sum_hessians += (h4);\
data[bin5].sum_hessians += (h5);\
data[bin6].sum_hessians += (h6);\
data[bin7].sum_hessians += (h7);\
}
#define AddCountToHistogram(data, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7) { \
++data[bin0].cnt;\
++data[bin1].cnt;\
++data[bin2].cnt;\
++data[bin3].cnt;\
++data[bin4].cnt;\
++data[bin5].cnt;\
++data[bin6].cnt;\
++data[bin7].cnt;\
}
struct TmpGradCntPair {
public:
float sum_gradients = 0.0f;
data_size_t cnt = 0;
};
#define KNumSumupGroup (32768)
#define KNumSumupGroupMask (32767)
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_BIN_H_ #endif // LightGBM_BIN_H_
...@@ -66,7 +66,7 @@ public: ...@@ -66,7 +66,7 @@ public:
* \param is_eval true if need evaluation or early stop * \param is_eval true if need evaluation or early stop
* \return True if meet early stopping or cannot boosting * \return True if meet early stopping or cannot boosting
*/ */
virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) = 0; virtual bool TrainOneIter(const float* gradient, const float* hessian, bool is_eval) = 0;
/*! /*!
* \brief Rollback one iteration * \brief Rollback one iteration
......
...@@ -393,8 +393,8 @@ public: ...@@ -393,8 +393,8 @@ public:
const data_size_t* data_indices, data_size_t num_data, const data_size_t* data_indices, data_size_t num_data,
int leaf_idx, int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins, std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians, const float* gradients, const float* hessians,
score_t* ordered_gradients, score_t* ordered_hessians, float* ordered_gradients, float* ordered_hessians,
bool is_constant_hessian, bool is_constant_hessian,
HistogramBinEntry* histogram_data) const; HistogramBinEntry* histogram_data) const;
......
...@@ -7,17 +7,28 @@ ...@@ -7,17 +7,28 @@
#include <vector> #include <vector>
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <cstdlib>
#if defined(_WIN32)
#include <malloc.h>
#else
#include <mm_malloc.h>
#endif // (_WIN32)
namespace LightGBM { namespace LightGBM {
/*! \brief Type of data size, it is better to use signed type*/ /*! \brief Type of data size, it is better to use signed type*/
typedef int32_t data_size_t; typedef int32_t data_size_t;
/*! \brief Type of score, and gradients */
typedef float score_t;
const score_t kMinScore = -std::numeric_limits<score_t>::infinity(); const float kMinScore = -std::numeric_limits<float>::infinity();
const score_t kEpsilon = 1e-15f; const float kEpsilon = 1e-15f;
using ReduceFunction = std::function<void(const char*, char*, int)>; using ReduceFunction = std::function<void(const char*, char*, int)>;
......
...@@ -29,7 +29,7 @@ public: ...@@ -29,7 +29,7 @@ public:
* \hessians Output hessians * \hessians Output hessians
*/ */
virtual void GetGradients(const double* score, virtual void GetGradients(const double* score,
score_t* gradients, score_t* hessians) const = 0; float* gradients, float* hessians) const = 0;
virtual const char* GetName() const = 0; virtual const char* GetName() const = 0;
......
...@@ -43,12 +43,12 @@ public: ...@@ -43,12 +43,12 @@ public:
* \param is_constant_hessian True if all hessians share the same value * \param is_constant_hessian True if all hessians share the same value
* \return A trained tree * \return A trained tree
*/ */
virtual Tree* Train(const score_t* gradients, const score_t* hessians, bool is_constant_hessian) = 0; virtual Tree* Train(const float* gradients, const float* hessians, bool is_constant_hessian) = 0;
/*! /*!
* \brief use a existing tree to fit the new gradients and hessians. * \brief use a existing tree to fit the new gradients and hessians.
*/ */
virtual Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const = 0; virtual Tree* FitByExistingTree(const Tree* old_tree, const float* gradients, const float* hessians) const = 0;
/*! /*!
* \brief Set bagging data * \brief Set bagging data
......
...@@ -46,7 +46,7 @@ public: ...@@ -46,7 +46,7 @@ public:
/*! /*!
* \brief one training iteration * \brief one training iteration
*/ */
bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override { bool TrainOneIter(const float* gradient, const float* hessian, bool is_eval) override {
is_update_score_cur_iter_ = false; is_update_score_cur_iter_ = false;
GBDT::TrainOneIter(gradient, hessian, false); GBDT::TrainOneIter(gradient, hessian, false);
// normalize // normalize
......
...@@ -115,11 +115,9 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_ ...@@ -115,11 +115,9 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
} }
num_data_ = train_data->num_data(); num_data_ = train_data->num_data();
// create buffer for gradients and hessians // create buffer for gradients and hessians
if (objective_function_ != nullptr) { size_t total_gradient_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_; gradients_.resize(total_gradient_size);
gradients_.resize(total_size); hessians_.resize(total_gradient_size);
hessians_.resize(total_size);
}
// get max feature index // get max feature index
max_feature_idx_ = train_data->num_total_features() - 1; max_feature_idx_ = train_data->num_total_features() - 1;
// get label index // get label index
...@@ -329,7 +327,7 @@ void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int cur_tree_id) { ...@@ -329,7 +327,7 @@ void GBDT::UpdateScoreOutOfBag(const Tree* tree, const int cur_tree_id) {
#endif #endif
} }
bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) { bool GBDT::TrainOneIter(const float* gradient, const float* hessian, bool is_eval) {
// boosting from average prediction. It doesn't work well for classification, remove it for now. // boosting from average prediction. It doesn't work well for classification, remove it for now.
if (models_.empty() if (models_.empty()
&& gbdt_config_->boost_from_average && gbdt_config_->boost_from_average
...@@ -359,11 +357,18 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -359,11 +357,18 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
auto start_time = std::chrono::steady_clock::now(); auto start_time = std::chrono::steady_clock::now();
#endif #endif
Boosting(); Boosting();
gradient = gradients_.data();
hessian = hessians_.data();
#ifdef TIMETAG #ifdef TIMETAG
boosting_time += std::chrono::steady_clock::now() - start_time; boosting_time += std::chrono::steady_clock::now() - start_time;
#endif #endif
} else {
for (int k = 0; k < num_tree_per_iteration_; ++k) {
const size_t bias = static_cast<size_t>(k) * num_data_;
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_data_; ++i) {
gradients_[bias + i] = gradient[bias + i];
hessians_[bias + i] = hessian[bias + i];
}
}
} }
#ifdef TIMETAG #ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now(); auto start_time = std::chrono::steady_clock::now();
...@@ -377,22 +382,15 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -377,22 +382,15 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
#ifdef TIMETAG #ifdef TIMETAG
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
#endif #endif
if (gradients_.empty()) {
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
gradients_.resize(total_size);
hessians_.resize(total_size);
}
// get sub gradients // get sub gradients
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
auto bias = cur_tree_id * num_data_; size_t bias = static_cast<size_t>(cur_tree_id)* num_data_;
// cannot multi-threading here. // cannot multi-threading here.
for (int i = 0; i < bag_data_cnt_; ++i) { for (int i = 0; i < bag_data_cnt_; ++i) {
gradients_[bias + i] = gradient[bias + bag_data_indices_[i]]; gradients_[bias + i] = gradients_[bias + bag_data_indices_[i]];
hessians_[bias + i] = hessian[bias + bag_data_indices_[i]]; hessians_[bias + i] = hessians_[bias + bag_data_indices_[i]];
} }
} }
gradient = gradients_.data();
hessian = hessians_.data();
#ifdef TIMETAG #ifdef TIMETAG
sub_gradient_time += std::chrono::steady_clock::now() - start_time; sub_gradient_time += std::chrono::steady_clock::now() - start_time;
#endif #endif
...@@ -403,9 +401,11 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -403,9 +401,11 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
#endif #endif
std::unique_ptr<Tree> new_tree(new Tree(2)); std::unique_ptr<Tree> new_tree(new Tree(2));
size_t gbias = static_cast<size_t>(cur_tree_id) * num_data_;
if (class_need_train_[cur_tree_id]) { if (class_need_train_[cur_tree_id]) {
new_tree.reset( new_tree.reset(
tree_learner_->Train(gradient + cur_tree_id * num_data_, hessian + cur_tree_id * num_data_, is_constant_hessian_)); tree_learner_->Train(gradients_.data() + gbias,
hessians_.data() + gbias, is_constant_hessian_));
} }
#ifdef TIMETAG #ifdef TIMETAG
tree_time += std::chrono::steady_clock::now() - start_time; tree_time += std::chrono::steady_clock::now() - start_time;
......
...@@ -82,7 +82,7 @@ public: ...@@ -82,7 +82,7 @@ public:
* \param is_eval true if need evaluation or early stop * \param is_eval true if need evaluation or early stop
* \return True if meet early stopping or cannot boosting * \return True if meet early stopping or cannot boosting
*/ */
virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override; virtual bool TrainOneIter(const float* gradient, const float* hessian, bool is_eval) override;
/*! /*!
* \brief Rollback one iteration * \brief Rollback one iteration
...@@ -302,9 +302,9 @@ protected: ...@@ -302,9 +302,9 @@ protected:
/*! \brief Max feature index of training data*/ /*! \brief Max feature index of training data*/
int max_feature_idx_; int max_feature_idx_;
/*! \brief First order derivative of training data */ /*! \brief First order derivative of training data */
std::vector<score_t> gradients_; std::vector<float> gradients_;
/*! \brief Secend order derivative of training data */ /*! \brief Secend order derivative of training data */
std::vector<score_t> hessians_; std::vector<float> hessians_;
/*! \brief Store the indices of in-bag data */ /*! \brief Store the indices of in-bag data */
std::vector<data_size_t> bag_data_indices_; std::vector<data_size_t> bag_data_indices_;
/*! \brief Number of in-bag data */ /*! \brief Number of in-bag data */
......
...@@ -77,27 +77,27 @@ public: ...@@ -77,27 +77,27 @@ public:
} }
data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer, data_size_t* buffer_right) { data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer, data_size_t* buffer_right) {
std::vector<score_t> tmp_gradients(cnt, 0.0f); std::vector<float> tmp_gradients(cnt, 0.0f);
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
int idx = cur_tree_id * num_data_ + start + i; size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]); tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]);
} }
} }
data_size_t top_k = static_cast<data_size_t>(cnt * gbdt_config_->top_rate); data_size_t top_k = static_cast<data_size_t>(cnt * gbdt_config_->top_rate);
data_size_t other_k = static_cast<data_size_t>(cnt * gbdt_config_->other_rate); data_size_t other_k = static_cast<data_size_t>(cnt * gbdt_config_->other_rate);
top_k = std::max(1, top_k); top_k = std::max(1, top_k);
ArrayArgs<score_t>::ArgMaxAtK(&tmp_gradients, 0, static_cast<int>(tmp_gradients.size()), top_k); ArrayArgs<float>::ArgMaxAtK(&tmp_gradients, 0, static_cast<int>(tmp_gradients.size()), top_k);
score_t threshold = tmp_gradients[top_k - 1]; float threshold = tmp_gradients[top_k - 1];
score_t multiply = static_cast<score_t>(cnt - top_k) / other_k; float multiply = static_cast<float>(cnt - top_k) / other_k;
data_size_t cur_left_cnt = 0; data_size_t cur_left_cnt = 0;
data_size_t cur_right_cnt = 0; data_size_t cur_right_cnt = 0;
data_size_t big_weight_cnt = 0; data_size_t big_weight_cnt = 0;
for (data_size_t i = 0; i < cnt; ++i) { for (data_size_t i = 0; i < cnt; ++i) {
score_t grad = 0.0f; float grad = 0.0f;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
int idx = cur_tree_id * num_data_ + start + i; size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
grad += std::fabs(gradients_[idx] * hessians_[idx]); grad += std::fabs(gradients_[idx] * hessians_[idx]);
} }
if (grad >= threshold) { if (grad >= threshold) {
...@@ -111,7 +111,7 @@ public: ...@@ -111,7 +111,7 @@ public:
if (cur_rand.NextFloat() < prob) { if (cur_rand.NextFloat() < prob) {
buffer[cur_left_cnt++] = start + i; buffer[cur_left_cnt++] = start + i;
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
int idx = cur_tree_id * num_data_ + start + i; size_t idx = static_cast<size_t>(cur_tree_id) * num_data_ + start + i;
gradients_[idx] *= multiply; gradients_[idx] *= multiply;
hessians_[idx] *= multiply; hessians_[idx] *= multiply;
} }
......
...@@ -410,8 +410,8 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, ...@@ -410,8 +410,8 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data, const data_size_t* data_indices, data_size_t num_data,
int leaf_idx, int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins, std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians, const float* gradients, const float* hessians,
score_t* ordered_gradients, score_t* ordered_hessians, float* ordered_gradients, float* ordered_hessians,
bool is_constant_hessian, bool is_constant_hessian,
HistogramBinEntry* hist_data) const { HistogramBinEntry* hist_data) const {
...@@ -436,6 +436,7 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, ...@@ -436,6 +436,7 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
ptr_ordered_grad = ordered_gradients; ptr_ordered_grad = ordered_gradients;
ptr_ordered_hess = ordered_hessians; ptr_ordered_hess = ordered_hessians;
} }
if (data_indices != nullptr) {
if (!is_constant_hessian) { if (!is_constant_hessian) {
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
...@@ -463,12 +464,14 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, ...@@ -463,12 +464,14 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
num_data, num_data,
ptr_ordered_grad, ptr_ordered_grad,
ptr_ordered_hess, ptr_ordered_hess,
num_bin,
data_ptr); data_ptr);
} else { } else {
// used ordered bin // used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx, ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients, gradients,
hessians, hessians,
num_bin,
data_ptr); data_ptr);
} }
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
...@@ -500,11 +503,13 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, ...@@ -500,11 +503,13 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
data_indices, data_indices,
num_data, num_data,
ptr_ordered_grad, ptr_ordered_grad,
num_bin,
data_ptr); data_ptr);
} else { } else {
// used ordered bin // used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx, ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients, gradients,
num_bin,
data_ptr); data_ptr);
} }
// fixed hessian. // fixed hessian.
...@@ -515,6 +520,89 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used, ...@@ -515,6 +520,89 @@ void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
} }
OMP_THROW_EX(); OMP_THROW_EX();
} }
} else {
if (!is_constant_hessian) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN();
bool is_groud_used = false;
const int f_cnt = group_feature_cnt_[group];
for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j;
if (is_feature_used[fidx]) {
is_groud_used = true;
break;
}
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
num_data,
ptr_ordered_grad,
ptr_ordered_hess,
num_bin,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
hessians,
num_bin,
data_ptr);
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN();
bool is_groud_used = false;
const int f_cnt = group_feature_cnt_[group];
for (int j = 0; j < f_cnt; ++j) {
const int fidx = group_feature_start_[group] + j;
if (is_feature_used[fidx]) {
is_groud_used = true;
break;
}
}
if (!is_groud_used) { continue; }
// feature is not used
auto data_ptr = hist_data + group_bin_boundaries_[group];
const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(data_ptr + 1, 0, (num_bin - 1) * sizeof(HistogramBinEntry));
// construct histograms for smaller leaf
if (ordered_bins[group] == nullptr) {
// if not use ordered bin
feature_groups_[group]->bin_data_->ConstructHistogram(
num_data,
ptr_ordered_grad,
num_bin,
data_ptr);
} else {
// used ordered bin
ordered_bins[group]->ConstructHistogram(leaf_idx,
gradients,
num_bin,
data_ptr);
}
// fixed hessian.
for (int i = 0; i < num_bin; ++i) {
data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
}
}
} }
void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data, void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
...@@ -525,16 +613,19 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hess ...@@ -525,16 +613,19 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hess
const int default_bin = bin_mapper->GetDefaultBin(); const int default_bin = bin_mapper->GetDefaultBin();
if (default_bin > 0) { if (default_bin > 0) {
const int num_bin = bin_mapper->num_bin(); const int num_bin = bin_mapper->num_bin();
data[default_bin].sum_gradients = sum_gradient; double sg = sum_gradient;
data[default_bin].sum_hessians = sum_hessian; double sh = sum_hessian;
data[default_bin].cnt = num_data; data_size_t cnt = num_data;
for (int i = 0; i < num_bin; ++i) { for (int i = 0; i < num_bin; ++i) {
if (i != default_bin) { if (i != default_bin) {
data[default_bin].sum_gradients -= data[i].sum_gradients; sg -= data[i].sum_gradients;
data[default_bin].sum_hessians -= data[i].sum_hessians; sh -= data[i].sum_hessians;
data[default_bin].cnt -= data[i].cnt; cnt -= data[i].cnt;
} }
} }
data[default_bin].sum_gradients = static_cast<float>(sg);
data[default_bin].sum_hessians = static_cast<float>(sh);
data[default_bin].cnt = cnt;
} }
} }
......
...@@ -64,32 +64,51 @@ public: ...@@ -64,32 +64,51 @@ public:
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override; BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians, const float* ordered_gradients, const float* ordered_hessians, int num_bin,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster const data_size_t group_rest = num_data & KNumSumupGroupMask;
if (data_indices != nullptr) { // if use part of data const data_size_t rest = num_data & 0x7;
const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - group_rest;) {
std::vector<HistogramBinEntry> tmp_sumup_buf(num_bin);
for (data_size_t j = 0; j < KNumSumupGroup; j += 8, i += 8) {
const VAL_T bin0 = data_[data_indices[i]]; const VAL_T bin0 = data_[data_indices[i]];
const VAL_T bin1 = data_[data_indices[i + 1]]; const VAL_T bin1 = data_[data_indices[i + 1]];
const VAL_T bin2 = data_[data_indices[i + 2]]; const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin3 = data_[data_indices[i + 3]]; const VAL_T bin3 = data_[data_indices[i + 3]];
const VAL_T bin4 = data_[data_indices[i + 4]];
const VAL_T bin5 = data_[data_indices[i + 5]];
const VAL_T bin6 = data_[data_indices[i + 6]];
const VAL_T bin7 = data_[data_indices[i + 7]];
out[bin0].sum_gradients += ordered_gradients[i]; AddGradientPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin1].sum_gradients += ordered_gradients[i + 1]; ordered_gradients + i);
out[bin2].sum_gradients += ordered_gradients[i + 2]; AddHessianPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin3].sum_gradients += ordered_gradients[i + 3]; ordered_hessians + i);
AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
}
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].sum_hessians += tmp_sumup_buf[j].sum_hessians;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
out[bin0].sum_hessians += ordered_hessians[i]; for (; i < num_data - rest; i += 8) {
out[bin1].sum_hessians += ordered_hessians[i + 1]; const VAL_T bin0 = data_[data_indices[i]];
out[bin2].sum_hessians += ordered_hessians[i + 2]; const VAL_T bin1 = data_[data_indices[i + 1]];
out[bin3].sum_hessians += ordered_hessians[i + 3]; const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin3 = data_[data_indices[i + 3]];
const VAL_T bin4 = data_[data_indices[i + 4]];
const VAL_T bin5 = data_[data_indices[i + 5]];
const VAL_T bin6 = data_[data_indices[i + 6]];
const VAL_T bin7 = data_[data_indices[i + 7]];
++out[bin0].cnt; AddGradientPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin1].cnt; ordered_gradients + i);
++out[bin2].cnt; AddHessianPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin3].cnt; ordered_hessians + i);
AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
} }
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
const VAL_T bin = data_[data_indices[i]]; const VAL_T bin = data_[data_indices[i]];
...@@ -97,29 +116,54 @@ public: ...@@ -97,29 +116,54 @@ public:
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt; ++out[bin].cnt;
} }
} else { // use full data }
const data_size_t rest = num_data & 0x3;
void ConstructHistogram(data_size_t num_data,
const float* ordered_gradients, const float* ordered_hessians, int num_bin,
HistogramBinEntry* out) const override {
const data_size_t group_rest = num_data & KNumSumupGroupMask;
const data_size_t rest = num_data & 0x7;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - group_rest;) {
std::vector<HistogramBinEntry> tmp_sumup_buf(num_bin);
for (data_size_t j = 0; j < KNumSumupGroup; j += 8, i += 8) {
const VAL_T bin0 = data_[i]; const VAL_T bin0 = data_[i];
const VAL_T bin1 = data_[i + 1]; const VAL_T bin1 = data_[i + 1];
const VAL_T bin2 = data_[i + 2]; const VAL_T bin2 = data_[i + 2];
const VAL_T bin3 = data_[i + 3]; const VAL_T bin3 = data_[i + 3];
const VAL_T bin4 = data_[i + 4];
const VAL_T bin5 = data_[i + 5];
const VAL_T bin6 = data_[i + 6];
const VAL_T bin7 = data_[i + 7];
out[bin0].sum_gradients += ordered_gradients[i]; AddGradientPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin1].sum_gradients += ordered_gradients[i + 1]; ordered_gradients + i);
out[bin2].sum_gradients += ordered_gradients[i + 2]; AddHessianPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin3].sum_gradients += ordered_gradients[i + 3]; ordered_hessians + i);
AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
}
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].sum_hessians += tmp_sumup_buf[j].sum_hessians;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
out[bin0].sum_hessians += ordered_hessians[i]; for (; i < num_data - rest; i += 8) {
out[bin1].sum_hessians += ordered_hessians[i + 1]; const VAL_T bin0 = data_[i];
out[bin2].sum_hessians += ordered_hessians[i + 2]; const VAL_T bin1 = data_[i + 1];
out[bin3].sum_hessians += ordered_hessians[i + 3]; const VAL_T bin2 = data_[i + 2];
const VAL_T bin3 = data_[i + 3];
const VAL_T bin4 = data_[i + 4];
const VAL_T bin5 = data_[i + 5];
const VAL_T bin6 = data_[i + 6];
const VAL_T bin7 = data_[i + 7];
++out[bin0].cnt; AddGradientPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin1].cnt; ordered_gradients + i);
++out[bin2].cnt; AddHessianPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin3].cnt; ordered_hessians + i);
AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
} }
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
const VAL_T bin = data_[i]; const VAL_T bin = data_[i];
...@@ -128,54 +172,97 @@ public: ...@@ -128,54 +172,97 @@ public:
++out[bin].cnt; ++out[bin].cnt;
} }
} }
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const float* ordered_gradients, int num_bin,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster const data_size_t group_rest = num_data & KNumSumupGroupMask;
if (data_indices != nullptr) { // if use part of data const data_size_t rest = num_data & 0x7;
const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - group_rest;) {
std::vector<TmpGradCntPair> tmp_sumup_buf(num_bin);
for (data_size_t j = 0; j < KNumSumupGroup; j += 8, i += 8) {
const VAL_T bin0 = data_[data_indices[i]];
const VAL_T bin1 = data_[data_indices[i + 1]];
const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin3 = data_[data_indices[i + 3]];
const VAL_T bin4 = data_[data_indices[i + 4]];
const VAL_T bin5 = data_[data_indices[i + 5]];
const VAL_T bin6 = data_[data_indices[i + 6]];
const VAL_T bin7 = data_[data_indices[i + 7]];
AddGradientPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
ordered_gradients + i);
AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
}
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
for (; i < num_data - rest; i += 8) {
const VAL_T bin0 = data_[data_indices[i]]; const VAL_T bin0 = data_[data_indices[i]];
const VAL_T bin1 = data_[data_indices[i + 1]]; const VAL_T bin1 = data_[data_indices[i + 1]];
const VAL_T bin2 = data_[data_indices[i + 2]]; const VAL_T bin2 = data_[data_indices[i + 2]];
const VAL_T bin3 = data_[data_indices[i + 3]]; const VAL_T bin3 = data_[data_indices[i + 3]];
const VAL_T bin4 = data_[data_indices[i + 4]];
const VAL_T bin5 = data_[data_indices[i + 5]];
const VAL_T bin6 = data_[data_indices[i + 6]];
const VAL_T bin7 = data_[data_indices[i + 7]];
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
++out[bin0].cnt; AddGradientPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin1].cnt; ordered_gradients + i);
++out[bin2].cnt; AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
++out[bin3].cnt;
} }
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
const VAL_T bin = data_[data_indices[i]]; const VAL_T bin = data_[data_indices[i]];
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt; ++out[bin].cnt;
} }
} else { // use full data }
const data_size_t rest = num_data & 0x3;
void ConstructHistogram(data_size_t num_data,
const float* ordered_gradients, int num_bin,
HistogramBinEntry* out) const override {
const data_size_t group_rest = num_data & KNumSumupGroupMask;
const data_size_t rest = num_data & 0x7;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - group_rest;) {
std::vector<TmpGradCntPair> tmp_sumup_buf(num_bin);
for (data_size_t j = 0; j < KNumSumupGroup; j += 8, i += 8) {
const VAL_T bin0 = data_[i]; const VAL_T bin0 = data_[i];
const VAL_T bin1 = data_[i + 1]; const VAL_T bin1 = data_[i + 1];
const VAL_T bin2 = data_[i + 2]; const VAL_T bin2 = data_[i + 2];
const VAL_T bin3 = data_[i + 3]; const VAL_T bin3 = data_[i + 3];
const VAL_T bin4 = data_[i + 4];
const VAL_T bin5 = data_[i + 5];
const VAL_T bin6 = data_[i + 6];
const VAL_T bin7 = data_[i + 7];
out[bin0].sum_gradients += ordered_gradients[i]; AddGradientPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin1].sum_gradients += ordered_gradients[i + 1]; ordered_gradients + i);
out[bin2].sum_gradients += ordered_gradients[i + 2]; AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
out[bin3].sum_gradients += ordered_gradients[i + 3]; }
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
for (; i < num_data - rest; i += 8) {
const VAL_T bin0 = data_[i];
const VAL_T bin1 = data_[i + 1];
const VAL_T bin2 = data_[i + 2];
const VAL_T bin3 = data_[i + 3];
const VAL_T bin4 = data_[i + 4];
const VAL_T bin5 = data_[i + 5];
const VAL_T bin6 = data_[i + 6];
const VAL_T bin7 = data_[i + 7];
AddGradientPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
ordered_gradients + i);
AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
++out[bin0].cnt;
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
} }
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
const VAL_T bin = data_[i]; const VAL_T bin = data_[i];
...@@ -183,7 +270,6 @@ public: ...@@ -183,7 +270,6 @@ public:
++out[bin].cnt; ++out[bin].cnt;
} }
} }
}
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
......
...@@ -40,7 +40,7 @@ public: ...@@ -40,7 +40,7 @@ public:
Dense4bitsBin(data_size_t num_data) Dense4bitsBin(data_size_t num_data)
: num_data_(num_data) { : num_data_(num_data) {
int len = (num_data_ + 1) / 2; int len = (num_data_ + 1) / 2;
data_ = std::vector<uint8_t>(len, static_cast<uint8_t>(0)); data_.resize(len, 0);
} }
~Dense4bitsBin() { ~Dense4bitsBin() {
...@@ -49,7 +49,7 @@ public: ...@@ -49,7 +49,7 @@ public:
void Push(int, data_size_t idx, uint32_t value) override { void Push(int, data_size_t idx, uint32_t value) override {
if (buf_.empty()) { if (buf_.empty()) {
#pragma omp critical #pragma omp critical
{ {
if (buf_.empty()) { if (buf_.empty()) {
int len = (num_data_ + 1) / 2; int len = (num_data_ + 1) / 2;
...@@ -78,13 +78,52 @@ public: ...@@ -78,13 +78,52 @@ public:
inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override; inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians, const float* ordered_gradients, const float* ordered_hessians, int num_bin,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
if (data_indices != nullptr) { // if use part of data const data_size_t group_rest = num_data & KNumSumupGroupMask;
const data_size_t rest = num_data & 0x7;
const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - group_rest;) {
std::vector<HistogramBinEntry> tmp_sumup_buf(num_bin);
for (data_size_t k = 0; k < KNumSumupGroup; k += 8, i += 8) {
data_size_t idx = data_indices[i];
const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 1];
const auto bin1 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 2];
const auto bin2 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 3];
const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 4];
const auto bin4 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 5];
const auto bin5 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 6];
const auto bin6 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 7];
const auto bin7 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
AddGradientPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
ordered_gradients + i);
AddHessianPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
ordered_hessians + i);
AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
}
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].sum_hessians += tmp_sumup_buf[j].sum_hessians;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
for (; i < num_data - rest; i += 8) {
data_size_t idx = data_indices[i]; data_size_t idx = data_indices[i];
const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
...@@ -98,21 +137,24 @@ public: ...@@ -98,21 +137,24 @@ public:
idx = data_indices[i + 3]; idx = data_indices[i + 3];
const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 4];
const auto bin4 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 5];
const auto bin5 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 6];
const auto bin6 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i]; idx = data_indices[i + 7];
out[bin1].sum_gradients += ordered_gradients[i + 1]; const auto bin7 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin0].sum_hessians += ordered_hessians[i];
out[bin1].sum_hessians += ordered_hessians[i + 1];
out[bin2].sum_hessians += ordered_hessians[i + 2];
out[bin3].sum_hessians += ordered_hessians[i + 3];
++out[bin0].cnt; AddGradientPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin1].cnt; ordered_gradients + i);
++out[bin2].cnt; AddHessianPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin3].cnt; ordered_hessians + i);
AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
} }
...@@ -123,32 +165,62 @@ public: ...@@ -123,32 +165,62 @@ public:
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt; ++out[bin].cnt;
} }
}
} else { // use full data void ConstructHistogram(data_size_t num_data,
const data_size_t rest = num_data & 0x3; const float* ordered_gradients, const float* ordered_hessians, int num_bin,
HistogramBinEntry* out) const override {
const data_size_t group_rest = num_data & KNumSumupGroupMask;
const data_size_t rest = num_data & 0x7;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - group_rest;) {
std::vector<HistogramBinEntry> tmp_sumup_buf(num_bin);
for (data_size_t k = 0; k < KNumSumupGroup; k += 8, i += 8) {
int j = i >> 1; int j = i >> 1;
const auto bin0 = (data_[j]) & 0xf; const auto bin0 = (data_[j]) & 0xf;
const auto bin1 = (data_[j] >> 4) & 0xf; const auto bin1 = (data_[j] >> 4) & 0xf;
++j; ++j;
const auto bin2 = (data_[j]) & 0xf; const auto bin2 = (data_[j]) & 0xf;
const auto bin3 = (data_[j] >> 4) & 0xf; const auto bin3 = (data_[j] >> 4) & 0xf;
++j;
const auto bin4 = (data_[j]) & 0xf;
const auto bin5 = (data_[j] >> 4) & 0xf;
++j;
const auto bin6 = (data_[j]) & 0xf;
const auto bin7 = (data_[j] >> 4) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i];
out[bin1].sum_gradients += ordered_gradients[i + 1];
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
out[bin0].sum_hessians += ordered_hessians[i]; AddGradientPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin1].sum_hessians += ordered_hessians[i + 1]; ordered_gradients + i);
out[bin2].sum_hessians += ordered_hessians[i + 2]; AddHessianPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin3].sum_hessians += ordered_hessians[i + 3]; ordered_hessians + i);
AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
}
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].sum_hessians += tmp_sumup_buf[j].sum_hessians;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
for (; i < num_data - rest; i += 8) {
int j = i >> 1;
const auto bin0 = (data_[j]) & 0xf;
const auto bin1 = (data_[j] >> 4) & 0xf;
++j;
const auto bin2 = (data_[j]) & 0xf;
const auto bin3 = (data_[j] >> 4) & 0xf;
++j;
const auto bin4 = (data_[j]) & 0xf;
const auto bin5 = (data_[j] >> 4) & 0xf;
++j;
const auto bin6 = (data_[j]) & 0xf;
const auto bin7 = (data_[j] >> 4) & 0xf;
++out[bin0].cnt; AddGradientPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin1].cnt; ordered_gradients + i);
++out[bin2].cnt; AddHessianPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin3].cnt; ordered_hessians + i);
AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
} }
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
...@@ -157,16 +229,51 @@ public: ...@@ -157,16 +229,51 @@ public:
++out[bin].cnt; ++out[bin].cnt;
} }
} }
}
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const float* ordered_gradients, int num_bin,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
if (data_indices != nullptr) { // if use part of data const data_size_t group_rest = num_data & KNumSumupGroupMask;
const data_size_t rest = num_data & 0x7;
const data_size_t rest = num_data & 0x3;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - group_rest;) {
std::vector<TmpGradCntPair> tmp_sumup_buf(num_bin);
for (data_size_t k = 0; k < KNumSumupGroup; k += 8, i += 8) {
data_size_t idx = data_indices[i];
const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 1];
const auto bin1 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 2];
const auto bin2 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 3];
const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 4];
const auto bin4 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 5];
const auto bin5 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 6];
const auto bin6 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 7];
const auto bin7 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
AddGradientPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
ordered_gradients + i);
AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
}
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
for (; i < num_data - rest; i += 8) {
data_size_t idx = data_indices[i]; data_size_t idx = data_indices[i];
const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin0 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
...@@ -180,16 +287,22 @@ public: ...@@ -180,16 +287,22 @@ public:
idx = data_indices[i + 3]; idx = data_indices[i + 3];
const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin3 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
idx = data_indices[i + 4];
const auto bin4 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i]; idx = data_indices[i + 5];
out[bin1].sum_gradients += ordered_gradients[i + 1]; const auto bin5 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
out[bin2].sum_gradients += ordered_gradients[i + 2];
out[bin3].sum_gradients += ordered_gradients[i + 3];
++out[bin0].cnt; idx = data_indices[i + 6];
++out[bin1].cnt; const auto bin6 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
++out[bin2].cnt;
++out[bin3].cnt; idx = data_indices[i + 7];
const auto bin7 = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
AddGradientPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
ordered_gradients + i);
AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
} }
...@@ -199,27 +312,57 @@ public: ...@@ -199,27 +312,57 @@ public:
out[bin].sum_gradients += ordered_gradients[i]; out[bin].sum_gradients += ordered_gradients[i];
++out[bin].cnt; ++out[bin].cnt;
} }
}
} else { // use full data void ConstructHistogram(data_size_t num_data,
const data_size_t rest = num_data & 0x3; const float* ordered_gradients, int num_bin,
HistogramBinEntry* out) const override {
const data_size_t group_rest = num_data & KNumSumupGroupMask;
const data_size_t rest = num_data & 0x7;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - group_rest;) {
std::vector<TmpGradCntPair> tmp_sumup_buf(num_bin);
for (data_size_t k = 0; k < KNumSumupGroup; k += 8, i += 8) {
int j = i >> 1; int j = i >> 1;
const auto bin0 = (data_[j]) & 0xf; const auto bin0 = (data_[j]) & 0xf;
const auto bin1 = (data_[j] >> 4) & 0xf; const auto bin1 = (data_[j] >> 4) & 0xf;
++j; ++j;
const auto bin2 = (data_[j]) & 0xf; const auto bin2 = (data_[j]) & 0xf;
const auto bin3 = (data_[j] >> 4) & 0xf; const auto bin3 = (data_[j] >> 4) & 0xf;
++j;
const auto bin4 = (data_[j]) & 0xf;
const auto bin5 = (data_[j] >> 4) & 0xf;
++j;
const auto bin6 = (data_[j]) & 0xf;
const auto bin7 = (data_[j] >> 4) & 0xf;
out[bin0].sum_gradients += ordered_gradients[i]; AddGradientPtrToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin1].sum_gradients += ordered_gradients[i + 1]; ordered_gradients + i);
out[bin2].sum_gradients += ordered_gradients[i + 2]; AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
out[bin3].sum_gradients += ordered_gradients[i + 3]; }
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
for (; i < num_data - rest; i += 8) {
int j = i >> 1;
const auto bin0 = (data_[j]) & 0xf;
const auto bin1 = (data_[j] >> 4) & 0xf;
++j;
const auto bin2 = (data_[j]) & 0xf;
const auto bin3 = (data_[j] >> 4) & 0xf;
++j;
const auto bin4 = (data_[j]) & 0xf;
const auto bin5 = (data_[j] >> 4) & 0xf;
++j;
const auto bin6 = (data_[j]) & 0xf;
const auto bin7 = (data_[j] >> 4) & 0xf;
++out[bin0].cnt; AddGradientPtrToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
++out[bin1].cnt; ordered_gradients + i);
++out[bin2].cnt; AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
++out[bin3].cnt;
} }
for (; i < num_data; ++i) { for (; i < num_data; ++i) {
const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf; const auto bin = (data_[i >> 1] >> ((i & 1) << 2)) & 0xf;
...@@ -227,7 +370,6 @@ public: ...@@ -227,7 +370,6 @@ public:
++out[bin].cnt; ++out[bin].cnt;
} }
} }
}
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin,
......
...@@ -78,20 +78,25 @@ public: ...@@ -78,20 +78,25 @@ public:
} }
} }
void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian, void ConstructHistogram(int leaf, const float* gradient, const float* hessian, int num_bin,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// get current leaf boundary // get current leaf boundary
const data_size_t start = leaf_start_[leaf]; const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf]; const data_size_t end = start + leaf_cnt_[leaf];
const int rest = (end - start) % 4; const data_size_t group_rest = (end - start) & KNumSumupGroupMask;
const data_size_t rest = (end - start) & 0x7;
data_size_t i = start; data_size_t i = start;
// use data on current leaf to construct histogram for (; i < end - group_rest;) {
for (; i < end - rest; i += 4) { std::vector<HistogramBinEntry> tmp_sumup_buf(num_bin);
for (data_size_t j = 0; j < KNumSumupGroup; j += 8, i += 8) {
const VAL_T bin0 = ordered_pair_[i].bin; const VAL_T bin0 = ordered_pair_[i].bin;
const VAL_T bin1 = ordered_pair_[i + 1].bin; const VAL_T bin1 = ordered_pair_[i + 1].bin;
const VAL_T bin2 = ordered_pair_[i + 2].bin; const VAL_T bin2 = ordered_pair_[i + 2].bin;
const VAL_T bin3 = ordered_pair_[i + 3].bin; const VAL_T bin3 = ordered_pair_[i + 3].bin;
const VAL_T bin4 = ordered_pair_[i + 4].bin;
const VAL_T bin5 = ordered_pair_[i + 5].bin;
const VAL_T bin6 = ordered_pair_[i + 6].bin;
const VAL_T bin7 = ordered_pair_[i + 7].bin;
const auto g0 = gradient[ordered_pair_[i].ridx]; const auto g0 = gradient[ordered_pair_[i].ridx];
const auto h0 = hessian[ordered_pair_[i].ridx]; const auto h0 = hessian[ordered_pair_[i].ridx];
...@@ -101,21 +106,61 @@ public: ...@@ -101,21 +106,61 @@ public:
const auto h2 = hessian[ordered_pair_[i + 2].ridx]; const auto h2 = hessian[ordered_pair_[i + 2].ridx];
const auto g3 = gradient[ordered_pair_[i + 3].ridx]; const auto g3 = gradient[ordered_pair_[i + 3].ridx];
const auto h3 = hessian[ordered_pair_[i + 3].ridx]; const auto h3 = hessian[ordered_pair_[i + 3].ridx];
const auto g4 = gradient[ordered_pair_[i + 4].ridx];
const auto h4 = hessian[ordered_pair_[i + 4].ridx];
const auto g5 = gradient[ordered_pair_[i + 5].ridx];
const auto h5 = hessian[ordered_pair_[i + 5].ridx];
const auto g6 = gradient[ordered_pair_[i + 6].ridx];
const auto h6 = hessian[ordered_pair_[i + 6].ridx];
const auto g7 = gradient[ordered_pair_[i + 7].ridx];
const auto h7 = hessian[ordered_pair_[i + 7].ridx];
out[bin0].sum_gradients += g0; AddGradientToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin1].sum_gradients += g1; g0, g1, g2, g3, g4, g5, g6, g7);
out[bin2].sum_gradients += g2; AddHessianToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin3].sum_gradients += g3; h0, h1, h2, h3, h4, h5, h6, h7);
AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
}
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].sum_hessians += tmp_sumup_buf[j].sum_hessians;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
// use data on current leaf to construct histogram
for (; i < end - rest; i += 8) {
out[bin0].sum_hessians += h0; const VAL_T bin0 = ordered_pair_[i].bin;
out[bin1].sum_hessians += h1; const VAL_T bin1 = ordered_pair_[i + 1].bin;
out[bin2].sum_hessians += h2; const VAL_T bin2 = ordered_pair_[i + 2].bin;
out[bin3].sum_hessians += h3; const VAL_T bin3 = ordered_pair_[i + 3].bin;
const VAL_T bin4 = ordered_pair_[i + 4].bin;
const VAL_T bin5 = ordered_pair_[i + 5].bin;
const VAL_T bin6 = ordered_pair_[i + 6].bin;
const VAL_T bin7 = ordered_pair_[i + 7].bin;
++out[bin0].cnt; const auto g0 = gradient[ordered_pair_[i].ridx];
++out[bin1].cnt; const auto h0 = hessian[ordered_pair_[i].ridx];
++out[bin2].cnt; const auto g1 = gradient[ordered_pair_[i + 1].ridx];
++out[bin3].cnt; const auto h1 = hessian[ordered_pair_[i + 1].ridx];
const auto g2 = gradient[ordered_pair_[i + 2].ridx];
const auto h2 = hessian[ordered_pair_[i + 2].ridx];
const auto g3 = gradient[ordered_pair_[i + 3].ridx];
const auto h3 = hessian[ordered_pair_[i + 3].ridx];
const auto g4 = gradient[ordered_pair_[i + 4].ridx];
const auto h4 = hessian[ordered_pair_[i + 4].ridx];
const auto g5 = gradient[ordered_pair_[i + 5].ridx];
const auto h5 = hessian[ordered_pair_[i + 5].ridx];
const auto g6 = gradient[ordered_pair_[i + 6].ridx];
const auto h6 = hessian[ordered_pair_[i + 6].ridx];
const auto g7 = gradient[ordered_pair_[i + 7].ridx];
const auto h7 = hessian[ordered_pair_[i + 7].ridx];
AddGradientToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
g0, g1, g2, g3, g4, g5, g6, g7);
AddHessianToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
h0, h1, h2, h3, h4, h5, h6, h7);
AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
} }
for (; i < end; ++i) { for (; i < end; ++i) {
...@@ -129,42 +174,78 @@ public: ...@@ -129,42 +174,78 @@ public:
out[bin0].sum_hessians += h0; out[bin0].sum_hessians += h0;
++out[bin0].cnt; ++out[bin0].cnt;
} }
} }
void ConstructHistogram(int leaf, const score_t* gradient, void ConstructHistogram(int leaf, const float* gradient, int num_bin,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// get current leaf boundary // get current leaf boundary
const data_size_t start = leaf_start_[leaf]; const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf]; const data_size_t end = start + leaf_cnt_[leaf];
const int rest = (end - start) % 4; const data_size_t group_rest = (end - start) & KNumSumupGroupMask;
const data_size_t rest = (end - start) & 0x7;
data_size_t i = start; data_size_t i = start;
for (; i < end - group_rest;) {
std::vector<TmpGradCntPair> tmp_sumup_buf(num_bin);
for (data_size_t j = 0; j < KNumSumupGroup; j += 8, i += 8) {
const VAL_T bin0 = ordered_pair_[i].bin;
const VAL_T bin1 = ordered_pair_[i + 1].bin;
const VAL_T bin2 = ordered_pair_[i + 2].bin;
const VAL_T bin3 = ordered_pair_[i + 3].bin;
const VAL_T bin4 = ordered_pair_[i + 4].bin;
const VAL_T bin5 = ordered_pair_[i + 5].bin;
const VAL_T bin6 = ordered_pair_[i + 6].bin;
const VAL_T bin7 = ordered_pair_[i + 7].bin;
const auto g0 = gradient[ordered_pair_[i].ridx];
const auto g1 = gradient[ordered_pair_[i + 1].ridx];
const auto g2 = gradient[ordered_pair_[i + 2].ridx];
const auto g3 = gradient[ordered_pair_[i + 3].ridx];
const auto g4 = gradient[ordered_pair_[i + 4].ridx];
const auto g5 = gradient[ordered_pair_[i + 5].ridx];
const auto g6 = gradient[ordered_pair_[i + 6].ridx];
const auto g7 = gradient[ordered_pair_[i + 7].ridx];
AddGradientToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
g0, g1, g2, g3, g4, g5, g6, g7);
AddCountToHistogram(tmp_sumup_buf.data(), bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
}
for (int j = 0; j < num_bin; ++j) {
out[j].sum_gradients += tmp_sumup_buf[j].sum_gradients;
out[j].cnt += tmp_sumup_buf[j].cnt;
}
}
// use data on current leaf to construct histogram // use data on current leaf to construct histogram
for (; i < end - rest; i += 4) { for (; i < end - rest; i += 8) {
const VAL_T bin0 = ordered_pair_[i].bin; const VAL_T bin0 = ordered_pair_[i].bin;
const VAL_T bin1 = ordered_pair_[i + 1].bin; const VAL_T bin1 = ordered_pair_[i + 1].bin;
const VAL_T bin2 = ordered_pair_[i + 2].bin; const VAL_T bin2 = ordered_pair_[i + 2].bin;
const VAL_T bin3 = ordered_pair_[i + 3].bin; const VAL_T bin3 = ordered_pair_[i + 3].bin;
const VAL_T bin4 = ordered_pair_[i + 4].bin;
const VAL_T bin5 = ordered_pair_[i + 5].bin;
const VAL_T bin6 = ordered_pair_[i + 6].bin;
const VAL_T bin7 = ordered_pair_[i + 7].bin;
const auto g0 = gradient[ordered_pair_[i].ridx]; const auto g0 = gradient[ordered_pair_[i].ridx];
const auto g1 = gradient[ordered_pair_[i + 1].ridx]; const auto g1 = gradient[ordered_pair_[i + 1].ridx];
const auto g2 = gradient[ordered_pair_[i + 2].ridx]; const auto g2 = gradient[ordered_pair_[i + 2].ridx];
const auto g3 = gradient[ordered_pair_[i + 3].ridx]; const auto g3 = gradient[ordered_pair_[i + 3].ridx];
const auto g4 = gradient[ordered_pair_[i + 4].ridx];
const auto g5 = gradient[ordered_pair_[i + 5].ridx];
const auto g6 = gradient[ordered_pair_[i + 6].ridx];
const auto g7 = gradient[ordered_pair_[i + 7].ridx];
out[bin0].sum_gradients += g0; AddGradientToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7,
out[bin1].sum_gradients += g1; g0, g1, g2, g3, g4, g5, g6, g7);
out[bin2].sum_gradients += g2; AddCountToHistogram(out, bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7);
out[bin3].sum_gradients += g3;
++out[bin0].cnt;
++out[bin1].cnt;
++out[bin2].cnt;
++out[bin3].cnt;
} }
for (; i < end; ++i) { for (; i < end; ++i) {
const VAL_T bin0 = ordered_pair_[i].bin; const VAL_T bin0 = ordered_pair_[i].bin;
const auto g0 = gradient[ordered_pair_[i].ridx]; const auto g0 = gradient[ordered_pair_[i].ridx];
out[bin0].sum_gradients += g0; out[bin0].sum_gradients += g0;
++out[bin0].cnt; ++out[bin0].cnt;
} }
......
...@@ -98,13 +98,25 @@ public: ...@@ -98,13 +98,25 @@ public:
BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override; BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
void ConstructHistogram(const data_size_t*, data_size_t, const score_t*, void ConstructHistogram(const data_size_t*, data_size_t, const float*,
const score_t*, HistogramBinEntry*) const override { const float*,int, HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead // Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
} }
void ConstructHistogram(const data_size_t*, data_size_t, const score_t*, void ConstructHistogram(data_size_t, const float*,
const float*, int, HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
}
void ConstructHistogram(const data_size_t*, data_size_t, const float*,int,
HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
}
void ConstructHistogram(data_size_t, const float*, int,
HistogramBinEntry*) const override { HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead // Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
......
...@@ -82,7 +82,7 @@ public: ...@@ -82,7 +82,7 @@ public:
label_weights_[1] *= scale_pos_weight_; label_weights_[1] *= scale_pos_weight_;
} }
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { void GetGradients(const double* score, float* gradients, float* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) { for (data_size_t i = 0; i < num_data_; ++i) {
...@@ -93,8 +93,8 @@ public: ...@@ -93,8 +93,8 @@ public:
// calculate gradients and hessians // calculate gradients and hessians
const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i])); const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
const double abs_response = fabs(response); const double abs_response = fabs(response);
gradients[i] = static_cast<score_t>(response * label_weight); gradients[i] = static_cast<float>(response * label_weight);
hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight); hessians[i] = static_cast<float>(abs_response * (sigmoid_ - abs_response) * label_weight);
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
...@@ -106,8 +106,8 @@ public: ...@@ -106,8 +106,8 @@ public:
// calculate gradients and hessians // calculate gradients and hessians
const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i])); const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
const double abs_response = fabs(response); const double abs_response = fabs(response);
gradients[i] = static_cast<score_t>(response * label_weight * weights_[i]); gradients[i] = static_cast<float>(response * label_weight * weights_[i]);
hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight * weights_[i]); hessians[i] = static_cast<float>(abs_response * (sigmoid_ - abs_response) * label_weight * weights_[i]);
} }
} }
} }
......
...@@ -62,10 +62,10 @@ public: ...@@ -62,10 +62,10 @@ public:
} }
} }
if (non_empty_class < 2) { non_empty_class = 2; } if (non_empty_class < 2) { non_empty_class = 2; }
hessian_nor_ = static_cast<score_t>(non_empty_class) / (non_empty_class - 1); hessian_nor_ = static_cast<float>(non_empty_class) / (non_empty_class - 1);
} }
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { void GetGradients(const double* score, float* gradients, float* hessians) const override {
if (weights_ == nullptr) { if (weights_ == nullptr) {
std::vector<double> rec; std::vector<double> rec;
#pragma omp parallel for schedule(static) private(rec) #pragma omp parallel for schedule(static) private(rec)
...@@ -81,11 +81,11 @@ public: ...@@ -81,11 +81,11 @@ public:
auto p = rec[k]; auto p = rec[k];
size_t idx = static_cast<size_t>(num_data_) * k + i; size_t idx = static_cast<size_t>(num_data_) * k + i;
if (label_int_[i] == k) { if (label_int_[i] == k) {
gradients[idx] = static_cast<score_t>(p - 1.0f + softmax_weight_decay_ * score[idx]); gradients[idx] = static_cast<float>(p - 1.0f + softmax_weight_decay_ * score[idx]);
} else { } else {
gradients[idx] = static_cast<score_t>(p + softmax_weight_decay_ * score[idx]); gradients[idx] = static_cast<float>(p + softmax_weight_decay_ * score[idx]);
} }
hessians[idx] = static_cast<score_t>(hessian_nor_ * p * (1.0f - p) + softmax_weight_decay_); hessians[idx] = static_cast<float>(hessian_nor_ * p * (1.0f - p) + softmax_weight_decay_);
} }
} }
} else { } else {
...@@ -103,11 +103,11 @@ public: ...@@ -103,11 +103,11 @@ public:
auto p = rec[k]; auto p = rec[k];
size_t idx = static_cast<size_t>(num_data_) * k + i; size_t idx = static_cast<size_t>(num_data_) * k + i;
if (label_int_[i] == k) { if (label_int_[i] == k) {
gradients[idx] = static_cast<score_t>((p - 1.0f + softmax_weight_decay_ * score[idx]) * weights_[i]); gradients[idx] = static_cast<float>((p - 1.0f + softmax_weight_decay_ * score[idx]) * weights_[i]);
} else { } else {
gradients[idx] = static_cast<score_t>((p + softmax_weight_decay_ * score[idx]) * weights_[i]); gradients[idx] = static_cast<float>((p + softmax_weight_decay_ * score[idx]) * weights_[i]);
} }
hessians[idx] = static_cast<score_t>((hessian_nor_ * p * (1.0f - p) + softmax_weight_decay_)* weights_[i]); hessians[idx] = static_cast<float>((hessian_nor_ * p * (1.0f - p) + softmax_weight_decay_)* weights_[i]);
} }
} }
} }
...@@ -147,7 +147,7 @@ private: ...@@ -147,7 +147,7 @@ private:
const float* weights_; const float* weights_;
std::vector<bool> is_empty_class_; std::vector<bool> is_empty_class_;
double softmax_weight_decay_; double softmax_weight_decay_;
score_t hessian_nor_; float hessian_nor_;
}; };
/*! /*!
...@@ -196,9 +196,9 @@ public: ...@@ -196,9 +196,9 @@ public:
} }
} }
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override { void GetGradients(const double* score, float* gradients, float* hessians) const override {
for (int i = 0; i < num_class_; ++i) { for (int i = 0; i < num_class_; ++i) {
int64_t bias = static_cast<int64_t>(num_data_) * i; size_t bias = static_cast<size_t>(num_data_) * i;
binary_loss_[i]->GetGradients(score + bias, gradients + bias, hessians + bias); binary_loss_[i]->GetGradients(score + bias, gradients + bias, hessians + bias);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment