Unverified Commit 509c2e50 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Support both row-wise and col-wise multi-threading (#2699)



* commit

* fix a bug

* fix bug

* reset to track changes

* refine the auto choose logic

* sort the time stats output

* fix include

* change  multi_val_bin_sparse_threshold

* add cmake

* add _mm_malloc and _mm_free for cross platform

* fix cmake bug

* timer for split

* try to fix cmake

* fix tests

* refactor DataPartition::Split

* fix test

* typo

* formating

* Revert "formating"

This reverts commit 5b8de4f7fb9d975ee23701d276a66d40ee6d4222.

* add document

* [R-package] Added tests on use of force_col_wise and force_row_wise in training (#2719)

* naming

* fix gpu code

* Update include/LightGBM/bin.h
Co-Authored-By: default avatarJames Lamb <jaylamb20@gmail.com>

* Update src/treelearner/ocl/histogram16.cl

* test: swap compilers for CI

* fix omp

* not avx2

* no aligned for feature histogram

* Revert "refactor DataPartition::Split"

This reverts commit 256e6d9641ade966a1f54da1752e998a1149b6f8.

* slightly refactor data partition

* reduce the memory cost
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent bc7bc4a1
......@@ -44,11 +44,11 @@ before_install:
- export BUILD_DIRECTORY="$TRAVIS_BUILD_DIR"
- if [[ $TRAVIS_OS_NAME == "osx" ]]; then
export OS_NAME="macos";
export COMPILER="gcc";
export COMPILER="clang";
export R_MAC_VERSION=3.6.1;
else
export OS_NAME="linux";
export COMPILER="clang";
export COMPILER="gcc";
export R_TRAVIS_LINUX_VERSION=3.6.1-3bionic;
fi
- export CONDA="$HOME/miniconda"
......
......@@ -17,7 +17,7 @@ jobs:
- job: Linux
###########################################
variables:
COMPILER: gcc
COMPILER: clang
pool:
vmImage: 'ubuntu-16.04'
container: ubuntu1404
......@@ -72,7 +72,7 @@ jobs:
- job: MacOS
###########################################
variables:
COMPILER: clang
COMPILER: gcc
pool:
vmImage: 'macOS-10.13'
strategy:
......
......@@ -68,6 +68,10 @@ if(USE_R35)
ADD_DEFINITIONS(-DR_VER_ABOVE_35)
endif(USE_R35)
if(USE_TIMETAG)
ADD_DEFINITIONS(-DTIMETAG)
endif(USE_TIMETAG)
if(USE_MPI)
find_package(MPI REQUIRED)
ADD_DEFINITIONS(-DUSE_MPI)
......@@ -130,6 +134,21 @@ if(${MM_PREFETCH})
ADD_DEFINITIONS(-DMM_PREFETCH)
endif()
include(CheckCXXSourceCompiles)
check_cxx_source_compiles("
#include <mm_malloc.h>
int main() {
char *a = (char*)_mm_malloc(8, 16);
_mm_free(a);
return 0;
}
" MM_MALLOC)
if(${MM_MALLOC})
message(STATUS "Use _mm_malloc")
ADD_DEFINITIONS(-DMM_MALLOC)
endif()
if(UNIX OR MINGW OR CYGWIN)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -O3 -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type")
if(USE_SWIG)
......
......@@ -252,3 +252,46 @@ test_that("lgb.train() throws an informative error if 'valids' contains lgb.Data
)
}, regexp = "each element of valids must have a name")
})
test_that("lgb.train() works with force_col_wise and force_row_wise", {
set.seed(1234L)
nrounds <- 10L
dtrain <- lgb.Dataset(
train$data
, label = train$label
)
params <- list(
objective = "binary"
, metric = "binary_error"
, force_col_wise = TRUE
)
bst_colwise <- lgb.train(
params = params
, data = dtrain
, nrounds = nrounds
)
params <- list(
objective = "binary"
, metric = "binary_error"
, force_row_wise = TRUE
)
bst_row_wise <- lgb.train(
params = params
, data = dtrain
, nrounds = nrounds
)
expected_error <- 0.003070782
expect_equal(bst_colwise$eval_train()[[1L]][["value"]], expected_error)
expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error)
# check some basic details of the boosters just to be sure force_col_wise
# and force_row_wise are not causing any weird side effects
for (bst in list(bst_row_wise, bst_colwise)) {
expect_equal(bst$current_iter(), nrounds)
parsed_model <- jsonlite::fromJSON(bst$dump_model())
expect_equal(parsed_model$objective, "binary sigmoid:1")
expect_false(parsed_model$average_output)
}
})
......@@ -47,8 +47,8 @@ test_that("learning-to-rank with lgb.train() works as expected", {
}
expect_identical(sapply(eval_results, function(x) {x$name}), eval_names)
expect_equal(eval_results[[1L]][["value"]], 0.825)
expect_true(abs(eval_results[[2L]][["value"]] - 0.795986) < TOLERANCE)
expect_true(abs(eval_results[[3L]][["value"]] - 0.7734639) < TOLERANCE)
expect_true(abs(eval_results[[2L]][["value"]] - 0.7766434) < TOLERANCE)
expect_true(abs(eval_results[[3L]][["value"]] - 0.7527939) < TOLERANCE)
})
test_that("learning-to-rank with lgb.cv() works as expected", {
......
......@@ -190,6 +190,38 @@ Core Parameters
Learning Control Parameters
---------------------------
- ``force_col_wise`` :raw-html:`<a id="force_col_wise" title="Permalink to this parameter" href="#force_col_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
- set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
- Recommend ``force_col_wise=true`` when:
- the number of columns is large, or the total number of bin is large
- when ``num_threads`` is large, e.g. ``>20``
- want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
- want to reduce memory cost
- when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one
- ``force_row_wise`` :raw-html:`<a id="force_row_wise" title="Permalink to this parameter" href="#force_row_wise">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
- set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build
- Recommend ``force_row_wise=true`` when:
- the number of data is large, and the number of total bin is relatively small
- want to use small ``bagging``, or ``goss``, to speed-up
- when ``num_threads`` is relatively small, e.g. ``<=16``
- set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true``
- when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one.
- ``max_depth`` :raw-html:`<a id="max_depth" title="Permalink to this parameter" href="#max_depth">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
- limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
......@@ -559,22 +591,6 @@ IO Parameters
- **Note**: disabling this may cause the slow training speed for sparse datasets
- ``max_conflict_rate`` :raw-html:`<a id="max_conflict_rate" title="Permalink to this parameter" href="#max_conflict_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, constraints: ``0.0 <= max_conflict_rate < 1.0``
- max conflict rate for bundles in EFB
- set this to ``0.0`` to disallow the conflict and provide more accurate results
- set this to a larger value to achieve faster speed
- ``is_enable_sparse`` :raw-html:`<a id="is_enable_sparse" title="Permalink to this parameter" href="#is_enable_sparse">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool, aliases: ``is_sparse``, ``enable_sparse``, ``sparse``
- used to enable/disable sparse optimization
- ``sparse_threshold`` :raw-html:`<a id="sparse_threshold" title="Permalink to this parameter" href="#sparse_threshold">&#x1F517;&#xFE0E;</a>`, default = ``0.8``, type = double, constraints: ``0.0 < sparse_threshold <= 1.0``
- the threshold of zero elements percentage for treating a feature as a sparse one
- ``use_missing`` :raw-html:`<a id="use_missing" title="Permalink to this parameter" href="#use_missing">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool
- set this to ``false`` to disable the special handle of missing value
......
......@@ -29,36 +29,29 @@ enum MissingType {
NaN
};
/*! \brief Store data for one histogram bin */
struct HistogramBinEntry {
public:
/*! \brief Sum of gradients on this bin */
double sum_gradients = 0.0f;
/*! \brief Sum of hessians on this bin */
double sum_hessians = 0.0f;
/*! \brief Number of data on this bin */
data_size_t cnt = 0;
/*!
* \brief Sum up (reducers) functions for histogram bin
*/
inline static void SumReducer(const char *src, char *dst, int type_size, comm_size_t len) {
comm_size_t used_size = 0;
const HistogramBinEntry* p1;
HistogramBinEntry* p2;
while (used_size < len) {
// convert
p1 = reinterpret_cast<const HistogramBinEntry*>(src);
p2 = reinterpret_cast<HistogramBinEntry*>(dst);
// add
p2->cnt += p1->cnt;
p2->sum_gradients += p1->sum_gradients;
p2->sum_hessians += p1->sum_hessians;
src += type_size;
dst += type_size;
used_size += type_size;
}
typedef double hist_t;
const size_t KHistEntrySize = 2 * sizeof(hist_t);
const int KHistOffset = 2;
const double kSparseThreshold = 0.7;
#define GET_GRAD(hist, i) hist[(i) << 1]
#define GET_HESS(hist, i) hist[((i) << 1) + 1]
inline static void HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
comm_size_t used_size = 0;
const hist_t* p1;
hist_t* p2;
while (used_size < len) {
// convert
p1 = reinterpret_cast<const hist_t*>(src);
p2 = reinterpret_cast<hist_t*>(dst);
*p2 += *p1;
src += type_size;
dst += type_size;
used_size += type_size;
}
};
}
/*! \brief This class used to convert feature values into bin,
* and store some meta information for bin*/
......@@ -252,7 +245,7 @@ class OrderedBin {
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients,
const score_t* hessians, HistogramBinEntry* out) const = 0;
const score_t* hessians, hist_t* out) const = 0;
/*!
* \brief Construct histogram by using this bin
......@@ -262,7 +255,7 @@ class OrderedBin {
* \param gradients Gradients, Note:non-ordered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(int leaf, const score_t* gradients, hist_t* out) const = 0;
/*!
* \brief Split current bin, and perform re-order by leaf
......@@ -360,11 +353,11 @@ class Bin {
virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
hist_t* out) const = 0;
virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
hist_t* out) const = 0;
/*!
* \brief Construct histogram of this feature,
......@@ -380,10 +373,10 @@ class Bin {
* \param out Output Result
*/
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
const score_t* ordered_gradients, hist_t* out) const = 0;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
......@@ -423,30 +416,11 @@ class Bin {
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
/*!
* \brief Create the ordered bin for this bin
* \return Pointer to ordered bin
*/
virtual OrderedBin* CreateOrderedBin() const = 0;
/*!
* \brief After pushed all feature data, call this could have better refactor for bin data
*/
virtual void FinishLoad() = 0;
/*!
* \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
* \param num_data Total number of data
* \param num_bin Number of bin
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
* \param is_enable_sparse True if enable sparse feature
* \param sparse_threshold Threshold for treating a feature as a sparse feature
* \param is_sparse Will set to true if this bin is sparse
* \return The bin data object
*/
static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
/*!
* \brief Create object for bin data of one feature, used for dense feature
* \param num_data Total number of data
......@@ -469,6 +443,46 @@ class Bin {
virtual Bin* Clone() = 0;
};
class MultiValBin {
public:
virtual ~MultiValBin() {}
virtual data_size_t num_data() const = 0;
virtual int32_t num_bin() const = 0;
virtual void ReSize(data_size_t num_data) = 0;
virtual void PushOneRow(int tid, data_size_t idx, const std::vector<uint32_t>& values) = 0;
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void FinishLoad() = 0;
virtual bool IsSparse() = 0;
static MultiValBin* CreateMultiValBin(data_size_t num_data, int num_bin, int num_feature, double sparse_rate);
virtual MultiValBin* Clone() = 0;
};
inline uint32_t BinMapper::ValueToBin(double value) const {
if (std::isnan(value)) {
if (missing_type_ == MissingType::NaN) {
......
......@@ -214,6 +214,24 @@ struct Config {
#pragma region Learning Control Parameters
// desc = set ``force_col_wise=true`` will force LightGBM to use col-wise histogram build
// desc = Recommend ``force_col_wise=true`` when:
// descl2 = the number of columns is large, or the total number of bin is large
// descl2 = when ``num_threads`` is large, e.g. ``>20``
// descl2 = want to use small ``feature_fraction``, e.g. ``0.5``, to speed-up
// descl2 = want to reduce memory cost
// desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one
bool force_col_wise = false;
// desc = set ``force_row_wise=true`` will force LightGBM to use row-wise histogram build
// desc = Recommend ``force_row_wise=true`` when:
// descl2 = the number of data is large, and the number of total bin is relatively small
// descl2 = want to use small ``bagging``, or ``goss``, to speed-up
// descl2 = when ``num_threads`` is relatively small, e.g. ``<=16``
// desc = set ``force_row_wise=true`` will double the memory cost for Dataset object, if your memory is not enough, you can try ``force_col_wise=true``
// desc = when both ``force_col_wise`` and ``force_col_wise`` are ``false``, LightGBM will firstly try them both, and uses the faster one.
bool force_row_wise = false;
// desc = limit the max depth for tree model. This is used to deal with over-fitting when ``#data`` is small. Tree still grows leaf-wise
// desc = ``<= 0`` means no limit
int max_depth = -1;
......@@ -534,22 +552,6 @@ struct Config {
// desc = **Note**: disabling this may cause the slow training speed for sparse datasets
bool enable_bundle = true;
// check = >=0.0
// check = <1.0
// desc = max conflict rate for bundles in EFB
// desc = set this to ``0.0`` to disallow the conflict and provide more accurate results
// desc = set this to a larger value to achieve faster speed
double max_conflict_rate = 0.0;
// alias = is_sparse, enable_sparse, sparse
// desc = used to enable/disable sparse optimization
bool is_enable_sparse = true;
// check = >0.0
// check = <=1.0
// desc = the threshold of zero elements percentage for treating a feature as a sparse one
double sparse_threshold = 0.8;
// desc = set this to ``false`` to disable the special handle of missing value
bool use_missing = true;
......
......@@ -8,6 +8,7 @@
#include <LightGBM/config.h>
#include <LightGBM/feature_group.h>
#include <LightGBM/meta.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/random.h>
......@@ -381,6 +382,7 @@ class Dataset {
inline uint64_t NumTotalBin() const {
return group_bin_boundaries_.back();
}
inline std::vector<int> ValidFeatureIndices() const {
std::vector<int> ret;
for (int i = 0; i < num_total_features_; ++i) {
......@@ -394,6 +396,13 @@ class Dataset {
void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
MultiValBin* GetMultiBinFromSparseFeatures() const;
MultiValBin* GetMultiBinFromAllFeatures() const;
MultiValBin* TestMultiThreadingMethod(score_t* gradients, score_t* hessians, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_colwise, bool force_rowwise, bool* is_hist_col_wise) const;
LIGHTGBM_EXPORT void FinishLoad();
LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
......@@ -423,15 +432,18 @@ class Dataset {
void ConstructHistograms(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
bool is_constant_hessian,
HistogramBinEntry* histogram_data) const;
const MultiValBin* multi_val_bin, bool is_colwise,
hist_t* histogram_data) const;
void ConstructHistogramsMultiVal(const MultiValBin* multi_val_bin, const data_size_t* data_indices, data_size_t num_data,
const score_t* gradients, const score_t* hessians,
bool is_constant_hessian,
hist_t* histogram_data) const;
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
HistogramBinEntry* data) const;
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
inline data_size_t Split(int feature,
const uint32_t* threshold, int num_threshold, bool default_left,
......@@ -496,19 +508,10 @@ class Dataset {
return feature_groups_[group]->bin_mappers_[sub_feature].get();
}
inline const Bin* FeatureBin(int i) const {
const int group = feature2group_[i];
return feature_groups_[group]->bin_data_.get();
}
inline const Bin* FeatureGroupBin(int group) const {
return feature_groups_[group]->bin_data_.get();
}
inline bool FeatureGroupIsSparse(int group) const {
return feature_groups_[group]->is_sparse_;
}
inline BinIterator* FeatureIterator(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
......@@ -519,6 +522,10 @@ class Dataset {
return feature_groups_[group]->FeatureGroupIterator();
}
inline bool IsMultiGroup(int i) const {
return feature_groups_[i]->is_multi_val_;
}
inline double RealThreshold(int i, uint32_t threshold) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
......@@ -532,18 +539,6 @@ class Dataset {
return feature_groups_[group]->bin_mappers_[sub_feature]->ValueToBin(threshold_double);
}
inline void CreateOrderedBins(std::vector<std::unique_ptr<OrderedBin>>* ordered_bins) const {
ordered_bins->resize(num_groups_);
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_groups_; ++i) {
OMP_LOOP_EX_BEGIN();
ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin());
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
}
/*!
* \brief Get meta data pointer
* \return Pointer of meta data
......@@ -620,7 +615,7 @@ class Dataset {
/*! \brief Disable copy */
Dataset(const Dataset&) = delete;
void addFeaturesFrom(Dataset* other);
void AddFeaturesFrom(Dataset* other);
private:
std::string data_filename_;
......@@ -638,8 +633,6 @@ class Dataset {
Metadata metadata_;
/*! \brief index of label column */
int label_idx_ = 0;
/*! \brief Threshold for treating a feature as a sparse feature */
double sparse_threshold_;
/*! \brief store feature names */
std::vector<std::string> feature_names_;
/*! \brief store feature names */
......@@ -662,6 +655,8 @@ class Dataset {
bool use_missing_;
bool zero_as_missing_;
std::vector<int> feature_need_push_zeros_;
mutable std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_buf_;
};
} // namespace LightGBM
......
......@@ -30,14 +30,13 @@ class FeatureGroup {
* \param is_enable_sparse True if enable sparse feature
* \param sparse_threshold Threshold for treating a feature as a sparse feature
*/
FeatureGroup(int num_feature,
FeatureGroup(int num_feature, bool is_multi_val,
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
data_size_t num_data, double sparse_threshold, bool is_enable_sparse) : num_feature_(num_feature) {
data_size_t num_data) : num_feature_(num_feature), is_multi_val_(is_multi_val), is_sparse_(false) {
CHECK(static_cast<int>(bin_mappers->size()) == num_feature);
// use bin at zero to store most_freq_bin
num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
int cnt_non_zero = 0;
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(bin_mappers->at(i).release());
auto num_bin = bin_mappers_[i]->num_bin();
......@@ -46,18 +45,26 @@ class FeatureGroup {
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers_[i]->sparse_rate()));
}
double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_,
sparse_rate, is_enable_sparse, sparse_threshold, &is_sparse_));
if (is_multi_val_) {
multi_bin_data_.clear();
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
} else {
multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
}
}
} else {
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
}
}
FeatureGroup(int num_feature,
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
data_size_t num_data, bool is_sparse) : num_feature_(num_feature) {
CHECK(static_cast<int>(bin_mappers->size()) == num_feature);
// use bin at zero to store most_freq_bin
FeatureGroup(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
data_size_t num_data) : num_feature_(1), is_multi_val_(false) {
CHECK(static_cast<int>(bin_mappers->size()) == 1);
// use bin at zero to store default_bin
num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) {
......@@ -69,13 +76,15 @@ class FeatureGroup {
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
}
is_sparse_ = is_sparse;
if (is_sparse_) {
if (bin_mappers_[0]->sparse_rate() >= kSparseThreshold) {
is_sparse_ = true;
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else {
is_sparse_ = false;
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
}
}
/*!
* \brief Constructor from memory
* \param memory Pointer of memory
......@@ -86,6 +95,8 @@ class FeatureGroup {
const std::vector<data_size_t>& local_used_indices) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get is_sparse
is_multi_val_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_multi_val_);
is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_sparse_);
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
......@@ -110,13 +121,26 @@ class FeatureGroup {
if (!local_used_indices.empty()) {
num_data = static_cast<data_size_t>(local_used_indices.size());
}
if (is_sparse_) {
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
int addi = bin_mappers_[i]->GetMostFreqBin() == 0 ? 0 : 1;
if (bin_mappers_[i]->sparse_rate() >= kSparseThreshold) {
multi_bin_data_.emplace_back(Bin::CreateSparseBin(num_data, bin_mappers_[i]->num_bin() + addi));
} else {
multi_bin_data_.emplace_back(Bin::CreateDenseBin(num_data, bin_mappers_[i]->num_bin() + addi));
}
multi_bin_data_.back()->LoadFromMemory(memory_ptr, local_used_indices);
memory_ptr += multi_bin_data_.back()->SizesInByte();
}
} else {
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
if (is_sparse_) {
bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else {
bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
}
// get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
}
// get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
}
/*! \brief Destructor */
~FeatureGroup() {
......@@ -131,22 +155,54 @@ class FeatureGroup {
inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
if (bin == bin_mappers_[sub_feature_idx]->GetMostFreqBin()) { return; }
bin += bin_offsets_[sub_feature_idx];
if (bin_mappers_[sub_feature_idx]->GetMostFreqBin() == 0) {
bin -= 1;
}
bin_data_->Push(tid, line_idx, bin);
if (is_multi_val_) {
multi_bin_data_[sub_feature_idx]->Push(tid, line_idx, bin + 1);
} else {
bin += bin_offsets_[sub_feature_idx];
bin_data_->Push(tid, line_idx, bin);
}
}
inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
if (!is_multi_val_) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
} else {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->CopySubset(full_feature->multi_bin_data_[i].get(), used_indices, num_used_indices);
}
}
}
inline BinIterator* SubFeatureIterator(int sub_feature) {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
if (!is_multi_val_) {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
return bin_data_->GetIterator(min_bin, max_bin, most_freq_bin);
} else {
int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
return multi_bin_data_[sub_feature]->GetIterator(min_bin, max_bin, most_freq_bin);
}
}
inline void FinishLoad() {
if (is_multi_val_) {
OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_feature_; ++i) {
OMP_LOOP_EX_BEGIN();
multi_bin_data_[i]->FinishLoad();
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else {
bin_data_->FinishLoad();
}
}
/*!
......@@ -155,6 +211,9 @@ class FeatureGroup {
* \return A pointer to the BinIterator object
*/
inline BinIterator* FeatureGroupIterator() {
if (is_multi_val_) {
return nullptr;
}
uint32_t min_bin = bin_offsets_[0];
uint32_t max_bin = bin_offsets_.back() - 1;
uint32_t most_freq_bin = 0;
......@@ -168,17 +227,29 @@ class FeatureGroup {
bool default_left,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
uint32_t most_freq_bin = bin_mappers_[sub_feature]->GetMostFreqBin();
if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
*threshold, data_indices, num_data, lte_indices, gt_indices);
if (!is_multi_val_) {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return bin_data_->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
*threshold, data_indices, num_data, lte_indices, gt_indices);
} else {
return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
}
} else {
return bin_data_->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
int addi = bin_mappers_[sub_feature]->GetMostFreqBin() == 0 ? 0 : 1;
uint32_t min_bin = 1;
uint32_t max_bin = bin_mappers_[sub_feature]->num_bin() - 1 + addi;
if (bin_mappers_[sub_feature]->bin_type() == BinType::NumericalBin) {
auto missing_type = bin_mappers_[sub_feature]->missing_type();
return multi_bin_data_[sub_feature]->Split(min_bin, max_bin, default_bin, most_freq_bin, missing_type, default_left,
*threshold, data_indices, num_data, lte_indices, gt_indices);
} else {
return multi_bin_data_[sub_feature]->SplitCategorical(min_bin, max_bin, most_freq_bin, threshold, num_threshold, data_indices, num_data, lte_indices, gt_indices);
}
}
}
/*!
......@@ -195,22 +266,35 @@ class FeatureGroup {
* \param file File want to write
*/
void SaveBinaryToFile(const VirtualFileWriter* writer) const {
writer->Write(&is_multi_val_, sizeof(is_multi_val_));
writer->Write(&is_sparse_, sizeof(is_sparse_));
writer->Write(&num_feature_, sizeof(num_feature_));
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(writer);
}
bin_data_->SaveBinaryToFile(writer);
if (is_multi_val_) {
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_[i]->SaveBinaryToFile(writer);
}
} else {
bin_data_->SaveBinaryToFile(writer);
}
}
/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
size_t ret = sizeof(is_sparse_) + sizeof(num_feature_);
size_t ret = sizeof(is_multi_val_) + sizeof(is_sparse_) + sizeof(num_feature_);
for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
ret += bin_data_->SizesInByte();
if (!is_multi_val_) {
ret += bin_data_->SizesInByte();
} else {
for (int i = 0; i < num_feature_; ++i) {
ret += multi_bin_data_[i]->SizesInByte();
}
}
return ret;
}
/*! \brief Disable copy */
......@@ -218,6 +302,7 @@ class FeatureGroup {
/*! \brief Deep copy */
FeatureGroup(const FeatureGroup& other) {
num_feature_ = other.num_feature_;
is_multi_val_ = other.is_multi_val_;
is_sparse_ = other.is_sparse_;
num_total_bin_ = other.num_total_bin_;
bin_offsets_ = other.bin_offsets_;
......@@ -226,8 +311,14 @@ class FeatureGroup {
for (auto& bin_mapper : other.bin_mappers_) {
bin_mappers_.emplace_back(new BinMapper(*bin_mapper));
}
bin_data_.reset(other.bin_data_->Clone());
if (!is_multi_val_) {
bin_data_.reset(other.bin_data_->Clone());
} else {
multi_bin_data_.clear();
for (int i = 0; i < num_feature_; ++i) {
multi_bin_data_.emplace_back(other.multi_bin_data_[i]->Clone());
}
}
}
private:
......@@ -239,7 +330,9 @@ class FeatureGroup {
std::vector<uint32_t> bin_offsets_;
/*! \brief Bin data of this feature */
std::unique_ptr<Bin> bin_data_;
std::vector<std::unique_ptr<Bin>> multi_bin_data_;
/*! \brief True if this feature is sparse */
bool is_multi_val_;
bool is_sparse_;
int num_total_bin_;
};
......
......@@ -71,8 +71,9 @@ typedef void(*AllgatherFunction)(char* input, comm_size_t input_size, const comm
#define NO_SPECIFIC (-1)
// Prefetch size is usually 64 bytes
const int kCacheLineSize = 64;
const int kAlignedSize = 32;
#define SIZE_ALIGNED(t) ((t) + kAlignedSize - 1) / kAlignedSize * kAlignedSize
} // namespace LightGBM
......
......@@ -213,6 +213,7 @@ class Tree {
void RecomputeMaxDepth();
int NextLeafId() const { return num_leaves_; }
private:
std::string NumericalDecisionIfElse(int node) const;
......
......@@ -71,6 +71,8 @@ class TreeLearner {
virtual void SetBaggingData(const data_size_t* used_indices,
data_size_t num_data) = 0;
virtual bool IsHistColWise() const = 0;
/*!
* \brief Using last trained tree to predict score then adding to out_score;
* \param out_score output score
......
......@@ -11,22 +11,36 @@
#include <limits>
#include <string>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <functional>
#include <iomanip>
#include <iterator>
#include <map>
#include <memory>
#include <sstream>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
#ifdef _MSC_VER
#include "intrin.h"
#if defined(_MSC_VER)
#include <malloc.h>
#elif MM_MALLOC
#include <mm_malloc.h>
#elif defined(__GNUC__)
#include <malloc.h>
#define _mm_malloc(a, b) memalign(b, a)
#define _mm_free(a) free(a)
#else
#include <stdlib.h>
#define _mm_malloc(a, b) malloc(a)
#define _mm_free(a) free(a)
#endif
namespace LightGBM {
namespace Common {
......@@ -946,8 +960,133 @@ inline bool CheckAllowedJSON(const std::string& s) {
return true;
}
inline int RoundInt(double x) {
return static_cast<int>(x + 0.5f);
}
template <typename T, std::size_t N = 32>
class AlignmentAllocator {
public:
typedef T value_type;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef T* pointer;
typedef const T* const_pointer;
typedef T& reference;
typedef const T& const_reference;
public:
inline AlignmentAllocator() throw () {}
template <typename T2>
inline AlignmentAllocator(const AlignmentAllocator<T2, N>&) throw () {}
inline ~AlignmentAllocator() throw () {}
inline pointer adress(reference r) {
return &r;
}
inline const_pointer adress(const_reference r) const {
return &r;
}
inline pointer allocate(size_type n) {
return (pointer)_mm_malloc(n * sizeof(value_type), N);
}
inline void deallocate(pointer p, size_type) {
_mm_free(p);
}
inline void construct(pointer p, const value_type& wert) {
new (p) value_type(wert);
}
inline void destroy(pointer p) {
p->~value_type();
}
inline size_type max_size() const throw () {
return size_type(-1) / sizeof(value_type);
}
template <typename T2>
struct rebind {
typedef AlignmentAllocator<T2, N> other;
};
bool operator!=(const AlignmentAllocator<T, N>& other) const {
return !(*this == other);
}
// Returns true if and only if storage allocated from *this
// can be deallocated from other, and vice versa.
// Always returns true for stateless allocators.
bool operator==(const AlignmentAllocator<T, N>&) const {
return true;
}
};
// Note: this class is not thread-safe, don't use it inside omp blocks
class Timer {
public:
Timer() {}
~Timer() {
Print();
}
#ifdef TIMETAG
void Start(const std::string& name) {
auto cur_time = std::chrono::steady_clock::now();
start_time_[name] = cur_time;
}
void Stop(const std::string& name) {
if (stats_.find(name) == stats_.end()) {
stats_[name] = std::chrono::duration<double, std::milli>(0);
}
stats_[name] += std::chrono::steady_clock::now() - start_time_[name];
}
#else
void Start(const std::string&) { }
void Stop(const std::string&) { }
#endif // TIMETAG
void Print() const {
#ifdef TIMETAG
std::map<std::string, std::chrono::duration<double, std::milli>> ordered(stats_.begin(), stats_.end());
for (auto it = ordered.begin(); it != ordered.end(); ++it) {
Log::Info("%s costs:\t %f ", it->first.c_str(), it->second * 1e-3);
}
#endif
}
std::unordered_map<std::string, std::chrono::steady_clock::time_point> start_time_;
std::unordered_map<std::string, std::chrono::duration<double, std::milli>> stats_;
};
// Note: this class is not thread-safe, don't use it inside omp blocks
class FunctionTimer {
public:
FunctionTimer(const std::string& name, Timer& timer): timer_(timer) {
timer.Start(name);
#ifdef TIMETAG
name_ = name;
#endif // TIMETAG
}
~FunctionTimer() {
timer_.Stop(name_);
}
private:
std::string name_;
Timer& timer_;
};
} // namespace Common
extern Common::Timer global_timer;
} // namespace LightGBM
#endif // LightGBM_UTILS_COMMON_FUN_H_
......@@ -27,6 +27,8 @@
namespace LightGBM {
Common::Timer global_timer;
Application::Application(int argc, char** argv) {
LoadParameters(argc, argv);
// set number of threads for openmp
......
......@@ -62,7 +62,7 @@ class Predictor {
boosting_ = boosting;
num_pred_one_row_ = boosting_->NumPredictOneRow(num_iteration, predict_leaf_index, predict_contrib);
num_feature_ = boosting_->MaxFeatureIdx() + 1;
predict_buf_ = std::vector<std::vector<double>>(num_threads_, std::vector<double>(num_feature_, 0.0f));
predict_buf_.resize(num_threads_, std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(num_feature_, 0.0f));
const int kFeatureThreshold = 100000;
const size_t KSparseThreshold = static_cast<size_t>(0.01 * num_feature_);
if (predict_leaf_index) {
......@@ -263,7 +263,7 @@ class Predictor {
int num_feature_;
int num_pred_one_row_;
int num_threads_;
std::vector<std::vector<double>> predict_buf_;
std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
};
} // namespace LightGBM
......
......@@ -17,7 +17,6 @@
namespace LightGBM {
GBDT::GBDT() : iter_(0),
train_data_(nullptr),
objective_function_(nullptr),
......@@ -41,6 +40,7 @@ balanced_bagging_(false) {
}
GBDT::~GBDT() {
}
void GBDT::Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
......@@ -148,6 +148,7 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
}
void GBDT::Boosting() {
Common::FunctionTimer fun_timer("GBDT::Boosting", global_timer);
if (objective_function_ == nullptr) {
Log::Fatal("No object function provided");
}
......@@ -208,23 +209,26 @@ data_size_t GBDT::BalancedBaggingHelper(Random* cur_rand, data_size_t start, dat
}
void GBDT::Bagging(int iter) {
Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer);
// if need bagging
if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0)
|| need_re_bagging_) {
need_re_bagging_ = false;
const data_size_t min_inner_size = 1000;
data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
if (inner_size < min_inner_size) { inner_size = min_inner_size; }
const data_size_t min_inner_size = 1024;
const int n_block = std::min(
num_threads_, (num_data_ + min_inner_size - 1) / min_inner_size);
data_size_t inner_size = SIZE_ALIGNED((num_data_ + n_block - 1) / n_block);
OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < n_block; ++i) {
OMP_LOOP_EX_BEGIN();
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size;
if (cur_start > num_data_) { continue; }
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
data_size_t cur_cnt = std::min(inner_size, num_data_ - cur_start);
if (cur_cnt <= 0) {
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
continue;
}
Random cur_rand(config_->bagging_seed + iter * num_threads_ + i);
data_size_t cur_left_count = 0;
if (balanced_bagging_) {
......@@ -241,15 +245,14 @@ void GBDT::Bagging(int iter) {
data_size_t left_cnt = 0;
left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) {
for (int i = 1; i < n_block; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
}
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
left_cnt = left_write_pos_buf_[n_block - 1] + left_cnts_buf_[n_block - 1];
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
OMP_LOOP_EX_BEGIN();
for (int i = 0; i < n_block; ++i) {
if (left_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
......@@ -258,9 +261,7 @@ void GBDT::Bagging(int iter) {
std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
tmp_indices_.data() + offsets_buf_[i] + left_cnts_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
bag_data_cnt_ = left_cnt;
Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner
......@@ -276,6 +277,7 @@ void GBDT::Bagging(int iter) {
}
void GBDT::Train(int snapshot_freq, const std::string& model_output_path) {
Common::FunctionTimer fun_timer("GBDT::Train", global_timer);
bool is_finished = false;
auto start_time = std::chrono::steady_clock::now();
for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) {
......@@ -342,6 +344,7 @@ double ObtainAutomaticInitialScore(const ObjectiveFunction* fobj, int class_id)
}
double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
Common::FunctionTimer fun_timer("GBDT::BoostFromAverage", global_timer);
// boosting from average label; or customized "average" if implemented for the current objective
if (models_.empty() && !train_score_updater_->has_init_score() && objective_function_ != nullptr) {
if (config_->boost_from_average || (train_data_ != nullptr && train_data_->num_features() == 0)) {
......@@ -366,6 +369,7 @@ double GBDT::BoostFromAverage(int class_id, bool update_scorer) {
}
bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer);
std::vector<double> init_scores(num_tree_per_iteration_, 0.0);
// boosting first
if (gradients == nullptr || hessians == nullptr) {
......@@ -486,6 +490,7 @@ bool GBDT::EvalAndCheckEarlyStopping() {
}
void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer);
// update training score
if (!is_use_subset_) {
train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id);
......@@ -755,17 +760,10 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
right_write_pos_buf_.resize(num_threads_);
double average_bag_rate = (bag_data_cnt_ / num_data_) / config->bagging_freq;
int sparse_group = 0;
for (int i = 0; i < train_data_->num_feature_groups(); ++i) {
if (train_data_->FeatureGroupIsSparse(i)) {
++sparse_group;
}
}
is_use_subset_ = false;
const int group_threshold_usesubset = 100;
const int sparse_group_threshold_usesubset = train_data_->num_feature_groups() / 4;
if (average_bag_rate <= 0.5
&& (train_data_->num_feature_groups() < group_threshold_usesubset || sparse_group < sparse_group_threshold_usesubset)) {
if (tree_learner_->IsHistColWise() && average_bag_rate <= 0.5
&& (train_data_->num_feature_groups() < group_threshold_usesubset)) {
if (tmp_subset_ == nullptr || is_change_dataset) {
tmp_subset_.reset(new Dataset(bag_data_cnt_));
tmp_subset_->CopyFeatureMapperFrom(train_data_);
......
......@@ -457,11 +457,11 @@ class GBDT : public GBDTBase {
/*! \brief Max feature index of training data*/
int max_feature_idx_;
/*! \brief First order derivative of training data */
std::vector<score_t> gradients_;
std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> gradients_;
/*! \brief Secend order derivative of training data */
std::vector<score_t> hessians_;
std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> hessians_;
/*! \brief Store the indices of in-bag data */
std::vector<data_size_t> bag_data_indices_;
std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> bag_data_indices_;
/*! \brief Number of in-bag data */
data_size_t bag_data_cnt_;
/*! \brief Store the indices of in-bag data */
......
......@@ -22,10 +22,6 @@
namespace LightGBM {
#ifdef TIMETAG
std::chrono::duration<double, std::milli> subset_time;
std::chrono::duration<double, std::milli> re_init_tree_time;
#endif
class GOSS: public GBDT {
public:
......@@ -36,10 +32,7 @@ class GOSS: public GBDT {
}
~GOSS() {
#ifdef TIMETAG
Log::Info("GOSS::subset costs %f", subset_time * 1e-3);
Log::Info("GOSS::re_init_tree costs %f", re_init_tree_time * 1e-3);
#endif
}
void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function,
......@@ -143,19 +136,21 @@ class GOSS: public GBDT {
// not subsample for first iterations
if (iter < static_cast<int>(1.0f / config_->learning_rate)) { return; }
const data_size_t min_inner_size = 100;
data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
if (inner_size < min_inner_size) { inner_size = min_inner_size; }
const data_size_t min_inner_size = 128;
const int n_block = std::min(
num_threads_, (num_data_ + min_inner_size - 1) / min_inner_size);
data_size_t inner_size = SIZE_ALIGNED((num_data_ + n_block - 1) / n_block);
OMP_INIT_EX();
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < n_block; ++i) {
OMP_LOOP_EX_BEGIN();
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size;
if (cur_start > num_data_) { continue; }
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
data_size_t cur_cnt = std::min(inner_size, num_data_ - cur_start);
if (cur_cnt <= 0) {
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
continue;
}
Random cur_rand(config_->bagging_seed + iter * num_threads_ + i);
data_size_t cur_left_count = BaggingHelper(&cur_rand, cur_start, cur_cnt,
tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start);
......@@ -168,14 +163,14 @@ class GOSS: public GBDT {
data_size_t left_cnt = 0;
left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) {
for (int i = 1; i < n_block; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
}
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
left_cnt = left_write_pos_buf_[n_block - 1] + left_cnts_buf_[n_block - 1];
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
for (int i = 0; i < n_block; ++i) {
OMP_LOOP_EX_BEGIN();
if (left_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
......@@ -193,22 +188,10 @@ class GOSS: public GBDT {
if (!is_use_subset_) {
tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
} else {
// get subset
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
#ifdef TIMETAG
subset_time += std::chrono::steady_clock::now() - start_time;
#endif
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
tree_learner_->ResetTrainingData(tmp_subset_.get());
#ifdef TIMETAG
re_init_tree_time += std::chrono::steady_clock::now() - start_time;
#endif
}
}
......
......@@ -55,6 +55,7 @@ class ScoreUpdater {
inline bool has_init_score() const { return has_init_score_; }
inline void AddScore(double val, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_data_; ++i) {
......@@ -76,6 +77,7 @@ class ScoreUpdater {
* \param cur_tree_id Current tree for multiclass training
*/
inline void AddScore(const Tree* tree, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
tree->AddPredictionToScore(data_, num_data_, score_.data() + offset);
}
......@@ -87,6 +89,7 @@ class ScoreUpdater {
* \param cur_tree_id Current tree for multiclass training
*/
inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
tree_learner->AddPredictionToScore(tree, score_.data() + offset);
}
......@@ -100,6 +103,7 @@ class ScoreUpdater {
*/
inline void AddScore(const Tree* tree, const data_size_t* data_indices,
data_size_t data_cnt, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + offset);
}
......@@ -119,7 +123,7 @@ class ScoreUpdater {
/*! \brief Pointer of data set */
const Dataset* data_;
/*! \brief Scores for data set */
std::vector<double> score_;
std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>> score_;
bool has_init_score_;
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment