"...git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "d43a6a3c37bca1fc1bb98b5f8da1bcbc065c1e2b"
Commit e984b0d6 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Handle for missing values (#516)

parent e8cc6ab9
...@@ -360,6 +360,7 @@ public: ...@@ -360,6 +360,7 @@ public:
* \param min_bin min_bin of current used feature * \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature * \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin] * \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param default_bin_for_zero defualt bin for the zero(missing) bin
* \param threshold The split threshold. * \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object. * \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data * \param num_data Number of used data
...@@ -369,7 +370,7 @@ public: ...@@ -369,7 +370,7 @@ public:
* \return The number of less than or equal data. * \return The number of less than or equal data.
*/ */
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, uint32_t threshold, uint32_t default_bin, uint32_t default_bin_for_zero, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0; data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;
......
...@@ -402,12 +402,12 @@ public: ...@@ -402,12 +402,12 @@ public:
HistogramBinEntry* data) const; HistogramBinEntry* data) const;
inline data_size_t Split(int feature, inline data_size_t Split(int feature,
uint32_t threshold, uint32_t threshold, uint32_t default_bin_for_zero,
data_size_t* data_indices, data_size_t num_data, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const { data_size_t* lte_indices, data_size_t* gt_indices) const {
const int group = feature2group_[feature]; const int group = feature2group_[feature];
const int sub_feature = feature2subfeature_[feature]; const int sub_feature = feature2subfeature_[feature];
return feature_groups_[group]->Split(sub_feature, threshold, data_indices, num_data, lte_indices, gt_indices); return feature_groups_[group]->Split(sub_feature, threshold, default_bin_for_zero, data_indices, num_data, lte_indices, gt_indices);
} }
inline int SubFeatureBinOffset(int i) const { inline int SubFeatureBinOffset(int i) const {
......
...@@ -161,13 +161,14 @@ public: ...@@ -161,13 +161,14 @@ public:
inline data_size_t Split( inline data_size_t Split(
int sub_feature, int sub_feature,
uint32_t threshold, uint32_t threshold,
uint32_t default_bin_for_zero,
data_size_t* data_indices, data_size_t num_data, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const { data_size_t* lte_indices, data_size_t* gt_indices) const {
uint32_t min_bin = bin_offsets_[sub_feature]; uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1; uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin(); uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin, return bin_data_->Split(min_bin, max_bin, default_bin, default_bin_for_zero,
threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type()); threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
} }
/*! /*!
......
...@@ -19,6 +19,8 @@ const score_t kMinScore = -std::numeric_limits<score_t>::infinity(); ...@@ -19,6 +19,8 @@ const score_t kMinScore = -std::numeric_limits<score_t>::infinity();
const score_t kEpsilon = 1e-15f; const score_t kEpsilon = 1e-15f;
const double kMissingValueRange = 1e-20f;
using ReduceFunction = std::function<void(const char*, char*, int)>; using ReduceFunction = std::function<void(const char*, char*, int)>;
using PredictFunction = using PredictFunction =
......
...@@ -44,11 +44,15 @@ public: ...@@ -44,11 +44,15 @@ public:
* \param left_cnt Count of left child * \param left_cnt Count of left child
* \param right_cnt Count of right child * \param right_cnt Count of right child
* \param gain Split gain * \param gain Split gain
* \param zero_bin bin value for value==0 (missing value)
* \param default_bin default conversion for the missing value, in bin
* \param default_value default conversion for the missing value, in float value
* \return The index of new leaf. * \return The index of new leaf.
*/ */
int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature, int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value, double threshold_double, double left_value, double right_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain); data_size_t left_cnt, data_size_t right_cnt, double gain,
uint32_t zero_bin, uint32_t default_bin_for_zero, double default_value);
/*! \brief Get the output of one leaf */ /*! \brief Get the output of one leaf */
inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; } inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; }
...@@ -140,6 +144,23 @@ public: ...@@ -140,6 +144,23 @@ public:
} }
} }
static double DefaultValueForZero(double fval, double zero, double out) {
if (fval > -zero && fval <= zero) {
return out;
} else {
return fval;
}
}
static uint32_t DefaultValueForZero(uint32_t fval, uint32_t zero, uint32_t out) {
if (fval == zero) {
return out;
} else {
return fval;
}
}
static const char* GetDecisionTypeName(int8_t type) { static const char* GetDecisionTypeName(int8_t type) {
if (type == 0) { if (type == 0) {
return "no_greater"; return "no_greater";
...@@ -176,7 +197,7 @@ private: ...@@ -176,7 +197,7 @@ private:
/*! \brief A non-leaf node's right child */ /*! \brief A non-leaf node's right child */
std::vector<int> right_child_; std::vector<int> right_child_;
/*! \brief A non-leaf node's split feature */ /*! \brief A non-leaf node's split feature */
std::vector<int> split_feature_inner; std::vector<int> split_feature_inner_;
/*! \brief A non-leaf node's split feature, the original index */ /*! \brief A non-leaf node's split feature, the original index */
std::vector<int> split_feature_; std::vector<int> split_feature_;
/*! \brief A non-leaf node's split threshold in bin */ /*! \brief A non-leaf node's split threshold in bin */
...@@ -185,6 +206,10 @@ private: ...@@ -185,6 +206,10 @@ private:
std::vector<double> threshold_; std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */ /*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std::vector<int8_t> decision_type_; std::vector<int8_t> decision_type_;
/*! \brief Default values for the na/0 feature values */
std::vector<double> default_value_;
std::vector<uint32_t> zero_bin_;
std::vector<uint32_t> default_bin_for_zero_;
/*! \brief A non-leaf node's split gain */ /*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_; std::vector<double> split_gain_;
// used for leaf node // used for leaf node
...@@ -226,8 +251,9 @@ inline int Tree::GetLeaf(const double* feature_values) const { ...@@ -226,8 +251,9 @@ inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0; int node = 0;
if (has_categorical_) { if (has_categorical_) {
while (node >= 0) { while (node >= 0) {
double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
if (decision_funs[decision_type_[node]]( if (decision_funs[decision_type_[node]](
feature_values[split_feature_[node]], fval,
threshold_[node])) { threshold_[node])) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
...@@ -236,8 +262,9 @@ inline int Tree::GetLeaf(const double* feature_values) const { ...@@ -236,8 +262,9 @@ inline int Tree::GetLeaf(const double* feature_values) const {
} }
} else { } else {
while (node >= 0) { while (node >= 0) {
double fval = DefaultValueForZero(feature_values[split_feature_[node]], kMissingValueRange, default_value_[node]);
if (NumericalDecision<double>( if (NumericalDecision<double>(
feature_values[split_feature_[node]], fval,
threshold_[node])) { threshold_[node])) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
......
...@@ -462,6 +462,16 @@ inline static std::vector<int> VectorSize(const std::vector<std::vector<T>>& dat ...@@ -462,6 +462,16 @@ inline static std::vector<int> VectorSize(const std::vector<std::vector<T>>& dat
return ret; return ret;
} }
inline static double AvoidInf(double x) {
if (x >= std::numeric_limits<double>::max()) {
return std::numeric_limits<double>::max();
} else if(x <= std::numeric_limits<double>::min()) {
return std::numeric_limits<double>::min();
} else {
return x;
}
}
} // namespace Common } // namespace Common
} // namespace LightGBM } // namespace LightGBM
......
...@@ -353,7 +353,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -353,7 +353,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
} }
init_score /= num_data_; init_score /= num_data_;
std::unique_ptr<Tree> new_tree(new Tree(2)); std::unique_ptr<Tree> new_tree(new Tree(2));
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, num_data_, -1); new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, init_score, init_score, 0, num_data_, -1, 0, 0, 0);
train_score_updater_->AddScore(init_score, 0); train_score_updater_->AddScore(init_score, 0);
for (auto& score_updater : valid_score_updater_) { for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(init_score, 0); score_updater->AddScore(init_score, 0);
...@@ -432,7 +432,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is ...@@ -432,7 +432,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) { if (!class_need_train_[cur_tree_id] && models_.size() < static_cast<size_t>(num_tree_per_iteration_)) {
auto output = class_default_output_[cur_tree_id]; auto output = class_default_output_[cur_tree_id];
new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0, new_tree->Split(0, 0, BinType::NumericalBin, 0, 0, 0,
output, output, 0, num_data_, -1); output, output, 0, num_data_, -1, 0, 0, 0);
train_score_updater_->AddScore(output, cur_tree_id); train_score_updater_->AddScore(output, cur_tree_id);
for (auto& score_updater : valid_score_updater_) { for (auto& score_updater : valid_score_updater_) {
score_updater->AddScore(output, cur_tree_id); score_updater->AddScore(output, cur_tree_id);
......
...@@ -63,6 +63,76 @@ bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, Bin ...@@ -63,6 +63,76 @@ bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, Bin
} }
return true; return true;
} }
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, int total_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
if (num_distinct_values <= max_bin) {
bin_upper_bound.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cur_cnt_inbin = 0;
}
}
cur_cnt_inbin += counts[num_distinct_values - 1];
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
double mean_bin_size = static_cast<double>(total_cnt) / max_bin;
// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_cnt);
std::vector<bool> is_big_count_value(num_distinct_values, false);
for (int i = 0; i < num_distinct_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
++bin_cnt;
// update bin upper bound
bin_upper_bound.resize(bin_cnt);
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
// last bin upper bound
bin_upper_bound[bin_cnt - 1] = std::numeric_limits<double>::infinity();
}
return bin_upper_bound;
}
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) { int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) {
...@@ -109,81 +179,62 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp ...@@ -109,81 +179,62 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
std::vector<int> cnt_in_bin; std::vector<int> cnt_in_bin;
int num_distinct_values = static_cast<int>(distinct_values.size()); int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) { if (bin_type_ == BinType::NumericalBin) {
if (num_distinct_values <= max_bin) { bin_upper_bound_.clear();
// use distinct value is enough int left_cnt_data = 0;
bin_upper_bound_.clear(); int missing_cnt_data = 0;
int cur_cnt_inbin = 0; int right_cnt_data = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) { for (int i = 0; i < num_distinct_values; ++i) {
cur_cnt_inbin += counts[i]; if (distinct_values[i] <= -kMissingValueRange) {
if (cur_cnt_inbin >= min_data_in_bin) { left_cnt_data += counts[i];
bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2); } else if (distinct_values[i] > kMissingValueRange) {
cnt_in_bin.push_back(cur_cnt_inbin); right_cnt_data += counts[i];
cur_cnt_inbin = 0; } else {
} missing_cnt_data += counts[i];
}
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
num_bin_ = static_cast<int>(bin_upper_bound_.size());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
} }
double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin; }
if (zero_cnt > mean_bin_size) {
int non_zero_cnt = num_sample_values; int left_cnt = 0;
max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin)); for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kMissingValueRange) {
left_cnt = i;
break;
}
}
if (left_cnt > 0) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - missing_cnt_data) * (max_bin - 1));
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound_.back() = -kMissingValueRange;
}
int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kMissingValueRange) {
right_start = i;
break;
} }
// mean size for one bin }
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_sample_cnt); if (right_start >= 0) {
std::vector<bool> is_big_count_value(num_distinct_values, false); int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound_.size());
auto right_bounds = GreedyFindBin(distinct_values.data() + right_start, counts.data() + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound_.push_back(kMissingValueRange);
bin_upper_bound_.insert(bin_upper_bound_.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
}
num_bin_ = static_cast<int>(bin_upper_bound_.size());
{
cnt_in_bin.resize(num_bin_, 0);
int i_bin = 0;
for (int i = 0; i < num_distinct_values; ++i) { for (int i = 0; i < num_distinct_values; ++i) {
if (counts[i] >= mean_bin_size) { if (distinct_values[i] > bin_upper_bound_[i_bin]) {
is_big_count_value[i] = true; ++i_bin;
--rest_bin_cnt; }
rest_sample_cnt -= counts[i]; cnt_in_bin[i_bin] += counts[i];
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = std::vector<double>(bin_cnt);
num_bin_ = bin_cnt;
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
} }
// last bin upper bound
bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
} }
CHECK(num_bin_ <= max_bin); CHECK(num_bin_ <= max_bin);
} else { } else {
......
...@@ -15,6 +15,10 @@ ...@@ -15,6 +15,10 @@
namespace LightGBM { namespace LightGBM {
#ifdef USE_GPU
const int kMaxBinPerGroup = 256;
#endif // USE_GPU
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n"; const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
Dataset::Dataset() { Dataset::Dataset() {
...@@ -43,12 +47,180 @@ std::vector<std::vector<int>> NoGroup( ...@@ -43,12 +47,180 @@ std::vector<std::vector<int>> NoGroup(
return features_in_group; return features_in_group;
} }
int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, int max_cnt) {
int ret = 0;
for (int i = 0; i < num_indices; ++i) {
if (mark[indices[i]]) {
++ret;
if (ret > max_cnt) {
return -1;
}
}
}
return ret;
}
void MarkUsed(std::vector<bool>& mark, const int* indices, int num_indices) {
for (int i = 0; i < num_indices; ++i) {
mark[indices[i]] = true;
}
}
std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const std::vector<int>& find_order,
int** sample_indices,
const int* num_per_col,
size_t total_sample_cnt,
data_size_t max_error_cnt,
data_size_t filter_cnt,
data_size_t num_data) {
const int max_search_group = 100;
Random rand(num_data);
std::vector<std::vector<int>> features_in_group;
std::vector<std::vector<bool>> conflict_marks;
std::vector<int> group_conflict_cnt;
std::vector<size_t> group_non_zero_cnt;
#ifdef USE_GPU
std::vector<int> group_num_bin;
#endif // USE_GPU
for (auto fidx : find_order) {
const size_t cur_non_zero_cnt = num_per_col[fidx];
bool need_new_group = true;
std::vector<int> available_groups;
for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
if (group_non_zero_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + max_error_cnt
#ifdef USE_GPU
&& group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)
<= kMaxBinPerGroup
#endif // USE_GPU
) {
available_groups.push_back(gid);
}
}
std::vector<int> search_groups;
if (!available_groups.empty()) {
int last = static_cast<int>(available_groups.size()) - 1;
auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
search_groups.push_back(available_groups.back());
for (auto idx : indices) {
search_groups.push_back(available_groups[idx]);
}
}
for (auto gid : search_groups) {
const int rest_max_cnt = max_error_cnt - group_conflict_cnt[gid];
int cnt = GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
if (cnt >= 0 && cnt <= rest_max_cnt) {
data_size_t rest_non_zero_data = static_cast<data_size_t>(
static_cast<double>(cur_non_zero_cnt - cnt) * num_data / total_sample_cnt);
if (rest_non_zero_data < filter_cnt) { continue; }
need_new_group = false;
features_in_group[gid].push_back(fidx);
group_conflict_cnt[gid] += cnt;
group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt;
MarkUsed(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
#ifdef USE_GPU
group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
#endif // USE_GPU
break;
}
}
if (need_new_group) {
features_in_group.emplace_back();
features_in_group.back().push_back(fidx);
group_conflict_cnt.push_back(0);
conflict_marks.emplace_back(total_sample_cnt, false);
MarkUsed(conflict_marks.back(), sample_indices[fidx], num_per_col[fidx]);
group_non_zero_cnt.emplace_back(cur_non_zero_cnt);
#ifdef USE_GPU
group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
#endif // USE_GPU
}
}
return features_in_group;
}
std::vector<std::vector<int>> FastFeatureBundling(std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
int** sample_indices,
const int* num_per_col,
size_t total_sample_cnt,
const std::vector<int>& used_features,
double max_conflict_rate,
data_size_t num_data,
data_size_t min_data,
double sparse_threshold,
bool is_enable_sparse) {
// filter is based on sampling data, so decrease its range
const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * min_data) / num_data * total_sample_cnt);
const data_size_t max_error_cnt = static_cast<data_size_t>(total_sample_cnt * max_conflict_rate);
int cur_used_feature_cnt = 0;
std::vector<size_t> feature_non_zero_cnt;
// put dense feature first
for (auto fidx : used_features) {
feature_non_zero_cnt.emplace_back(num_per_col[fidx]);
++cur_used_feature_cnt;
}
// sort by non zero cnt
std::vector<int> sorted_idx;
for (int i = 0; i < cur_used_feature_cnt; ++i) {
sorted_idx.emplace_back(i);
}
// sort by non zero cnt, bigger first
std::sort(sorted_idx.begin(), sorted_idx.end(),
[&feature_non_zero_cnt](int a, int b) {
return feature_non_zero_cnt[a] > feature_non_zero_cnt[b];
});
std::vector<int> feature_order_by_cnt;
for (auto sidx : sorted_idx) {
feature_order_by_cnt.push_back(used_features[sidx]);
}
auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, num_per_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data);
auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, num_per_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data);
if (features_in_group.size() > group2.size()) {
features_in_group = group2;
}
std::vector<std::vector<int>> ret;
for (size_t i = 0; i < features_in_group.size(); ++i) {
if (features_in_group[i].size() <= 1 || features_in_group[i].size() >= 5) {
ret.push_back(features_in_group[i]);
} else {
int cnt_non_zero = 0;
for (size_t j = 0; j < features_in_group[i].size(); ++j) {
const int fidx = features_in_group[i][j];
cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers[fidx]->sparse_rate()));
}
double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
// take apart small sparse group, due it will not gain on speed
if (sparse_rate >= sparse_threshold && is_enable_sparse) {
for (size_t j = 0; j < features_in_group[i].size(); ++j) {
const int fidx = features_in_group[i][j];
ret.emplace_back();
ret.back().push_back(fidx);
}
} else {
ret.push_back(features_in_group[i]);
}
}
}
// shuffle groups
int num_group = static_cast<int>(ret.size());
Random tmp_rand(12);
for (int i = 0; i < num_group - 1; ++i) {
int j = tmp_rand.NextShort(i + 1, num_group);
std::swap(ret[i], ret[j]);
}
return ret;
}
void Dataset::Construct( void Dataset::Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers, std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
int**, int** sample_non_zero_indices,
const int*, const int* num_per_col,
size_t, size_t total_sample_cnt,
const IOConfig& io_config) { const IOConfig& io_config) {
num_total_features_ = static_cast<int>(bin_mappers.size()); num_total_features_ = static_cast<int>(bin_mappers.size());
sparse_threshold_ = io_config.sparse_threshold; sparse_threshold_ = io_config.sparse_threshold;
// get num_features // get num_features
...@@ -61,6 +233,15 @@ void Dataset::Construct( ...@@ -61,6 +233,15 @@ void Dataset::Construct(
auto features_in_group = NoGroup(used_features); auto features_in_group = NoGroup(used_features);
if (io_config.enable_bundle) {
std::chrono::duration<double, std::milli> bundling_time_;
features_in_group = FastFeatureBundling(bin_mappers,
sample_non_zero_indices, num_per_col, total_sample_cnt,
used_features, io_config.max_conflict_rate,
num_data_, io_config.min_data_in_leaf,
sparse_threshold_, io_config.is_enable_sparse);
}
num_features_ = 0; num_features_ = 0;
for (const auto& fs : features_in_group) { for (const auto& fs : features_in_group) {
num_features_ += static_cast<int>(fs.size()); num_features_ += static_cast<int>(fs.size());
...@@ -86,7 +267,8 @@ void Dataset::Construct( ...@@ -86,7 +267,8 @@ void Dataset::Construct(
++cur_fidx; ++cur_fidx;
} }
feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>( feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
new FeatureGroup(cur_cnt_features, cur_bin_mappers, num_data_, sparse_threshold_, io_config.is_enable_sparse))); new FeatureGroup(cur_cnt_features, cur_bin_mappers, num_data_, sparse_threshold_,
io_config.is_enable_sparse)));
} }
feature_groups_.shrink_to_fit(); feature_groups_.shrink_to_fit();
group_bin_boundaries_.clear(); group_bin_boundaries_.clear();
...@@ -116,7 +298,7 @@ void Dataset::Construct( ...@@ -116,7 +298,7 @@ void Dataset::Construct(
void Dataset::FinishLoad() { void Dataset::FinishLoad() {
if (is_finish_load_) { return; } if (is_finish_load_) { return; }
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < num_groups_; ++i) { for (int i = 0; i < num_groups_; ++i) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
feature_groups_[i]->bin_data_->FinishLoad(); feature_groups_[i]->bin_data_->FinishLoad();
...@@ -212,7 +394,7 @@ void Dataset::ReSize(data_size_t num_data) { ...@@ -212,7 +394,7 @@ void Dataset::ReSize(data_size_t num_data) {
if (num_data_ != num_data) { if (num_data_ != num_data) {
num_data_ = num_data; num_data_ = num_data;
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int group = 0; group < num_groups_; ++group) { for (int group = 0; group < num_groups_; ++group) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
feature_groups_[group]->bin_data_->ReSize(num_data_); feature_groups_[group]->bin_data_->ReSize(num_data_);
...@@ -314,7 +496,7 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in ...@@ -314,7 +496,7 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in
void Dataset::SaveBinaryFile(const char* bin_filename) { void Dataset::SaveBinaryFile(const char* bin_filename) {
if (bin_filename != nullptr if (bin_filename != nullptr
&& std::string(bin_filename) == std::string(data_filename_)) { && std::string(bin_filename) == std::string(data_filename_)) {
Log::Warning("Bianry file %s already existed", bin_filename); Log::Warning("Bianry file %s already existed", bin_filename);
return; return;
} }
...@@ -326,11 +508,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -326,11 +508,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
} }
bool is_file_existed = false; bool is_file_existed = false;
FILE* file; FILE* file;
#ifdef _MSC_VER #ifdef _MSC_VER
fopen_s(&file, bin_filename, "rb"); fopen_s(&file, bin_filename, "rb");
#else #else
file = fopen(bin_filename, "rb"); file = fopen(bin_filename, "rb");
#endif #endif
if (file != NULL) { if (file != NULL) {
is_file_existed = true; is_file_existed = true;
...@@ -339,11 +521,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -339,11 +521,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
} }
if (!is_file_existed) { if (!is_file_existed) {
#ifdef _MSC_VER #ifdef _MSC_VER
fopen_s(&file, bin_filename, "wb"); fopen_s(&file, bin_filename, "wb");
#else #else
file = fopen(bin_filename, "wb"); file = fopen(bin_filename, "wb");
#endif #endif
if (file == NULL) { if (file == NULL) {
Log::Fatal("Cannot write binary data to %s ", bin_filename); Log::Fatal("Cannot write binary data to %s ", bin_filename);
} }
......
...@@ -188,29 +188,31 @@ public: ...@@ -188,29 +188,31 @@ public:
} }
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data, uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override { data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; } if (num_data <= 0) { return 0; }
VAL_T th = static_cast<VAL_T>(threshold + min_bin); VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin); VAL_T minb = static_cast<VAL_T>(min_bin);
VAL_T maxb = static_cast<VAL_T>(max_bin); VAL_T maxb = static_cast<VAL_T>(max_bin);
VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
if (default_bin == 0) { if (default_bin == 0) {
th -= 1; th -= 1;
t_default_bin -= 1;
} }
data_size_t lte_count = 0; data_size_t lte_count = 0;
data_size_t gt_count = 0; data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices; data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count; data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) { if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) { if (default_bin_for_zero <= threshold) {
default_indices = lte_indices; default_indices = lte_indices;
default_count = &lte_count; default_count = &lte_count;
} }
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx]; VAL_T bin = data_[idx];
if (bin > maxb || bin < minb) { if ( bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx; default_indices[(*default_count)++] = idx;
} else if (bin > th) { } else if (bin > th) {
gt_indices[gt_count++] = idx; gt_indices[gt_count++] = idx;
...@@ -219,14 +221,14 @@ public: ...@@ -219,14 +221,14 @@ public:
} }
} }
} else { } else {
if (default_bin == threshold) { if (default_bin_for_zero == threshold) {
default_indices = lte_indices; default_indices = lte_indices;
default_count = &lte_count; default_count = &lte_count;
} }
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
VAL_T bin = data_[idx]; VAL_T bin = data_[idx];
if (bin > maxb || bin < minb) { if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx; default_indices[(*default_count)++] = idx;
} else if (bin != th) { } else if (bin != th) {
gt_indices[gt_count++] = idx; gt_indices[gt_count++] = idx;
......
...@@ -227,29 +227,31 @@ public: ...@@ -227,29 +227,31 @@ public:
} }
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data, uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override { data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
if (num_data <= 0) { return 0; } if (num_data <= 0) { return 0; }
uint8_t th = static_cast<uint8_t>(threshold + min_bin); uint8_t th = static_cast<uint8_t>(threshold + min_bin);
uint8_t minb = static_cast<uint8_t>(min_bin); uint8_t minb = static_cast<uint8_t>(min_bin);
uint8_t maxb = static_cast<uint8_t>(max_bin); uint8_t maxb = static_cast<uint8_t>(max_bin);
uint8_t t_default_bin = static_cast<uint8_t>(min_bin + default_bin);
if (default_bin == 0) { if (default_bin == 0) {
th -= 1; th -= 1;
t_default_bin -= 1;
} }
data_size_t lte_count = 0; data_size_t lte_count = 0;
data_size_t gt_count = 0; data_size_t gt_count = 0;
data_size_t* default_indices = gt_indices; data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count; data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) { if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) { if (default_bin_for_zero <= threshold) {
default_indices = lte_indices; default_indices = lte_indices;
default_count = &lte_count; default_count = &lte_count;
} }
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin > maxb || bin < minb) { if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx; default_indices[(*default_count)++] = idx;
} else if (bin > th) { } else if (bin > th) {
gt_indices[gt_count++] = idx; gt_indices[gt_count++] = idx;
...@@ -258,14 +260,14 @@ public: ...@@ -258,14 +260,14 @@ public:
} }
} }
} else { } else {
if (default_bin == threshold) { if (default_bin_for_zero == threshold) {
default_indices = lte_indices; default_indices = lte_indices;
default_count = &lte_count; default_count = &lte_count;
} }
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf;
if (bin > maxb || bin < minb) { if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx; default_indices[(*default_count)++] = idx;
} else if (bin != th) { } else if (bin != th) {
gt_indices[gt_count++] = idx; gt_indices[gt_count++] = idx;
......
...@@ -142,7 +142,7 @@ public: ...@@ -142,7 +142,7 @@ public:
} }
virtual data_size_t Split( virtual data_size_t Split(
uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
uint32_t threshold, data_size_t* data_indices, data_size_t num_data, uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override { data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
// not need to split // not need to split
...@@ -150,8 +150,10 @@ public: ...@@ -150,8 +150,10 @@ public:
VAL_T th = static_cast<VAL_T>(threshold + min_bin); VAL_T th = static_cast<VAL_T>(threshold + min_bin);
VAL_T minb = static_cast<VAL_T>(min_bin); VAL_T minb = static_cast<VAL_T>(min_bin);
VAL_T maxb = static_cast<VAL_T>(max_bin); VAL_T maxb = static_cast<VAL_T>(max_bin);
VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
if (default_bin == 0) { if (default_bin == 0) {
th -= 1; th -= 1;
t_default_bin -= 1;
} }
SparseBinIterator<VAL_T> iterator(this, data_indices[0]); SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
data_size_t lte_count = 0; data_size_t lte_count = 0;
...@@ -159,14 +161,14 @@ public: ...@@ -159,14 +161,14 @@ public:
data_size_t* default_indices = gt_indices; data_size_t* default_indices = gt_indices;
data_size_t* default_count = &gt_count; data_size_t* default_count = &gt_count;
if (bin_type == BinType::NumericalBin) { if (bin_type == BinType::NumericalBin) {
if (default_bin <= threshold) { if (default_bin_for_zero <= threshold) {
default_indices = lte_indices; default_indices = lte_indices;
default_count = &lte_count; default_count = &lte_count;
} }
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerRawGet(idx); VAL_T bin = iterator.InnerRawGet(idx);
if (bin > maxb || bin < minb) { if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx; default_indices[(*default_count)++] = idx;
} else if (bin > th) { } else if (bin > th) {
gt_indices[gt_count++] = idx; gt_indices[gt_count++] = idx;
...@@ -175,14 +177,14 @@ public: ...@@ -175,14 +177,14 @@ public:
} }
} }
} else { } else {
if (default_bin == threshold) { if (default_bin_for_zero == threshold) {
default_indices = lte_indices; default_indices = lte_indices;
default_count = &lte_count; default_count = &lte_count;
} }
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
VAL_T bin = iterator.InnerRawGet(idx); VAL_T bin = iterator.InnerRawGet(idx);
if (bin > maxb || bin < minb) { if (bin < minb || bin > maxb || t_default_bin == bin) {
default_indices[(*default_count)++] = idx; default_indices[(*default_count)++] = idx;
} else if (bin != th) { } else if (bin != th) {
gt_indices[gt_count++] = idx; gt_indices[gt_count++] = idx;
......
...@@ -24,20 +24,23 @@ Tree::Tree(int max_leaves) ...@@ -24,20 +24,23 @@ Tree::Tree(int max_leaves)
:max_leaves_(max_leaves) { :max_leaves_(max_leaves) {
num_leaves_ = 0; num_leaves_ = 0;
left_child_ = std::vector<int>(max_leaves_ - 1); left_child_.resize(max_leaves_ - 1);
right_child_ = std::vector<int>(max_leaves_ - 1); right_child_.resize(max_leaves_ - 1);
split_feature_inner = std::vector<int>(max_leaves_ - 1); split_feature_inner_.resize(max_leaves_ - 1);
split_feature_ = std::vector<int>(max_leaves_ - 1); split_feature_.resize(max_leaves_ - 1);
threshold_in_bin_ = std::vector<uint32_t>(max_leaves_ - 1); threshold_in_bin_.resize(max_leaves_ - 1);
threshold_ = std::vector<double>(max_leaves_ - 1); threshold_.resize(max_leaves_ - 1);
decision_type_ = std::vector<int8_t>(max_leaves_ - 1); decision_type_.resize(max_leaves_ - 1);
split_gain_ = std::vector<double>(max_leaves_ - 1); default_value_.resize(max_leaves_ - 1);
leaf_parent_ = std::vector<int>(max_leaves_); zero_bin_.resize(max_leaves_ - 1);
leaf_value_ = std::vector<double>(max_leaves_); default_bin_for_zero_.resize(max_leaves_ - 1);
leaf_count_ = std::vector<data_size_t>(max_leaves_); split_gain_.resize(max_leaves_ - 1);
internal_value_ = std::vector<double>(max_leaves_ - 1); leaf_parent_.resize(max_leaves_);
internal_count_ = std::vector<data_size_t>(max_leaves_ - 1); leaf_value_.resize(max_leaves_);
leaf_depth_ = std::vector<int>(max_leaves_); leaf_count_.resize(max_leaves_);
internal_value_.resize(max_leaves_ - 1);
internal_count_.resize(max_leaves_ - 1);
leaf_depth_.resize(max_leaves_);
// root is in the depth 0 // root is in the depth 0
leaf_depth_[0] = 0; leaf_depth_[0] = 0;
num_leaves_ = 1; num_leaves_ = 1;
...@@ -49,9 +52,9 @@ Tree::~Tree() { ...@@ -49,9 +52,9 @@ Tree::~Tree() {
} }
int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature, int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature, double threshold_double,
double threshold_double, double left_value, double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) { uint32_t zero_bin, uint32_t default_bin_for_zero, double default_value) {
int new_node_idx = num_leaves_ - 1; int new_node_idx = num_leaves_ - 1;
// update parent info // update parent info
int parent = leaf_parent_[leaf]; int parent = leaf_parent_[leaf];
...@@ -64,17 +67,23 @@ int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, ...@@ -64,17 +67,23 @@ int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin,
} }
} }
// add new node // add new node
split_feature_inner[new_node_idx] = feature; split_feature_inner_[new_node_idx] = feature;
split_feature_[new_node_idx] = real_feature; split_feature_[new_node_idx] = real_feature;
zero_bin_[new_node_idx] = zero_bin;
default_bin_for_zero_[new_node_idx] = default_bin_for_zero;
default_value_[new_node_idx] = Common::AvoidInf(default_value);
if (bin_type == BinType::NumericalBin) { if (bin_type == BinType::NumericalBin) {
decision_type_[new_node_idx] = 0; decision_type_[new_node_idx] = 0;
} else { } else {
has_categorical_ = true; has_categorical_ = true;
decision_type_[new_node_idx] = 1; decision_type_[new_node_idx] = 1;
} }
threshold_in_bin_[new_node_idx] = threshold_bin; threshold_in_bin_[new_node_idx] = threshold_bin;
threshold_[new_node_idx] = threshold_double; threshold_[new_node_idx] = threshold_double;
split_gain_[new_node_idx] = gain == std::numeric_limits<double>::infinity() ? std::numeric_limits<double>::max() : gain; split_gain_[new_node_idx] = Common::AvoidInf(gain);
// add two new leaves // add two new leaves
left_child_[new_node_idx] = ~leaf; left_child_[new_node_idx] = ~leaf;
right_child_[new_node_idx] = ~num_leaves_; right_child_[new_node_idx] = ~num_leaves_;
...@@ -104,15 +113,16 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl ...@@ -104,15 +113,16 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
[this, &data, score](int, data_size_t start, data_size_t end) { [this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1); std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) { for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i]; const int fidx = split_feature_inner_[i];
iter[i].reset(data->FeatureIterator(fidx)); iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(start); iter[i]->Reset(start);
} }
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[node]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
if (inner_decision_funs[decision_type_[node]]( if (inner_decision_funs[decision_type_[node]](
iter[node]->Get(i), fval,
threshold_in_bin_[node])) { threshold_in_bin_[node])) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
...@@ -133,8 +143,9 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl ...@@ -133,8 +143,9 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
if (inner_decision_funs[decision_type_[node]]( if (inner_decision_funs[decision_type_[node]](
iter[split_feature_inner[node]]->Get(i), fval,
threshold_in_bin_[node])) { threshold_in_bin_[node])) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
...@@ -151,14 +162,15 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl ...@@ -151,14 +162,15 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
[this, &data, score](int, data_size_t start, data_size_t end) { [this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1); std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) { for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i]; const int fidx = split_feature_inner_[i];
iter[i].reset(data->FeatureIterator(fidx)); iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(start); iter[i]->Reset(start);
} }
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
if (iter[node]->Get(i) <= threshold_in_bin_[node]) { uint32_t fval = DefaultValueForZero(iter[node]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
if (fval <= threshold_in_bin_[node]) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
node = right_child_[node]; node = right_child_[node];
...@@ -178,7 +190,8 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl ...@@ -178,7 +190,8 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
for (data_size_t i = start; i < end; ++i) { for (data_size_t i = start; i < end; ++i) {
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
if (iter[split_feature_inner[node]]->Get(i) <= threshold_in_bin_[node]) { uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(i), zero_bin_[node], default_bin_for_zero_[node]);
if (fval <= threshold_in_bin_[node]) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
node = right_child_[node]; node = right_child_[node];
...@@ -201,7 +214,7 @@ void Tree::AddPredictionToScore(const Dataset* data, ...@@ -201,7 +214,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) { [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1); std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) { for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i]; const int fidx = split_feature_inner_[i];
iter[i].reset(data->FeatureIterator(fidx)); iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(used_data_indices[start]); iter[i]->Reset(used_data_indices[start]);
} }
...@@ -209,8 +222,9 @@ void Tree::AddPredictionToScore(const Dataset* data, ...@@ -209,8 +222,9 @@ void Tree::AddPredictionToScore(const Dataset* data,
int node = 0; int node = 0;
const data_size_t idx = used_data_indices[i]; const data_size_t idx = used_data_indices[i];
while (node >= 0) { while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[node]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
if (inner_decision_funs[decision_type_[node]]( if (inner_decision_funs[decision_type_[node]](
iter[node]->Get(idx), fval,
threshold_in_bin_[node])) { threshold_in_bin_[node])) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
...@@ -232,8 +246,9 @@ void Tree::AddPredictionToScore(const Dataset* data, ...@@ -232,8 +246,9 @@ void Tree::AddPredictionToScore(const Dataset* data,
const data_size_t idx = used_data_indices[i]; const data_size_t idx = used_data_indices[i];
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
if (inner_decision_funs[decision_type_[node]]( if (inner_decision_funs[decision_type_[node]](
iter[split_feature_inner[node]]->Get(idx), fval,
threshold_in_bin_[node])) { threshold_in_bin_[node])) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
...@@ -250,7 +265,7 @@ void Tree::AddPredictionToScore(const Dataset* data, ...@@ -250,7 +265,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) { [this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1); std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) { for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i]; const int fidx = split_feature_inner_[i];
iter[i].reset(data->FeatureIterator(fidx)); iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(used_data_indices[start]); iter[i]->Reset(used_data_indices[start]);
} }
...@@ -258,7 +273,8 @@ void Tree::AddPredictionToScore(const Dataset* data, ...@@ -258,7 +273,8 @@ void Tree::AddPredictionToScore(const Dataset* data,
int node = 0; int node = 0;
const data_size_t idx = used_data_indices[i]; const data_size_t idx = used_data_indices[i];
while (node >= 0) { while (node >= 0) {
if (iter[node]->Get(idx) <= threshold_in_bin_[node]) { uint32_t fval = DefaultValueForZero(iter[node]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
if (fval <= threshold_in_bin_[node]) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
node = right_child_[node]; node = right_child_[node];
...@@ -279,7 +295,8 @@ void Tree::AddPredictionToScore(const Dataset* data, ...@@ -279,7 +295,8 @@ void Tree::AddPredictionToScore(const Dataset* data,
const data_size_t idx = used_data_indices[i]; const data_size_t idx = used_data_indices[i];
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
if (iter[split_feature_inner[node]]->Get(idx) <= threshold_in_bin_[node]) { uint32_t fval = DefaultValueForZero(iter[split_feature_inner_[node]]->Get(idx), zero_bin_[node], default_bin_for_zero_[node]);
if (fval <= threshold_in_bin_[node]) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
node = right_child_[node]; node = right_child_[node];
...@@ -303,6 +320,8 @@ std::string Tree::ToString() { ...@@ -303,6 +320,8 @@ std::string Tree::ToString() {
<< Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<double>(threshold_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "decision_type=" str_buf << "decision_type="
<< Common::ArrayToString<int>(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<int>(Common::ArrayCast<int8_t, int>(decision_type_), num_leaves_ - 1, ' ') << std::endl;
str_buf << "default_value="
<< Common::ArrayToString<double>(default_value_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "left_child=" str_buf << "left_child="
<< Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl; << Common::ArrayToString<int>(left_child_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "right_child=" str_buf << "right_child="
...@@ -349,6 +368,7 @@ std::string Tree::NodeToJSON(int index) { ...@@ -349,6 +368,7 @@ std::string Tree::NodeToJSON(int index) {
str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl; str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl; str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl;
str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl; str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
str_buf << "\"default_value\":" << default_value_[index] << "," << std::endl;
str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl; str_buf << "\"internal_value\":" << internal_value_[index] << "," << std::endl;
str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl; str_buf << "\"internal_count\":" << internal_count_[index] << "," << std::endl;
str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl; str_buf << "\"left_child\":" << NodeToJSON(left_child_[index]) << "," << std::endl;
...@@ -389,7 +409,11 @@ std::string Tree::NodeToIfElse(int index, bool is_predict_leaf_index) { ...@@ -389,7 +409,11 @@ std::string Tree::NodeToIfElse(int index, bool is_predict_leaf_index) {
str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2); str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
if (index >= 0) { if (index >= 0) {
// non-leaf // non-leaf
str_buf << "if ( arr[" << split_feature_[index] << "] "; std::stringstream tmp_str_buf;
tmp_str_buf << "arr[" << split_feature_[index] << "]";
std::string str_fval = tmp_str_buf.str();
str_buf << "if( ( " << str_fval <<" <= " << kMissingValueRange << " && "<< str_fval << " > -" << kMissingValueRange <<" ? "
<< default_value_[index] << " : " << str_fval << " ) ";
if (decision_type_[index] == 0) { if (decision_type_[index] == 0) {
str_buf << "<"; str_buf << "<";
} else { } else {
...@@ -461,6 +485,12 @@ Tree::Tree(const std::string& str) { ...@@ -461,6 +485,12 @@ Tree::Tree(const std::string& str) {
Log::Fatal("Tree model string format error, should contain threshold field"); Log::Fatal("Tree model string format error, should contain threshold field");
} }
if (key_vals.count("default_value")) {
default_value_ = Common::StringToArray<double>(key_vals["default_value"], ' ', num_leaves_ - 1);
} else {
Log::Fatal("Tree model string format error, should contain default_value field");
}
if (key_vals.count("leaf_value")) { if (key_vals.count("leaf_value")) {
leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_); leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);
} else { } else {
......
...@@ -91,7 +91,7 @@ public: ...@@ -91,7 +91,7 @@ public:
* \param threshold threshold that want to split * \param threshold threshold that want to split
* \param right_leaf index of right leaf * \param right_leaf index of right leaf
*/ */
void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, int right_leaf) { void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, uint32_t default_bin_for_zero, int right_leaf, int expected_left_cnt) {
const data_size_t min_inner_size = 512; const data_size_t min_inner_size = 512;
// get leaf boundary // get leaf boundary
const data_size_t begin = leaf_begin_[leaf]; const data_size_t begin = leaf_begin_[leaf];
...@@ -111,7 +111,7 @@ public: ...@@ -111,7 +111,7 @@ public:
data_size_t cur_cnt = inner_size; data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; } if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
// split data inner, reduce the times of function called // split data inner, reduce the times of function called
data_size_t cur_left_count = dataset->Split(feature, threshold, indices_.data() + begin + cur_start, cur_cnt, data_size_t cur_left_count = dataset->Split(feature, threshold, default_bin_for_zero, indices_.data() + begin + cur_start, cur_cnt,
temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start); temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
offsets_buf_[i] = cur_start; offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count; left_cnts_buf_[i] = cur_left_count;
...@@ -141,6 +141,7 @@ public: ...@@ -141,6 +141,7 @@ public:
} }
// update leaf boundary // update leaf boundary
leaf_count_[leaf] = left_cnt; leaf_count_[leaf] = left_cnt;
CHECK(left_cnt == expected_left_cnt);
leaf_begin_[right_leaf] = left_cnt + begin; leaf_begin_[right_leaf] = left_cnt + begin;
leaf_count_[right_leaf] = cnt - left_cnt; leaf_count_[right_leaf] = cnt - left_cnt;
} }
......
...@@ -15,6 +15,7 @@ class FeatureMetainfo { ...@@ -15,6 +15,7 @@ class FeatureMetainfo {
public: public:
int num_bin; int num_bin;
int bias = 0; int bias = 0;
uint32_t default_bin;
/*! \brief pointer of tree config */ /*! \brief pointer of tree config */
const TreeConfig* tree_config; const TreeConfig* tree_config;
}; };
...@@ -69,81 +70,28 @@ public: ...@@ -69,81 +70,28 @@ public:
void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data, void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) { SplitInfo* output) {
output->default_bin_for_zero = meta_->default_bin;
output->gain = kMinScore;
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output); find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
} }
void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data, void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) { SplitInfo* output) {
double best_sum_left_gradient = NAN;
double best_sum_left_hessian = NAN; is_splittable_ = false;
double best_gain = kMinScore;
data_size_t best_left_count = 0;
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
double sum_right_gradient = 0.0f;
double sum_right_hessian = kEpsilon;
data_size_t right_count = 0;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2); meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split; double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
is_splittable_ = false;
const int bias = meta_->bias;
int t = meta_->num_bin - 1 - bias;
const int t_end = 1 - bias;
// from right to left, and we don't need data in bin0
for (; t >= t_end; --t) {
sum_right_gradient += data_[t].sum_gradients;
sum_right_hessian += data_[t].sum_hessians;
right_count += data_[t].cnt;
// if data not enough, or sum hessian too small
if (right_count < meta_->tree_config->min_data_in_leaf
|| sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t left_count = num_data - right_count;
// if data not enough
if (left_count < meta_->tree_config->min_data_in_leaf) break;
double sum_left_hessian = sum_hessian - sum_right_hessian;
// if sum hessian too small
if (sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
double sum_left_gradient = sum_gradient - sum_right_gradient;
// current split gain
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;
// mark to is splittable FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, 0);
is_splittable_ = true; // Zero is not in leftmost or rightmost
// better split point if (static_cast<int>(meta_->default_bin) > 0 && static_cast<int>(meta_->default_bin) < meta_->num_bin - 1) {
if (current_gain > best_gain) { FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->default_bin);
best_left_count = left_count;
best_sum_left_gradient = sum_left_gradient;
best_sum_left_hessian = sum_left_hessian;
// left is <= threshold, right is > threshold. so this is t-1
best_threshold = static_cast<uint32_t>(t - 1 + bias);
best_gain = current_gain;
}
} }
if (is_splittable_) { if (meta_->num_bin > 2) {
// update split information FindBestThresholdSequence(sum_gradient, sum_hessian, num_data, min_gain_shift, output, meta_->num_bin - 1);
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
output->gain = best_gain - gain_shift;
} else {
output->gain = kMinScore;
} }
output->gain -= min_gain_shift;
} }
void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data, void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
...@@ -242,10 +190,8 @@ public: ...@@ -242,10 +190,8 @@ public:
output->right_count = num_data - best_left_count; output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
output->gain = best_gain - gain_shift; output->gain = best_gain - min_gain_shift;
} else { }
output->gain = kMinScore;
}
} }
/*! /*!
...@@ -301,6 +247,142 @@ public: ...@@ -301,6 +247,142 @@ public:
private: private:
void FindBestThresholdSequence(double sum_gradient, double sum_hessian, data_size_t num_data, double min_gain_shift,
SplitInfo* output, uint32_t default_bin_for_zero) {
int dir = -1;
if (static_cast<int>(default_bin_for_zero) == meta_->num_bin - 1) { dir = 1; };
bool skip_default_bin = true;
if (static_cast<int>(default_bin_for_zero) > 0 && static_cast<int>(default_bin_for_zero) < meta_->num_bin - 1) {
skip_default_bin = false;
}
const int bias = meta_->bias;
double best_sum_left_gradient = NAN;
double best_sum_left_hessian = NAN;
double best_gain = kMinScore;
data_size_t best_left_count = 0;
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
if (dir == -1) {
double sum_right_gradient = 0.0f;
double sum_right_hessian = kEpsilon;
data_size_t right_count = 0;
int t = meta_->num_bin - 1 - bias;
const int t_end = 1 - bias;
// from right to left, and we don't need data in bin0
for (; t >= t_end; --t) {
// need to skip default bin
if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
sum_right_gradient += data_[t].sum_gradients;
sum_right_hessian += data_[t].sum_hessians;
right_count += data_[t].cnt;
// if data not enough, or sum hessian too small
if (right_count < meta_->tree_config->min_data_in_leaf
|| sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t left_count = num_data - right_count;
// if data not enough
if (left_count < meta_->tree_config->min_data_in_leaf) break;
double sum_left_hessian = sum_hessian - sum_right_hessian;
// if sum hessian too small
if (sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
double sum_left_gradient = sum_gradient - sum_right_gradient;
// current split gain
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;
// mark to is splittable
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_left_count = left_count;
best_sum_left_gradient = sum_left_gradient;
best_sum_left_hessian = sum_left_hessian;
// left is <= threshold, right is > threshold. so this is t-1
best_threshold = static_cast<uint32_t>(t - 1 + bias);
best_gain = current_gain;
}
}
} else{
double sum_left_gradient = 0.0f;
double sum_left_hessian = kEpsilon;
data_size_t left_count = 0;
int t = 0;
const int t_end = meta_->num_bin - 2 - bias;
// from right to left, and we don't need data in bin0
for (; t <= t_end; ++t) {
// need to skip default bin
if (skip_default_bin && (t + bias) == static_cast<int>(meta_->default_bin)) { continue; }
sum_left_gradient += data_[t].sum_gradients;
sum_left_hessian += data_[t].sum_hessians;
left_count += data_[t].cnt;
// if data not enough, or sum hessian too small
if (left_count < meta_->tree_config->min_data_in_leaf
|| sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t right_count = num_data - left_count;
// if data not enough
if (right_count < meta_->tree_config->min_data_in_leaf) break;
double sum_right_hessian = sum_hessian - sum_left_hessian;
// if sum hessian too small
if (sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
double sum_right_gradient = sum_gradient - sum_left_gradient;
// current split gain
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;
// mark to is splittable
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_left_count = left_count;
best_sum_left_gradient = sum_left_gradient;
best_sum_left_hessian = sum_left_hessian;
best_threshold = static_cast<uint32_t>(t + bias);
best_gain = current_gain;
}
}
}
if (is_splittable_ && best_gain > output->gain) {
// update split information
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
output->gain = best_gain;
output->default_bin_for_zero = default_bin_for_zero;
}
}
const FeatureMetainfo* meta_; const FeatureMetainfo* meta_;
/*! \brief sum of gradient of each bin */ /*! \brief sum of gradient of each bin */
HistogramBinEntry* data_; HistogramBinEntry* data_;
...@@ -364,6 +446,7 @@ public: ...@@ -364,6 +446,7 @@ public:
#pragma omp parallel for schedule(static, 512) if(num_feature >= 1024) #pragma omp parallel for schedule(static, 512) if(num_feature >= 1024)
for (int i = 0; i < num_feature; ++i) { for (int i = 0; i < num_feature; ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i); feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) { if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1; feature_metas_[i].bias = 1;
} else { } else {
......
...@@ -543,6 +543,10 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri ...@@ -543,6 +543,10 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature); const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
// left = parent // left = parent
*left_leaf = best_Leaf; *left_leaf = best_Leaf;
double default_value = 0.0f;
if (train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin() != best_split_info.default_bin_for_zero) {
default_value = train_data_->RealThreshold(inner_feature_index, best_split_info.default_bin_for_zero);
}
// split tree, will return right leaf // split tree, will return right leaf
*right_leaf = tree->Split(best_Leaf, *right_leaf = tree->Split(best_Leaf,
inner_feature_index, inner_feature_index,
...@@ -554,10 +558,13 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri ...@@ -554,10 +558,13 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
static_cast<double>(best_split_info.right_output), static_cast<double>(best_split_info.right_output),
static_cast<data_size_t>(best_split_info.left_count), static_cast<data_size_t>(best_split_info.left_count),
static_cast<data_size_t>(best_split_info.right_count), static_cast<data_size_t>(best_split_info.right_count),
static_cast<double>(best_split_info.gain)); static_cast<double>(best_split_info.gain),
train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin(),
best_split_info.default_bin_for_zero,
default_value);
// split data partition // split data partition
data_partition_->Split(best_Leaf, train_data_, inner_feature_index, data_partition_->Split(best_Leaf, train_data_, inner_feature_index,
best_split_info.threshold, *right_leaf); best_split_info.threshold, best_split_info.default_bin_for_zero, *right_leaf, best_split_info.left_count);
// init the leaves that used on next iteration // init the leaves that used on next iteration
if (best_split_info.left_count < best_split_info.right_count) { if (best_split_info.left_count < best_split_info.right_count) {
......
...@@ -19,7 +19,9 @@ public: ...@@ -19,7 +19,9 @@ public:
/*! \brief Feature index */ /*! \brief Feature index */
int feature; int feature;
/*! \brief Split threshold */ /*! \brief Split threshold */
unsigned int threshold; uint32_t threshold;
uint32_t default_bin_for_zero;
/*! \brief Left output after split */ /*! \brief Left output after split */
double left_output; double left_output;
/*! \brief Right output after split */ /*! \brief Right output after split */
......
...@@ -67,6 +67,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b ...@@ -67,6 +67,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int i = 0; i < train_data->num_features(); ++i) { for (int i = 0; i < train_data->num_features(); ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i); feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
feature_metas_[i].default_bin = train_data->FeatureBinMapper(i)->GetDefaultBin();
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) { if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1; feature_metas_[i].bias = 1;
} else { } else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment