Commit eade219e authored by Qiwei Ye's avatar Qiwei Ye
Browse files

merge conflict

parents f23e6083 060bd316
......@@ -4,7 +4,6 @@
#include <LightGBM/utils/common.h>
#include <LightGBM/dataset.h>
#include <LightGBM/feature.h>
#include <sstream>
#include <unordered_map>
......@@ -16,11 +15,10 @@
namespace LightGBM {
std::vector<std::function<bool(unsigned int, unsigned int)>> Tree::inner_decision_funs =
{Tree::NumericalDecision<unsigned int>, Tree::CategoricalDecision<unsigned int> };
std::vector<std::function<bool(double, double)>> Tree::decision_funs =
{ Tree::NumericalDecision<double>, Tree::CategoricalDecision<double> };
std::vector<bool(*)(uint32_t, uint32_t)> Tree::inner_decision_funs =
{ Tree::NumericalDecision<uint32_t>, Tree::CategoricalDecision<uint32_t> };
std::vector<bool(*)(double, double)> Tree::decision_funs =
{ Tree::NumericalDecision<double>, Tree::CategoricalDecision<double> };
Tree::Tree(int max_leaves)
:max_leaves_(max_leaves) {
......@@ -28,9 +26,9 @@ Tree::Tree(int max_leaves)
num_leaves_ = 0;
left_child_ = std::vector<int>(max_leaves_ - 1);
right_child_ = std::vector<int>(max_leaves_ - 1);
split_feature_inner = std::vector<int>(max_leaves_ - 1);
split_feature_ = std::vector<int>(max_leaves_ - 1);
split_feature_real_ = std::vector<int>(max_leaves_ - 1);
threshold_in_bin_ = std::vector<unsigned int>(max_leaves_ - 1);
threshold_in_bin_ = std::vector<uint32_t>(max_leaves_ - 1);
threshold_ = std::vector<double>(max_leaves_ - 1);
decision_type_ = std::vector<int8_t>(max_leaves_ - 1);
split_gain_ = std::vector<double>(max_leaves_ - 1);
......@@ -44,12 +42,14 @@ Tree::Tree(int max_leaves)
leaf_depth_[0] = 0;
num_leaves_ = 1;
leaf_parent_[0] = -1;
shrinkage_ = 1.0f;
has_categorical_ = false;
}
Tree::~Tree() {
}
int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_bin, int real_feature,
int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin, int real_feature,
double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain) {
int new_node_idx = num_leaves_ - 1;
......@@ -64,15 +64,16 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
}
}
// add new node
split_feature_[new_node_idx] = feature;
split_feature_real_[new_node_idx] = real_feature;
threshold_in_bin_[new_node_idx] = threshold_bin;
threshold_[new_node_idx] = threshold_double;
split_feature_inner[new_node_idx] = feature;
split_feature_[new_node_idx] = real_feature;
if (bin_type == BinType::NumericalBin) {
decision_type_[new_node_idx] = 0;
} else {
has_categorical_ = true;
decision_type_[new_node_idx] = 1;
}
threshold_in_bin_[new_node_idx] = threshold_bin;
threshold_[new_node_idx] = threshold_double;
split_gain_[new_node_idx] = gain;
// add two new leaves
left_child_[new_node_idx] = ~leaf;
......@@ -96,36 +97,206 @@ int Tree::Split(int leaf, int feature, BinType bin_type, unsigned int threshold_
}
void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
Threading::For<data_size_t>(0, num_data, [this, data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(start));
if (num_leaves_ <= 1) { return; }
if (has_categorical_) {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[node]->Get(i),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[split_feature_inner[node]]->Get(i),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
}
for (data_size_t i = start; i < end; ++i) {
score[i] += static_cast<double>(leaf_value_[GetLeaf(iterators, i)]);
} else {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (iter[node]->Get(i) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, &data, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(start);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
while (node >= 0) {
if (iter[split_feature_inner[node]]->Get(i) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[i] += static_cast<double>(leaf_value_[~node]);
}
});
}
});
}
}
void Tree::AddPredictionToScore(const Dataset* data, const data_size_t* used_data_indices,
data_size_t num_data, double* score) const {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iterators(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iterators[i].reset(data->FeatureAt(i)->bin_data()->GetIterator(used_data_indices[start]));
void Tree::AddPredictionToScore(const Dataset* data,
const data_size_t* used_data_indices,
data_size_t num_data, double* score) const {
if (num_leaves_ <= 1) { return; }
if (has_categorical_) {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
const data_size_t idx = used_data_indices[i];
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[node]->Get(idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
const data_size_t idx = used_data_indices[i];
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iter[split_feature_inner[node]]->Get(idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
}
for (data_size_t i = start; i < end; ++i) {
score[used_data_indices[i]] += static_cast<double>(leaf_value_[GetLeaf(iterators, used_data_indices[i])]);
} else {
if (data->num_features() > num_leaves_ - 1) {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(num_leaves_ - 1);
for (int i = 0; i < num_leaves_ - 1; ++i) {
const int fidx = split_feature_inner[i];
iter[i].reset(data->FeatureIterator(fidx));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
int node = 0;
const data_size_t idx = used_data_indices[i];
while (node >= 0) {
if (iter[node]->Get(idx) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
} else {
Threading::For<data_size_t>(0, num_data,
[this, data, used_data_indices, score](int, data_size_t start, data_size_t end) {
std::vector<std::unique_ptr<BinIterator>> iter(data->num_features());
for (int i = 0; i < data->num_features(); ++i) {
iter[i].reset(data->FeatureIterator(i));
iter[i]->Reset(used_data_indices[start]);
}
for (data_size_t i = start; i < end; ++i) {
const data_size_t idx = used_data_indices[i];
int node = 0;
while (node >= 0) {
if (iter[split_feature_inner[node]]->Get(idx) <= threshold_in_bin_[node]) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
score[idx] += static_cast<double>(leaf_value_[~node]);
}
});
}
});
}
}
std::string Tree::ToString() {
std::stringstream str_buf;
str_buf << "num_leaves=" << num_leaves_ << std::endl;
str_buf << "split_feature="
<< Common::ArrayToString<int>(split_feature_real_, num_leaves_ - 1, ' ') << std::endl;
<< Common::ArrayToString<int>(split_feature_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "split_gain="
<< Common::ArrayToString<double>(split_gain_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "threshold="
......@@ -146,6 +317,7 @@ std::string Tree::ToString() {
<< Common::ArrayToString<double>(internal_value_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "internal_count="
<< Common::ArrayToString<data_size_t>(internal_count_, num_leaves_ - 1, ' ') << std::endl;
str_buf << "shrinkage=" << shrinkage_ << std::endl;
str_buf << std::endl;
return str_buf.str();
}
......@@ -154,7 +326,7 @@ std::string Tree::ToJSON() {
std::stringstream str_buf;
str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
str_buf << "\"num_leaves\":" << num_leaves_ << "," << std::endl;
str_buf << "\"shrinkage\":" << shrinkage_ << "," << std::endl;
str_buf << "\"tree_structure\":" << NodeToJSON(0) << std::endl;
return str_buf.str();
......@@ -167,7 +339,7 @@ std::string Tree::NodeToJSON(int index) {
// non-leaf
str_buf << "{" << std::endl;
str_buf << "\"split_index\":" << index << "," << std::endl;
str_buf << "\"split_feature\":" << split_feature_real_[index] << "," << std::endl;
str_buf << "\"split_feature\":" << split_feature_[index] << "," << std::endl;
str_buf << "\"split_gain\":" << split_gain_[index] << "," << std::endl;
str_buf << "\"threshold\":" << threshold_[index] << "," << std::endl;
str_buf << "\"decision_type\":\"" << Tree::GetDecisionTypeName(decision_type_[index]) << "\"," << std::endl;
......@@ -208,7 +380,8 @@ Tree::Tree(const std::string& str) {
|| key_vals.count("left_child") <= 0 || key_vals.count("right_child") <= 0
|| key_vals.count("leaf_parent") <= 0 || key_vals.count("leaf_value") <= 0
|| key_vals.count("internal_value") <= 0 || key_vals.count("internal_count") <= 0
|| key_vals.count("leaf_count") <= 0 || key_vals.count("decision_type") <= 0
|| key_vals.count("leaf_count") <= 0 || key_vals.count("shrinkage") <= 0
|| key_vals.count("decision_type") <= 0
) {
Log::Fatal("Tree model string format error");
}
......@@ -217,17 +390,17 @@ Tree::Tree(const std::string& str) {
left_child_ = Common::StringToArray<int>(key_vals["left_child"], ' ', num_leaves_ - 1);
right_child_ = Common::StringToArray<int>(key_vals["right_child"], ' ', num_leaves_ - 1);
split_feature_real_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1);
split_feature_ = Common::StringToArray<int>(key_vals["split_feature"], ' ', num_leaves_ - 1);
threshold_ = Common::StringToArray<double>(key_vals["threshold"], ' ', num_leaves_ - 1);
decision_type_ = Common::StringToArray<int8_t>(key_vals["decision_type"], ' ', num_leaves_ - 1);
split_gain_ = Common::StringToArray<double>(key_vals["split_gain"], ' ', num_leaves_ - 1);
internal_count_ = Common::StringToArray<data_size_t>(key_vals["internal_count"], ' ', num_leaves_ - 1);
internal_value_ = Common::StringToArray<double>(key_vals["internal_value"], ' ', num_leaves_ - 1);
decision_type_ = Common::StringToArray<int8_t>(key_vals["decision_type"], ' ', num_leaves_ - 1);
leaf_count_ = Common::StringToArray<data_size_t>(key_vals["leaf_count"], ' ', num_leaves_);
leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_);
leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);
Common::Atof(key_vals["shrinkage"].c_str(), &shrinkage_);
}
} // namespace LightGBM
......@@ -2,22 +2,6 @@
#include <LightGBM/application.h>
int main(int argc, char** argv) {
try {
LightGBM::Application app(argc, argv);
app.Run();
}
catch (const std::exception& ex) {
std::cerr << "Met Exceptions:" << std::endl;
std::cerr << ex.what() << std::endl;
exit(-1);
}
catch (const std::string& ex) {
std::cerr << "Met Exceptions:" << std::endl;
std::cerr << ex << std::endl;
exit(-1);
}
catch (...) {
std::cerr << "Unknown Exceptions" << std::endl;
exit(-1);
}
LightGBM::Application app(argc, argv);
app.Run();
}
......@@ -63,7 +63,7 @@ public:
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
// sigmoid transform
double prob = 1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * score[i]));
double prob = 1.0f / (1.0f + std::exp(-sigmoid_ * score[i]));
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob);
}
......@@ -71,7 +71,7 @@ public:
#pragma omp parallel for schedule(static) reduction(+:sum_loss)
for (data_size_t i = 0; i < num_data_; ++i) {
// sigmoid transform
double prob = 1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * score[i]));
double prob = 1.0f / (1.0f + std::exp(-sigmoid_ * score[i]));
// add loss
sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob) * weights_[i];
}
......@@ -103,7 +103,7 @@ public:
explicit BinaryLoglossMetric(const MetricConfig& config) :BinaryMetric<BinaryLoglossMetric>(config) {}
inline static double LossOnPoint(float label, double prob) {
if (label == 0) {
if (label <= 0) {
if (1.0f - prob > kEpsilon) {
return -std::log(1.0f - prob);
}
......@@ -128,9 +128,9 @@ public:
inline static double LossOnPoint(float label, double prob) {
if (prob <= 0.5f) {
return label;
return label > 0;
} else {
return 1.0f - label;
return label <= 0;
}
}
......@@ -207,8 +207,8 @@ public:
// reset
cur_neg = cur_pos = 0.0f;
}
cur_neg += 1.0f - cur_label;
cur_pos += cur_label;
cur_neg += (cur_label <= 0);
cur_pos += (cur_label > 0);
}
} else { // has weights
for (data_size_t i = 0; i < num_data_; ++i) {
......@@ -224,8 +224,8 @@ public:
// reset
cur_neg = cur_pos = 0.0f;
}
cur_neg += (1.0f - cur_label)*cur_weight;
cur_pos += cur_label*cur_weight;
cur_neg += (cur_label <= 0)*cur_weight;
cur_pos += (cur_label > 0)*cur_weight;
}
}
accum += cur_neg*(cur_pos * 0.5f + sum_pos);
......
#ifndef LIGHTGBM_METRIC_MAP_METRIC_HPP_
#define LIGHTGBM_METRIC_MAP_METRIC_HPP_
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/metric.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <sstream>
#include <vector>
namespace LightGBM {
class MapMetric:public Metric {
public:
explicit MapMetric(const MetricConfig& config) {
// get eval position
for (auto k : config.eval_at) {
eval_at_.push_back(static_cast<data_size_t>(k));
}
// get number of threads
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
}
~MapMetric() {
}
void Init(const Metadata& metadata, data_size_t num_data) override {
std::stringstream str_buf;
for (auto k : eval_at_) {
name_.emplace_back(std::string("map@") + std::to_string(k));
}
num_data_ = num_data;
// get label
label_ = metadata.label();
// get query boundaries
query_boundaries_ = metadata.query_boundaries();
if (query_boundaries_ == nullptr) {
Log::Fatal("For MAP metric, there should be query information");
}
num_queries_ = metadata.num_queries();
Log::Info("total groups: %d , total data: %d", num_queries_, num_data_);
// get query weights
query_weights_ = metadata.query_weights();
if (query_weights_ == nullptr) {
sum_query_weights_ = static_cast<double>(num_queries_);
} else {
sum_query_weights_ = 0.0f;
for (data_size_t i = 0; i < num_queries_; ++i) {
sum_query_weights_ += query_weights_[i];
}
}
}
const std::vector<std::string>& GetName() const override {
return name_;
}
double factor_to_bigger_better() const override {
return 1.0f;
}
void CalMapAtK(std::vector<int> ks, const float* label,
const double* score, data_size_t num_data, std::vector<double>* out) const {
// get sorted indices by score
std::vector<data_size_t> sorted_idx;
for (data_size_t i = 0; i < num_data; ++i) {
sorted_idx.emplace_back(i);
}
std::sort(sorted_idx.begin(), sorted_idx.end(),
[score](data_size_t a, data_size_t b) {return score[a] > score[b]; });
int num_hit = 0;
double sum_ap = 0.0f;
data_size_t cur_left = 0;
for (size_t i = 0; i < ks.size(); ++i) {
data_size_t cur_k = ks[i];
if (cur_k > num_data) { cur_k = num_data; }
for (data_size_t j = cur_left; j < cur_k; ++j) {
data_size_t idx = sorted_idx[j];
if (label[idx] > 0.5f) {
++num_hit;
sum_ap += static_cast<double>(num_hit) / (i + 1.0f);
}
}
(*out)[i] = sum_ap / cur_k;
cur_left = cur_k;
}
}
std::vector<double> Eval(const double* score) const override {
// some buffers for multi-threading sum up
std::vector<std::vector<double>> result_buffer_;
for (int i = 0; i < num_threads_; ++i) {
result_buffer_.emplace_back(eval_at_.size(), 0.0f);
}
std::vector<double> tmp_map(eval_at_.size(), 0.0f);
if (query_weights_ == nullptr) {
#pragma omp parallel for schedule(guided) firstprivate(tmp_map)
for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num();
CalMapAtK(eval_at_, label_ + query_boundaries_[i],
score + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_map);
for (size_t j = 0; j < eval_at_.size(); ++j) {
result_buffer_[tid][j] += tmp_map[j];
}
}
} else {
#pragma omp parallel for schedule(guided) firstprivate(tmp_map)
for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num();
CalMapAtK(eval_at_, label_ + query_boundaries_[i],
score + query_boundaries_[i], query_boundaries_[i + 1] - query_boundaries_[i], &tmp_map);
for (size_t j = 0; j < eval_at_.size(); ++j) {
result_buffer_[tid][j] += tmp_map[j] * query_weights_[i];
}
}
}
// Get final average MAP
std::vector<double> result(eval_at_.size(), 0.0f);
for (size_t j = 0; j < result.size(); ++j) {
for (int i = 0; i < num_threads_; ++i) {
result[j] += result_buffer_[i][j];
}
result[j] /= sum_query_weights_;
}
return result;
}
private:
/*! \brief Number of data */
data_size_t num_data_;
/*! \brief Pointer of label */
const float* label_;
/*! \brief Query boundaries information */
const data_size_t* query_boundaries_;
/*! \brief Number of queries */
data_size_t num_queries_;
/*! \brief Weights of queries */
const float* query_weights_;
/*! \brief Sum weights of queries */
double sum_query_weights_;
/*! \brief Evaluate position of Nmap */
std::vector<data_size_t> eval_at_;
/*! \brief Number of threads */
int num_threads_;
std::vector<std::string> name_;
};
} // namespace LightGBM
#endif // LIGHTGBM_METRIC_MAP_METRIC_HPP_
......@@ -2,6 +2,7 @@
#include "regression_metric.hpp"
#include "binary_metric.hpp"
#include "rank_metric.hpp"
#include "map_metric.hpp"
#include "multiclass_metric.hpp"
namespace LightGBM {
......@@ -15,6 +16,8 @@ Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config
return new HuberLossMetric(config);
} else if (type == std::string("fair")) {
return new FairLossMetric(config);
} else if (type == std::string("poisson")) {
return new PoissonMetric(config);
} else if (type == std::string("binary_logloss")) {
return new BinaryLoglossMetric(config);
} else if (type == std::string("binary_error")) {
......@@ -23,6 +26,8 @@ Metric* Metric::CreateMetric(const std::string& type, const MetricConfig& config
return new AUCMetric(config);
} else if (type == std::string("ndcg")) {
return new NDCGMetric(config);
} else if (type == std::string("map")) {
return new MapMetric(config);
} else if (type == std::string("multi_logloss")) {
return new MultiLoglossMetric(config);
} else if (type == std::string("multi_error")) {
......
......@@ -6,7 +6,7 @@
#include <LightGBM/metric.h>
#include <omp.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <sstream>
#include <vector>
......@@ -90,7 +90,7 @@ public:
}
std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
if (query_weights_ == nullptr) {
#pragma omp parallel for schedule(guided) firstprivate(tmp_dcg)
#pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num();
// if all doc in this query are all negative, let its NDCG=1
......@@ -110,7 +110,7 @@ public:
}
}
} else {
#pragma omp parallel for schedule(guided) firstprivate(tmp_dcg)
#pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
for (data_size_t i = 0; i < num_queries_; ++i) {
const int tid = omp_get_thread_num();
// if all doc in this query are all negative, let its NDCG=1
......
......@@ -162,5 +162,23 @@ public:
}
};
/*! \brief Poisson regression loss for regression task */
class PoissonMetric: public RegressionMetric<PoissonMetric> {
public:
explicit PoissonMetric(const MetricConfig& config) :RegressionMetric<PoissonMetric>(config) {
}
inline static double LossOnPoint(float label, double score, double, double) {
const double eps = 1e-10f;
if (score < eps) {
score = eps;
}
return score - label * std::log(score);
}
inline static const char* Name() {
return "poisson";
}
};
} // namespace LightGBM
#endif // LightGBM_METRIC_REGRESSION_METRIC_HPP_
......@@ -25,7 +25,7 @@ Linkers::Linkers(NetworkConfig config) {
local_listen_port_ = config.local_listen_port;
socket_timeout_ = config.time_out;
rank_ = -1;
// parser clients from file
// parse clients from file
ParseMachineList(config.machine_list_filename.c_str());
if (rank_ == -1) {
......
......@@ -28,14 +28,15 @@ public:
data_size_t cnt_positive = 0;
data_size_t cnt_negative = 0;
// count for positive and negative samples
#pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
for (data_size_t i = 0; i < num_data_; ++i) {
if (label_[i] == 1) {
if (label_[i] > 0) {
++cnt_positive;
} else {
++cnt_negative;
}
}
Log::Info("Number of postive: %d, number of negative: %d", cnt_positive, cnt_negative);
Log::Info("Number of positive: %d, number of negative: %d", cnt_positive, cnt_negative);
// cannot continue if all sample are same class
if (cnt_positive == 0 || cnt_negative == 0) {
Log::Fatal("Training data only contains one class");
......@@ -64,25 +65,27 @@ public:
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
// get label and label weights
const int label = label_val_[static_cast<int>(label_[i])];
const double label_weight = label_weights_[static_cast<int>(label_[i])];
const int is_pos = label_[i] > 0;
const int label = label_val_[is_pos];
const double label_weight = label_weights_[is_pos];
// calculate gradients and hessians
const double response = -2.0f * label * sigmoid_ / (1.0f + std::exp(2.0f * label * sigmoid_ * score[i]));
const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
const double abs_response = fabs(response);
gradients[i] = static_cast<score_t>(response * label_weight);
hessians[i] = static_cast<score_t>(abs_response * (2.0f * sigmoid_ - abs_response) * label_weight);
hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight);
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
// get label and label weights
const int label = label_val_[static_cast<int>(label_[i])];
const double label_weight = label_weights_[static_cast<int>(label_[i])];
const int is_pos = label_[i] > 0;
const int label = label_val_[is_pos];
const double label_weight = label_weights_[is_pos];
// calculate gradients and hessians
const double response = -2.0f * label * sigmoid_ / (1.0f + std::exp(2.0f * label * sigmoid_ * score[i]));
const double response = -label * sigmoid_ / (1.0f + std::exp(label * sigmoid_ * score[i]));
const double abs_response = fabs(response);
gradients[i] = static_cast<score_t>(response * label_weight * weights_[i]);
hessians[i] = static_cast<score_t>(abs_response * (2.0f * sigmoid_ - abs_response) * label_weight * weights_[i]);
hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight * weights_[i]);
}
}
}
......
......@@ -14,6 +14,7 @@ class MulticlassLogloss: public ObjectiveFunction {
public:
explicit MulticlassLogloss(const ObjectiveConfig& config) {
num_class_ = config.num_class;
is_unbalance_ = config.is_unbalance;
}
~MulticlassLogloss() {
......@@ -24,12 +25,25 @@ public:
label_ = metadata.label();
weights_ = metadata.weights();
label_int_.resize(num_data_);
for (int i = 0; i < num_data_; ++i){
label_int_[i] = static_cast<int>(label_[i]);
if (label_int_[i] < 0 || label_int_[i] >= num_class_) {
Log::Fatal("Label must be in [0, %d), but found %d in label", num_class_, label_int_[i]);
}
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_data_; ++i) {
label_int_[i] = static_cast<int>(label_[i]);
if (label_int_[i] < 0 || label_int_[i] >= num_class_) {
Log::Fatal("Label must be in [0, %d), but found %d in label", num_class_, label_int_[i]);
}
}
label_pos_weights_ = std::vector<float>(num_class_, 1);
if (is_unbalance_) {
std::vector<int> cnts(num_class_, 0);
for (int i = 0; i < num_data_; ++i) {
++cnts[label_int_[i]];
}
for (int i = 0; i < num_class_; ++i) {
int cnt_cur = cnts[i];
int cnt_other = (num_data_ - cnts[i]);
label_pos_weights_[i] = static_cast<float>(cnt_other) / cnt_cur;
}
}
}
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
......@@ -46,11 +60,12 @@ public:
auto p = rec[k];
size_t idx = static_cast<size_t>(num_data_) * k + i;
if (label_int_[i] == k) {
gradients[idx] = static_cast<score_t>(p - 1.0f);
gradients[idx] = static_cast<score_t>(p - 1.0f) * label_pos_weights_[k];
hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p))* label_pos_weights_[k];
} else {
gradients[idx] = static_cast<score_t>(p);
hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p));
}
hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p));
}
}
} else {
......@@ -66,11 +81,13 @@ public:
auto p = rec[k];
size_t idx = static_cast<size_t>(num_data_) * k + i;
if (label_int_[i] == k) {
gradients[idx] = static_cast<score_t>((p - 1.0f) * weights_[i]);
gradients[idx] = static_cast<score_t>((p - 1.0f) * weights_[i]) * label_pos_weights_[k];
hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p) * weights_[i]) * label_pos_weights_[k];
} else {
gradients[idx] = static_cast<score_t>(p * weights_[i]);
hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p) * weights_[i]);
}
hessians[idx] = static_cast<score_t>(2.0f * p * (1.0f - p) * weights_[i]);
}
}
}
......@@ -91,6 +108,9 @@ private:
std::vector<int> label_int_;
/*! \brief Weights for data */
const float* weights_;
/*! \brief Weights for label */
std::vector<float> label_pos_weights_;
bool is_unbalance_;
};
} // namespace LightGBM
......
......@@ -16,6 +16,8 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
return new RegressionHuberLoss(config);
} else if (type == std::string("fair")) {
return new RegressionFairLoss(config);
} else if (type == std::string("poisson")) {
return new RegressionPoissonLoss(config);
} else if (type == std::string("binary")) {
return new BinaryLogloss(config);
} else if (type == std::string("lambdarank")) {
......
......@@ -52,6 +52,7 @@ public:
num_queries_ = metadata.num_queries();
// cache inverse max DCG, avoid computation many times
inverse_max_dcgs_.resize(num_queries_);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_queries_; ++i) {
inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(optimize_pos_at_,
label_ + query_boundaries_[i],
......
......@@ -236,5 +236,55 @@ private:
double c_;
};
/*!
* \brief Objective function for Poisson regression
*/
class RegressionPoissonLoss: public ObjectiveFunction {
public:
explicit RegressionPoissonLoss(const ObjectiveConfig& config) {
max_delta_step_ = static_cast<double>(config.poisson_max_delta_step);
}
~RegressionPoissonLoss() {}
void Init(const Metadata& metadata, data_size_t num_data) override {
num_data_ = num_data;
label_ = metadata.label();
weights_ = metadata.weights();
}
void GetGradients(const double* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = static_cast<score_t>(score[i] - label_[i]);
hessians[i] = static_cast<score_t>(score[i] + max_delta_step_);
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = static_cast<score_t>((score[i] - label_[i]) * weights_[i]);
hessians[i] = static_cast<score_t>((score[i] + max_delta_step_) * weights_[i]);
}
}
}
const char* GetName() const override {
return "poisson";
}
private:
/*! \brief Number of data */
data_size_t num_data_;
/*! \brief Pointer of label */
const float* label_;
/*! \brief Pointer of weights */
const float* weights_;
/*! \brief used to safeguard optimization */
double max_delta_step_;
};
} // namespace LightGBM
#endif // LightGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
......@@ -22,10 +22,7 @@ void DataParallelTreeLearner::Init(const Dataset* train_data) {
rank_ = Network::rank();
num_machines_ = Network::num_machines();
// allocate buffer for communication
size_t buffer_size = 0;
for (int i = 0; i < num_features_; ++i) {
buffer_size += train_data_->FeatureAt(i)->num_bin() * sizeof(HistogramBinEntry);
}
size_t buffer_size = train_data_->NumTotalBin() * sizeof(HistogramBinEntry);
input_buffer_.resize(buffer_size);
output_buffer_.resize(buffer_size);
......@@ -50,13 +47,19 @@ void DataParallelTreeLearner::BeforeTrain() {
// generate feature partition for current tree
std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
std::vector<int> num_bins_distributed(num_machines_, 0);
for (int i = 0; i < train_data_->num_features(); ++i) {
if (is_feature_used_[i]) {
for (int i = 0; i < train_data_->num_total_features(); ++i) {
int inner_feature_index = train_data_->InnerFeatureIndex(i);
if (inner_feature_index == -1) { continue; }
if (is_feature_used_[inner_feature_index]) {
int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
feature_distribution[cur_min_machine].push_back(i);
num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin();
feature_distribution[cur_min_machine].push_back(inner_feature_index);
auto num_bin = train_data_->FeatureNumBin(inner_feature_index);
if (train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin() == 0) {
num_bin -= 1;
}
num_bins_distributed[cur_min_machine] += num_bin;
}
is_feature_aggregated_[i] = false;
is_feature_aggregated_[inner_feature_index] = false;
}
// get local used feature
for (auto fid : feature_distribution[rank_]) {
......@@ -68,7 +71,11 @@ void DataParallelTreeLearner::BeforeTrain() {
for (int i = 0; i < num_machines_; ++i) {
block_len_[i] = 0;
for (auto fid : feature_distribution[i]) {
block_len_[i] += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
auto num_bin = train_data_->FeatureNumBin(fid);
if (train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
num_bin -= 1;
}
block_len_[i] += num_bin * sizeof(HistogramBinEntry);
}
reduce_scatter_size_ += block_len_[i];
}
......@@ -83,7 +90,11 @@ void DataParallelTreeLearner::BeforeTrain() {
for (int i = 0; i < num_machines_; ++i) {
for (auto fid : feature_distribution[i]) {
buffer_write_start_pos_[fid] = bin_size;
bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
auto num_bin = train_data_->FeatureNumBin(fid);
if (train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
num_bin -= 1;
}
bin_size += num_bin * sizeof(HistogramBinEntry);
}
}
......@@ -91,12 +102,16 @@ void DataParallelTreeLearner::BeforeTrain() {
bin_size = 0;
for (auto fid : feature_distribution[rank_]) {
buffer_read_start_pos_[fid] = bin_size;
bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
auto num_bin = train_data_->FeatureNumBin(fid);
if (train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
num_bin -= 1;
}
bin_size += num_bin * sizeof(HistogramBinEntry);
}
// sync global data sumup info
std::tuple<data_size_t, double, double> data(smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians());
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians());
int size = sizeof(data);
std::memcpy(input_buffer_.data(), &data, size);
// global sumup reduce
......@@ -125,88 +140,88 @@ void DataParallelTreeLearner::BeforeTrain() {
}
void DataParallelTreeLearner::FindBestThresholds() {
train_data_->ConstructHistograms(is_feature_used_,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
smaller_leaf_histogram_array_[0].RawData() - 1);
// construct local histograms
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if ((!is_feature_used_.empty() && is_feature_used_[feature_index] == false)) continue;
// construct histograms for smaller leaf
if (ordered_bins_[feature_index] == nullptr) {
smaller_leaf_histogram_array_[feature_index].Construct(smaller_leaf_splits_->data_indices(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_smaller_leaf_,
ptr_to_ordered_hessians_smaller_leaf_);
} else {
smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index].get(),
smaller_leaf_splits_->LeafIndex(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
gradients_,
hessians_);
}
// copy to buffer
std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index],
smaller_leaf_histogram_array_[feature_index].HistogramData(),
smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
smaller_leaf_histogram_array_[feature_index].RawData(),
smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
}
// Reduce scatter for histogram
Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(),
block_len_.data(), output_buffer_.data(), &HistogramBinEntry::SumReducer);
#pragma omp parallel for schedule(guided)
block_len_.data(), output_buffer_.data(), &HistogramBinEntry::SumReducer);
std::vector<SplitInfo> smaller_best(num_threads_, SplitInfo());
std::vector<SplitInfo> larger_best(num_threads_, SplitInfo());
#pragma omp parallel for schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_aggregated_[feature_index]) continue;
// copy global sumup info
smaller_leaf_histogram_array_[feature_index].SetSumup(
GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians());
const int tid = omp_get_thread_num();
// restore global histograms from buffer
smaller_leaf_histogram_array_[feature_index].FromMemory(
output_buffer_.data() + buffer_read_start_pos_[feature_index]);
output_buffer_.data() + buffer_read_start_pos_[feature_index]);
train_data_->FixHistogram(feature_index,
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
smaller_leaf_histogram_array_[feature_index].RawData());
SplitInfo smaller_split;
// find best threshold for smaller child
smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
&smaller_leaf_splits_->BestSplitPerFeature()[feature_index]);
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
&smaller_split);
if (smaller_split.gain > smaller_best[tid].gain) {
smaller_best[tid] = smaller_split;
smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
}
// only root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
// construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_[feature_index].Subtract(
smaller_leaf_histogram_array_[feature_index]);
// set sumup info for histogram
larger_leaf_histogram_array_[feature_index].SetSumup(
GetGlobalDataCountInLeaf(larger_leaf_splits_->LeafIndex()),
larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians());
smaller_leaf_histogram_array_[feature_index]);
SplitInfo larger_split;
// find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold(
&larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
GetGlobalDataCountInLeaf(larger_leaf_splits_->LeafIndex()),
&larger_split);
if (larger_split.gain > larger_best[tid].gain) {
larger_best[tid] = larger_split;
larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
}
}
auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
int leaf = smaller_leaf_splits_->LeafIndex();
best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { return; }
leaf = larger_leaf_splits_->LeafIndex();
auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
}
void DataParallelTreeLearner::FindBestSplitsForLeaves() {
int smaller_best_feature = -1, larger_best_feature = -1;
SplitInfo smaller_best, larger_best;
std::vector<double> gains;
// find local best split for smaller leaf
for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
}
smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
// find local best split for larger leaf
if (larger_leaf_splits_->LeafIndex() >= 0) {
gains.clear();
for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
}
larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
}
// sync global best info
......@@ -214,7 +229,7 @@ void DataParallelTreeLearner::FindBestSplitsForLeaves() {
std::memcpy(input_buffer_.data() + sizeof(SplitInfo), &larger_best, sizeof(SplitInfo));
Network::Allreduce(input_buffer_.data(), sizeof(SplitInfo) * 2, sizeof(SplitInfo),
output_buffer_.data(), &SplitInfo::MaxReducer);
output_buffer_.data(), &SplitInfo::MaxReducer);
std::memcpy(&smaller_best, output_buffer_.data(), sizeof(SplitInfo));
std::memcpy(&larger_best, output_buffer_.data() + sizeof(SplitInfo), sizeof(SplitInfo));
......
......@@ -2,9 +2,9 @@
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <LightGBM/dataset.h>
#include <omp.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <cstring>
......@@ -41,7 +41,12 @@ public:
leaf_begin_.resize(num_leaves_);
leaf_count_.resize(num_leaves_);
}
void ResetNumData(int num_data) {
num_data_ = num_data;
indices_.resize(num_data_);
temp_left_indices_.resize(num_data_);
temp_right_indices_.resize(num_data_);
}
~DataPartition() {
}
......@@ -88,7 +93,7 @@ public:
* \param threshold threshold that want to split
* \param right_leaf index of right leaf
*/
void Split(int leaf, const Bin* feature_bins, unsigned int threshold, int right_leaf) {
void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, int right_leaf) {
const data_size_t min_inner_size = 1000;
// get leaf boundary
const data_size_t begin = leaf_begin_[leaf];
......@@ -106,7 +111,7 @@ public:
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
// split data inner, reduce the times of function called
data_size_t cur_left_count = feature_bins->Split(threshold, indices_.data() + begin + cur_start, cur_cnt,
data_size_t cur_left_count = dataset->Split(feature, threshold, indices_.data() + begin + cur_start, cur_cnt,
temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count;
......
......@@ -2,19 +2,31 @@
#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#include "split_info.hpp"
#include <LightGBM/feature.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/dataset.h>
#include <cstring>
namespace LightGBM {
namespace LightGBM
{
class FeatureMetainfo {
public:
int num_bin;
int bias = 0;
/*! \brief pointer of tree config */
const TreeConfig* tree_config;
};
/*!
* \brief FeatureHistogram is used to construct and store a histogram for a feature.
*/
class FeatureHistogram {
public:
FeatureHistogram() {
data_ = nullptr;
}
~FeatureHistogram() {
}
......@@ -28,125 +40,76 @@ public:
* \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf
*/
void Init(const Feature* feature, int feature_idx, const TreeConfig* tree_config) {
feature_idx_ = feature_idx;
tree_config_ = tree_config;
bin_data_ = feature->bin_data();
num_bins_ = feature->num_bin();
data_.resize(num_bins_);
if (feature->bin_type() == BinType::NumericalBin) {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdForNumerical, this, std::placeholders::_1);
void Init(HistogramBinEntry* data, const FeatureMetainfo* meta, BinType bin_type) {
meta_ = meta;
data_ = data;
if (bin_type == BinType::NumericalBin) {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdNumerical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
} else {
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdForCategorical, this, std::placeholders::_1);
find_best_threshold_fun_ = std::bind(&FeatureHistogram::FindBestThresholdCategorical, this, std::placeholders::_1
, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
}
}
/*!
* \brief Construct a histogram
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hessians of current leaf
* \param ordered_gradients Orederd gradients
* \param ordered_hessians Ordered hessians
* \param data_indices data indices of current leaf
*/
void Construct(const data_size_t* data_indices, data_size_t num_data, double sum_gradients,
double sum_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians) {
std::memset(data_.data(), 0, sizeof(HistogramBinEntry)* num_bins_);
num_data_ = num_data;
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians + 2 * kEpsilon;
bin_data_->ConstructHistogram(data_indices, num_data, ordered_gradients, ordered_hessians, data_.data());
}
/*!
* \brief Construct a histogram by ordered bin
* \param leaf current leaf
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hessians of current leaf
* \param gradients
* \param hessian
*/
void Construct(const OrderedBin* ordered_bin, int leaf, data_size_t num_data, double sum_gradients,
double sum_hessians, const score_t* gradients, const score_t* hessians) {
std::memset(data_.data(), 0, sizeof(HistogramBinEntry)* num_bins_);
num_data_ = num_data;
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians + 2 * kEpsilon;
ordered_bin->ConstructHistogram(leaf, gradients, hessians, data_.data());
HistogramBinEntry* RawData() {
return data_;
}
/*!
* \brief Set sumup information for current histogram
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hessians of current leaf
*/
void SetSumup(data_size_t num_data, double sum_gradients, double sum_hessians) {
num_data_ = num_data;
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians + 2 * kEpsilon;
}
/*!
* \brief Subtract current histograms with other
* \param other The histogram that want to subtract
*/
void Subtract(const FeatureHistogram& other) {
num_data_ -= other.num_data_;
sum_gradients_ -= other.sum_gradients_;
sum_hessians_ -= other.sum_hessians_;
for (unsigned int i = 0; i < num_bins_; ++i) {
for (int i = 0; i < meta_->num_bin - meta_->bias; ++i) {
data_[i].cnt -= other.data_[i].cnt;
data_[i].sum_gradients -= other.data_[i].sum_gradients;
data_[i].sum_hessians -= other.data_[i].sum_hessians;
}
}
/*!
* \brief Find best threshold for this histogram
* \param output The best split result
*/
void FindBestThreshold(SplitInfo* output) {
find_best_threshold_fun_(output);
void FindBestThreshold(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, output);
}
void FindBestThresholdForNumerical(SplitInfo* output) {
void FindBestThresholdNumerical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
double best_sum_left_gradient = NAN;
double best_sum_left_hessian = NAN;
double best_gain = kMinScore;
data_size_t best_left_count = 0;
unsigned int best_threshold = static_cast<unsigned int>(num_bins_);
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
double sum_right_gradient = 0.0f;
double sum_right_hessian = kEpsilon;
data_size_t right_count = 0;
double gain_shift = GetLeafSplitGain(sum_gradients_, sum_hessians_);
double min_gain_shift = gain_shift + tree_config_->min_gain_to_split;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
is_splittable_ = false;
const int bias = meta_->bias;
int t = meta_->num_bin - 1 - bias;
const int t_end = 1 - bias;
// from right to left, and we don't need data in bin0
for (unsigned int t = num_bins_ - 1; t > 0; --t) {
for (; t >= t_end; --t) {
sum_right_gradient += data_[t].sum_gradients;
sum_right_hessian += data_[t].sum_hessians;
right_count += data_[t].cnt;
// if data not enough, or sum hessian too small
if (right_count < tree_config_->min_data_in_leaf
|| sum_right_hessian < tree_config_->min_sum_hessian_in_leaf) continue;
data_size_t left_count = num_data_ - right_count;
if (right_count < meta_->tree_config->min_data_in_leaf
|| sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t left_count = num_data - right_count;
// if data not enough
if (left_count < tree_config_->min_data_in_leaf) break;
if (left_count < meta_->tree_config->min_data_in_leaf) break;
double sum_left_hessian = sum_hessians_ - sum_right_hessian;
double sum_left_hessian = sum_hessian - sum_right_hessian;
// if sum hessian too small
if (sum_left_hessian < tree_config_->min_sum_hessian_in_leaf) break;
if (sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
double sum_left_gradient = sum_gradients_ - sum_right_gradient;
double sum_left_gradient = sum_gradient - sum_right_gradient;
// current split gain
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian);
// gain with split is worse than without split
if (current_gain < min_gain_shift) continue;
if (current_gain <= min_gain_shift) continue;
// mark to is splittable
is_splittable_ = true;
......@@ -156,91 +119,119 @@ public:
best_sum_left_gradient = sum_left_gradient;
best_sum_left_hessian = sum_left_hessian;
// left is <= threshold, right is > threshold. so this is t-1
best_threshold = t - 1;
best_threshold = static_cast<uint32_t>(t - 1 + bias);
best_gain = current_gain;
}
}
if (is_splittable_) {
// update split information
output->feature = feature_idx_;
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian;
output->right_output = CalculateSplittedLeafOutput(sum_gradients_ - best_sum_left_gradient,
sum_hessians_ - best_sum_left_hessian);
output->right_count = num_data_ - best_left_count;
output->right_sum_gradient = sum_gradients_ - best_sum_left_gradient;
output->right_sum_hessian = sum_hessians_ - best_sum_left_hessian;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian);
output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
output->gain = best_gain - gain_shift;
} else {
output->feature = feature_idx_;
output->gain = kMinScore;
}
}
/*!
* \brief Find best threshold for this histogram
* \param output The best split result
*/
void FindBestThresholdForCategorical(SplitInfo* output) {
void FindBestThresholdCategorical(double sum_gradient, double sum_hessian, data_size_t num_data,
SplitInfo* output) {
double best_gain = kMinScore;
unsigned int best_threshold = static_cast<unsigned int>(num_bins_);
double gain_shift = GetLeafSplitGain(sum_gradients_, sum_hessians_);
double min_gain_shift = gain_shift + tree_config_->min_gain_to_split;
uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
data_size_t best_left_count = 0;
double best_sum_left_gradient = 0.0f;
double best_sum_left_hessian = 0.0f;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian);
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
is_splittable_ = false;
for (int t = num_bins_ - 1; t >= 0; --t) {
double sum_current_gradient = data_[t].sum_gradients;
double sum_current_hessian = data_[t].sum_hessians;
data_size_t current_count = data_[t].cnt;
const int bias = meta_->bias;
int t = meta_->num_bin - 1 - bias;
const int t_end = 0;
// from right to left, and we don't need data in bin0
for (; t >= t_end; --t) {
// if data not enough, or sum hessian too small
if (current_count < tree_config_->min_data_in_leaf
|| sum_current_hessian < tree_config_->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data_ - current_count;
if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
|| data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - data_[t].cnt;
// if data not enough
if (other_count < tree_config_->min_data_in_leaf) continue;
if (other_count < meta_->tree_config->min_data_in_leaf) continue;
double sum_other_hessian = sum_hessians_ - sum_current_hessian;
double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon;
// if sum hessian too small
if (sum_other_hessian < tree_config_->min_sum_hessian_in_leaf) continue;
if (sum_other_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
double sum_other_gradient = sum_gradients_ - sum_current_gradient;
double sum_other_gradient = sum_gradient - data_[t].sum_gradients;
// current split gain
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(sum_current_gradient, sum_current_hessian);
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon);
// gain with split is worse than without split
if (current_gain < min_gain_shift) continue;
if (current_gain <= min_gain_shift) continue;
// mark to is splittable
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_threshold = static_cast<unsigned int>(t);
best_threshold = static_cast<uint32_t>(t + bias);
best_sum_left_gradient = data_[t].sum_gradients;
best_sum_left_hessian = data_[t].sum_hessians + kEpsilon;
best_left_count = data_[t].cnt;
best_gain = current_gain;
}
}
// update split information
// need restore zero bin
if (bias == 1) {
t = meta_->num_bin - 1 - bias;
double sum_bin0_gradient = sum_gradient;
double sum_bin0_hessian = sum_hessian - 2 * kEpsilon;
data_size_t cnt_bin0 = num_data;
for (; t >= 0; --t) {
sum_bin0_gradient -= data_[t].sum_gradients;
sum_bin0_hessian -= data_[t].sum_hessians;
cnt_bin0 -= data_[t].cnt;
}
data_size_t other_count = num_data - cnt_bin0;
double sum_other_hessian = sum_hessian - sum_bin0_hessian - kEpsilon;
if (cnt_bin0 >= meta_->tree_config->min_data_in_leaf
&& sum_bin0_hessian >= meta_->tree_config->min_sum_hessian_in_leaf
&& other_count >= meta_->tree_config->min_data_in_leaf
&& sum_other_hessian >= meta_->tree_config->min_sum_hessian_in_leaf) {
double sum_other_gradient = sum_gradient - sum_bin0_gradient;
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian)
+ GetLeafSplitGain(sum_bin0_gradient, sum_bin0_hessian + kEpsilon);
if (current_gain > min_gain_shift) {
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_threshold = static_cast<uint32_t>(0);
best_sum_left_gradient = sum_bin0_gradient;
best_sum_left_hessian = sum_bin0_hessian + kEpsilon;
best_left_count = cnt_bin0;
best_gain = current_gain;
}
}
}
}
if (is_splittable_) {
output->feature = feature_idx_;
// update split information
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(data_[best_threshold].sum_gradients,
data_[best_threshold].sum_hessians);
output->left_count = data_[best_threshold].cnt;
output->left_sum_gradient = data_[best_threshold].sum_gradients;
output->left_sum_hessian = data_[best_threshold].sum_hessians;
output->right_output = CalculateSplittedLeafOutput(sum_gradients_ - data_[best_threshold].sum_gradients,
sum_hessians_ - data_[best_threshold].sum_hessians);
output->right_count = num_data_ - data_[best_threshold].cnt;
output->right_sum_gradient = sum_gradients_ - data_[best_threshold].sum_gradients;
output->right_sum_hessian = sum_hessians_ - data_[best_threshold].sum_hessians;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput(sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian);
output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
output->gain = best_gain - gain_shift;
} else {
output->feature = feature_idx_;
output->gain = kMinScore;
}
}
......@@ -249,21 +240,14 @@ public:
* \brief Binary size of this histogram
*/
int SizeOfHistgram() const {
return num_bins_ * sizeof(HistogramBinEntry);
}
/*!
* \brief Memory pointer to histogram data
*/
const HistogramBinEntry* HistogramData() const {
return data_.data();
return (meta_->num_bin - meta_->bias) * sizeof(HistogramBinEntry);
}
/*!
* \brief Restore histogram from memory
*/
void FromMemory(char* memory_data) {
std::memcpy(data_.data(), memory_data, num_bins_ * sizeof(HistogramBinEntry));
void FromMemory(char* memory_data) {
std::memcpy(data_, memory_data, (meta_->num_bin - meta_->bias) * sizeof(HistogramBinEntry));
}
/*!
......@@ -276,10 +260,6 @@ public:
*/
void set_is_splittable(bool val) { is_splittable_ = val; }
void ResetConfig(const TreeConfig* tree_config) {
tree_config_ = tree_config;
}
private:
/*!
* \brief Calculate the split gain based on regularized sum_gradients and sum_hessians
......@@ -289,12 +269,10 @@ private:
*/
double GetLeafSplitGain(double sum_gradients, double sum_hessians) const {
double abs_sum_gradients = std::fabs(sum_gradients);
if (abs_sum_gradients > tree_config_->lambda_l1) {
double reg_abs_sum_gradients = abs_sum_gradients - tree_config_->lambda_l1;
return (reg_abs_sum_gradients * reg_abs_sum_gradients)
/ (sum_hessians + tree_config_->lambda_l2);
}
return 0.0f;
double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
return (reg_abs_sum_gradients * reg_abs_sum_gradients)
/ (sum_hessians + meta_->tree_config->lambda_l2);
}
/*!
......@@ -305,35 +283,19 @@ private:
*/
double CalculateSplittedLeafOutput(double sum_gradients, double sum_hessians) const {
double abs_sum_gradients = std::fabs(sum_gradients);
if (abs_sum_gradients > tree_config_->lambda_l1) {
return -std::copysign(abs_sum_gradients - tree_config_->lambda_l1, sum_gradients)
/ (sum_hessians + tree_config_->lambda_l2);
}
return 0.0f;
double reg_abs_sum_gradients = std::max(0.0, abs_sum_gradients - meta_->tree_config->lambda_l1);
return -std::copysign(reg_abs_sum_gradients, sum_gradients)
/ (sum_hessians + meta_->tree_config->lambda_l2);
}
int feature_idx_;
/*! \brief pointer of tree config */
const TreeConfig* tree_config_;
/*! \brief the bin data of current feature */
const Bin* bin_data_;
/*! \brief number of bin of histogram */
unsigned int num_bins_;
const FeatureMetainfo* meta_;
/*! \brief sum of gradient of each bin */
std::vector<HistogramBinEntry> data_;
/*! \brief number of all data */
data_size_t num_data_;
/*! \brief sum of gradient of current leaf */
double sum_gradients_;
/*! \brief sum of hessians of current leaf */
double sum_hessians_;
HistogramBinEntry* data_;
//std::vector<HistogramBinEntry> data_;
/*! \brief False if this histogram cannot split */
bool is_splittable_ = true;
/*! \brief function that used to find best threshold */
std::function<void(SplitInfo*)> find_best_threshold_fun_;
};
std::function<void(double, double, data_size_t, SplitInfo*)> find_best_threshold_fun_;
};
class HistogramPool {
public:
/*!
......@@ -343,7 +305,6 @@ public:
cache_size_ = 0;
total_size_ = 0;
}
/*!
* \brief Destructor
*/
......@@ -370,7 +331,6 @@ public:
ResetMap();
}
}
/*!
* \brief Reset mapper
*/
......@@ -383,34 +343,48 @@ public:
}
}
/*!
* \brief Fill the pool
* \param obj_create_fun that used to generate object
*/
void Fill(std::function<FeatureHistogram*()> obj_create_fun) {
fill_func_ = obj_create_fun;
pool_.clear();
pool_.resize(cache_size_);
for (int i = 0; i < cache_size_; ++i) {
pool_[i].reset(obj_create_fun());
void DynamicChangeSize(const Dataset* train_data, const TreeConfig* tree_config, int cache_size, int total_size) {
if (feature_metas_.empty()) {
feature_metas_.resize(train_data->num_features());
#pragma omp parallel for schedule(static)
for (int i = 0; i < train_data->num_features(); ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1;
} else {
feature_metas_[i].bias = 0;
}
feature_metas_[i].tree_config = tree_config;
}
}
}
void DynamicChangeSize(int cache_size, int total_size) {
uint64_t num_total_bin = train_data->NumTotalBin();
Log::Info("Total Bins %d", num_total_bin);
int old_cache_size = cache_size_;
Reset(cache_size, total_size);
pool_.resize(cache_size_);
pool_.resize(cache_size);
data_.resize(cache_size);
#pragma omp parallel for schedule(static)
for (int i = old_cache_size; i < cache_size_; ++i) {
pool_[i].reset(fill_func_());
pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
data_[i].resize(num_total_bin);
uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
pool_[i][j].Init(data_[i].data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
num_bin -= 1;
}
offset += static_cast<uint64_t>(num_bin);
}
CHECK(offset == num_total_bin);
}
}
void ResetConfig(const TreeConfig* tree_config, int array_size) {
for (int i = 0; i < cache_size_; ++i) {
auto data_ptr = pool_[i].get();
for (int j = 0; j < array_size; ++j) {
data_ptr[j].ResetConfig(tree_config);
}
void ResetConfig(const TreeConfig* tree_config) {
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(feature_metas_.size()); ++i) {
feature_metas_[i].tree_config = tree_config;
}
}
/*!
......@@ -468,9 +442,9 @@ public:
inverse_mapper_[slot] = dst_idx;
}
private:
std::vector<std::unique_ptr<FeatureHistogram[]>> pool_;
std::function<FeatureHistogram*()> fill_func_;
std::vector<std::vector<HistogramBinEntry>> data_;
std::vector<FeatureMetainfo> feature_metas_;
int cache_size_;
int total_size_;
bool is_enough_ = false;
......@@ -480,7 +454,5 @@ private:
int cur_time_ = 0;
};
} // namespace LightGBM
#endif // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
......@@ -28,12 +28,14 @@ void FeatureParallelTreeLearner::BeforeTrain() {
// get feature partition
std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
std::vector<int> num_bins_distributed(num_machines_, 0);
for (int i = 0; i < train_data_->num_features(); ++i) {
if (is_feature_used_[i]) {
for (int i = 0; i < train_data_->num_total_features(); ++i) {
int inner_feature_index = train_data_->InnerFeatureIndex(i);
if (inner_feature_index == -1) { continue; }
if (is_feature_used_[inner_feature_index]) {
int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
feature_distribution[cur_min_machine].push_back(i);
num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin();
is_feature_used_[i] = false;
feature_distribution[cur_min_machine].push_back(inner_feature_index);
num_bins_distributed[cur_min_machine] += train_data_->FeatureNumBin(inner_feature_index);
is_feature_used_[inner_feature_index] = false;
}
}
// get local used features
......@@ -43,23 +45,12 @@ void FeatureParallelTreeLearner::BeforeTrain() {
}
void FeatureParallelTreeLearner::FindBestSplitsForLeaves() {
int smaller_best_feature = -1, larger_best_feature = -1;
SplitInfo smaller_best, larger_best;
// get best split at smaller leaf
std::vector<double> gains;
for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
}
smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
// get best split at larger leaf
smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
// find local best split for larger leaf
if (larger_leaf_splits_->LeafIndex() >= 0) {
gains.clear();
for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
}
larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
}
// sync global best info
std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo));
......
......@@ -3,7 +3,6 @@
#include <LightGBM/meta.h>
#include "data_partition.hpp"
#include "split_info.hpp"
#include <vector>
......@@ -14,13 +13,13 @@ namespace LightGBM {
*/
class LeafSplits {
public:
LeafSplits(int num_feature, data_size_t num_data)
:num_data_in_leaf_(num_data), num_data_(num_data), num_features_(num_feature),
LeafSplits(data_size_t num_data)
:num_data_in_leaf_(num_data), num_data_(num_data),
data_indices_(nullptr) {
best_split_per_feature_.resize(num_features_);
for (int i = 0; i < num_features_; ++i) {
best_split_per_feature_[i].feature = i;
}
}
void ResetNumData(data_size_t num_data) {
num_data_ = num_data;
num_data_in_leaf_ = num_data;
}
~LeafSplits() {
}
......@@ -38,9 +37,6 @@ public:
data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
}
/*!
......@@ -61,9 +57,6 @@ public:
}
sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
}
/*!
......@@ -86,9 +79,6 @@ public:
}
sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
}
......@@ -101,9 +91,6 @@ public:
leaf_index_ = 0;
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
}
/*!
......@@ -111,13 +98,10 @@ public:
*/
void Init() {
leaf_index_ = -1;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
data_indices_ = nullptr;
num_data_in_leaf_ = 0;
}
/*! \brief Get best splits on all features */
std::vector<SplitInfo>& BestSplitPerFeature() { return best_split_per_feature_;}
/*! \brief Get current leaf index */
int LeafIndex() const { return leaf_index_; }
......@@ -136,16 +120,12 @@ public:
private:
/*! \brief store best splits of all feature on current leaf */
std::vector<SplitInfo> best_split_per_feature_;
/*! \brief current leaf index */
int leaf_index_;
/*! \brief number of data on current leaf */
data_size_t num_data_in_leaf_;
/*! \brief number of all training data */
data_size_t num_data_;
/*! \brief number of features */
int num_features_;
/*! \brief sum of gradients of current leaf */
double sum_gradients_;
/*! \brief sum of hessians of current leaf */
......
......@@ -22,7 +22,7 @@ class FeatureParallelTreeLearner: public SerialTreeLearner {
public:
explicit FeatureParallelTreeLearner(const TreeConfig* tree_config);
~FeatureParallelTreeLearner();
virtual void Init(const Dataset* train_data);
void Init(const Dataset* train_data) override;
protected:
void BeforeTrain() override;
......@@ -170,6 +170,10 @@ private:
std::unique_ptr<FeatureHistogram[]> smaller_leaf_histogram_array_global_;
/*! \brief Store global histogram for larger leaf */
std::unique_ptr<FeatureHistogram[]> larger_leaf_histogram_array_global_;
std::vector<HistogramBinEntry> smaller_leaf_histogram_data_;
std::vector<HistogramBinEntry> larger_leaf_histogram_data_;
std::vector<FeatureMetainfo> feature_metas_;
};
} // namespace LightGBM
......
......@@ -7,13 +7,34 @@
namespace LightGBM {
#ifdef TIMETAG
std::chrono::duration<double, std::milli> init_train_time;
std::chrono::duration<double, std::milli> init_split_time;
std::chrono::duration<double, std::milli> hist_time;
std::chrono::duration<double, std::milli> find_split_time;
std::chrono::duration<double, std::milli> split_time;
std::chrono::duration<double, std::milli> ordered_bin_time;
#endif // TIMETAG
SerialTreeLearner::SerialTreeLearner(const TreeConfig* tree_config)
:tree_config_(tree_config){
:tree_config_(tree_config) {
random_ = Random(tree_config_->feature_fraction_seed);
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
}
SerialTreeLearner::~SerialTreeLearner() {
#ifdef TIMETAG
Log::Info("SerialTreeLearner::init_train costs %f", init_train_time * 1e-3);
Log::Info("SerialTreeLearner::init_split costs %f", init_split_time * 1e-3);
Log::Info("SerialTreeLearner::hist_build costs %f", hist_time * 1e-3);
Log::Info("SerialTreeLearner::find_split costs %f", find_split_time * 1e-3);
Log::Info("SerialTreeLearner::split costs %f", split_time * 1e-3);
Log::Info("SerialTreeLearner::ordered_bin costs %f", ordered_bin_time * 1e-3);
#endif
}
void SerialTreeLearner::Init(const Dataset* train_data) {
......@@ -27,49 +48,74 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
} else {
size_t total_histogram_size = 0;
for (int i = 0; i < train_data_->num_features(); ++i) {
total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureAt(i)->num_bin();
total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
}
max_cache_size = static_cast<int>(tree_config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
}
// at least need 2 leaves
max_cache_size = std::max(2, max_cache_size);
max_cache_size = std::min(max_cache_size, tree_config_->num_leaves);
histogram_pool_.Reset(max_cache_size, tree_config_->num_leaves);
auto histogram_create_function = [this]() {
auto tmp_histogram_array = std::unique_ptr<FeatureHistogram[]>(new FeatureHistogram[train_data_->num_features()]);
for (int j = 0; j < train_data_->num_features(); ++j) {
tmp_histogram_array[j].Init(train_data_->FeatureAt(j),
j, tree_config_);
}
return tmp_histogram_array.release();
};
histogram_pool_.Fill(histogram_create_function);
histogram_pool_.DynamicChangeSize(train_data_, tree_config_, max_cache_size, tree_config_->num_leaves);
// push split information for all leaves
best_split_per_leaf_.resize(tree_config_->num_leaves);
// initialize ordered_bins_ with nullptr
ordered_bins_.resize(num_features_);
// get ordered bin
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
ordered_bins_[i].reset(train_data_->FeatureAt(i)->bin_data()->CreateOrderedBin());
}
train_data_->CreateOrderedBins(&ordered_bins_);
// check existing for ordered bin
for (int i = 0; i < num_features_; ++i) {
for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
if (ordered_bins_[i] != nullptr) {
has_ordered_bin_ = true;
break;
}
}
// initialize splits for leaf
smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data()));
larger_leaf_splits_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data()));
smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
larger_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
// initialize data partition
data_partition_.reset(new DataPartition(num_data_, tree_config_->num_leaves));
is_feature_used_.resize(num_features_);
// initialize ordered gradients and hessians
ordered_gradients_.resize(num_data_);
ordered_hessians_.resize(num_data_);
// if has ordered bin, need to allocate a buffer to fast split
if (has_ordered_bin_) {
is_data_in_leaf_.resize(num_data_);
std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
ordered_bin_indices_.clear();
for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
if (ordered_bins_[i] != nullptr) {
ordered_bin_indices_.push_back(i);
}
}
}
Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
}
void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
train_data_ = train_data;
num_data_ = train_data_->num_data();
num_features_ = train_data_->num_features();
// get ordered bin
train_data_->CreateOrderedBins(&ordered_bins_);
has_ordered_bin_ = false;
// check existing for ordered bin
for (int i = 0; i < static_cast<int>(ordered_bins_.size()); ++i) {
if (ordered_bins_[i] != nullptr) {
has_ordered_bin_ = true;
break;
}
}
// initialize splits for leaf
smaller_leaf_splits_->ResetNumData(num_data_);
larger_leaf_splits_->ResetNumData(num_data_);
// initialize data partition
data_partition_->ResetNumData(num_data_);
is_feature_used_.resize(num_features_);
......@@ -79,11 +125,16 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
// if has ordered bin, need to allocate a buffer to fast split
if (has_ordered_bin_) {
is_data_in_leaf_.resize(num_data_);
std::fill(is_data_in_leaf_.begin(), is_data_in_leaf_.end(), 0);
ordered_bin_indices_.clear();
for (int i = 0; i < static_cast<int>(ordered_bins_.size()); i++) {
if (ordered_bins_[i] != nullptr) {
ordered_bin_indices_.push_back(i);
}
}
}
Log::Info("Number of data: %d, number of features: %d", num_data_, num_features_);
}
void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
if (tree_config_->num_leaves != tree_config->num_leaves) {
tree_config_ = tree_config;
......@@ -94,14 +145,14 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
} else {
size_t total_histogram_size = 0;
for (int i = 0; i < train_data_->num_features(); ++i) {
total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureAt(i)->num_bin();
total_histogram_size += sizeof(HistogramBinEntry) * train_data_->FeatureNumBin(i);
}
max_cache_size = static_cast<int>(tree_config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
}
// at least need 2 leaves
max_cache_size = std::max(2, max_cache_size);
max_cache_size = std::min(max_cache_size, tree_config_->num_leaves);
histogram_pool_.DynamicChangeSize(max_cache_size, tree_config_->num_leaves);
histogram_pool_.DynamicChangeSize(train_data_, tree_config_, max_cache_size, tree_config_->num_leaves);
// push split information for all leaves
best_split_per_leaf_.resize(tree_config_->num_leaves);
......@@ -110,24 +161,40 @@ void SerialTreeLearner::ResetConfig(const TreeConfig* tree_config) {
tree_config_ = tree_config;
}
histogram_pool_.ResetConfig(tree_config_, train_data_->num_features());
histogram_pool_.ResetConfig(tree_config_);
}
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
gradients_ = gradients;
hessians_ = hessians;
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// some initial works before training
BeforeTrain();
#ifdef TIMETAG
init_train_time += std::chrono::steady_clock::now() - start_time;
#endif
auto tree = std::unique_ptr<Tree>(new Tree(tree_config_->num_leaves));
// save pointer to last trained tree
last_trained_tree_ = tree.get();
// root leaf
int left_leaf = 0;
int cur_depth = 1;
// only root leaf can be splitted on first time
int right_leaf = -1;
for (int split = 0; split < tree_config_->num_leaves - 1; split++) {
for (int split = 0; split < tree_config_->num_leaves - 1; ++split) {
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
// some initial works before finding best split
if (BeforeFindBestSplit(left_leaf, right_leaf)) {
#ifdef TIMETAG
init_split_time += std::chrono::steady_clock::now() - start_time;
#endif
// find best threshold for every feature
FindBestThresholds();
// find best split from all features
......@@ -139,13 +206,20 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
// cannot split, quit
if (best_leaf_SplitInfo.gain <= 0.0) {
Log::Info("No further splits with positive gain, best gain: %f, leaves: %d",
best_leaf_SplitInfo.gain, split + 1);
Log::Info("No further splits with positive gain, best gain: %f", best_leaf_SplitInfo.gain);
break;
}
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
// split tree with best leaf
Split(tree.get(), best_leaf, &left_leaf, &right_leaf);
#ifdef TIMETAG
split_time += std::chrono::steady_clock::now() - start_time;
#endif
cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
}
Log::Info("Trained a tree with leaves=%d and max_depth=%d", tree->num_leaves(), cur_depth);
return tree.release();
}
......@@ -153,15 +227,24 @@ void SerialTreeLearner::BeforeTrain() {
// reset histogram pool
histogram_pool_.ResetMap();
// initialize used features
for (int i = 0; i < num_features_; ++i) {
is_feature_used_[i] = false;
}
// Get used feature at current tree
int used_feature_cnt = static_cast<int>(num_features_*tree_config_->feature_fraction);
auto used_feature_indices = random_.Sample(num_features_, used_feature_cnt);
for (auto idx : used_feature_indices) {
is_feature_used_[idx] = true;
if (tree_config_->feature_fraction < 1) {
int used_feature_cnt = static_cast<int>(train_data_->num_total_features()*tree_config_->feature_fraction);
// initialize used features
std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * num_features_);
// Get used feature at current tree
auto used_feature_indices = random_.Sample(train_data_->num_total_features(), used_feature_cnt);
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(used_feature_indices.size()); ++i) {
int inner_feature_index = train_data_->InnerFeatureIndex(used_feature_indices[i]);
if (inner_feature_index < 0) { continue; }
is_feature_used_[inner_feature_index] = 1;
}
} else {
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_features_; ++i) {
is_feature_used_[i] = 1;
}
}
// initialize data partition
......@@ -176,60 +259,49 @@ void SerialTreeLearner::BeforeTrain() {
if (data_partition_->leaf_count(0) == num_data_) {
// use all data
smaller_leaf_splits_->Init(gradients_, hessians_);
// point to gradients, avoid copy
ptr_to_ordered_gradients_smaller_leaf_ = gradients_;
ptr_to_ordered_hessians_smaller_leaf_ = hessians_;
} else {
// use bagging, only use part of data
smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
// copy used gradients and hessians to ordered buffer
const data_size_t* indices = data_partition_->indices();
data_size_t cnt = data_partition_->leaf_count(0);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < cnt; ++i) {
ordered_gradients_[i] = gradients_[indices[i]];
ordered_hessians_[i] = hessians_[indices[i]];
}
// point to ordered_gradients_ and ordered_hessians_
ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_.data();
ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_.data();
}
ptr_to_ordered_gradients_larger_leaf_ = nullptr;
ptr_to_ordered_hessians_larger_leaf_ = nullptr;
larger_leaf_splits_->Init();
// if has ordered bin, need to initialize the ordered bin
if (has_ordered_bin_) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
if (data_partition_->leaf_count(0) == num_data_) {
// use all data, pass nullptr
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
if (ordered_bins_[i] != nullptr) {
ordered_bins_[i]->Init(nullptr, tree_config_->num_leaves);
}
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
ordered_bins_[ordered_bin_indices_[i]]->Init(nullptr, tree_config_->num_leaves);
}
} else {
// bagging, only use part of data
// mark used data
std::memset(is_data_in_leaf_.data(), 0, sizeof(char)*num_data_);
const data_size_t* indices = data_partition_->indices();
data_size_t begin = data_partition_->leaf_begin(0);
data_size_t end = begin + data_partition_->leaf_count(0);
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 1;
}
// initialize ordered bin
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
if (ordered_bins_[i] != nullptr) {
ordered_bins_[i]->Init(is_data_in_leaf_.data(), tree_config_->num_leaves);
}
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
ordered_bins_[ordered_bin_indices_[i]]->Init(is_data_in_leaf_.data(), tree_config_->num_leaves);
}
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 0;
}
}
#ifdef TIMETAG
ordered_bin_time += std::chrono::steady_clock::now() - start_time;
#endif
}
}
......@@ -249,7 +321,7 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
// no enough data to continue
if (num_data_in_right_child < static_cast<data_size_t>(tree_config_->min_data_in_leaf * 2)
&& num_data_in_left_child < static_cast<data_size_t>(tree_config_->min_data_in_leaf * 2)) {
&& num_data_in_left_child < static_cast<data_size_t>(tree_config_->min_data_in_leaf * 2)) {
best_split_per_leaf_[left_leaf].gain = kMinScore;
if (right_leaf >= 0) {
best_split_per_leaf_[right_leaf].gain = kMinScore;
......@@ -257,172 +329,184 @@ bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
return false;
}
parent_leaf_histogram_array_ = nullptr;
// -1 if only has one leaf. else equal the index of smaller leaf
int smaller_leaf = -1;
int larger_leaf = -1;
// only have root
if (right_leaf < 0) {
histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
larger_leaf_histogram_array_ = nullptr;
} else if (num_data_in_left_child < num_data_in_right_child) {
smaller_leaf = left_leaf;
larger_leaf = right_leaf;
// put parent(left) leaf's histograms into larger leaf's histograms
if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
histogram_pool_.Move(left_leaf, right_leaf);
histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
} else {
smaller_leaf = right_leaf;
larger_leaf = left_leaf;
// put parent(left) leaf's histograms to larger leaf's histograms
if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
}
// init for the ordered gradients, only initialize when have 2 leaves
if (smaller_leaf >= 0) {
// only need to initialize for smaller leaf
// Get leaf boundary
const data_size_t* indices = data_partition_->indices();
data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
// copy
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
ordered_gradients_[i - begin] = gradients_[indices[i]];
ordered_hessians_[i - begin] = hessians_[indices[i]];
}
// assign pointer
ptr_to_ordered_gradients_smaller_leaf_ = ordered_gradients_.data();
ptr_to_ordered_hessians_smaller_leaf_ = ordered_hessians_.data();
if (parent_leaf_histogram_array_ == nullptr) {
// need order gradient for larger leaf
data_size_t smaller_size = end - begin;
data_size_t larger_begin = data_partition_->leaf_begin(larger_leaf);
data_size_t larger_end = larger_begin + data_partition_->leaf_count(larger_leaf);
// copy
#pragma omp parallel for schedule(static)
for (data_size_t i = larger_begin; i < larger_end; ++i) {
ordered_gradients_[smaller_size + i - larger_begin] = gradients_[indices[i]];
ordered_hessians_[smaller_size + i - larger_begin] = hessians_[indices[i]];
}
ptr_to_ordered_gradients_larger_leaf_ = ordered_gradients_.data() + smaller_size;
ptr_to_ordered_hessians_larger_leaf_ = ordered_hessians_.data() + smaller_size;
}
}
// split for the ordered bin
if (has_ordered_bin_ && right_leaf >= 0) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
// mark data that at left-leaf
std::memset(is_data_in_leaf_.data(), 0, sizeof(char)*num_data_);
const data_size_t* indices = data_partition_->indices();
const auto left_cnt = data_partition_->leaf_count(left_leaf);
const auto right_cnt = data_partition_->leaf_count(right_leaf);
char mark = 1;
data_size_t begin = data_partition_->leaf_begin(left_leaf);
data_size_t end = begin + data_partition_->leaf_count(left_leaf);
#pragma omp parallel for schedule(static)
data_size_t end = begin + left_cnt;
if (left_cnt > right_cnt) {
begin = data_partition_->leaf_begin(right_leaf);
end = begin + right_cnt;
mark = 0;
}
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 1;
}
// split the ordered bin
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
if (ordered_bins_[i] != nullptr) {
ordered_bins_[i]->Split(left_leaf, right_leaf, is_data_in_leaf_.data());
}
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(ordered_bin_indices_.size()); ++i) {
ordered_bins_[ordered_bin_indices_[i]]->Split(left_leaf, right_leaf, is_data_in_leaf_.data(), mark);
}
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 0;
}
#ifdef TIMETAG
ordered_bin_time += std::chrono::steady_clock::now() - start_time;
#endif
}
return true;
}
void SerialTreeLearner::FindBestThresholds() {
#pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; feature_index++) {
// feature is not used
if ((!is_feature_used_.empty() && is_feature_used_[feature_index] == false)) continue;
// if parent(larger) leaf cannot split at current feature
if (parent_leaf_histogram_array_ != nullptr && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
#ifdef TIMETAG
auto start_time = std::chrono::steady_clock::now();
#endif
std::vector<int8_t> is_feature_used(num_features_, 0);
#pragma omp parallel for schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used_[feature_index]) continue;
if (parent_leaf_histogram_array_ != nullptr
&& !parent_leaf_histogram_array_[feature_index].is_splittable()) {
smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
continue;
}
// construct histograms for smaller leaf
if (ordered_bins_[feature_index] == nullptr) {
// if not use ordered bin
smaller_leaf_histogram_array_[feature_index].Construct(smaller_leaf_splits_->data_indices(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_smaller_leaf_,
ptr_to_ordered_hessians_smaller_leaf_);
} else {
// used ordered bin
smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index].get(),
smaller_leaf_splits_->LeafIndex(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
gradients_,
hessians_);
is_feature_used[feature_index] = 1;
}
bool use_subtract = true;
if (parent_leaf_histogram_array_ == nullptr) {
use_subtract = false;
}
// construct smaller leaf
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
train_data_->ConstructHistograms(is_feature_used,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
ptr_smaller_leaf_hist_data);
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf
HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
train_data_->ConstructHistograms(is_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
ptr_larger_leaf_hist_data);
}
#ifdef TIMETAG
hist_time += std::chrono::steady_clock::now() - start_time;
#endif
#ifdef TIMETAG
start_time = std::chrono::steady_clock::now();
#endif
std::vector<SplitInfo> smaller_best(num_threads_);
std::vector<SplitInfo> larger_best(num_threads_);
// find splits
#pragma omp parallel for schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used[feature_index]) { continue; }
const int tid = omp_get_thread_num();
SplitInfo smaller_split;
train_data_->FixHistogram(feature_index,
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_histogram_array_[feature_index].RawData());
smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
smaller_leaf_splits_->num_data_in_leaf(),
&smaller_split);
if (smaller_split.gain > smaller_best[tid].gain) {
smaller_best[tid] = smaller_split;
smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
}
// find best threshold for smaller child
smaller_leaf_histogram_array_[feature_index].FindBestThreshold(&smaller_leaf_splits_->BestSplitPerFeature()[feature_index]);
// only has root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
if (parent_leaf_histogram_array_ != nullptr) {
// construct histgroms for large leaf, we initialize larger leaf as the parent,
// so we can just subtract the smaller leaf's histograms
if (use_subtract) {
larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
} else {
if (ordered_bins_[feature_index] == nullptr) {
// if not use ordered bin
larger_leaf_histogram_array_[feature_index].Construct(larger_leaf_splits_->data_indices(),
larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_larger_leaf_,
ptr_to_ordered_hessians_larger_leaf_);
} else {
// used ordered bin
larger_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index].get(),
larger_leaf_splits_->LeafIndex(),
larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
gradients_,
hessians_);
}
train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(),
larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_histogram_array_[feature_index].RawData());
}
SplitInfo larger_split;
// find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold(&larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
larger_leaf_histogram_array_[feature_index].FindBestThreshold(
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
larger_leaf_splits_->num_data_in_leaf(),
&larger_split);
if (larger_split.gain > larger_best[tid].gain) {
larger_best[tid] = larger_split;
larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
}
}
auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
int leaf = smaller_leaf_splits_->LeafIndex();
best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
if (larger_leaf_splits_ != nullptr && larger_leaf_splits_->LeafIndex() >= 0) {
leaf = larger_leaf_splits_->LeafIndex();
auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
}
#ifdef TIMETAG
find_split_time += std::chrono::steady_clock::now() - start_time;
#endif
}
void SerialTreeLearner::FindBestSplitsForLeaves() {
}
void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
// left = parent
*left_leaf = best_Leaf;
// split tree, will return right leaf
*right_leaf = tree->Split(best_Leaf, best_split_info.feature,
train_data_->FeatureAt(best_split_info.feature)->bin_type(),
best_split_info.threshold,
train_data_->FeatureAt(best_split_info.feature)->feature_index(),
train_data_->FeatureAt(best_split_info.feature)->BinToValue(best_split_info.threshold),
static_cast<double>(best_split_info.left_output),
static_cast<double>(best_split_info.right_output),
static_cast<data_size_t>(best_split_info.left_count),
static_cast<data_size_t>(best_split_info.right_count),
static_cast<double>(best_split_info.gain));
*right_leaf = tree->Split(best_Leaf,
inner_feature_index,
train_data_->FeatureBinMapper(inner_feature_index)->bin_type(),
best_split_info.threshold,
best_split_info.feature,
train_data_->RealThreshold(inner_feature_index, best_split_info.threshold),
static_cast<double>(best_split_info.left_output),
static_cast<double>(best_split_info.right_output),
static_cast<data_size_t>(best_split_info.left_count),
static_cast<data_size_t>(best_split_info.right_count),
static_cast<double>(best_split_info.gain));
// split data partition
data_partition_->Split(best_Leaf, train_data_->FeatureAt(best_split_info.feature)->bin_data(),
data_partition_->Split(best_Leaf, train_data_, inner_feature_index,
best_split_info.threshold, *right_leaf);
// init the leaves that used on next iteration
......@@ -431,8 +515,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
best_split_info.left_sum_gradient,
best_split_info.left_sum_hessian);
larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
best_split_info.right_sum_gradient,
best_split_info.right_sum_hessian);
best_split_info.right_sum_gradient,
best_split_info.right_sum_hessian);
} else {
smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment