Commit 1c774687 authored by Guolin Ke's avatar Guolin Ke
Browse files

first commit

parents
#ifndef LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
#define LIGHTGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
#include <LightGBM/objective_function.h>
namespace LightGBM {
/*!
* \brief Objective funtion for regression
*/
class RegressionL2loss: public ObjectiveFunction {
public:
explicit RegressionL2loss(const ObjectiveConfig&) {
}
~RegressionL2loss() {
}
void Init(const Metadata& metadata, data_size_t num_data) override {
num_data_ = num_data;
label_ = metadata.label();
weights_ = metadata.weights();
}
void GetGradients(const score_t* score, score_t* gradients,
score_t* hessians) const override {
if (weights_ == nullptr) {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = (score[i] - label_[i]);
hessians[i] = 1.0;
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
gradients[i] = (score[i] - label_[i]) * weights_[i];
hessians[i] = weights_[i];
}
}
}
double GetSigmoid() const override {
// not sigmoid transform, return -1
return -1.0;
}
private:
/*! \brief Number of data */
data_size_t num_data_;
/*! \brief Pointer of label */
const float* label_;
/*! \brief Pointer of weights */
const float* weights_;
};
} // namespace LightGBM
#endif #endif // LightGBM_OBJECTIVE_REGRESSION_OBJECTIVE_HPP_
#include "parallel_tree_learner.h"
#include <cstring>
#include <tuple>
#include <vector>
namespace LightGBM {
DataParallelTreeLearner::DataParallelTreeLearner(const TreeConfig& tree_config)
:SerialTreeLearner(tree_config), input_buffer_(nullptr),
output_buffer_(nullptr), is_feature_aggregated_(nullptr),
block_start_(nullptr), block_len_(nullptr),
buffer_write_start_pos_(nullptr), buffer_read_start_pos_(nullptr),
global_data_count_in_leaf_(nullptr) {
}
DataParallelTreeLearner::~DataParallelTreeLearner() {
if (input_buffer_ != nullptr) { delete[] input_buffer_; }
if (output_buffer_ != nullptr) { delete[] output_buffer_; }
if (is_feature_aggregated_ != nullptr) { delete[] is_feature_aggregated_; }
if (block_start_ != nullptr) { delete[] block_start_; }
if (block_len_ != nullptr) { delete[] block_len_; }
if (buffer_write_start_pos_ != nullptr) { delete[] buffer_write_start_pos_; }
if (buffer_read_start_pos_ != nullptr) { delete[] buffer_read_start_pos_; }
if (global_data_count_in_leaf_ != nullptr) { delete[] global_data_count_in_leaf_; }
}
void DataParallelTreeLearner::Init(const Dataset* train_data) {
// initialize SerialTreeLearner
SerialTreeLearner::Init(train_data);
// Get local rank and global machine size
rank_ = Network::rank();
num_machines_ = Network::num_machines();
// allocate buffer for communication
size_t buffer_size = 0;
for (int i = 0; i < num_features_; ++i) {
buffer_size += train_data_->FeatureAt(i)->num_bin() * sizeof(HistogramBinEntry);
}
input_buffer_ = new char[buffer_size];
output_buffer_ = new char[buffer_size];
is_feature_aggregated_ = new bool[num_features_];
block_start_ = new int[num_machines_];
block_len_ = new int[num_machines_];
buffer_write_start_pos_ = new int[num_features_];
buffer_read_start_pos_ = new int[num_features_];
global_data_count_in_leaf_ = new data_size_t[num_leaves_];
}
void DataParallelTreeLearner::BeforeTrain() {
SerialTreeLearner::BeforeTrain();
// generate feature partition for current tree
std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
std::vector<int> num_bins_distributed(num_machines_, 0);
for (int i = 0; i < train_data_->num_features(); ++i) {
if (is_feature_used_[i]) {
int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
feature_distribution[cur_min_machine].push_back(i);
num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin();
}
is_feature_aggregated_[i] = false;
}
// get local used feature
for (auto fid : feature_distribution[rank_]) {
is_feature_aggregated_[fid] = true;
}
// get block start and block len for reduce scatter
reduce_scatter_size_ = 0;
for (int i = 0; i < num_machines_; ++i) {
block_len_[i] = 0;
for (auto fid : feature_distribution[i]) {
block_len_[i] += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
}
reduce_scatter_size_ += block_len_[i];
}
block_start_[0] = 0;
for (int i = 1; i < num_machines_; ++i) {
block_start_[i] = block_start_[i - 1] + block_len_[i - 1];
}
// get buffer_write_start_pos_
int bin_size = 0;
for (int i = 0; i < num_machines_; ++i) {
for (auto fid : feature_distribution[i]) {
buffer_write_start_pos_[fid] = bin_size;
bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
}
}
// get buffer_read_start_pos_
bin_size = 0;
for (auto fid : feature_distribution[rank_]) {
buffer_read_start_pos_[fid] = bin_size;
bin_size += train_data_->FeatureAt(fid)->num_bin() * sizeof(HistogramBinEntry);
}
// sync global data sumup info
std::tuple<data_size_t, score_t, score_t> data(smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians());
int size = sizeof(data);
std::memcpy(input_buffer_, &data, size);
// global sumup reduce
Network::Allreduce(input_buffer_, size, size, output_buffer_, [](const char *src, char *dst, int len) {
int used_size = 0;
int type_size = sizeof(std::tuple<data_size_t, score_t, score_t>);
const std::tuple<data_size_t, score_t, score_t> *p1;
std::tuple<data_size_t, score_t, score_t> *p2;
while (used_size < len) {
p1 = reinterpret_cast<const std::tuple<data_size_t, score_t, score_t> *>(src);
p2 = reinterpret_cast<std::tuple<data_size_t, score_t, score_t> *>(dst);
std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1);
std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1);
std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1);
src += type_size;
dst += type_size;
used_size += type_size;
}
});
// copy back
std::memcpy(&data, output_buffer_, size);
// set global sumup info
smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data));
// init global data count in leaf
global_data_count_in_leaf_[0] = std::get<0>(data);
}
void DataParallelTreeLearner::FindBestThresholds() {
// construct local histograms
#pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if ((is_feature_used_ != nullptr && is_feature_used_[feature_index] == false)) continue;
// construct histograms for smaller leaf
if (ordered_bins_[feature_index] == nullptr) {
smaller_leaf_histogram_array_[feature_index].Construct(smaller_leaf_splits_->data_indices(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_,
ptr_to_ordered_hessians_);
} else {
smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
smaller_leaf_splits_->LeafIndex(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
gradients_,
hessians_);
}
// copy to buffer
std::memcpy(input_buffer_ + buffer_write_start_pos_[feature_index],
smaller_leaf_histogram_array_[feature_index].HistogramData(),
smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
}
// Reduce scatter for histogram
Network::ReduceScatter(input_buffer_, reduce_scatter_size_, block_start_,
block_len_, output_buffer_, &HistogramBinEntry::SumReducer);
#pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_aggregated_[feature_index]) continue;
// copy global sumup info
smaller_leaf_histogram_array_[feature_index].SetSumup(
GetGlobalDataCountInLeaf(smaller_leaf_splits_->LeafIndex()),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians());
// restore global histograms from buffer
smaller_leaf_histogram_array_[feature_index].FromMemory(
output_buffer_ + buffer_read_start_pos_[feature_index]);
// find best threshold for smaller child
smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
&smaller_leaf_splits_->BestSplitPerFeature()[feature_index]);
// only root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
// construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_[feature_index].Subtract(
smaller_leaf_histogram_array_[feature_index]);
// set sumup info for histogram
larger_leaf_histogram_array_[feature_index].SetSumup(
GetGlobalDataCountInLeaf(larger_leaf_splits_->LeafIndex()),
larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians());
// find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold(
&larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
}
}
void DataParallelTreeLearner::FindBestSplitsForLeaves() {
int smaller_best_feature = -1, larger_best_feature = -1;
SplitInfo smaller_best, larger_best;
std::vector<double> gains;
// find local best split for smaller leaf
for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
}
smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
// find local best split for larger leaf
if (larger_leaf_splits_->LeafIndex() >= 0) {
gains.clear();
for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
}
larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
}
// sync global best info
std::memcpy(input_buffer_, &smaller_best, sizeof(SplitInfo));
std::memcpy(input_buffer_ + sizeof(SplitInfo), &larger_best, sizeof(SplitInfo));
Network::Allreduce(input_buffer_, sizeof(SplitInfo) * 2, sizeof(SplitInfo),
output_buffer_, &SplitInfo::MaxReducer);
std::memcpy(&smaller_best, output_buffer_, sizeof(SplitInfo));
std::memcpy(&larger_best, output_buffer_ + sizeof(SplitInfo), sizeof(SplitInfo));
// set best split
best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()] = smaller_best;
if (larger_leaf_splits_->LeafIndex() >= 0) {
best_split_per_leaf_[larger_leaf_splits_->LeafIndex()] = larger_best;
}
}
void DataParallelTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
// need update global number of data in leaf
global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count;
global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count;
}
} // namespace LightGBM
#ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <omp.h>
#include <cstring>
#include <vector>
namespace LightGBM {
/*!
* \brief DataPartition is used to store the the partition of data on tree.
*/
class DataPartition {
public:
DataPartition(data_size_t num_data, int num_leafs)
:num_data_(num_data), num_leaves_(num_leafs) {
leaf_begin_ = new data_size_t[num_leaves_];
leaf_count_ = new data_size_t[num_leaves_];
indices_ = new data_size_t[num_data_];
temp_left_indices_ = new data_size_t[num_data_];
temp_right_indices_ = new data_size_t[num_data_];
used_data_indices_ = nullptr;
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
offsets_buf_ = new data_size_t[num_threads_];
left_cnts_buf_ = new data_size_t[num_threads_];
right_cnts_buf_ = new data_size_t[num_threads_];
left_write_pos_buf_ = new data_size_t[num_threads_];
right_write_pos_buf_ = new data_size_t[num_threads_];
}
~DataPartition() {
delete[] leaf_begin_;
delete[] leaf_count_;
delete[] indices_;
delete[] temp_left_indices_;
delete[] temp_right_indices_;
delete[] offsets_buf_;
delete[] left_cnts_buf_;
delete[] right_cnts_buf_;
delete[] left_write_pos_buf_;
delete[] right_write_pos_buf_;
}
/*!
* \brief Init, will put all data on the root(leaf_idx = 0)
*/
void Init() {
for (int i = 0; i < num_leaves_; ++i) {
leaf_count_[i] = 0;
}
leaf_begin_[0] = 0;
if (used_data_indices_ == nullptr) {
// if using all data
leaf_count_[0] = num_data_;
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data_; ++i) {
indices_[i] = i;
}
} else {
// if bagging
leaf_count_[0] = used_data_count_;
std::memcpy(indices_, used_data_indices_, used_data_count_ * sizeof(data_size_t));
}
}
/*!
* \brief Get the data indices of one leaf
* \param leaf index of leaf
* \param indices output data indices
* \return number of data on this leaf
*/
data_size_t GetIndexOnLeaf(int leaf, data_size_t** indices) const {
// copy reference, maybe unsafe, but faster
data_size_t begin = leaf_begin_[leaf];
(*indices) = static_cast<data_size_t*>(indices_ + begin);
return leaf_count_[leaf];
}
/*!
* \brief Split the data
* \param leaf index of leaf
* \param feature_bins feature bin data
* \param threshold threshold that want to split
* \param right_leaf index of right leaf
*/
void Split(int leaf, const Bin* feature_bins, unsigned int threshold, int right_leaf) {
const data_size_t min_inner_size = 1000;
// get leaf boundary
const data_size_t begin = leaf_begin_[leaf];
const data_size_t cnt = leaf_count_[leaf];
data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
if (inner_size < min_inner_size) { inner_size = min_inner_size; }
// split data multi-threading
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size;
if (cur_start > cnt) { continue; }
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
// split data inner, reduce the times of function called
data_size_t cur_left_count = feature_bins->Split(threshold, indices_ + begin + cur_start, cur_cnt,
temp_left_indices_ + cur_start, temp_right_indices_ + cur_start);
offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count;
right_cnts_buf_[i] = cur_cnt - cur_left_count;
}
data_size_t left_cnt = 0;
left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
}
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
// copy back indices of right leaf to indices_
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
if (left_cnts_buf_[i] > 0) {
std::memcpy(indices_ + begin + left_write_pos_buf_[i], temp_left_indices_ + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
}
if (right_cnts_buf_[i] > 0) {
std::memcpy(indices_ + begin + left_cnt + right_write_pos_buf_[i], temp_right_indices_ + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
}
// update leaf boundary
leaf_count_[leaf] = left_cnt;
leaf_begin_[right_leaf] = left_cnt + begin;
leaf_count_[right_leaf] = cnt - left_cnt;
}
/*!
* \brief SetLabelAt used data indices before training, used for bagging
* \param used_data_indices indices of used data
* \param num_used_data number of used data
*/
void SetUsedDataIndices(const data_size_t * used_data_indices, data_size_t num_used_data) {
used_data_indices_ = used_data_indices;
used_data_count_ = num_used_data;
}
/*!
* \brief Get number of data on one leaf
* \param leaf index of leaf
* \return number of data of this leaf
*/
data_size_t leaf_count(int leaf) const { return leaf_count_[leaf]; }
/*!
* \brief Get leaf begin
* \param leaf index of leaf
* \return begin index of this leaf
*/
data_size_t leaf_begin(int leaf) const { return leaf_begin_[leaf]; }
const data_size_t* indices() const { return indices_; }
/*! \brief Get number of leaves */
int num_leaves() const { return num_leaves_; }
private:
/*! \brief Number of all data */
data_size_t num_data_;
/*! \brief Number of all leaves */
int num_leaves_;
/*! \brief start index of data on one leaf */
data_size_t* leaf_begin_;
/*! \brief number of data on one leaf */
data_size_t* leaf_count_;
/*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
data_size_t* indices_;
/*! \brief team indices buffer for split */
data_size_t* temp_left_indices_;
/*! \brief team indices buffer for split */
data_size_t* temp_right_indices_;
/*! \brief used data indices, used for bagging */
const data_size_t* used_data_indices_;
/*! \brief used data count, used for bagging */
data_size_t used_data_count_;
/*! \brief number of threads */
int num_threads_;
/*! \brief Buffer for multi-threading data partition, used to store offset for different threads */
data_size_t* offsets_buf_;
/*! \brief Buffer for multi-threading data partition, used to store left count after split for different threads */
data_size_t* left_cnts_buf_;
/*! \brief Buffer for multi-threading data partition, used to store right count after split for different threads */
data_size_t* right_cnts_buf_;
/*! \brief Buffer for multi-threading data partition, used to store write position of left leaf for different threads */
data_size_t* left_write_pos_buf_;
/*! \brief Buffer for multi-threading data partition, used to store write position of right leaf for different threads */
data_size_t* right_write_pos_buf_;
};
} // namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_DATA_PARTITION_HPP_
#ifndef LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#define LIGHTGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#include "split_info.hpp"
#include <LightGBM/feature.h>
#include <cstring>
namespace LightGBM {
/*!
* \brief FeatureHistogram is used to construct and store a histogram for a feature.
*/
class FeatureHistogram {
public:
FeatureHistogram()
:data_(nullptr) {
}
~FeatureHistogram() {
if (data_ != nullptr) { delete[] data_; }
}
/*!
* \brief Init the feature histogram
* \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf
*/
void Init(const Feature* feature, int feature_idx, data_size_t min_num_data_one_leaf,
score_t min_sum_hessian_one_leaf) {
feature_idx_ = feature_idx;
min_num_data_one_leaf_ = min_num_data_one_leaf;
min_sum_hessian_one_leaf_ = min_sum_hessian_one_leaf;
bin_data_ = feature->bin_data();
num_bins_ = feature->num_bin();
data_ = new HistogramBinEntry[num_bins_];
}
/*!
* \brief Construct a histogram
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hissians of current leaf
* \param ordered_gradients Orederd gradients
* \param ordered_hessians Ordered hessians
* \param data_indices data indices of current leaf
*/
void Construct(data_size_t* data_indices, data_size_t num_data, score_t sum_gradients,
score_t sum_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians) {
std::memset(data_, 0, sizeof(HistogramBinEntry)* num_bins_);
num_data_ = num_data;
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians + 2 * kEpsilon;
bin_data_->ConstructHistogram(data_indices, num_data, ordered_gradients, ordered_hessians, data_);
}
/*!
* \brief Construct a histogram by ordered bin
* \param leaf current leaf
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hissians of current leaf
* \param gradients
* \param hessian
*/
void Construct(const OrderedBin* ordered_bin, int leaf, data_size_t num_data, score_t sum_gradients,
score_t sum_hessians, const score_t* gradients, const score_t* hessians) {
std::memset(data_, 0, sizeof(HistogramBinEntry)* num_bins_);
num_data_ = num_data;
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians + 2 * kEpsilon;
ordered_bin->ConstructHistogram(leaf, gradients, hessians, data_);
}
/*!
* \brief Set sumup information for current histogram
* \param num_data number of data in current leaf
* \param sum_gradients sum of gradients of current leaf
* \param sum_hessians sum of hissians of current leaf
*/
void SetSumup(data_size_t num_data, score_t sum_gradients, score_t sum_hessians) {
num_data_ = num_data;
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians + 2 * kEpsilon;
}
/*!
* \brief Subtract current histograms with other
* \param other The histogram that want to subtract
*/
void Subtract(const FeatureHistogram& other) {
num_data_ -= other.num_data_;
sum_gradients_ -= other.sum_gradients_;
sum_hessians_ -= other.sum_hessians_;
for (unsigned int i = 0; i < num_bins_; ++i) {
data_[i].cnt -= other.data_[i].cnt;
data_[i].sum_gradients -= other.data_[i].sum_gradients;
data_[i].sum_hessians -= other.data_[i].sum_hessians;
}
}
/*!
* \brief Find best threshold for this histogram
* \param output The best split result
*/
void FindBestThreshold(SplitInfo* output) {
score_t best_sum_left_gradient = NAN;
score_t best_sum_left_hessian = NAN;
score_t best_gain = kMinScore;
data_size_t best_left_count = 0;
unsigned int best_threshold = static_cast<unsigned int>(num_bins_);
score_t sum_right_gradient = 0.0f;
score_t sum_right_hessian = kEpsilon;
data_size_t right_count = 0;
score_t gain_shift = GetLeafSplitGain(sum_gradients_, sum_hessians_);
is_splittable_ = false;
// from right to left, and we don't need data in bin0
for (unsigned int t = num_bins_ - 1; t > 0; --t) {
sum_right_gradient += data_[t].sum_gradients;
sum_right_hessian += data_[t].sum_hessians;
right_count += data_[t].cnt;
// if data not enough, or sum hessian too small
if (right_count < min_num_data_one_leaf_ || sum_right_hessian < min_sum_hessian_one_leaf_) continue;
data_size_t left_count = num_data_ - right_count;
// if data not enough
if (left_count < min_num_data_one_leaf_) break;
score_t sum_left_hessian = sum_hessians_ - sum_right_hessian;
// if sum hessian too small
if (sum_left_hessian < min_sum_hessian_one_leaf_) {
break;
}
score_t sum_left_gradient = sum_gradients_ - sum_right_gradient;
// current split gain
score_t current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian) + GetLeafSplitGain(sum_right_gradient, sum_right_hessian);
// gain is worst than no perform split
if (current_gain < gain_shift) {
continue;
}
// mark to is splittable
is_splittable_ = true;
// better split point
if (current_gain > best_gain) {
best_left_count = left_count;
best_sum_left_gradient = sum_left_gradient;
best_sum_left_hessian = sum_left_hessian;
// left is <= threshold, right is > threshold. so this is t-1
best_threshold = t - 1;
best_gain = current_gain;
}
}
// update split information
output->feature = feature_idx_;
output->threshold = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian);
output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian;
output->right_output = CalculateSplittedLeafOutput(sum_gradients_ - best_sum_left_gradient,
sum_hessians_ - best_sum_left_hessian);
output->right_count = num_data_ - best_left_count;
output->right_sum_gradient = sum_gradients_ - best_sum_left_gradient;
output->right_sum_hessian = sum_hessians_ - best_sum_left_hessian;
output->gain = best_gain - gain_shift;
}
/*!
* \brief Binary size of this histogram
*/
int SizeOfHistgram() const {
return num_bins_ * sizeof(HistogramBinEntry);
}
/*!
* \brief Memory pointer to histogram data
*/
const HistogramBinEntry* HistogramData() const {
return data_;
}
/*!
* \brief Restore histogram from memory
*/
void FromMemory(char* memory_data) {
std::memcpy(data_, memory_data, num_bins_ * sizeof(HistogramBinEntry));
}
/*!
* \brief Set min number data in one leaf
*/
void SetMinNumDataOneLeaf(data_size_t new_val) {
min_num_data_one_leaf_ = new_val;
}
/*!
* \brief Set min sum hessian in one leaf
*/
void SetMinSumHessianOneLeaf(score_t new_val) {
min_sum_hessian_one_leaf_ = new_val;
}
/*!
* \brief True if this histogram can be splitted
*/
bool is_splittable() { return is_splittable_; }
/*!
* \brief Set splittable to this histogram
*/
void set_is_splittable(bool val) { is_splittable_ = val; }
private:
/*!
* \brief Calculate the split gain based on sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
* \return split gain
*/
score_t GetLeafSplitGain(score_t sum_gradients, score_t sum_hessians) const {
return (sum_gradients * sum_gradients) / (sum_hessians);
}
/*!
* \brief Calculate the output of a leaf based on sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
* \return leaf output
*/
score_t CalculateSplittedLeafOutput(score_t sum_gradients, score_t sum_hessians) const {
return -(sum_gradients) / (sum_hessians);
}
int feature_idx_;
/*! \brief minimal number of data in one leaf */
data_size_t min_num_data_one_leaf_;
/*! \brief minimal sum hessian of data in one leaf */
score_t min_sum_hessian_one_leaf_;
/*! \brief the bin data of current feature */
const Bin* bin_data_;
/*! \brief number of bin of histogram */
unsigned int num_bins_;
/*! \brief sum of gradient of each bin */
HistogramBinEntry* data_;
/*! \brief number of all data */
data_size_t num_data_;
/*! \brief sum of gradient of current leaf */
score_t sum_gradients_;
/*! \brief sum of hessians of current leaf */
score_t sum_hessians_;
/*! \brief False if this histogram cannot split */
bool is_splittable_ = true;
};
} // namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_FEATURE_HISTOGRAM_HPP_
#include "parallel_tree_learner.h"
#include <cstring>
#include <vector>
namespace LightGBM {
FeatureParallelTreeLearner::FeatureParallelTreeLearner(const TreeConfig& tree_config)
:SerialTreeLearner(tree_config), input_buffer_(nullptr), output_buffer_(nullptr) {
}
FeatureParallelTreeLearner::~FeatureParallelTreeLearner() {
if (input_buffer_ != nullptr) { delete[] input_buffer_; }
if (output_buffer_ != nullptr) { delete[] output_buffer_; }
}
void FeatureParallelTreeLearner::Init(const Dataset* train_data) {
SerialTreeLearner::Init(train_data);
rank_ = Network::rank();
num_machines_ = Network::num_machines();
input_buffer_ = new char[sizeof(SplitInfo) * 2];
output_buffer_ = new char[sizeof(SplitInfo) * 2];
}
void FeatureParallelTreeLearner::BeforeTrain() {
SerialTreeLearner::BeforeTrain();
// get feature partition
std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
std::vector<int> num_bins_distributed(num_machines_, 0);
for (int i = 0; i < train_data_->num_features(); ++i) {
if (is_feature_used_[i]) {
int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
feature_distribution[cur_min_machine].push_back(i);
num_bins_distributed[cur_min_machine] += train_data_->FeatureAt(i)->num_bin();
is_feature_used_[i] = false;
}
}
// get local used features
for (auto fid : feature_distribution[rank_]) {
is_feature_used_[fid] = true;
}
}
void FeatureParallelTreeLearner::FindBestSplitsForLeaves() {
int smaller_best_feature = -1, larger_best_feature = -1;
SplitInfo smaller_best, larger_best;
// get best split at smaller leaf
std::vector<double> gains;
for (size_t i = 0; i < smaller_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(smaller_leaf_splits_->BestSplitPerFeature()[i].gain);
}
smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
smaller_best = smaller_leaf_splits_->BestSplitPerFeature()[smaller_best_feature];
// get best split at larger leaf
if (larger_leaf_splits_->LeafIndex() >= 0) {
gains.clear();
for (size_t i = 0; i < larger_leaf_splits_->BestSplitPerFeature().size(); ++i) {
gains.push_back(larger_leaf_splits_->BestSplitPerFeature()[i].gain);
}
larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
larger_best = larger_leaf_splits_->BestSplitPerFeature()[larger_best_feature];
}
// sync global best info
std::memcpy(input_buffer_, &smaller_best, sizeof(SplitInfo));
std::memcpy(input_buffer_ + sizeof(SplitInfo), &larger_best, sizeof(SplitInfo));
Network::Allreduce(input_buffer_, sizeof(SplitInfo) * 2, sizeof(SplitInfo),
output_buffer_, &SplitInfo::MaxReducer);
// copy back
std::memcpy(&smaller_best, output_buffer_, sizeof(SplitInfo));
std::memcpy(&larger_best, output_buffer_ + sizeof(SplitInfo), sizeof(SplitInfo));
// update best split
best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()] = smaller_best;
if (larger_leaf_splits_->LeafIndex() >= 0) {
best_split_per_leaf_[larger_leaf_splits_->LeafIndex()] = larger_best;
}
}
} // namespace LightGBM
#ifndef LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
#define LIGHTGBM_TREELEARNER_LEAF_SPLITS_HPP_
#include <LightGBM/meta.h>
#include "data_partition.hpp"
#include "split_info.hpp"
#include <vector>
namespace LightGBM {
/*!
* \brief used to find splits candidates for a leaf
*/
class LeafSplits {
public:
LeafSplits(int num_feature, data_size_t num_data)
:num_data_in_leaf_(num_data), num_data_(num_data), num_features_(num_feature),
data_indices_(nullptr) {
for (int i = 0; i < num_features_; ++i) {
best_split_per_feature_.push_back(SplitInfo());
best_split_per_feature_[i].feature = i;
}
}
~LeafSplits() {
}
/*!
* \brief Init splits on current leaf, don't need to travesal all data
* \param leaf Index of current leaf
* \param data_partition current data partition
* \param sum_gradients
* \param sum_hessians
*/
void Init(int leaf, const DataPartition* data_partition, score_t sum_gradients, score_t sum_hessians) {
leaf_index_ = leaf;
num_data_in_leaf_ = data_partition->GetIndexOnLeaf(leaf, &data_indices_);
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
}
/*!
* \brief Init splits on current leaf, need to travesal all data to sum up
* \param gradients
* \param hessians
*/
void Init(const score_t* gradients, const score_t *hessians) {
num_data_in_leaf_ = num_data_;
leaf_index_ = 0;
data_indices_ = nullptr;
score_t tmp_sum_gradients = 0.0;
score_t tmp_sum_hessians = 0.0;
#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
tmp_sum_gradients += gradients[i];
tmp_sum_hessians += hessians[i];
}
sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
}
/*!
* \brief Init splits on current leaf, need to travesal all data to sum up
* \param leaf Index of current leaf
* \param data_partition current data partition
* \param gradients
* \param hessians
*/
void Init(int leaf, const DataPartition* data_partition, const score_t* gradients, const score_t *hessians) {
leaf_index_ = leaf;
num_data_in_leaf_ = data_partition->GetIndexOnLeaf(leaf, &data_indices_);
score_t tmp_sum_gradients = 0.0;
score_t tmp_sum_hessians = 0.0;
#pragma omp parallel for schedule(static) reduction(+:tmp_sum_gradients, tmp_sum_hessians)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
data_size_t idx = data_indices_[i];
tmp_sum_gradients += gradients[idx];
tmp_sum_hessians += hessians[idx];
}
sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
}
/*!
* \brief Init splits on current leaf, only update sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
*/
void Init(score_t sum_gradients, score_t sum_hessians) {
leaf_index_ = 0;
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
}
/*!
* \brief Init splits on current leaf
*/
void Init() {
leaf_index_ = -1;
for (SplitInfo& split_info : best_split_per_feature_) {
split_info.Reset();
}
}
/*! \brief Get best splits on all features */
std::vector<SplitInfo>& BestSplitPerFeature() { return best_split_per_feature_;}
/*! \brief Get current leaf index */
int LeafIndex() const { return leaf_index_; }
/*! \brief Get numer of data in current leaf */
data_size_t num_data_in_leaf() const { return num_data_in_leaf_; }
/*! \brief Get sum of gradients of current leaf */
score_t sum_gradients() const { return sum_gradients_; }
/*! \brief Get sum of hessians of current leaf */
score_t sum_hessians() const { return sum_hessians_; }
/*! \brief Get indices of data of current leaf */
data_size_t * data_indices() const { return data_indices_; }
private:
/*! \brief store best splits of all feature on current leaf */
std::vector<SplitInfo> best_split_per_feature_;
/*! \brief current leaf index */
int leaf_index_;
/*! \brief number of data on current leaf */
data_size_t num_data_in_leaf_;
/*! \brief number of all training data */
data_size_t num_data_;
/*! \brief number of features */
int num_features_;
/*! \brief sum of gradients of current leaf */
score_t sum_gradients_;
/*! \brief sum of hessians of current leaf */
score_t sum_hessians_;
/*! \brief indices of data of current leaf */
data_size_t* data_indices_;
};
} // namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_LEAF_SPLITS_HPP_
#ifndef LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
#define LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/network.h>
#include "serial_tree_learner.h"
#include <cstring>
#include <vector>
namespace LightGBM {
/*!
* \brief Feature parallel learning algorithm.
* Different machine will find best split on different features, then sync global best split
* When #data is small or #feature is large, you can use this to have better speed-up
*/
class FeatureParallelTreeLearner: public SerialTreeLearner {
public:
explicit FeatureParallelTreeLearner(const TreeConfig& tree_config);
~FeatureParallelTreeLearner();
virtual void Init(const Dataset* train_data);
protected:
void BeforeTrain() override;
void FindBestSplitsForLeaves() override;
private:
/*! \brief rank of local machine */
int rank_;
/*! \brief Number of machines of this parallel task */
int num_machines_;
/*! \brief Buffer for network send */
char* input_buffer_;
/*! \brief Buffer for network receive */
char* output_buffer_;
};
/*!
* \brief Data parallel learning algorithm.
* Workers use local data to construct histograms locally, then sync up global histograms.
* When #data is large or #feature is small, you can use this to have better speed-up
*/
class DataParallelTreeLearner: public SerialTreeLearner {
public:
explicit DataParallelTreeLearner(const TreeConfig& tree_config);
~DataParallelTreeLearner();
void Init(const Dataset* train_data) override;
protected:
void BeforeTrain() override;
void FindBestThresholds() override;
void FindBestSplitsForLeaves() override;
void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override {
if (leaf_idx >= 0) {
return global_data_count_in_leaf_[leaf_idx];
} else {
return 0;
}
}
private:
/*! \brief Rank of local machine */
int rank_;
/*! \brief Number of machines of this parallel task */
int num_machines_;
/*! \brief Buffer for network send */
char* input_buffer_;
/*! \brief Buffer for network receive */
char* output_buffer_;
/*! \brief different machines will aggregate histograms for different features,
use this to mark local aggregate features*/
bool* is_feature_aggregated_;
/*! \brief Block start index for reduce scatter */
int* block_start_;
/*! \brief Block size for reduce scatter */
int* block_len_;
/*! \brief Write positions for feature histgrams */
int* buffer_write_start_pos_;
/*! \brief Read positions for local feature histgrams */
int* buffer_read_start_pos_;
/*! \brief Size for reduce scatter */
int reduce_scatter_size_;
/*! \brief Store global number of data in leaves */
data_size_t* global_data_count_in_leaf_;
};
} // namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
#include "serial_tree_learner.h"
#include <LightGBM/utils/array_args.h>
#include <algorithm>
#include <vector>
namespace LightGBM {
SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
:data_partition_(nullptr), is_feature_used_(nullptr),
historical_histogram_array_(nullptr), smaller_leaf_histogram_array_(nullptr),
larger_leaf_histogram_array_(nullptr),
smaller_leaf_splits_(nullptr), larger_leaf_splits_(nullptr),
ordered_gradients_(nullptr), ordered_hessians_(nullptr), is_data_in_leaf_(nullptr) {
// initialize with nullptr
num_leaves_ = tree_config.num_leaves;
min_num_data_one_leaf_ = static_cast<data_size_t>(tree_config.min_data_in_leaf);
min_sum_hessian_one_leaf_ = static_cast<float>(tree_config.min_sum_hessian_in_leaf);
feature_fraction_ = tree_config.feature_fraction;
random_ = Random(tree_config.feature_fraction_seed);
}
SerialTreeLearner::~SerialTreeLearner() {
if (data_partition_ != nullptr) { delete data_partition_; }
if (smaller_leaf_splits_ != nullptr) { delete smaller_leaf_splits_; }
if (larger_leaf_splits_ != nullptr) { delete larger_leaf_splits_; }
for (int i = 0; i < num_leaves_; ++i) {
if (historical_histogram_array_[i] != nullptr) {
delete[] historical_histogram_array_[i];
}
}
if (historical_histogram_array_ != nullptr) { delete[] historical_histogram_array_; }
if (is_feature_used_ != nullptr) { delete[] is_feature_used_; }
if (ordered_gradients_ != nullptr) { delete[] ordered_gradients_; }
if (ordered_hessians_ != nullptr) { delete[] ordered_hessians_; }
for (auto& bin : ordered_bins_) {
delete bin;
}
if (is_data_in_leaf_ != nullptr) {
delete[] is_data_in_leaf_;
}
}
void SerialTreeLearner::Init(const Dataset* train_data) {
train_data_ = train_data;
num_data_ = train_data_->num_data();
num_features_ = train_data_->num_features();
// allocate the space for historical_histogram_array_
historical_histogram_array_ = new FeatureHistogram*[num_leaves_];
for (int i = 0; i < num_leaves_; ++i) {
historical_histogram_array_[i] = new FeatureHistogram[train_data_->num_features()];
for (int j = 0; j < train_data_->num_features(); ++j) {
historical_histogram_array_[i][j].Init(train_data_->FeatureAt(j),
j, min_num_data_one_leaf_,
min_sum_hessian_one_leaf_);
}
}
// push split information for all leaves
for (int i = 0; i < num_leaves_; ++i) {
best_split_per_leaf_.push_back(SplitInfo());
}
// initialize ordered_bins_ with nullptr
for (int i = 0; i < num_features_; ++i) {
ordered_bins_.push_back(nullptr);
}
// get ordered bin
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
ordered_bins_[i] = train_data_->FeatureAt(i)->bin_data()->CreateOrderedBin();
}
// check existing for ordered bin
for (int i = 0; i < num_features_; ++i) {
if (ordered_bins_[i] != nullptr) {
has_ordered_bin_ = true;
break;
}
}
// initialize splits for leaf
smaller_leaf_splits_ = new LeafSplits(train_data_->num_features(), train_data_->num_data());
larger_leaf_splits_ = new LeafSplits(train_data_->num_features(), train_data_->num_data());
// initialize data partition
data_partition_ = new DataPartition(num_data_, num_leaves_);
is_feature_used_ = new bool[num_features_];
// initialize ordered gradients and hessians
ordered_gradients_ = new score_t[num_data_];
ordered_hessians_ = new score_t[num_data_];
// if has ordered bin, need allocata a buffer to fast split
if (has_ordered_bin_) {
is_data_in_leaf_ = new char[num_data_];
}
Log::Stdout("#data:%d #feature:%d\n", num_data_, num_features_);
}
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
gradients_ = gradients;
hessians_ = hessians;
// some initial works before training
BeforeTrain();
Tree *tree = new Tree(num_leaves_);
// root leaf
int left_leaf = 0;
// only root leaf can be splitted on first time
int right_leaf = -1;
for (int split = 0; split < num_leaves_ - 1; split++) {
// some initial works before finding best split
if (BeforeFindBestSplit(left_leaf, right_leaf)) {
// find best threshold for every feature
FindBestThresholds();
// find best split from all features
FindBestSplitsForLeaves();
}
// Get a leaf with max split gain
int best_leaf = static_cast<int>(ArrayArgs<SplitInfo>::ArgMax(best_split_per_leaf_));
// Get split information for best leaf
const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
// cannot split, quit
if (best_leaf_SplitInfo.gain <= 0.0) {
Log::Stdout("cannot find more split with gain = %f , current #leaves=%d\n",
best_leaf_SplitInfo.gain, split + 1);
break;
}
// split tree with best leaf
Split(tree, best_leaf, &left_leaf, &right_leaf);
}
// save pointer to last trained tree
last_trained_tree_ = tree;
return tree;
}
void SerialTreeLearner::BeforeTrain() {
// initialize used features
for (int i = 0; i < num_features_; ++i) {
is_feature_used_[i] = false;
}
// Get used feature at current tree
size_t used_feature_cnt = static_cast<size_t>(num_features_*feature_fraction_);
std::vector<size_t> used_feature_indices = random_.Sample(num_features_, used_feature_cnt);
for (auto idx : used_feature_indices) {
is_feature_used_[idx] = true;
}
// set all histogram to splittable
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_leaves_; ++i) {
for (int j = 0; j < train_data_->num_features(); ++j) {
historical_histogram_array_[i][j].set_is_splittable(true);
}
}
// initialize data partition
data_partition_->Init();
// reset the splits for leaves
for (int i = 0; i < num_leaves_; ++i) {
best_split_per_leaf_[i].Reset();
}
// Sumup for root
if (data_partition_->leaf_count(0) == num_data_) {
// use all data
smaller_leaf_splits_->Init(gradients_, hessians_);
// point to gradients, avoid copy
ptr_to_ordered_gradients_ = gradients_;
ptr_to_ordered_hessians_ = hessians_;
} else {
// use bagging, only use part of data
smaller_leaf_splits_->Init(0, data_partition_, gradients_, hessians_);
// copy used gradients and hessians to ordered buffer
const data_size_t* indices = data_partition_->indices();
data_size_t cnt = data_partition_->leaf_count(0);
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < cnt; ++i) {
ordered_gradients_[i] = gradients_[indices[i]];
ordered_hessians_[i] = hessians_[indices[i]];
}
// point to ordered_gradients_ and ordered_hessians_
ptr_to_ordered_gradients_ = ordered_gradients_;
ptr_to_ordered_hessians_ = ordered_hessians_;
}
larger_leaf_splits_->Init();
// if has ordered bin, need to initialize the ordered bin
if (has_ordered_bin_) {
if (data_partition_->leaf_count(0) == num_data_) {
// use all data, pass nullptr
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
if (ordered_bins_[i] != nullptr) {
ordered_bins_[i]->Init(nullptr, num_leaves_);
}
}
} else {
// bagging, only use part of data
// mark used data
std::memset(is_data_in_leaf_, 0, sizeof(char)*num_data_);
const data_size_t* indices = data_partition_->indices();
data_size_t begin = data_partition_->leaf_begin(0);
data_size_t end = begin + data_partition_->leaf_count(0);
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 1;
}
// initialize ordered bin
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
if (ordered_bins_[i] != nullptr) {
ordered_bins_[i]->Init(is_data_in_leaf_, num_leaves_);
}
}
}
}
}
bool SerialTreeLearner::BeforeFindBestSplit(int left_leaf, int right_leaf) {
data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
// no enough data to continue
if (num_data_in_right_child < static_cast<data_size_t>(min_num_data_one_leaf_ * 2)
&& num_data_in_left_child < static_cast<data_size_t>(min_num_data_one_leaf_ * 2)) {
best_split_per_leaf_[left_leaf].gain = kMinScore;
if (right_leaf >= 0) {
best_split_per_leaf_[right_leaf].gain = kMinScore;
}
return false;
}
// -1 if only has one leaf. else equal the index of smaller leaf
int smaller_leaf = -1;
// only have root
if (right_leaf < 0) {
smaller_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
larger_leaf_histogram_array_ = nullptr;
} else if (num_data_in_left_child < num_data_in_right_child) {
smaller_leaf = left_leaf;
// put parent(left) leaf's histograms into larger leaf's histgrams
larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
// We will construc histograms for smaller leaf, and smaller_leaf=left_leaf = parent.
// if we don't swap the cache, we will overwrite the parent's hisogram cache.
std::swap(historical_histogram_array_[left_leaf], historical_histogram_array_[right_leaf]);
} else {
smaller_leaf = right_leaf;
// put parent(left) leaf's histograms to larger leaf's histgrams
larger_leaf_histogram_array_ = historical_histogram_array_[left_leaf];
smaller_leaf_histogram_array_ = historical_histogram_array_[right_leaf];
}
// init for the ordered gradients, only initialize when have 2 leaves
if (smaller_leaf >= 0) {
// only need to initialize for smaller leaf
// Get leaf boundary
const data_size_t* indices = data_partition_->indices();
data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
// copy
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
ordered_gradients_[i - begin] = gradients_[indices[i]];
ordered_hessians_[i - begin] = hessians_[indices[i]];
}
// assign pointer
ptr_to_ordered_gradients_ = ordered_gradients_;
ptr_to_ordered_hessians_ = ordered_hessians_;
}
// split for the ordered bin
if (has_ordered_bin_ && right_leaf >= 0) {
// mark data that at left-leaf
std::memset(is_data_in_leaf_, 0, sizeof(char)*num_data_);
const data_size_t* indices = data_partition_->indices();
data_size_t begin = data_partition_->leaf_begin(left_leaf);
data_size_t end = begin + data_partition_->leaf_count(left_leaf);
#pragma omp parallel for schedule(static)
for (data_size_t i = begin; i < end; ++i) {
is_data_in_leaf_[indices[i]] = 1;
}
// split the ordered bin
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
if (ordered_bins_[i] != nullptr) {
ordered_bins_[i]->Split(left_leaf, right_leaf, is_data_in_leaf_);
}
}
}
return true;
}
void SerialTreeLearner::FindBestThresholds() {
#pragma omp parallel for schedule(guided)
for (int feature_index = 0; feature_index < num_features_; feature_index++) {
// feature is not used
if ((is_feature_used_ != nullptr && is_feature_used_[feature_index] == false)) continue;
// if parent(larger) leaf cannot split at current feature
if (larger_leaf_histogram_array_ != nullptr && !larger_leaf_histogram_array_[feature_index].is_splittable()) {
smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
continue;
}
// construct histograms for smaller leaf
if (ordered_bins_[feature_index] == nullptr) {
// if not use ordered bin
smaller_leaf_histogram_array_[feature_index].Construct(smaller_leaf_splits_->data_indices(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
ptr_to_ordered_gradients_,
ptr_to_ordered_hessians_);
} else {
// used ordered bin
smaller_leaf_histogram_array_[feature_index].Construct(ordered_bins_[feature_index],
smaller_leaf_splits_->LeafIndex(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
gradients_,
hessians_);
}
// find best threshold for smaller child
smaller_leaf_histogram_array_[feature_index].FindBestThreshold(&smaller_leaf_splits_->BestSplitPerFeature()[feature_index]);
// only has root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) continue;
// construct histgroms for large leaf, we initialize larger leaf as the parent,
// so we can just subtract the smaller leaf's histograms
larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
// find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold(&larger_leaf_splits_->BestSplitPerFeature()[feature_index]);
}
}
void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
// left = parent
*left_leaf = best_Leaf;
// split tree, will return right leaf
*right_leaf = tree->Split(best_Leaf, best_split_info.feature, best_split_info.threshold,
train_data_->FeatureAt(best_split_info.feature)->feature_index(),
train_data_->FeatureAt(best_split_info.feature)->BinToValue(best_split_info.threshold),
best_split_info.left_output, best_split_info.right_output, best_split_info.gain);
// split data partition
data_partition_->Split(best_Leaf, train_data_->FeatureAt(best_split_info.feature)->bin_data(),
best_split_info.threshold, *right_leaf);
// init the leaves that used on next iteration
if (best_split_info.left_count < best_split_info.right_count) {
smaller_leaf_splits_->Init(*left_leaf, data_partition_,
best_split_info.left_sum_gradient,
best_split_info.left_sum_hessian);
larger_leaf_splits_->Init(*right_leaf, data_partition_,
best_split_info.right_sum_gradient,
best_split_info.right_sum_hessian);
} else {
smaller_leaf_splits_->Init(*right_leaf, data_partition_, best_split_info.right_sum_gradient, best_split_info.right_sum_hessian);
larger_leaf_splits_->Init(*left_leaf, data_partition_, best_split_info.left_sum_gradient, best_split_info.left_sum_hessian);
}
}
} // namespace LightGBM
#ifndef LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
#define LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/tree_learner.h>
#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
#include <LightGBM/feature.h>
#include "feature_histogram.hpp"
#include "data_partition.hpp"
#include "split_info.hpp"
#include "leaf_splits.hpp"
#include <cstdio>
#include <vector>
#include <random>
#include <cmath>
namespace LightGBM {
/*!
* \brief Used for learning a tree by single machine
*/
class SerialTreeLearner: public TreeLearner {
public:
explicit SerialTreeLearner(const TreeConfig& tree_config);
~SerialTreeLearner();
void Init(const Dataset* train_data) override;
Tree* Train(const score_t* gradients, const score_t *hessians) override;
void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
data_partition_->SetUsedDataIndices(used_indices, num_data);
}
void AddPredictionToScore(score_t *out_score) const override {
#pragma omp parallel for schedule(guided)
for (int i = 0; i < data_partition_->num_leaves(); ++i) {
double output = last_trained_tree_->LeafOutput(i);
data_size_t* tmp_idx = nullptr;
data_size_t cnt_leaf_data = data_partition_->GetIndexOnLeaf(i, &tmp_idx);
for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
out_score[tmp_idx[j]] += static_cast<score_t>(output);
}
}
}
protected:
/*!
* \brief Some initial works before training
*/
virtual void BeforeTrain();
/*!
* \brief Some initial works before FindBestSplit
*/
virtual bool BeforeFindBestSplit(int left_leaf, int right_leaf);
/*!
* \brief Find best thresholds for all features, using multi-threading.
* The result will be stored in smaller_leaf_splits_ and larger_leaf_splits_.
* This function will be called in FindBestSplit.
*/
virtual void FindBestThresholds();
/*!
* \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_.
* This function will be called after FindBestThresholds.
*/
inline virtual void FindBestSplitsForLeaves();
/*!
* \brief Partition tree and data according best split.
* \param tree Current tree, will be splitted on this function.
* \param best_leaf The index of leaf that will be splitted.
* \param left_leaf The index of left leaf after splitted.
* \param right_leaf The index of right leaf after splitted.
*/
virtual void Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf);
/*!
* \brief Get the number of data in a leaf
* \param leaf_idx The index of leaf
* \return The number of data in the leaf_idx leaf
*/
inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;
/*!
* \brief Find best features for leaf from leaf_splits
* \param leaf_splits
*/
inline void FindBestSplitForLeaf(LeafSplits* leaf_splits);
/*! \brief Last trained decision tree */
const Tree* last_trained_tree_;
/*! \brief number of data */
data_size_t num_data_;
/*! \brief number of features */
int num_features_;
/*! \brief training data */
const Dataset* train_data_;
/*! \brief gradients of current iteration */
const score_t* gradients_;
/*! \brief hessians of current iteration */
const score_t* hessians_;
/*! \brief number of total leaves */
int num_leaves_;
/*! \brief mininal data on one leaf */
data_size_t min_num_data_one_leaf_;
/*! \brief mininal sum hessian on one leaf */
score_t min_sum_hessian_one_leaf_;
/*! \brief sub-feature fraction rate */
double feature_fraction_;
/*! \brief training data partition on leaves */
DataPartition* data_partition_;
/*! \brief used for generate used features */
Random random_;
/*! \brief used for sub feature training, is_feature_used_[i] = falase means don't used feature i */
bool* is_feature_used_;
/*! \brief cache historical histogram to speed up */
FeatureHistogram** historical_histogram_array_;
/*! \brief pointer to histograms array of smaller leaf */
FeatureHistogram* smaller_leaf_histogram_array_;
/*! \brief pointer to histograms array of larger leaf */
FeatureHistogram* larger_leaf_histogram_array_;
/*! \brief store best split points for all leaves */
std::vector<SplitInfo> best_split_per_leaf_;
/*! \brief stores best thresholds for all feature for smaller leaf */
LeafSplits* smaller_leaf_splits_;
/*! \brief stores best thresholds for all feature for larger leaf */
LeafSplits* larger_leaf_splits_;
/*! \brief gradients of current iteration, ordered for cache optimized */
score_t* ordered_gradients_;
/*! \brief hessians of current iteration, ordered for cache optimized */
score_t* ordered_hessians_;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const score_t* ptr_to_ordered_gradients_;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const score_t* ptr_to_ordered_hessians_;
/*! \brief Store ordered bin */
std::vector<OrderedBin*> ordered_bins_;
/*! \brief True if has ordered bin */
bool has_ordered_bin_ = false;
/*! \brief is_data_in_leaf_[i] != 0 means i-th data is marked */
char* is_data_in_leaf_;
};
inline void SerialTreeLearner::FindBestSplitsForLeaves() {
FindBestSplitForLeaf(smaller_leaf_splits_);
FindBestSplitForLeaf(larger_leaf_splits_);
}
inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) const {
if (leafIdx >= 0) {
return data_partition_->leaf_count(leafIdx);
} else {
return 0;
}
}
inline void SerialTreeLearner::FindBestSplitForLeaf(LeafSplits* leaf_splits) {
if (leaf_splits == nullptr || leaf_splits->LeafIndex() < 0) {
return;
}
std::vector<double> gains;
for (size_t i = 0; i < leaf_splits->BestSplitPerFeature().size(); ++i) {
gains.push_back(leaf_splits->BestSplitPerFeature()[i].gain);
}
int best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
int leaf = leaf_splits->LeafIndex();
best_split_per_leaf_[leaf] = leaf_splits->BestSplitPerFeature()[best_feature];
best_split_per_leaf_[leaf].feature = best_feature;
}
} // namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
#ifndef LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
#define LIGHTGBM_TREELEARNER_SPLIT_INFO_HPP_
#include <LightGBM/meta.h>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <functional>
namespace LightGBM {
/*!
* \brief Used to store some information for gain split point
*/
struct SplitInfo {
public:
/*! \brief Feature index */
int feature;
/*! \brief Split threshold */
unsigned int threshold;
/*! \brief Left output after split */
score_t left_output;
/*! \brief Right output after split */
score_t right_output;
/*! \brief Split gain */
score_t gain;
/*! \brief Left number of data after split */
data_size_t left_count;
/*! \brief Right number of data after split */
data_size_t right_count;
/*! \brief Left sum gradient after split */
score_t left_sum_gradient;
/*! \brief Left sum hessian after split */
score_t left_sum_hessian;
/*! \brief Right sum gradient after split */
score_t right_sum_gradient;
/*! \brief Right sum hessian after split */
score_t right_sum_hessian;
SplitInfo() {
// initilize with -1 and -inf gain
feature = -1;
gain = kMinScore;
}
inline void Reset() {
// initilize with -1 and -inf gain
feature = -1;
gain = kMinScore;
}
inline bool operator > (const SplitInfo &si) const;
inline static void MaxReducer(const char* src, char* dst, int len) {
const int type_size = sizeof(SplitInfo);
int used_size = 0;
const SplitInfo* p1;
SplitInfo* p2;
while (used_size < len) {
p1 = reinterpret_cast<const SplitInfo*>(src);
p2 = reinterpret_cast<SplitInfo*>(dst);
if (*p1 > *p2) {
// copy
std::memcpy(dst, src, type_size);
}
src += type_size;
dst += type_size;
used_size += type_size;
}
}
};
inline bool SplitInfo::operator > (const SplitInfo& si) const {
score_t local_gain = this->gain;
score_t other_gain = si.gain;
// replace nan with -inf
if (local_gain == NAN) {
local_gain = kMinScore;
}
// replace nan with -inf
if (other_gain == NAN) {
other_gain = kMinScore;
}
int local_feature = this->feature;
int other_feature = si.feature;
// replace -1 with max int
if (local_feature == -1) {
local_feature = INT32_MAX;
}
// replace -1 with max int
if (other_feature == -1) {
other_feature = INT32_MAX;
}
if (local_gain != other_gain) {
return local_gain > other_gain;
} else {
// if same gain, use smaller feature
return local_feature < other_feature;
}
}
} // namespace LightGBM
#endif #endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
#include <LightGBM/tree_learner.h>
#include "serial_tree_learner.h"
#include "parallel_tree_learner.h"
namespace LightGBM {
TreeLearner* TreeLearner::CreateTreeLearner(TreeLearnerType type, const TreeConfig& tree_config) {
if (type == TreeLearnerType::kSerialTreeLearner) {
return new SerialTreeLearner(tree_config);
} else if (type == TreeLearnerType::kFeatureParallelTreelearner) {
return new FeatureParallelTreeLearner(tree_config);
} else if (type == TreeLearnerType::kDataParallelTreeLearner) {
return new DataParallelTreeLearner(tree_config);
}
return nullptr;
}
} // namespace LightGBM

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25123.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LightGBM", "LightGBM.vcxproj", "{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug_mpi|x64 = Debug_mpi|x64
Debug|x64 = Debug|x64
Release_mpi|x64 = Release_mpi|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.ActiveCfg = Debug_mpi|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug_mpi|x64.Build.0 = Debug_mpi|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.ActiveCfg = Debug|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Debug|x64.Build.0 = Debug|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.ActiveCfg = Release_mpi|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release_mpi|x64.Build.0 = Release_mpi|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.ActiveCfg = Release|x64
{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="Projectconfigurations">
<ProjectConfiguration Include="Debug_mpi|x64">
<Configuration>Debug_mpi</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release_mpi|x64">
<Configuration>Release_mpi</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{F31C0B5D-715E-4953-AA1B-8D2AEEE4344C}</ProjectGuid>
<RootNamespace>LightGBM</RootNamespace>
<SccProjectName>SAK</SccProjectName>
<SccAuxPath>SAK</SccAuxPath>
<SccLocalPath>SAK</SccLocalPath>
<SccProvider>SAK</SccProvider>
<ProjectName>LightGBM</ProjectName>
<WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug_mpi|x64'">
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup>
<PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup>
<PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup>
<PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Release_mpi|x64'">
<PlatformToolset>v120</PlatformToolset>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Condition="'$(configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(configuration)|$(Platform)'=='Debug|x64'">
<IncludePath>..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
<LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(configuration)|$(Platform)'=='Debug_mpi|x64'">
<IncludePath>$(MSMPI_INC);..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
<LibraryPath>$(MSMPI_LIB64);$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(configuration)|$(Platform)'=='Release|x64'">
<IncludePath>..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
<LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);</LibraryPath>
</PropertyGroup>
<PropertyGroup Condition="'$(configuration)|$(Platform)'=='Release_mpi|x64'">
<IncludePath>$(MSMPI_INC);..\include;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
<LibraryPath>$(MSMPI_LIB64);$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_mpi|x64'">
<ClCompile>
<PreprocessorDefinitions>USE_MPI</PreprocessorDefinitions>
<WarningLevel>Level4</WarningLevel>
<OpenMPSupport>true</OpenMPSupport>
<FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>
<InlineFunctionExpansion>Default</InlineFunctionExpansion>
<IntrinsicFunctions>false</IntrinsicFunctions>
<EnableFiberSafeOptimizations>false</EnableFiberSafeOptimizations>
<WholeProgramOptimization>false</WholeProgramOptimization>
<Optimization>Disabled</Optimization>
</ClCompile>
<Link>
<AdditionalLibraryDirectories>
</AdditionalLibraryDirectories>
</Link>
<Link>
<AdditionalDependencies>msmpi.lib</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PreprocessorDefinitions>USE_SOCKET</PreprocessorDefinitions>
<WarningLevel>Level4</WarningLevel>
<OpenMPSupport>true</OpenMPSupport>
<FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>
<InlineFunctionExpansion>Default</InlineFunctionExpansion>
<IntrinsicFunctions>false</IntrinsicFunctions>
<EnableFiberSafeOptimizations>false</EnableFiberSafeOptimizations>
<WholeProgramOptimization>false</WholeProgramOptimization>
<Optimization>Disabled</Optimization>
</ClCompile>
<Link>
<AdditionalDependencies>
</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(configuration)|$(Platform)'=='Release_mpi|x64'">
<ClCompile>
<WarningLevel>Level4</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<SDLCheck>true</SDLCheck>
<PreprocessorDefinitions>USE_MPI;_MBCS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<OpenMPSupport>true</OpenMPSupport>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<EnableFiberSafeOptimizations>false</EnableFiberSafeOptimizations>
<WholeProgramOptimization>true</WholeProgramOptimization>
<OmitFramePointers>true</OmitFramePointers>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
</ClCompile>
<Link>
<AdditionalLibraryDirectories>
</AdditionalLibraryDirectories>
</Link>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalDependencies>msmpi.lib</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<PreprocessorDefinitions>USE_SOCKET;_MBCS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<WarningLevel>Level4</WarningLevel>
<OpenMPSupport>true</OpenMPSupport>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<IntrinsicFunctions>true</IntrinsicFunctions>
<EnableFiberSafeOptimizations>false</EnableFiberSafeOptimizations>
<WholeProgramOptimization>true</WholeProgramOptimization>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<OmitFramePointers>true</OmitFramePointers>
<FunctionLevelLinking>true</FunctionLevelLinking>
</ClCompile>
<Link>
<AdditionalDependencies />
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="..\include\LightGBM\application.h" />
<ClInclude Include="..\include\LightGBM\bin.h" />
<ClInclude Include="..\include\LightGBM\boosting.h" />
<ClInclude Include="..\include\LightGBM\config.h" />
<ClInclude Include="..\include\LightGBM\dataset.h" />
<ClInclude Include="..\include\LightGBM\feature.h" />
<ClInclude Include="..\include\LightGBM\meta.h" />
<ClInclude Include="..\include\LightGBM\metric.h" />
<ClInclude Include="..\include\LightGBM\network.h" />
<ClInclude Include="..\include\LightGBM\objective_function.h" />
<ClInclude Include="..\include\LightGBM\tree.h" />
<ClInclude Include="..\include\LightGBM\tree_learner.h" />
<ClInclude Include="..\include\LightGBM\utils\array_args.h" />
<ClInclude Include="..\include\LightGBM\utils\common.h" />
<ClInclude Include="..\include\LightGBM\utils\log.h" />
<ClInclude Include="..\include\LightGBM\utils\pipeline_reader.h" />
<ClInclude Include="..\include\LightGBM\utils\random.h" />
<ClInclude Include="..\include\LightGBM\utils\text_reader.h" />
<ClInclude Include="..\include\LightGBM\utils\threading.h" />
<ClInclude Include="..\src\application\predictor.hpp" />
<ClInclude Include="..\src\boosting\gbdt.h" />
<ClInclude Include="..\src\boosting\score_updater.hpp" />
<ClInclude Include="..\src\io\dense_bin.hpp" />
<ClInclude Include="..\src\io\ordered_sparse_bin.hpp" />
<ClInclude Include="..\src\io\parser.hpp" />
<ClInclude Include="..\src\io\sparse_bin.hpp" />
<ClInclude Include="..\src\metric\binary_metric.hpp" />
<ClInclude Include="..\src\metric\rank_metric.hpp" />
<ClInclude Include="..\src\metric\regression_metric.hpp" />
<ClInclude Include="..\src\network\linkers.h" />
<ClInclude Include="..\src\network\socket_wrapper.hpp" />
<ClInclude Include="..\src\objective\binary_objective.hpp" />
<ClInclude Include="..\src\objective\rank_objective.hpp" />
<ClInclude Include="..\src\objective\regression_objective.hpp" />
<ClInclude Include="..\src\treelearner\data_partition.hpp" />
<ClInclude Include="..\src\treelearner\feature_histogram.hpp" />
<ClInclude Include="..\src\treelearner\leaf_splits.hpp" />
<ClInclude Include="..\src\treelearner\parallel_tree_learner.h" />
<ClInclude Include="..\src\treelearner\serial_tree_learner.h" />
<ClInclude Include="..\src\treelearner\split_info.hpp" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\src\application\application.cpp" />
<ClCompile Include="..\src\boosting\boosting.cpp" />
<ClCompile Include="..\src\boosting\gbdt.cpp" />
<ClCompile Include="..\src\io\bin.cpp" />
<ClCompile Include="..\src\io\config.cpp" />
<ClCompile Include="..\src\io\dataset.cpp" />
<ClCompile Include="..\src\io\metadata.cpp" />
<ClCompile Include="..\src\io\parser.cpp" />
<ClCompile Include="..\src\io\tree.cpp" />
<ClCompile Include="..\src\metric\dcg_calculator.cpp" />
<ClCompile Include="..\src\metric\metric.cpp" />
<ClCompile Include="..\src\network\network.cpp" />
<ClCompile Include="..\src\network\linkers_mpi.cpp" />
<ClCompile Include="..\src\network\linkers_socket.cpp" />
<ClCompile Include="..\src\network\linker_topo.cpp" />
<ClCompile Include="..\src\objective\objective_function.cpp" />
<ClCompile Include="..\src\main.cpp" />
<ClCompile Include="..\src\treelearner\data_parallel_tree_learner.cpp" />
<ClCompile Include="..\src\treelearner\feature_parallel_tree_learner.cpp" />
<ClCompile Include="..\src\treelearner\serial_tree_learner.cpp" />
<ClCompile Include="..\src\treelearner\tree_learner.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="src">
<UniqueIdentifier>{6e213f6b-b843-4469-bc8c-56c1ffe7f195}</UniqueIdentifier>
</Filter>
<Filter Include="include">
<UniqueIdentifier>{29082261-e6cd-40b2-b30c-c4cb70f23339}</UniqueIdentifier>
</Filter>
<Filter Include="src\application">
<UniqueIdentifier>{3a703e42-6f06-4ab1-8e46-0dfb07407d9e}</UniqueIdentifier>
</Filter>
<Filter Include="src\boosting">
<UniqueIdentifier>{43be32f9-227b-4a15-9c0e-38dbf9747aeb}</UniqueIdentifier>
</Filter>
<Filter Include="src\io">
<UniqueIdentifier>{6fcdaf19-880a-45b0-80db-344be9498017}</UniqueIdentifier>
</Filter>
<Filter Include="src\metric">
<UniqueIdentifier>{8bacb16c-7f31-494f-94df-8ccc6c3e3894}</UniqueIdentifier>
</Filter>
<Filter Include="src\network">
<UniqueIdentifier>{93db474b-4ab8-406b-99ec-eb8e40f97593}</UniqueIdentifier>
</Filter>
<Filter Include="src\objective">
<UniqueIdentifier>{34d576af-dec6-4cad-90bd-f8d0e95ec614}</UniqueIdentifier>
</Filter>
<Filter Include="src\treelearner">
<UniqueIdentifier>{16638c37-41bd-4124-8b80-befbca2f969f}</UniqueIdentifier>
</Filter>
<Filter Include="include\LightGBM">
<UniqueIdentifier>{37b41659-26e2-4b2f-ac0c-7b52d8bd53da}</UniqueIdentifier>
</Filter>
<Filter Include="include\LightGBM\utils">
<UniqueIdentifier>{bf66b9f7-015e-404d-8098-4353abc46956}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\src\boosting\gbdt.h">
<Filter>src\boosting</Filter>
</ClInclude>
<ClInclude Include="..\src\network\linkers.h">
<Filter>src\network</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\parallel_tree_learner.h">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\serial_tree_learner.h">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\application\predictor.hpp">
<Filter>src\application</Filter>
</ClInclude>
<ClInclude Include="..\src\boosting\score_updater.hpp">
<Filter>src\boosting</Filter>
</ClInclude>
<ClInclude Include="..\src\io\dense_bin.hpp">
<Filter>src\io</Filter>
</ClInclude>
<ClInclude Include="..\src\io\ordered_sparse_bin.hpp">
<Filter>src\io</Filter>
</ClInclude>
<ClInclude Include="..\src\io\parser.hpp">
<Filter>src\io</Filter>
</ClInclude>
<ClInclude Include="..\src\io\sparse_bin.hpp">
<Filter>src\io</Filter>
</ClInclude>
<ClInclude Include="..\src\metric\binary_metric.hpp">
<Filter>src\metric</Filter>
</ClInclude>
<ClInclude Include="..\src\metric\rank_metric.hpp">
<Filter>src\metric</Filter>
</ClInclude>
<ClInclude Include="..\src\metric\regression_metric.hpp">
<Filter>src\metric</Filter>
</ClInclude>
<ClInclude Include="..\src\network\socket_wrapper.hpp">
<Filter>src\network</Filter>
</ClInclude>
<ClInclude Include="..\src\objective\binary_objective.hpp">
<Filter>src\objective</Filter>
</ClInclude>
<ClInclude Include="..\src\objective\rank_objective.hpp">
<Filter>src\objective</Filter>
</ClInclude>
<ClInclude Include="..\src\objective\regression_objective.hpp">
<Filter>src\objective</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\data_partition.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\feature_histogram.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\leaf_splits.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\split_info.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\application.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\bin.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\boosting.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\config.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\dataset.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\feature.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\meta.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\metric.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\network.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\objective_function.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\tree.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\tree_learner.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\array_args.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\common.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\log.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\pipeline_reader.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\random.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\text_reader.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\threading.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\src\application\application.cpp">
<Filter>src\application</Filter>
</ClCompile>
<ClCompile Include="..\src\network\linkers_socket.cpp">
<Filter>src\network</Filter>
</ClCompile>
<ClCompile Include="..\src\network\linkers_mpi.cpp">
<Filter>src\network</Filter>
</ClCompile>
<ClCompile Include="..\src\treelearner\serial_tree_learner.cpp">
<Filter>src\treelearner</Filter>
</ClCompile>
<ClCompile Include="..\src\treelearner\tree_learner.cpp">
<Filter>src\treelearner</Filter>
</ClCompile>
<ClCompile Include="..\src\Boosting\gbdt.cpp">
<Filter>src\boosting</Filter>
</ClCompile>
<ClCompile Include="..\src\io\dataset.cpp">
<Filter>src\io</Filter>
</ClCompile>
<ClCompile Include="..\src\io\bin.cpp">
<Filter>src\io</Filter>
</ClCompile>
<ClCompile Include="..\src\io\tree.cpp">
<Filter>src\io</Filter>
</ClCompile>
<ClCompile Include="..\src\objective\objective_function.cpp">
<Filter>src\objective</Filter>
</ClCompile>
<ClCompile Include="..\src\Boosting\boosting.cpp">
<Filter>src\boosting</Filter>
</ClCompile>
<ClCompile Include="..\src\io\parser.cpp">
<Filter>src\io</Filter>
</ClCompile>
<ClCompile Include="..\src\metric\metric.cpp">
<Filter>src\metric</Filter>
</ClCompile>
<ClCompile Include="..\src\treelearner\data_parallel_tree_learner.cpp">
<Filter>src\treelearner</Filter>
</ClCompile>
<ClCompile Include="..\src\treelearner\feature_parallel_tree_learner.cpp">
<Filter>src\treelearner</Filter>
</ClCompile>
<ClCompile Include="..\src\network\linker_topo.cpp">
<Filter>src\network</Filter>
</ClCompile>
<ClCompile Include="..\src\network\network.cpp">
<Filter>src\network</Filter>
</ClCompile>
<ClCompile Include="..\src\io\config.cpp">
<Filter>src\io</Filter>
</ClCompile>
<ClCompile Include="..\src\metric\dcg_calculator.cpp">
<Filter>src\metric</Filter>
</ClCompile>
<ClCompile Include="..\src\io\metadata.cpp">
<Filter>src\io</Filter>
</ClCompile>
<ClCompile Include="..\src\main.cpp">
<Filter>src</Filter>
</ClCompile>
</ItemGroup>
</Project>
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment