Commit eade219e authored by Qiwei Ye's avatar Qiwei Ye
Browse files

merge conflict

parents f23e6083 060bd316
...@@ -7,10 +7,10 @@ ...@@ -7,10 +7,10 @@
#include <LightGBM/tree_learner.h> #include <LightGBM/tree_learner.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <LightGBM/tree.h> #include <LightGBM/tree.h>
#include <LightGBM/feature.h>
#include "feature_histogram.hpp" #include "feature_histogram.hpp"
#include "data_partition.hpp"
#include "split_info.hpp" #include "split_info.hpp"
#include "data_partition.hpp"
#include "leaf_splits.hpp" #include "leaf_splits.hpp"
#include <cstdio> #include <cstdio>
...@@ -32,6 +32,8 @@ public: ...@@ -32,6 +32,8 @@ public:
void Init(const Dataset* train_data) override; void Init(const Dataset* train_data) override;
void ResetTrainingData(const Dataset* train_data) override;
void ResetConfig(const TreeConfig* tree_config) override; void ResetConfig(const TreeConfig* tree_config) override;
Tree* Train(const score_t* gradients, const score_t *hessians) override; Tree* Train(const score_t* gradients, const score_t *hessians) override;
...@@ -41,7 +43,8 @@ public: ...@@ -41,7 +43,8 @@ public:
} }
void AddPredictionToScore(double* out_score) const override { void AddPredictionToScore(double* out_score) const override {
#pragma omp parallel for schedule(guided) if (last_trained_tree_->num_leaves() <= 1) { return; }
#pragma omp parallel for schedule(static)
for (int i = 0; i < data_partition_->num_leaves(); ++i) { for (int i = 0; i < data_partition_->num_leaves(); ++i) {
double output = static_cast<double>(last_trained_tree_->LeafOutput(i)); double output = static_cast<double>(last_trained_tree_->LeafOutput(i));
data_size_t cnt_leaf_data = 0; data_size_t cnt_leaf_data = 0;
...@@ -75,7 +78,7 @@ protected: ...@@ -75,7 +78,7 @@ protected:
* \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_. * \brief Find best features for leaves from smaller_leaf_splits_ and larger_leaf_splits_.
* This function will be called after FindBestThresholds. * This function will be called after FindBestThresholds.
*/ */
inline virtual void FindBestSplitsForLeaves(); virtual void FindBestSplitsForLeaves();
/*! /*!
* \brief Partition tree and data according best split. * \brief Partition tree and data according best split.
...@@ -93,12 +96,6 @@ protected: ...@@ -93,12 +96,6 @@ protected:
*/ */
inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const; inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;
/*!
* \brief Find best features for leaf from leaf_splits
* \param leaf_splits
*/
inline void FindBestSplitForLeaf(LeafSplits* leaf_splits);
/*! \brief Last trained decision tree */ /*! \brief Last trained decision tree */
const Tree* last_trained_tree_; const Tree* last_trained_tree_;
/*! \brief number of data */ /*! \brief number of data */
...@@ -116,7 +113,7 @@ protected: ...@@ -116,7 +113,7 @@ protected:
/*! \brief used for generate used features */ /*! \brief used for generate used features */
Random random_; Random random_;
/*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */ /*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
std::vector<bool> is_feature_used_; std::vector<int8_t> is_feature_used_;
/*! \brief pointer to histograms array of parent of current leaves */ /*! \brief pointer to histograms array of parent of current leaves */
FeatureHistogram* parent_leaf_histogram_array_; FeatureHistogram* parent_leaf_histogram_array_;
/*! \brief pointer to histograms array of smaller leaf */ /*! \brief pointer to histograms array of smaller leaf */
...@@ -137,15 +134,6 @@ protected: ...@@ -137,15 +134,6 @@ protected:
/*! \brief hessians of current iteration, ordered for cache optimized */ /*! \brief hessians of current iteration, ordered for cache optimized */
std::vector<score_t> ordered_hessians_; std::vector<score_t> ordered_hessians_;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const score_t* ptr_to_ordered_gradients_smaller_leaf_;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const score_t* ptr_to_ordered_hessians_smaller_leaf_;
/*! \brief Pointer to ordered_gradients_, use this to avoid copy at BeforeTrain */
const score_t* ptr_to_ordered_gradients_larger_leaf_;
/*! \brief Pointer to ordered_hessians_, use this to avoid copy at BeforeTrain*/
const score_t* ptr_to_ordered_hessians_larger_leaf_;
/*! \brief Store ordered bin */ /*! \brief Store ordered bin */
std::vector<std::unique_ptr<OrderedBin>> ordered_bins_; std::vector<std::unique_ptr<OrderedBin>> ordered_bins_;
/*! \brief True if has ordered bin */ /*! \brief True if has ordered bin */
...@@ -156,15 +144,10 @@ protected: ...@@ -156,15 +144,10 @@ protected:
HistogramPool histogram_pool_; HistogramPool histogram_pool_;
/*! \brief config of tree learner*/ /*! \brief config of tree learner*/
const TreeConfig* tree_config_; const TreeConfig* tree_config_;
int num_threads_;
std::vector<int> ordered_bin_indices_;
}; };
inline void SerialTreeLearner::FindBestSplitsForLeaves() {
FindBestSplitForLeaf(smaller_leaf_splits_.get());
FindBestSplitForLeaf(larger_leaf_splits_.get());
}
inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) const { inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) const {
if (leafIdx >= 0) { if (leafIdx >= 0) {
return data_partition_->leaf_count(leafIdx); return data_partition_->leaf_count(leafIdx);
...@@ -173,19 +156,5 @@ inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) cons ...@@ -173,19 +156,5 @@ inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leafIdx) cons
} }
} }
inline void SerialTreeLearner::FindBestSplitForLeaf(LeafSplits* leaf_splits) {
if (leaf_splits == nullptr || leaf_splits->LeafIndex() < 0) {
return;
}
std::vector<double> gains;
for (size_t i = 0; i < leaf_splits->BestSplitPerFeature().size(); ++i) {
gains.push_back(leaf_splits->BestSplitPerFeature()[i].gain);
}
int best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
int leaf = leaf_splits->LeafIndex();
best_split_per_leaf_[leaf] = leaf_splits->BestSplitPerFeature()[best_feature];
best_split_per_leaf_[leaf].feature = best_feature;
}
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_ #endif // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
...@@ -53,6 +53,8 @@ public: ...@@ -53,6 +53,8 @@ public:
inline bool operator > (const SplitInfo &si) const; inline bool operator > (const SplitInfo &si) const;
inline bool operator == (const SplitInfo &si) const;
inline static void MaxReducer(const char* src, char* dst, int len) { inline static void MaxReducer(const char* src, char* dst, int len) {
const int type_size = sizeof(SplitInfo); const int type_size = sizeof(SplitInfo);
int used_size = 0; int used_size = 0;
...@@ -103,5 +105,34 @@ inline bool SplitInfo::operator > (const SplitInfo& si) const { ...@@ -103,5 +105,34 @@ inline bool SplitInfo::operator > (const SplitInfo& si) const {
} }
} }
inline bool SplitInfo::operator == (const SplitInfo& si) const {
double local_gain = this->gain;
double other_gain = si.gain;
// replace nan with -inf
if (local_gain == NAN) {
local_gain = kMinScore;
}
// replace nan with -inf
if (other_gain == NAN) {
other_gain = kMinScore;
}
int local_feature = this->feature;
int other_feature = si.feature;
// replace -1 with max int
if (local_feature == -1) {
local_feature = INT32_MAX;
}
// replace -1 with max int
if (other_feature == -1) {
other_feature = INT32_MAX;
}
if (local_gain != other_gain) {
return local_gain == other_gain;
} else {
// if same gain, use smaller feature
return local_feature == other_feature;
}
}
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_ #endif // LightGBM_TREELEARNER_SPLIT_INFO_HPP_
...@@ -26,8 +26,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) { ...@@ -26,8 +26,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
// get max bin // get max bin
int max_bin = 0; int max_bin = 0;
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < num_features_; ++i) {
if (max_bin < train_data_->FeatureAt(i)->num_bin()) { if (max_bin < train_data_->FeatureNumBin(i)) {
max_bin = train_data_->FeatureAt(i)->num_bin(); max_bin = train_data_->FeatureNumBin(i);
} }
} }
// calculate buffer size // calculate buffer size
...@@ -46,21 +46,42 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) { ...@@ -46,21 +46,42 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
larger_buffer_read_start_pos_.resize(num_features_); larger_buffer_read_start_pos_.resize(num_features_);
global_data_count_in_leaf_.resize(tree_config_->num_leaves); global_data_count_in_leaf_.resize(tree_config_->num_leaves);
smaller_leaf_splits_global_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data())); smaller_leaf_splits_global_.reset(new LeafSplits(train_data_->num_data()));
larger_leaf_splits_global_.reset(new LeafSplits(train_data_->num_features(), train_data_->num_data())); larger_leaf_splits_global_.reset(new LeafSplits(train_data_->num_data()));
local_tree_config_ = *tree_config_; local_tree_config_ = *tree_config_;
local_tree_config_.min_data_in_leaf /= num_machines_; local_tree_config_.min_data_in_leaf /= num_machines_;
local_tree_config_.min_sum_hessian_in_leaf /= num_machines_; local_tree_config_.min_sum_hessian_in_leaf /= num_machines_;
histogram_pool_.ResetConfig(&local_tree_config_, train_data_->num_features()); histogram_pool_.ResetConfig(&local_tree_config_);
// initialize histograms for global // initialize histograms for global
smaller_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]); smaller_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]);
larger_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]); larger_leaf_histogram_array_global_.reset(new FeatureHistogram[num_features_]);
for (int j = 0; j < num_features_; ++j) { auto num_total_bin = train_data_->NumTotalBin();
smaller_leaf_histogram_array_global_[j].Init(train_data_->FeatureAt(j), j, tree_config_); smaller_leaf_histogram_data_.resize(num_total_bin);
larger_leaf_histogram_array_global_[j].Init(train_data_->FeatureAt(j), j, tree_config_); larger_leaf_histogram_data_.resize(num_total_bin);
feature_metas_.resize(train_data->num_features());
#pragma omp parallel for schedule(static)
for (int i = 0; i < train_data->num_features(); ++i) {
feature_metas_[i].num_bin = train_data->FeatureNumBin(i);
if (train_data->FeatureBinMapper(i)->GetDefaultBin() == 0) {
feature_metas_[i].bias = 1;
} else {
feature_metas_[i].bias = 0;
}
feature_metas_[i].tree_config = tree_config_;
}
uint64_t offset = 0;
for (int j = 0; j < train_data->num_features(); ++j) {
offset += static_cast<uint64_t>(train_data->SubFeatureBinOffset(j));
smaller_leaf_histogram_array_global_[j].Init(smaller_leaf_histogram_data_.data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
larger_leaf_histogram_array_global_[j].Init(larger_leaf_histogram_data_.data() + offset, &feature_metas_[j], train_data->FeatureBinMapper(j)->bin_type());
auto num_bin = train_data->FeatureNumBin(j);
if (train_data->FeatureBinMapper(j)->GetDefaultBin() == 0) {
num_bin -= 1;
}
offset += static_cast<uint64_t>(num_bin);
} }
} }
...@@ -71,12 +92,11 @@ void VotingParallelTreeLearner::ResetConfig(const TreeConfig* tree_config) { ...@@ -71,12 +92,11 @@ void VotingParallelTreeLearner::ResetConfig(const TreeConfig* tree_config) {
local_tree_config_.min_data_in_leaf /= num_machines_; local_tree_config_.min_data_in_leaf /= num_machines_;
local_tree_config_.min_sum_hessian_in_leaf /= num_machines_; local_tree_config_.min_sum_hessian_in_leaf /= num_machines_;
histogram_pool_.ResetConfig(&local_tree_config_, train_data_->num_features()); histogram_pool_.ResetConfig(&local_tree_config_);
global_data_count_in_leaf_.resize(tree_config_->num_leaves); global_data_count_in_leaf_.resize(tree_config_->num_leaves);
for (int j = 0; j < num_features_; ++j) { for (size_t i = 0; i < feature_metas_.size(); ++i) {
smaller_leaf_histogram_array_global_[j].ResetConfig(tree_config_); feature_metas_[i].tree_config = tree_config_;
larger_leaf_histogram_array_global_[j].ResetConfig(tree_config_);
} }
} }
...@@ -183,17 +203,17 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small ...@@ -183,17 +203,17 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
while (cur_used_features < cur_total_feature) { while (cur_used_features < cur_total_feature) {
// copy smaller leaf histograms first // copy smaller leaf histograms first
if (smaller_idx < smaller_top_features.size()) { if (smaller_idx < smaller_top_features.size()) {
int fid = smaller_top_features[smaller_idx]; int inner_feature_index = train_data_->InnerFeatureIndex(smaller_top_features[smaller_idx]);
++cur_used_features; ++cur_used_features;
// mark local aggregated feature // mark local aggregated feature
if (i == rank_) { if (i == rank_) {
smaller_is_feature_aggregated_[fid] = true; smaller_is_feature_aggregated_[inner_feature_index] = true;
smaller_buffer_read_start_pos_[fid] = static_cast<int>(cur_size); smaller_buffer_read_start_pos_[inner_feature_index] = static_cast<int>(cur_size);
} }
// copy // copy
std::memcpy(input_buffer_.data() + reduce_scatter_size_, smaller_leaf_histogram_array_[fid].HistogramData(), smaller_leaf_histogram_array_[fid].SizeOfHistgram()); std::memcpy(input_buffer_.data() + reduce_scatter_size_, smaller_leaf_histogram_array_[inner_feature_index].RawData(), smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram());
cur_size += smaller_leaf_histogram_array_[fid].SizeOfHistgram(); cur_size += smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
reduce_scatter_size_ += smaller_leaf_histogram_array_[fid].SizeOfHistgram(); reduce_scatter_size_ += smaller_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
++smaller_idx; ++smaller_idx;
} }
if (cur_used_features >= cur_total_feature) { if (cur_used_features >= cur_total_feature) {
...@@ -201,17 +221,17 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small ...@@ -201,17 +221,17 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
} }
// then copy larger leaf histograms // then copy larger leaf histograms
if (larger_idx < larger_top_features.size()) { if (larger_idx < larger_top_features.size()) {
int fid = larger_top_features[larger_idx]; int inner_feature_index = train_data_->InnerFeatureIndex(larger_top_features[larger_idx]);
++cur_used_features; ++cur_used_features;
// mark local aggregated feature // mark local aggregated feature
if (i == rank_) { if (i == rank_) {
larger_is_feature_aggregated_[fid] = true; larger_is_feature_aggregated_[inner_feature_index] = true;
larger_buffer_read_start_pos_[fid] = static_cast<int>(cur_size); larger_buffer_read_start_pos_[inner_feature_index] = static_cast<int>(cur_size);
} }
// copy // copy
std::memcpy(input_buffer_.data() + reduce_scatter_size_, larger_leaf_histogram_array_[fid].HistogramData(), larger_leaf_histogram_array_[fid].SizeOfHistgram()); std::memcpy(input_buffer_.data() + reduce_scatter_size_, larger_leaf_histogram_array_[inner_feature_index].RawData(), larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram());
cur_size += larger_leaf_histogram_array_[fid].SizeOfHistgram(); cur_size += larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
reduce_scatter_size_ += larger_leaf_histogram_array_[fid].SizeOfHistgram(); reduce_scatter_size_ += larger_leaf_histogram_array_[inner_feature_index].SizeOfHistgram();
++larger_idx; ++larger_idx;
} }
} }
...@@ -225,11 +245,83 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small ...@@ -225,11 +245,83 @@ void VotingParallelTreeLearner::CopyLocalHistogram(const std::vector<int>& small
void VotingParallelTreeLearner::FindBestThresholds() { void VotingParallelTreeLearner::FindBestThresholds() {
// use local data to find local best splits // use local data to find local best splits
SerialTreeLearner::FindBestThresholds(); std::vector<int8_t> is_feature_used(num_features_, 0);
#pragma omp parallel for schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used_[feature_index]) continue;
if (parent_leaf_histogram_array_ != nullptr
&& !parent_leaf_histogram_array_[feature_index].is_splittable()) {
smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
continue;
}
is_feature_used[feature_index] = 1;
}
bool use_subtract = true;
if (parent_leaf_histogram_array_ == nullptr) {
use_subtract = false;
}
// construct smaller leaf
HistogramBinEntry* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - 1;
train_data_->ConstructHistograms(is_feature_used,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
ptr_smaller_leaf_hist_data);
if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
// construct larger leaf
HistogramBinEntry* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - 1;
train_data_->ConstructHistograms(is_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_splits_->LeafIndex(),
ordered_bins_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(),
ptr_larger_leaf_hist_data);
}
std::vector<SplitInfo> smaller_bestsplit_per_features(num_features_);
std::vector<SplitInfo> larger_bestsplit_per_features(num_features_);
// find splits
#pragma omp parallel for schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
if (!is_feature_used[feature_index]) { continue; }
const int real_feature_index = train_data_->RealFeatureIndex(feature_index);
train_data_->FixHistogram(feature_index,
smaller_leaf_splits_->sum_gradients(), smaller_leaf_splits_->sum_hessians(),
smaller_leaf_splits_->num_data_in_leaf(),
smaller_leaf_histogram_array_[feature_index].RawData());
smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
smaller_leaf_splits_->num_data_in_leaf(),
&smaller_bestsplit_per_features[feature_index]);
smaller_bestsplit_per_features[feature_index].feature = real_feature_index;
// only has root leaf
if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->LeafIndex() < 0) { continue; }
if (use_subtract) {
larger_leaf_histogram_array_[feature_index].Subtract(smaller_leaf_histogram_array_[feature_index]);
} else {
train_data_->FixHistogram(feature_index, larger_leaf_splits_->sum_gradients(), larger_leaf_splits_->sum_hessians(),
larger_leaf_splits_->num_data_in_leaf(),
larger_leaf_histogram_array_[feature_index].RawData());
}
// find best threshold for larger child
larger_leaf_histogram_array_[feature_index].FindBestThreshold(
larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
larger_leaf_splits_->num_data_in_leaf(),
&larger_bestsplit_per_features[feature_index]);
larger_bestsplit_per_features[feature_index].feature = real_feature_index;
}
std::vector<SplitInfo> smaller_top_k_splits, larger_top_k_splits; std::vector<SplitInfo> smaller_top_k_splits, larger_top_k_splits;
// local voting // local voting
ArrayArgs<SplitInfo>::MaxK(smaller_leaf_splits_->BestSplitPerFeature(), top_k_, &smaller_top_k_splits); ArrayArgs<SplitInfo>::MaxK(smaller_bestsplit_per_features, top_k_, &smaller_top_k_splits);
ArrayArgs<SplitInfo>::MaxK(larger_leaf_splits_->BestSplitPerFeature(), top_k_, &larger_top_k_splits); ArrayArgs<SplitInfo>::MaxK(larger_bestsplit_per_features, top_k_, &larger_top_k_splits);
// gather // gather
int offset = 0; int offset = 0;
for (int i = 0; i < top_k_; ++i) { for (int i = 0; i < top_k_; ++i) {
...@@ -262,54 +354,78 @@ void VotingParallelTreeLearner::FindBestThresholds() { ...@@ -262,54 +354,78 @@ void VotingParallelTreeLearner::FindBestThresholds() {
// Reduce scatter for histogram // Reduce scatter for histogram
Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(), block_len_.data(), Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, block_start_.data(), block_len_.data(),
output_buffer_.data(), &HistogramBinEntry::SumReducer); output_buffer_.data(), &HistogramBinEntry::SumReducer);
std::vector<SplitInfo> smaller_best(num_threads_);
std::vector<SplitInfo> larger_best(num_threads_);
// find best split from local aggregated histograms // find best split from local aggregated histograms
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(static)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) { for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
const int tid = omp_get_thread_num();
if (smaller_is_feature_aggregated_[feature_index]) { if (smaller_is_feature_aggregated_[feature_index]) {
smaller_leaf_histogram_array_global_[feature_index].SetSumup( SplitInfo smaller_split;
GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()),
smaller_leaf_splits_global_->sum_gradients(),
smaller_leaf_splits_global_->sum_hessians());
// restore from buffer // restore from buffer
smaller_leaf_histogram_array_global_[feature_index].FromMemory( smaller_leaf_histogram_array_global_[feature_index].FromMemory(
output_buffer_.data() + smaller_buffer_read_start_pos_[feature_index]); output_buffer_.data() + smaller_buffer_read_start_pos_[feature_index]);
train_data_->FixHistogram(feature_index,
smaller_leaf_splits_global_->sum_gradients(), smaller_leaf_splits_global_->sum_hessians(),
GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()),
smaller_leaf_histogram_array_global_[feature_index].RawData());
// find best threshold // find best threshold
smaller_leaf_histogram_array_global_[feature_index].FindBestThreshold( smaller_leaf_histogram_array_global_[feature_index].FindBestThreshold(
&smaller_leaf_splits_global_->BestSplitPerFeature()[feature_index]); smaller_leaf_splits_global_->sum_gradients(),
smaller_leaf_splits_global_->sum_hessians(),
GetGlobalDataCountInLeaf(smaller_leaf_splits_global_->LeafIndex()),
&smaller_split);
if (smaller_split.gain > smaller_best[tid].gain) {
smaller_best[tid] = smaller_split;
smaller_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
}
} }
if (larger_is_feature_aggregated_[feature_index]) { if (larger_is_feature_aggregated_[feature_index]) {
larger_leaf_histogram_array_global_[feature_index].SetSumup(GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()), SplitInfo larger_split;
larger_leaf_splits_global_->sum_gradients(), larger_leaf_splits_global_->sum_hessians());
// restore from buffer // restore from buffer
larger_leaf_histogram_array_global_[feature_index].FromMemory(output_buffer_.data() + larger_buffer_read_start_pos_[feature_index]); larger_leaf_histogram_array_global_[feature_index].FromMemory(output_buffer_.data() + larger_buffer_read_start_pos_[feature_index]);
train_data_->FixHistogram(feature_index,
larger_leaf_splits_global_->sum_gradients(), larger_leaf_splits_global_->sum_hessians(),
GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()),
larger_leaf_histogram_array_global_[feature_index].RawData());
// find best threshold // find best threshold
larger_leaf_histogram_array_global_[feature_index].FindBestThreshold(&larger_leaf_splits_global_->BestSplitPerFeature()[feature_index]); larger_leaf_histogram_array_global_[feature_index].FindBestThreshold(
larger_leaf_splits_global_->sum_gradients(),
larger_leaf_splits_global_->sum_hessians(),
GetGlobalDataCountInLeaf(larger_leaf_splits_global_->LeafIndex()),
&larger_split);
if (larger_split.gain > larger_best[tid].gain) {
larger_best[tid] = larger_split;
larger_best[tid].feature = train_data_->RealFeatureIndex(feature_index);
}
} }
} }
auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
int leaf = smaller_leaf_splits_->LeafIndex();
best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];
if (larger_leaf_splits_ != nullptr && larger_leaf_splits_->LeafIndex() >= 0) {
leaf = larger_leaf_splits_->LeafIndex();
auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
}
} }
void VotingParallelTreeLearner::FindBestSplitsForLeaves() { void VotingParallelTreeLearner::FindBestSplitsForLeaves() {
int smaller_best_feature = -1, larger_best_feature = -1;
// find local best // find local best
SplitInfo smaller_best, larger_best; SplitInfo smaller_best, larger_best;
std::vector<double> gains; smaller_best = best_split_per_leaf_[smaller_leaf_splits_->LeafIndex()];
for (size_t i = 0; i < smaller_leaf_splits_global_->BestSplitPerFeature().size(); ++i) { // find local best split for larger leaf
gains.push_back(smaller_leaf_splits_global_->BestSplitPerFeature()[i].gain); if (larger_leaf_splits_->LeafIndex() >= 0) {
} larger_best = best_split_per_leaf_[larger_leaf_splits_->LeafIndex()];
smaller_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
smaller_best = smaller_leaf_splits_global_->BestSplitPerFeature()[smaller_best_feature];
if (larger_leaf_splits_global_->LeafIndex() >= 0) {
gains.clear();
for (size_t i = 0; i < larger_leaf_splits_global_->BestSplitPerFeature().size(); ++i) {
gains.push_back(larger_leaf_splits_global_->BestSplitPerFeature()[i].gain);
}
larger_best_feature = static_cast<int>(ArrayArgs<double>::ArgMax(gains));
larger_best = larger_leaf_splits_global_->BestSplitPerFeature()[larger_best_feature];
} }
// sync global best info // sync global best info
std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo)); std::memcpy(input_buffer_.data(), &smaller_best, sizeof(SplitInfo));
...@@ -336,18 +452,18 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, ...@@ -336,18 +452,18 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf,
// init the global sumup info // init the global sumup info
if (best_split_info.left_count < best_split_info.right_count) { if (best_split_info.left_count < best_split_info.right_count) {
smaller_leaf_splits_global_->Init(*left_leaf, data_partition_.get(), smaller_leaf_splits_global_->Init(*left_leaf, data_partition_.get(),
best_split_info.left_sum_gradient, best_split_info.left_sum_gradient,
best_split_info.left_sum_hessian); best_split_info.left_sum_hessian);
larger_leaf_splits_global_->Init(*right_leaf, data_partition_.get(), larger_leaf_splits_global_->Init(*right_leaf, data_partition_.get(),
best_split_info.right_sum_gradient, best_split_info.right_sum_gradient,
best_split_info.right_sum_hessian); best_split_info.right_sum_hessian);
} else { } else {
smaller_leaf_splits_global_->Init(*right_leaf, data_partition_.get(), smaller_leaf_splits_global_->Init(*right_leaf, data_partition_.get(),
best_split_info.right_sum_gradient, best_split_info.right_sum_gradient,
best_split_info.right_sum_hessian); best_split_info.right_sum_hessian);
larger_leaf_splits_global_->Init(*left_leaf, data_partition_.get(), larger_leaf_splits_global_->Init(*left_leaf, data_partition_.get(),
best_split_info.left_sum_gradient, best_split_info.left_sum_gradient,
best_split_info.left_sum_hessian); best_split_info.left_sum_hessian);
} }
} }
......
...@@ -6,21 +6,21 @@ import unittest ...@@ -6,21 +6,21 @@ import unittest
import lightgbm as lgb import lightgbm as lgb
import numpy as np import numpy as np
from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_breast_cancer, dump_svmlight_file
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
class TestBasic(unittest.TestCase): class TestBasic(unittest.TestCase):
def test(self): def test(self):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1) X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=2)
train_data = lgb.Dataset(X_train, max_bin=255, label=y_train) train_data = lgb.Dataset(X_train, max_bin=255, label=y_train)
valid_data = train_data.create_valid(X_test, label=y_test) valid_data = train_data.create_valid(X_test, label=y_test)
params = { params = {
"objective": "binary", "objective": "binary",
"metric": "auc", "metric": "auc",
"min_data": 1, "min_data": 10,
"num_leaves": 15, "num_leaves": 15,
"verbose": -1 "verbose": -1
} }
...@@ -36,7 +36,7 @@ class TestBasic(unittest.TestCase): ...@@ -36,7 +36,7 @@ class TestBasic(unittest.TestCase):
with tempfile.NamedTemporaryFile() as f: with tempfile.NamedTemporaryFile() as f:
tname = f.name tname = f.name
with open(tname, "w+b") as f: with open(tname, "w+b") as f:
np.savetxt(f, X_test, delimiter=',') dump_svmlight_file(X_test, y_test, f)
pred_from_file = bst.predict(tname) pred_from_file = bst.predict(tname)
os.remove(tname) os.remove(tname)
self.assertEqual(len(pred_from_matr), len(pred_from_file)) self.assertEqual(len(pred_from_matr), len(pred_from_file))
......
...@@ -10,11 +10,17 @@ import numpy as np ...@@ -10,11 +10,17 @@ import numpy as np
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits, from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_iris) load_iris)
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split, TimeSeriesSplit
try:
import pandas as pd
IS_PANDAS_INSTALLED = True
except ImportError:
IS_PANDAS_INSTALLED = False
try: try:
import cPickle as pickle import cPickle as pickle
except: except ImportError:
import pickle import pickle
...@@ -22,31 +28,33 @@ def multi_logloss(y_true, y_pred): ...@@ -22,31 +28,33 @@ def multi_logloss(y_true, y_pred):
return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)]) return np.mean([-math.log(y_pred[i][y]) for i, y in enumerate(y_true)])
def test_template(params={'objective': 'regression', 'metric': 'l2'}, class template(object):
X_y=load_boston(True), feval=mean_squared_error, @staticmethod
num_round=100, init_model=None, custom_eval=None, def test_template(params={'objective': 'regression', 'metric': 'l2'},
early_stopping_rounds=10, X_y=load_boston(True), feval=mean_squared_error,
return_data=False, return_model=False): num_round=150, init_model=None, custom_eval=None,
params['verbose'], params['seed'] = -1, 42 early_stopping_rounds=10,
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42) return_data=False, return_model=False):
lgb_train = lgb.Dataset(X_train, y_train, params=params) params['verbose'], params['seed'] = -1, 42
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
if return_data: lgb_train = lgb.Dataset(X_train, y_train, params=params)
return lgb_train, lgb_eval lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
evals_result = {} if return_data:
gbm = lgb.train(params, lgb_train, return lgb_train, lgb_eval
num_boost_round=num_round, evals_result = {}
valid_sets=lgb_eval, gbm = lgb.train(params, lgb_train,
valid_names='eval', num_boost_round=num_round,
verbose_eval=False, valid_sets=lgb_eval,
feval=custom_eval, valid_names='eval',
evals_result=evals_result, verbose_eval=False,
early_stopping_rounds=early_stopping_rounds, feval=custom_eval,
init_model=init_model) evals_result=evals_result,
if return_model: early_stopping_rounds=early_stopping_rounds,
return gbm init_model=init_model)
else: if return_model:
return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration)) return gbm
else:
return evals_result, feval(y_test, gbm.predict(X_test, gbm.best_iteration))
class TestEngine(unittest.TestCase): class TestEngine(unittest.TestCase):
...@@ -57,12 +65,12 @@ class TestEngine(unittest.TestCase): ...@@ -57,12 +65,12 @@ class TestEngine(unittest.TestCase):
'objective': 'binary', 'objective': 'binary',
'metric': 'binary_logloss' 'metric': 'binary_logloss'
} }
evals_result, ret = test_template(params, X_y, log_loss) evals_result, ret = template.test_template(params, X_y, log_loss)
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
self.assertAlmostEqual(min(evals_result['eval']['binary_logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['binary_logloss']), ret, places=5)
def test_regreesion(self): def test_regreesion(self):
evals_result, ret = test_template() evals_result, ret = template.test_template()
ret **= 0.5 ret **= 0.5
self.assertLess(ret, 4) self.assertLess(ret, 4)
self.assertAlmostEqual(min(evals_result['eval']['l2']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['l2']), ret, places=5)
...@@ -74,7 +82,7 @@ class TestEngine(unittest.TestCase): ...@@ -74,7 +82,7 @@ class TestEngine(unittest.TestCase):
'metric': 'multi_logloss', 'metric': 'multi_logloss',
'num_class': 10 'num_class': 10
} }
evals_result, ret = test_template(params, X_y, multi_logloss) evals_result, ret = template.test_template(params, X_y, multi_logloss)
self.assertLess(ret, 0.2) self.assertLess(ret, 0.2)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
...@@ -84,11 +92,11 @@ class TestEngine(unittest.TestCase): ...@@ -84,11 +92,11 @@ class TestEngine(unittest.TestCase):
'metric': 'l1' 'metric': 'l1'
} }
model_name = 'model.txt' model_name = 'model.txt'
gbm = test_template(params, num_round=20, return_model=True, early_stopping_rounds=-1) gbm = template.test_template(params, num_round=20, return_model=True, early_stopping_rounds=-1)
gbm.save_model(model_name) gbm.save_model(model_name)
evals_result, ret = test_template(params, feval=mean_absolute_error, evals_result, ret = template.test_template(params, feval=mean_absolute_error,
num_round=80, init_model=model_name, num_round=80, init_model=model_name,
custom_eval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False))) custom_eval=(lambda p, d: ('mae', mean_absolute_error(p, d.get_label()), False)))
self.assertLess(ret, 3) self.assertLess(ret, 3)
self.assertAlmostEqual(min(evals_result['eval']['l1']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['l1']), ret, places=5)
for l1, mae in zip(evals_result['eval']['l1'], evals_result['eval']['mae']): for l1, mae in zip(evals_result['eval']['l1'], evals_result['eval']['mae']):
...@@ -104,38 +112,90 @@ class TestEngine(unittest.TestCase): ...@@ -104,38 +112,90 @@ class TestEngine(unittest.TestCase):
'metric': 'multi_logloss', 'metric': 'multi_logloss',
'num_class': 3 'num_class': 3
} }
gbm = test_template(params, X_y, num_round=20, return_model=True, early_stopping_rounds=-1) gbm = template.test_template(params, X_y, num_round=20, return_model=True, early_stopping_rounds=-1)
evals_result, ret = test_template(params, X_y, feval=multi_logloss, evals_result, ret = template.test_template(params, X_y, feval=multi_logloss,
num_round=80, init_model=gbm) num_round=80, init_model=gbm)
self.assertLess(ret, 1.5) self.assertLess(ret, 1.5)
self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5) self.assertAlmostEqual(min(evals_result['eval']['multi_logloss']), ret, places=5)
def test_cv(self): def test_cv(self):
lgb_train, _ = test_template(return_data=True) lgb_train, _ = template.test_template(return_data=True)
lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5, lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, nfold=5, shuffle=False,
metrics='l1', verbose_eval=False, metrics='l1', verbose_eval=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)]) callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
tss = TimeSeriesSplit(3)
lgb.cv({'verbose': -1}, lgb_train, num_boost_round=20, data_splitter=tss, nfold=5, # test if wrong nfold is ignored
metrics='l2', verbose_eval=False)
def test_feature_name(self):
lgb_train, _ = template.test_template(return_data=True)
feature_names = ['f' + str(i) for i in range(13)]
gbm = lgb.train({'verbose': -1}, lgb_train, num_boost_round=10, feature_name=feature_names)
self.assertListEqual(feature_names, gbm.feature_name())
def test_save_load_copy_pickle(self): def test_save_load_copy_pickle(self):
gbm = test_template(num_round=20, return_model=True) gbm = template.test_template(num_round=20, return_model=True)
_, ret_origin = test_template(init_model=gbm) _, ret_origin = template.test_template(init_model=gbm)
other_ret = [] other_ret = []
gbm.save_model('lgb.model') gbm.save_model('lgb.model')
other_ret.append(test_template(init_model='lgb.model')[1]) other_ret.append(template.test_template(init_model='lgb.model')[1])
gbm_load = lgb.Booster(model_file='lgb.model') gbm_load = lgb.Booster(model_file='lgb.model')
other_ret.append(test_template(init_model=gbm_load)[1]) other_ret.append(template.test_template(init_model=gbm_load)[1])
other_ret.append(test_template(init_model=copy.copy(gbm))[1]) other_ret.append(template.test_template(init_model=copy.copy(gbm))[1])
other_ret.append(test_template(init_model=copy.deepcopy(gbm))[1]) other_ret.append(template.test_template(init_model=copy.deepcopy(gbm))[1])
with open('lgb.pkl', 'wb') as f: with open('lgb.pkl', 'wb') as f:
pickle.dump(gbm, f) pickle.dump(gbm, f)
with open('lgb.pkl', 'rb') as f: with open('lgb.pkl', 'rb') as f:
gbm_pickle = pickle.load(f) gbm_pickle = pickle.load(f)
other_ret.append(test_template(init_model=gbm_pickle)[1]) other_ret.append(template.test_template(init_model=gbm_pickle)[1])
gbm_pickles = pickle.loads(pickle.dumps(gbm)) gbm_pickles = pickle.loads(pickle.dumps(gbm))
other_ret.append(test_template(init_model=gbm_pickles)[1]) other_ret.append(template.test_template(init_model=gbm_pickles)[1])
for ret in other_ret: for ret in other_ret:
self.assertAlmostEqual(ret_origin, ret, places=5) self.assertAlmostEqual(ret_origin, ret, places=5)
@unittest.skipIf(not IS_PANDAS_INSTALLED, 'pandas not installed')
def test_pandas_categorical(self):
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150)}) # bool
y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30)})
for col in ["A", "B", "C", "D"]:
X[col] = X[col].astype('category')
X_test[col] = X_test[col].astype('category')
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbose': -1
}
lgb_train = lgb.Dataset(X, y)
gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False)
pred0 = list(gbm0.predict(X_test))
lgb_train = lgb.Dataset(X, y)
gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=[0])
pred1 = list(gbm1.predict(X_test))
lgb_train = lgb.Dataset(X, y)
gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A'])
pred2 = list(gbm2.predict(X_test))
lgb_train = lgb.Dataset(X, y)
gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A', 'B', 'C', 'D'])
pred3 = list(gbm3.predict(X_test))
lgb_train = lgb.Dataset(X, y)
gbm3.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = list(gbm4.predict(X_test))
self.assertListEqual(pred0, pred1)
self.assertListEqual(pred0, pred2)
self.assertListEqual(pred0, pred3)
self.assertListEqual(pred0, pred4)
print("----------------------------------------------------------------------") print("----------------------------------------------------------------------")
print("running test_engine.py") print("running test_engine.py")
......
# coding: utf-8
# pylint: skip-file
import unittest
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
try:
import matplotlib
matplotlib.use('Agg')
matplotlib_installed = True
except ImportError:
matplotlib_installed = False
class TestBasic(unittest.TestCase):
@unittest.skipIf(not matplotlib_installed, 'matplotlib not installed')
def test_plot_importance(self):
X_train, _, y_train, _ = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
train_data = lgb.Dataset(X_train, y_train)
params = {
"objective": "binary",
"verbose": -1,
"num_leaves": 3
}
gbm0 = lgb.train(params, train_data, num_boost_round=10)
ax0 = lgb.plot_importance(gbm0)
self.assertIsInstance(ax0, matplotlib.axes.Axes)
self.assertEqual(ax0.get_title(), 'Feature importance')
self.assertEqual(ax0.get_xlabel(), 'Feature importance')
self.assertEqual(ax0.get_ylabel(), 'Features')
self.assertLessEqual(len(ax0.patches), 30)
gbm1 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
gbm1.fit(X_train, y_train)
ax1 = lgb.plot_importance(gbm1, color='r', title='t', xlabel='x', ylabel='y')
self.assertIsInstance(ax1, matplotlib.axes.Axes)
self.assertEqual(ax1.get_title(), 't')
self.assertEqual(ax1.get_xlabel(), 'x')
self.assertEqual(ax1.get_ylabel(), 'y')
self.assertLessEqual(len(ax1.patches), 30)
for patch in ax1.patches:
self.assertTupleEqual(patch.get_facecolor(), (1., 0, 0, 1.)) # red
ax2 = lgb.plot_importance(gbm0, color=['r', 'y', 'g', 'b'],
title=None, xlabel=None, ylabel=None)
self.assertIsInstance(ax2, matplotlib.axes.Axes)
self.assertEqual(ax2.get_title(), '')
self.assertEqual(ax2.get_xlabel(), '')
self.assertEqual(ax2.get_ylabel(), '')
self.assertLessEqual(len(ax2.patches), 30)
self.assertTupleEqual(ax2.patches[0].get_facecolor(), (1., 0, 0, 1.)) # r
self.assertTupleEqual(ax2.patches[1].get_facecolor(), (.75, .75, 0, 1.)) # y
self.assertTupleEqual(ax2.patches[2].get_facecolor(), (0, .5, 0, 1.)) # g
self.assertTupleEqual(ax2.patches[3].get_facecolor(), (0, 0, 1., 1.)) # b
@unittest.skip('Graphviz are not executables on Travis')
def test_plot_tree(self):
pass
@unittest.skipIf(not matplotlib_installed, 'matplotlib not installed')
def test_plot_metrics(self):
X_train, X_test, y_train, y_test = train_test_split(*load_breast_cancer(True), test_size=0.1, random_state=1)
train_data = lgb.Dataset(X_train, y_train)
test_data = lgb.Dataset(X_test, y_test, reference=train_data)
params = {
"objective": "binary",
"metric": {"binary_logloss", "binary_error"},
"verbose": -1,
"num_leaves": 3
}
evals_result0 = {}
gbm0 = lgb.train(params, train_data,
valid_sets=[train_data, test_data],
valid_names=['v1', 'v2'],
num_boost_round=10,
evals_result=evals_result0,
verbose_eval=False)
ax0 = lgb.plot_metric(evals_result0)
self.assertIsInstance(ax0, matplotlib.axes.Axes)
self.assertEqual(ax0.get_title(), 'Metric during training')
self.assertEqual(ax0.get_xlabel(), 'Iterations')
self.assertIn(ax0.get_ylabel(), {'binary_logloss', 'binary_error'})
ax0 = lgb.plot_metric(evals_result0, metric='binary_error')
ax0 = lgb.plot_metric(evals_result0, metric='binary_logloss', dataset_names=['v2'])
evals_result1 = {}
gbm1 = lgb.train(params, train_data,
num_boost_round=10,
evals_result=evals_result1,
verbose_eval=False)
self.assertRaises(ValueError, lgb.plot_metric, evals_result1)
gbm2 = lgb.LGBMClassifier(n_estimators=10, num_leaves=3, silent=True)
gbm2.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
ax2 = lgb.plot_metric(gbm2, title=None, xlabel=None, ylabel=None)
self.assertIsInstance(ax2, matplotlib.axes.Axes)
self.assertEqual(ax2.get_title(), '')
self.assertEqual(ax2.get_xlabel(), '')
self.assertEqual(ax2.get_ylabel(), '')
print("----------------------------------------------------------------------")
print("running test_plotting.py")
unittest.main()
...@@ -12,42 +12,44 @@ from sklearn.metrics import log_loss, mean_squared_error ...@@ -12,42 +12,44 @@ from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.model_selection import GridSearchCV, train_test_split
def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor, class template(object):
feval=mean_squared_error, num_round=100, @staticmethod
custom_obj=None, predict_proba=False, def test_template(X_y=load_boston(True), model=lgb.LGBMRegressor,
return_data=False, return_model=False): feval=mean_squared_error, num_round=100,
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42) custom_obj=None, predict_proba=False,
if return_data: return_data=False, return_model=False):
return X_train, X_test, y_train, y_test X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
arguments = {'n_estimators': num_round, 'silent': True} if return_data:
if custom_obj: return X_train, X_test, y_train, y_test
arguments['objective'] = custom_obj arguments = {'n_estimators': num_round, 'silent': True}
gbm = model(**arguments) if custom_obj:
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False) arguments['objective'] = custom_obj
if return_model: gbm = model(**arguments)
return gbm gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10, verbose=False)
elif predict_proba: if return_model:
return feval(y_test, gbm.predict_proba(X_test)) return gbm
else: elif predict_proba:
return feval(y_test, gbm.predict(X_test)) return feval(y_test, gbm.predict_proba(X_test))
else:
return feval(y_test, gbm.predict(X_test))
class TestSklearn(unittest.TestCase): class TestSklearn(unittest.TestCase):
def test_binary(self): def test_binary(self):
X_y = load_breast_cancer(True) X_y = load_breast_cancer(True)
ret = test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True) ret = template.test_template(X_y, lgb.LGBMClassifier, log_loss, predict_proba=True)
self.assertLess(ret, 0.15) self.assertLess(ret, 0.15)
def test_regreesion(self): def test_regreesion(self):
self.assertLess(test_template() ** 0.5, 4) self.assertLess(template.test_template() ** 0.5, 4)
def test_multiclass(self): def test_multiclass(self):
X_y = load_digits(10, True) X_y = load_digits(10, True)
def multi_error(y_true, y_pred): def multi_error(y_true, y_pred):
return np.mean(y_true != y_pred) return np.mean(y_true != y_pred)
ret = test_template(X_y, lgb.LGBMClassifier, multi_error) ret = template.test_template(X_y, lgb.LGBMClassifier, multi_error)
self.assertLess(ret, 0.2) self.assertLess(ret, 0.2)
def test_lambdarank(self): def test_lambdarank(self):
...@@ -68,7 +70,7 @@ class TestSklearn(unittest.TestCase): ...@@ -68,7 +70,7 @@ class TestSklearn(unittest.TestCase):
grad = (y_pred - y_true) grad = (y_pred - y_true)
hess = np.ones(len(y_true)) hess = np.ones(len(y_true))
return grad, hess return grad, hess
ret = test_template(custom_obj=objective_ls) ret = template.test_template(custom_obj=objective_ls)
self.assertLess(ret, 100) self.assertLess(ret, 100)
def test_binary_classification_with_custom_objective(self): def test_binary_classification_with_custom_objective(self):
...@@ -81,17 +83,17 @@ class TestSklearn(unittest.TestCase): ...@@ -81,17 +83,17 @@ class TestSklearn(unittest.TestCase):
def binary_error(y_test, y_pred): def binary_error(y_test, y_pred):
return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)]) return np.mean([int(p > 0.5) != y for y, p in zip(y_test, y_pred)])
ret = test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj) ret = template.test_template(X_y, lgb.LGBMClassifier, feval=binary_error, custom_obj=logregobj)
self.assertLess(ret, 0.1) self.assertLess(ret, 0.1)
def test_dart(self): def test_dart(self):
X_train, X_test, y_train, y_test = test_template(return_data=True) X_train, X_test, y_train, y_test = template.test_template(return_data=True)
gbm = lgb.LGBMRegressor(boosting_type='dart') gbm = lgb.LGBMRegressor(boosting_type='dart')
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
self.assertLessEqual(gbm.score(X_train, y_train), 1.) self.assertLessEqual(gbm.score(X_train, y_train), 1.)
def test_grid_search(self): def test_grid_search(self):
X_train, X_test, y_train, y_test = test_template(return_data=True) X_train, X_test, y_train, y_test = template.test_template(return_data=True)
params = {'boosting_type': ['dart', 'gbdt'], params = {'boosting_type': ['dart', 'gbdt'],
'n_estimators': [15, 20], 'n_estimators': [15, 20],
'drop_rate': [0.1, 0.2]} 'drop_rate': [0.1, 0.2]}
...@@ -100,27 +102,29 @@ class TestSklearn(unittest.TestCase): ...@@ -100,27 +102,29 @@ class TestSklearn(unittest.TestCase):
self.assertIn(gbm.best_params_['n_estimators'], [15, 20]) self.assertIn(gbm.best_params_['n_estimators'], [15, 20])
def test_clone_and_property(self): def test_clone_and_property(self):
gbm = test_template(return_model=True) gbm = template.test_template(return_model=True)
gbm_clone = clone(gbm) gbm_clone = clone(gbm)
self.assertIsInstance(gbm.booster_, lgb.Booster) self.assertIsInstance(gbm.booster_, lgb.Booster)
self.assertIsInstance(gbm.feature_importance_, np.ndarray) self.assertIsInstance(gbm.feature_importances_, np.ndarray)
clf = test_template(load_digits(2, True), model=lgb.LGBMClassifier, return_model=True) clf = template.test_template(load_digits(2, True), model=lgb.LGBMClassifier, return_model=True)
self.assertListEqual(sorted(clf.classes_), [0, 1]) self.assertListEqual(sorted(clf.classes_), [0, 1])
self.assertEqual(clf.n_classes_, 2) self.assertEqual(clf.n_classes_, 2)
self.assertIsInstance(clf.booster_, lgb.Booster) self.assertIsInstance(clf.booster_, lgb.Booster)
self.assertIsInstance(clf.feature_importance_, np.ndarray) self.assertIsInstance(clf.feature_importances_, np.ndarray)
def test_joblib(self): def test_joblib(self):
gbm = test_template(num_round=10, return_model=True) gbm = template.test_template(num_round=10, return_model=True)
joblib.dump(gbm, 'lgb.pkl') joblib.dump(gbm, 'lgb.pkl')
gbm_pickle = joblib.load('lgb.pkl') gbm_pickle = joblib.load('lgb.pkl')
self.assertIsInstance(gbm_pickle.booster_, lgb.Booster) self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params()) self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
self.assertListEqual(list(gbm.feature_importance_), list(gbm_pickle.feature_importance_)) self.assertListEqual(list(gbm.feature_importances_), list(gbm_pickle.feature_importances_))
X_train, X_test, y_train, y_test = test_template(return_data=True) X_train, X_test, y_train, y_test = template.test_template(return_data=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
self.assertDictEqual(gbm.evals_result_, gbm_pickle.evals_result_) for key in gbm.evals_result_:
for evals in zip(gbm.evals_result_[key], gbm_pickle.evals_result_[key]):
self.assertAlmostEqual(*evals, places=5)
pred_origin = gbm.predict(X_test) pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test) pred_pickle = gbm_pickle.predict(X_test)
self.assertEqual(len(pred_origin), len(pred_pickle)) self.assertEqual(len(pred_origin), len(pred_pickle))
......
...@@ -196,7 +196,7 @@ ...@@ -196,7 +196,7 @@
<ClInclude Include="..\include\LightGBM\c_api.h" /> <ClInclude Include="..\include\LightGBM\c_api.h" />
<ClInclude Include="..\include\LightGBM\dataset.h" /> <ClInclude Include="..\include\LightGBM\dataset.h" />
<ClInclude Include="..\include\LightGBM\dataset_loader.h" /> <ClInclude Include="..\include\LightGBM\dataset_loader.h" />
<ClInclude Include="..\include\LightGBM\feature.h" /> <ClInclude Include="..\include\LightGBM\feature_group.h" />
<ClInclude Include="..\include\LightGBM\meta.h" /> <ClInclude Include="..\include\LightGBM\meta.h" />
<ClInclude Include="..\include\LightGBM\metric.h" /> <ClInclude Include="..\include\LightGBM\metric.h" />
<ClInclude Include="..\include\LightGBM\network.h" /> <ClInclude Include="..\include\LightGBM\network.h" />
...@@ -206,6 +206,7 @@ ...@@ -206,6 +206,7 @@
<ClInclude Include="..\include\LightGBM\utils\array_args.h" /> <ClInclude Include="..\include\LightGBM\utils\array_args.h" />
<ClInclude Include="..\include\LightGBM\utils\common.h" /> <ClInclude Include="..\include\LightGBM\utils\common.h" />
<ClInclude Include="..\include\LightGBM\utils\log.h" /> <ClInclude Include="..\include\LightGBM\utils\log.h" />
<ClInclude Include="..\include\LightGBM\utils\openmp_wrapper.h" />
<ClInclude Include="..\include\LightGBM\utils\pipeline_reader.h" /> <ClInclude Include="..\include\LightGBM\utils\pipeline_reader.h" />
<ClInclude Include="..\include\LightGBM\utils\random.h" /> <ClInclude Include="..\include\LightGBM\utils\random.h" />
<ClInclude Include="..\include\LightGBM\utils\text_reader.h" /> <ClInclude Include="..\include\LightGBM\utils\text_reader.h" />
...@@ -213,8 +214,10 @@ ...@@ -213,8 +214,10 @@
<ClInclude Include="..\src\application\predictor.hpp" /> <ClInclude Include="..\src\application\predictor.hpp" />
<ClInclude Include="..\src\boosting\gbdt.h" /> <ClInclude Include="..\src\boosting\gbdt.h" />
<ClInclude Include="..\src\boosting\dart.hpp" /> <ClInclude Include="..\src\boosting\dart.hpp" />
<ClInclude Include="..\src\boosting\goss.hpp" />
<ClInclude Include="..\src\boosting\score_updater.hpp" /> <ClInclude Include="..\src\boosting\score_updater.hpp" />
<ClInclude Include="..\src\io\dense_bin.hpp" /> <ClInclude Include="..\src\io\dense_bin.hpp" />
<ClInclude Include="..\src\io\dense_nbits_bin.hpp" />
<ClInclude Include="..\src\io\ordered_sparse_bin.hpp" /> <ClInclude Include="..\src\io\ordered_sparse_bin.hpp" />
<ClInclude Include="..\src\io\parser.hpp" /> <ClInclude Include="..\src\io\parser.hpp" />
<ClInclude Include="..\src\io\sparse_bin.hpp" /> <ClInclude Include="..\src\io\sparse_bin.hpp" />
......
...@@ -96,15 +96,9 @@ ...@@ -96,15 +96,9 @@
<ClInclude Include="..\src\treelearner\data_partition.hpp"> <ClInclude Include="..\src\treelearner\data_partition.hpp">
<Filter>src\treelearner</Filter> <Filter>src\treelearner</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\src\treelearner\feature_histogram.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\leaf_splits.hpp"> <ClInclude Include="..\src\treelearner\leaf_splits.hpp">
<Filter>src\treelearner</Filter> <Filter>src\treelearner</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\src\treelearner\split_info.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\application.h"> <ClInclude Include="..\include\LightGBM\application.h">
<Filter>include\LightGBM</Filter> <Filter>include\LightGBM</Filter>
</ClInclude> </ClInclude>
...@@ -120,9 +114,6 @@ ...@@ -120,9 +114,6 @@
<ClInclude Include="..\include\LightGBM\dataset.h"> <ClInclude Include="..\include\LightGBM\dataset.h">
<Filter>include\LightGBM</Filter> <Filter>include\LightGBM</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\include\LightGBM\feature.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\meta.h"> <ClInclude Include="..\include\LightGBM\meta.h">
<Filter>include\LightGBM</Filter> <Filter>include\LightGBM</Filter>
</ClInclude> </ClInclude>
...@@ -171,6 +162,24 @@ ...@@ -171,6 +162,24 @@
<ClInclude Include="..\src\boosting\dart.hpp"> <ClInclude Include="..\src\boosting\dart.hpp">
<Filter>src\boosting</Filter> <Filter>src\boosting</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\include\LightGBM\feature_group.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\feature_histogram.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\split_info.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\boosting\goss.hpp">
<Filter>src\boosting</Filter>
</ClInclude>
<ClInclude Include="..\src\io\dense_nbits_bin.hpp">
<Filter>src\io</Filter>
</ClInclude>
<ClInclude Include="..\include\LightGBM\utils\openmp_wrapper.h">
<Filter>include\LightGBM\utils</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClCompile Include="..\src\application\application.cpp"> <ClCompile Include="..\src\application\application.cpp">
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment