Commit 952099d6 authored by Guolin Ke's avatar Guolin Ke
Browse files

fix bugs in bin finder

parent 9f04f276
...@@ -16,18 +16,6 @@ namespace LightGBM { ...@@ -16,18 +16,6 @@ namespace LightGBM {
namespace Common { namespace Common {
template<typename T>
inline static T Max(const T& a, const T& b) {
return a > b ? a : b;
}
template<typename T>
inline static T Min(const T& a, const T& b) {
return a < b ? a : b;
}
inline static std::string& Trim(std::string& str) { inline static std::string& Trim(std::string& str) {
if (str.size() <= 0) { if (str.size() <= 0) {
return str; return str;
...@@ -329,8 +317,8 @@ inline static std::string Join(const std::vector<T>& strs, size_t start, size_t ...@@ -329,8 +317,8 @@ inline static std::string Join(const std::vector<T>& strs, size_t start, size_t
if (end - start <= 0) { if (end - start <= 0) {
return std::string(""); return std::string("");
} }
start = Min<size_t>(start, static_cast<size_t>(strs.size()) - 1); start = std::min(start, static_cast<size_t>(strs.size()) - 1);
end = Min<size_t>(end, static_cast<size_t>(strs.size())); end = std::min(end, static_cast<size_t>(strs.size()));
std::stringstream ss; std::stringstream ss;
ss << strs[start]; ss << strs[start];
for (size_t i = start + 1; i < end; ++i) { for (size_t i = start + 1; i < end; ++i) {
......
...@@ -110,9 +110,9 @@ void DART::DroppingTrees() { ...@@ -110,9 +110,9 @@ void DART::DroppingTrees() {
drop_index_ = random_for_drop_.Sample(iter_, 1); drop_index_ = random_for_drop_.Sample(iter_, 1);
} }
// drop trees // drop trees
for (int i: drop_index_) { for (auto i: drop_index_) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) { for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
int curr_tree = i * num_class_ + curr_class; auto curr_tree = i * num_class_ + curr_class;
models_[curr_tree]->Shrinkage(-1.0); models_[curr_tree]->Shrinkage(-1.0);
train_score_updater_->AddScore(models_[curr_tree], curr_class); train_score_updater_->AddScore(models_[curr_tree], curr_class);
} }
...@@ -122,9 +122,9 @@ void DART::DroppingTrees() { ...@@ -122,9 +122,9 @@ void DART::DroppingTrees() {
void DART::Normalize() { void DART::Normalize() {
double k = static_cast<double>(drop_index_.size()); double k = static_cast<double>(drop_index_.size());
for (int i: drop_index_) { for (auto i: drop_index_) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) { for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
int curr_tree = i * num_class_ + curr_class; auto curr_tree = i * num_class_ + curr_class;
// update validation score // update validation score
models_[curr_tree]->Shrinkage(shrinkage_rate_); models_[curr_tree]->Shrinkage(shrinkage_rate_);
for (auto& score_updater : valid_score_updater_) { for (auto& score_updater : valid_score_updater_) {
......
...@@ -385,7 +385,7 @@ void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filen ...@@ -385,7 +385,7 @@ void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filen
model_output_file_ << models_[i]->ToString() << std::endl; model_output_file_ << models_[i]->ToString() << std::endl;
} }
saved_model_size_ = Common::Max(saved_model_size_, rest); saved_model_size_ = std::max(saved_model_size_, rest);
model_output_file_.flush(); model_output_file_.flush();
// training finished, can close file // training finished, can close file
......
...@@ -42,29 +42,45 @@ BinMapper::~BinMapper() { ...@@ -42,29 +42,45 @@ BinMapper::~BinMapper() {
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin) { void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin) {
std::vector<double>& ref_values = (*values); std::vector<double>& ref_values = (*values);
size_t sample_size = total_sample_cnt; size_t sample_size = total_sample_cnt;
size_t zero_cnt = total_sample_cnt - ref_values.size(); int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
// find distinct_values first // find distinct_values first
std::vector<double> distinct_values; std::vector<double> distinct_values;
std::vector<int> counts; std::vector<int> counts;
std::sort(ref_values.begin(), ref_values.end()); std::sort(ref_values.begin(), ref_values.end());
// push 0 first
if (zero_cnt > 0) { // push zero in the front
distinct_values.push_back(0.0f); if (ref_values.size() == 0 || (ref_values[0] > 0.0f && zero_cnt > 0)) {
counts.push_back(static_cast<int>(zero_cnt)); distinct_values.push_back(0);
counts.push_back(zero_cnt);
} }
if (ref_values.size() > 0) { if (ref_values.size() > 0) {
distinct_values.push_back(ref_values[0]); distinct_values.push_back(ref_values[0]);
counts.push_back(1); counts.push_back(1);
} }
for (size_t i = 1; i < ref_values.size(); ++i) { for (size_t i = 1; i < ref_values.size(); ++i) {
if (ref_values[i] != ref_values[i - 1]) { if (ref_values[i] != ref_values[i - 1]) {
if (ref_values[i - 1] == 0.0f) {
counts.back() += zero_cnt;
} else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
distinct_values.push_back(0);
counts.push_back(zero_cnt);
}
distinct_values.push_back(ref_values[i]); distinct_values.push_back(ref_values[i]);
counts.push_back(1); counts.push_back(1);
} else { } else {
++counts.back(); ++counts.back();
} }
} }
// push zero in the back
if (ref_values.size() > 0 && ref_values.back() < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0);
counts.push_back(zero_cnt);
}
int num_values = static_cast<int>(distinct_values.size()); int num_values = static_cast<int>(distinct_values.size());
int cnt_in_bin0 = 0; int cnt_in_bin0 = 0;
if (num_values <= max_bin) { if (num_values <= max_bin) {
...@@ -78,53 +94,38 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in ...@@ -78,53 +94,38 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
cnt_in_bin0 = counts[0]; cnt_in_bin0 = counts[0];
bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity(); bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
} else { } else {
double min_lower_bound = std::numeric_limits<double>::infinity();
// mean size for one bin // mean size for one bin
double mean_bin_size = sample_size / static_cast<double>(max_bin); double mean_bin_size = sample_size / static_cast<double>(max_bin);
int rest_sample_cnt = static_cast<int>(sample_size); std::vector<bool> is_big_count_value(num_values, false);
int bin_cnt = 0; for (int i = 0; i < num_values; ++i) {
if (counts[i] >= mean_bin_size) { is_big_count_value[i] = true; }
}
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity()); std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity()); std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
// sort by count, descent
Common::SortForPair(counts, distinct_values, 0, true); int rest_sample_cnt = static_cast<int>(sample_size);
// fetch big slot as unique bin int bin_cnt = 0;
while (counts[bin_cnt] > mean_bin_size) { lower_bounds[bin_cnt] = distinct_values[0];
upper_bounds[bin_cnt] = distinct_values[bin_cnt]; int cur_cnt_inbin = 0;
lower_bounds[bin_cnt] = distinct_values[bin_cnt]; for (int i = 0; i < num_values - 1; ++i) {
if (lower_bounds[bin_cnt] < min_lower_bound) { rest_sample_cnt -= counts[i];
min_lower_bound = lower_bounds[bin_cnt]; cur_cnt_inbin += counts[i];
cnt_in_bin0 = counts[bin_cnt]; // need a new bin
} if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
rest_sample_cnt -= counts[bin_cnt]; (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
++bin_cnt; upper_bounds[bin_cnt] = distinct_values[i];
} if (bin_cnt == 0) {
// process reminder bins cnt_in_bin0 = cur_cnt_inbin;
if (bin_cnt < max_bin) {
// sort rest by values
Common::SortForPair<double, int>(distinct_values, counts, bin_cnt, false);
mean_bin_size = rest_sample_cnt / static_cast<double>(max_bin - bin_cnt);
lower_bounds[bin_cnt] = distinct_values[bin_cnt];
int cur_cnt_inbin = 0;
for (int i = bin_cnt; i < num_values - 1; ++i) {
rest_sample_cnt -= counts[i];
cur_cnt_inbin += counts[i];
// need a new bin
if (cur_cnt_inbin >= mean_bin_size) {
upper_bounds[bin_cnt] = distinct_values[i];
if (lower_bounds[bin_cnt] < min_lower_bound) {
min_lower_bound = lower_bounds[bin_cnt];
cnt_in_bin0 = cur_cnt_inbin;
}
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) break;
cur_cnt_inbin = 0;
mean_bin_size = rest_sample_cnt / static_cast<double>(max_bin - bin_cnt);
} }
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
mean_bin_size = rest_sample_cnt / static_cast<double>(max_bin - bin_cnt);
} }
cur_cnt_inbin += counts[num_values - 1];
} }
Common::SortForPair<double, double>(lower_bounds, upper_bounds, 0, false); //
++bin_cnt;
// update bin upper bound // update bin upper bound
bin_upper_bound_ = new double[bin_cnt]; bin_upper_bound_ = new double[bin_cnt];
num_bin_ = bin_cnt; num_bin_ = bin_cnt;
......
...@@ -657,7 +657,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -657,7 +657,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
start[0] = 0; start[0] = 0;
for (int i = 0; i < num_machines - 1; ++i) { for (int i = 0; i < num_machines - 1; ++i) {
len[i] = Common::Min<int>(step, total_num_feature - start[i]); len[i] = std::min(step, total_num_feature - start[i]);
start[i + 1] = start[i] + len[i]; start[i + 1] = start[i] + len[i];
} }
len[num_machines - 1] = total_num_feature - start[num_machines - 1]; len[num_machines - 1] = total_num_feature - start[num_machines - 1];
......
#ifndef LIGHTGBM_NETWORK_LINKERS_H_ #ifndef LIGHTGBM_NETWORK_LINKERS_H_
#define LIGHTGBM_NETWORK_LINKERS_H_ #define LIGHTGBM_NETWORK_LINKERS_H_
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include <LightGBM/config.h> #include <LightGBM/config.h>
#include <LightGBM/network.h> #include <LightGBM/network.h>
#include <algorithm>
#include <chrono> #include <chrono>
#include <ctime> #include <ctime>
#ifdef USE_SOCKET #ifdef USE_SOCKET
#include "socket_wrapper.hpp" #include "socket_wrapper.hpp"
#include <LightGBM/utils/common.h> #include <LightGBM/utils/common.h>
...@@ -171,9 +172,9 @@ inline const RecursiveHalvingMap& Linkers::recursive_halving_map() { ...@@ -171,9 +172,9 @@ inline const RecursiveHalvingMap& Linkers::recursive_halving_map() {
inline void Linkers::Recv(int rank, char* data, int len) const { inline void Linkers::Recv(int rank, char* data, int len) const {
int recv_cnt = 0; int recv_cnt = 0;
while (recv_cnt < len) { while (recv_cnt < len) {
recv_cnt += linkers_[rank]->Recv(data + recv_cnt , recv_cnt += linkers_[rank]->Recv(data + recv_cnt,
//len - recv_cnt //len - recv_cnt
Common::Min<int>(len - recv_cnt, SocketConfig::kMaxReceiveSize) std::min(len - recv_cnt, SocketConfig::kMaxReceiveSize)
); );
} }
} }
......
...@@ -54,7 +54,7 @@ void Network::Allreduce(char* input, int input_size, int type_size, char* output ...@@ -54,7 +54,7 @@ void Network::Allreduce(char* input, int input_size, int type_size, char* output
} }
block_start_[0] = 0; block_start_[0] = 0;
for (int i = 0; i < num_machines_ - 1; ++i) { for (int i = 0; i < num_machines_ - 1; ++i) {
block_len_[i] = Common::Min<int>(step * type_size, input_size - block_start_[i]); block_len_[i] = std::min(step * type_size, input_size - block_start_[i]);
block_start_[i + 1] = block_start_[i] + block_len_[i]; block_start_[i + 1] = block_start_[i] + block_len_[i];
} }
block_len_[num_machines_ - 1] = input_size - block_start_[num_machines_ - 1]; block_len_[num_machines_ - 1] = input_size - block_start_[num_machines_ - 1];
...@@ -108,7 +108,7 @@ void Network::Allgather(char* input, int all_size, int* block_start, int* block_ ...@@ -108,7 +108,7 @@ void Network::Allgather(char* input, int all_size, int* block_start, int* block_
int accumulated_block = 1; int accumulated_block = 1;
for (int i = 0; i < bruck_map_.k; ++i) { for (int i = 0; i < bruck_map_.k; ++i) {
// get current local block size // get current local block size
int cur_block_size = Common::Min<int>(1 << i, num_machines_ - accumulated_block); int cur_block_size = std::min(1 << i, num_machines_ - accumulated_block);
// get out rank // get out rank
int out_rank = bruck_map_.out_ranks[i]; int out_rank = bruck_map_.out_ranks[i];
// get send information // get send information
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#ifdef USE_SOCKET #ifdef USE_SOCKET
#if defined(_WIN32) #if defined(_WIN32)
#define NOMINMAX
#include <winsock2.h> #include <winsock2.h>
#include <ws2tcpip.h> #include <ws2tcpip.h>
#include <iphlpapi.h> #include <iphlpapi.h>
......
...@@ -62,8 +62,8 @@ void SerialTreeLearner::Init(const Dataset* train_data) { ...@@ -62,8 +62,8 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
max_cache_size = static_cast<int>(histogram_pool_size_ * 1024 * 1024 / total_histogram_size); max_cache_size = static_cast<int>(histogram_pool_size_ * 1024 * 1024 / total_histogram_size);
} }
// at least need 2 leaves // at least need 2 leaves
max_cache_size = Common::Max(2, max_cache_size); max_cache_size = std::max(2, max_cache_size);
max_cache_size = Common::Min(max_cache_size, num_leaves_); max_cache_size = std::min(max_cache_size, num_leaves_);
histogram_pool_.ResetSize(max_cache_size, num_leaves_); histogram_pool_.ResetSize(max_cache_size, num_leaves_);
auto histogram_create_function = [this]() { auto histogram_create_function = [this]() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment