Commit 952099d6 authored by Guolin Ke's avatar Guolin Ke
Browse files

fix bugs in bin finder

parent 9f04f276
......@@ -16,18 +16,6 @@ namespace LightGBM {
namespace Common {
template<typename T>
inline static T Max(const T& a, const T& b) {
return a > b ? a : b;
}
template<typename T>
inline static T Min(const T& a, const T& b) {
return a < b ? a : b;
}
inline static std::string& Trim(std::string& str) {
if (str.size() <= 0) {
return str;
......@@ -329,8 +317,8 @@ inline static std::string Join(const std::vector<T>& strs, size_t start, size_t
if (end - start <= 0) {
return std::string("");
}
start = Min<size_t>(start, static_cast<size_t>(strs.size()) - 1);
end = Min<size_t>(end, static_cast<size_t>(strs.size()));
start = std::min(start, static_cast<size_t>(strs.size()) - 1);
end = std::min(end, static_cast<size_t>(strs.size()));
std::stringstream ss;
ss << strs[start];
for (size_t i = start + 1; i < end; ++i) {
......
......@@ -110,9 +110,9 @@ void DART::DroppingTrees() {
drop_index_ = random_for_drop_.Sample(iter_, 1);
}
// drop trees
for (int i: drop_index_) {
for (auto i: drop_index_) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
int curr_tree = i * num_class_ + curr_class;
auto curr_tree = i * num_class_ + curr_class;
models_[curr_tree]->Shrinkage(-1.0);
train_score_updater_->AddScore(models_[curr_tree], curr_class);
}
......@@ -122,9 +122,9 @@ void DART::DroppingTrees() {
void DART::Normalize() {
double k = static_cast<double>(drop_index_.size());
for (int i: drop_index_) {
for (auto i: drop_index_) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
int curr_tree = i * num_class_ + curr_class;
auto curr_tree = i * num_class_ + curr_class;
// update validation score
models_[curr_tree]->Shrinkage(shrinkage_rate_);
for (auto& score_updater : valid_score_updater_) {
......
......@@ -385,7 +385,7 @@ void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filen
model_output_file_ << models_[i]->ToString() << std::endl;
}
saved_model_size_ = Common::Max(saved_model_size_, rest);
saved_model_size_ = std::max(saved_model_size_, rest);
model_output_file_.flush();
// training finished, can close file
......
......@@ -42,29 +42,45 @@ BinMapper::~BinMapper() {
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin) {
std::vector<double>& ref_values = (*values);
size_t sample_size = total_sample_cnt;
size_t zero_cnt = total_sample_cnt - ref_values.size();
int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
std::sort(ref_values.begin(), ref_values.end());
// push 0 first
if (zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(static_cast<int>(zero_cnt));
// push zero in the front
if (ref_values.size() == 0 || (ref_values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0);
counts.push_back(zero_cnt);
}
if (ref_values.size() > 0) {
distinct_values.push_back(ref_values[0]);
counts.push_back(1);
}
for (size_t i = 1; i < ref_values.size(); ++i) {
if (ref_values[i] != ref_values[i - 1]) {
if (ref_values[i - 1] == 0.0f) {
counts.back() += zero_cnt;
} else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
distinct_values.push_back(0);
counts.push_back(zero_cnt);
}
distinct_values.push_back(ref_values[i]);
counts.push_back(1);
} else {
++counts.back();
}
}
// push zero in the back
if (ref_values.size() > 0 && ref_values.back() < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0);
counts.push_back(zero_cnt);
}
int num_values = static_cast<int>(distinct_values.size());
int cnt_in_bin0 = 0;
if (num_values <= max_bin) {
......@@ -78,53 +94,38 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
cnt_in_bin0 = counts[0];
bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
} else {
double min_lower_bound = std::numeric_limits<double>::infinity();
// mean size for one bin
double mean_bin_size = sample_size / static_cast<double>(max_bin);
int rest_sample_cnt = static_cast<int>(sample_size);
int bin_cnt = 0;
std::vector<bool> is_big_count_value(num_values, false);
for (int i = 0; i < num_values; ++i) {
if (counts[i] >= mean_bin_size) { is_big_count_value[i] = true; }
}
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
// sort by count, descent
Common::SortForPair(counts, distinct_values, 0, true);
// fetch big slot as unique bin
while (counts[bin_cnt] > mean_bin_size) {
upper_bounds[bin_cnt] = distinct_values[bin_cnt];
lower_bounds[bin_cnt] = distinct_values[bin_cnt];
if (lower_bounds[bin_cnt] < min_lower_bound) {
min_lower_bound = lower_bounds[bin_cnt];
cnt_in_bin0 = counts[bin_cnt];
}
rest_sample_cnt -= counts[bin_cnt];
++bin_cnt;
}
// process reminder bins
if (bin_cnt < max_bin) {
// sort rest by values
Common::SortForPair<double, int>(distinct_values, counts, bin_cnt, false);
mean_bin_size = rest_sample_cnt / static_cast<double>(max_bin - bin_cnt);
lower_bounds[bin_cnt] = distinct_values[bin_cnt];
int rest_sample_cnt = static_cast<int>(sample_size);
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = bin_cnt; i < num_values - 1; ++i) {
for (int i = 0; i < num_values - 1; ++i) {
rest_sample_cnt -= counts[i];
cur_cnt_inbin += counts[i];
// need a new bin
if (cur_cnt_inbin >= mean_bin_size) {
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
if (lower_bounds[bin_cnt] < min_lower_bound) {
min_lower_bound = lower_bounds[bin_cnt];
if (bin_cnt == 0) {
cnt_in_bin0 = cur_cnt_inbin;
}
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) break;
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
mean_bin_size = rest_sample_cnt / static_cast<double>(max_bin - bin_cnt);
}
}
cur_cnt_inbin += counts[num_values - 1];
}
Common::SortForPair<double, double>(lower_bounds, upper_bounds, 0, false);
//
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = new double[bin_cnt];
num_bin_ = bin_cnt;
......
......@@ -657,7 +657,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
start[0] = 0;
for (int i = 0; i < num_machines - 1; ++i) {
len[i] = Common::Min<int>(step, total_num_feature - start[i]);
len[i] = std::min(step, total_num_feature - start[i]);
start[i + 1] = start[i] + len[i];
}
len[num_machines - 1] = total_num_feature - start[num_machines - 1];
......
#ifndef LIGHTGBM_NETWORK_LINKERS_H_
#define LIGHTGBM_NETWORK_LINKERS_H_
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <LightGBM/network.h>
#include <algorithm>
#include <chrono>
#include <ctime>
#ifdef USE_SOCKET
#include "socket_wrapper.hpp"
#include <LightGBM/utils/common.h>
......@@ -171,9 +172,9 @@ inline const RecursiveHalvingMap& Linkers::recursive_halving_map() {
inline void Linkers::Recv(int rank, char* data, int len) const {
int recv_cnt = 0;
while (recv_cnt < len) {
recv_cnt += linkers_[rank]->Recv(data + recv_cnt ,
recv_cnt += linkers_[rank]->Recv(data + recv_cnt,
//len - recv_cnt
Common::Min<int>(len - recv_cnt, SocketConfig::kMaxReceiveSize)
std::min(len - recv_cnt, SocketConfig::kMaxReceiveSize)
);
}
}
......
......@@ -54,7 +54,7 @@ void Network::Allreduce(char* input, int input_size, int type_size, char* output
}
block_start_[0] = 0;
for (int i = 0; i < num_machines_ - 1; ++i) {
block_len_[i] = Common::Min<int>(step * type_size, input_size - block_start_[i]);
block_len_[i] = std::min(step * type_size, input_size - block_start_[i]);
block_start_[i + 1] = block_start_[i] + block_len_[i];
}
block_len_[num_machines_ - 1] = input_size - block_start_[num_machines_ - 1];
......@@ -108,7 +108,7 @@ void Network::Allgather(char* input, int all_size, int* block_start, int* block_
int accumulated_block = 1;
for (int i = 0; i < bruck_map_.k; ++i) {
// get current local block size
int cur_block_size = Common::Min<int>(1 << i, num_machines_ - accumulated_block);
int cur_block_size = std::min(1 << i, num_machines_ - accumulated_block);
// get out rank
int out_rank = bruck_map_.out_ranks[i];
// get send information
......
......@@ -3,7 +3,7 @@
#ifdef USE_SOCKET
#if defined(_WIN32)
#define NOMINMAX
#include <winsock2.h>
#include <ws2tcpip.h>
#include <iphlpapi.h>
......
......@@ -62,8 +62,8 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
max_cache_size = static_cast<int>(histogram_pool_size_ * 1024 * 1024 / total_histogram_size);
}
// at least need 2 leaves
max_cache_size = Common::Max(2, max_cache_size);
max_cache_size = Common::Min(max_cache_size, num_leaves_);
max_cache_size = std::max(2, max_cache_size);
max_cache_size = std::min(max_cache_size, num_leaves_);
histogram_pool_.ResetSize(max_cache_size, num_leaves_);
auto histogram_create_function = [this]() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment