Commit c62dcf73 authored by Guolin Ke's avatar Guolin Ke
Browse files

fix merge bugs.

parent 7a82ba4f
......@@ -12,20 +12,20 @@
namespace LightGBM {
enum BinType {
enum BinType {
NumericalBin,
CategoricalBin
};
};
enum MissingType {
enum MissingType {
None,
Zero,
NaN
};
};
/*! \brief Store data for one histogram bin */
struct HistogramBinEntry {
public:
/*! \brief Store data for one histogram bin */
struct HistogramBinEntry {
public:
/*! \brief Sum of gradients on this bin */
double sum_gradients = 0.0f;
/*! \brief Sum of hessians on this bin */
......@@ -53,12 +53,12 @@ public:
used_size += type_size;
}
}
};
};
/*! \brief This class used to convert feature values into bin,
* and store some meta information for bin*/
class BinMapper {
public:
/*! \brief This class used to convert feature values into bin,
* and store some meta information for bin*/
class BinMapper {
public:
BinMapper();
BinMapper(const BinMapper& other);
explicit BinMapper(const void* memory);
......@@ -183,7 +183,7 @@ public:
}
}
private:
private:
/*! \brief Number of bins */
int num_bin_;
MissingType missing_type_;
......@@ -205,18 +205,18 @@ private:
double max_val_;
/*! \brief bin value of feature value 0 */
uint32_t default_bin_;
};
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
class OrderedBin {
public:
};
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
class OrderedBin {
public:
/*! \brief virtual destructor */
virtual ~OrderedBin() {}
......@@ -260,11 +260,11 @@ public:
virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
virtual data_size_t NonZeroCount(int leaf) const = 0;
};
};
/*! \brief Iterator for one bin column */
class BinIterator {
public:
/*! \brief Iterator for one bin column */
class BinIterator {
public:
/*!
* \brief Get bin data on specific row index
* \param idx Index of this data
......@@ -274,16 +274,16 @@ public:
virtual uint32_t RawGet(data_size_t idx) = 0;
virtual void Reset(data_size_t idx) = 0;
virtual ~BinIterator() = default;
};
/*!
* \brief Interface for bin data. This class will store bin data for one feature.
* unlike OrderedBin, this class will store data by original order.
* Note that it may cause cache misses when construct histogram,
* but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
class Bin {
public:
};
/*!
* \brief Interface for bin data. This class will store bin data for one feature.
* unlike OrderedBin, this class will store data by original order.
* Note that it may cause cache misses when construct histogram,
* but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
class Bin {
public:
/*! \brief virtual destructor */
virtual ~Bin() {}
/*!
......@@ -429,9 +429,9 @@ public:
* \return The bin data object
*/
static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
};
};
inline uint32_t BinMapper::ValueToBin(double value) const {
inline uint32_t BinMapper::ValueToBin(double value) const {
if (std::isnan(value)) {
if (missing_type_ == MissingType::NaN) {
return num_bin_ - 1;
......@@ -457,13 +457,17 @@ inline uint32_t BinMapper::ValueToBin(double value) const {
return l;
} else {
int int_value = static_cast<int>(value);
// convert negative value to NaN bin
if (int_value < 0) {
return num_bin_ - 1;
}
if (categorical_2_bin_.count(int_value)) {
return categorical_2_bin_.at(int_value);
} else {
return num_bin_ - 1;
}
}
}
}
} // namespace LightGBM
......
......@@ -409,7 +409,8 @@ inline void Tree::TreeSHAP(const double *feature_values, double *phi,
// internal node
} else {
const int hot_index = Decision(feature_values[split_index], node);
const int hot_index =
decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](feature_values[split_index], threshold_[node]);
const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
const double w = data_count(node);
const double hot_zero_fraction = data_count(hot_index)/w;
......
......@@ -16,11 +16,11 @@
namespace LightGBM {
BinMapper::BinMapper() {
}
BinMapper::BinMapper() {
}
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other) {
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
missing_type_ = other.missing_type_;
is_trival_ = other.is_trival_;
......@@ -35,17 +35,17 @@ BinMapper::BinMapper(const BinMapper& other) {
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
}
}
BinMapper::BinMapper(const void* memory) {
BinMapper::BinMapper(const void* memory) {
CopyFrom(reinterpret_cast<const char*>(memory));
}
}
BinMapper::~BinMapper() {
BinMapper::~BinMapper() {
}
}
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
bool NeedFilter(const std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
int sum_left = 0;
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
......@@ -55,17 +55,21 @@ bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, Bin
}
}
} else {
if (cnt_in_bin.size() <= 2) {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
int sum_left = cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
return false;
}
}
} else {
return false;
}
}
return true;
}
}
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
if (num_distinct_values <= max_bin) {
......@@ -134,13 +138,13 @@ std::vector<double> GreedyFindBin(const double* distinct_values, const int* coun
bin_upper_bound[bin_cnt - 1] = std::numeric_limits<double>::infinity();
}
return bin_upper_bound;
}
}
std::vector<double> FindBinWithZeroAsMissing(const double* distinct_values, const int* counts,
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
int left_cnt_data = 0;
int cnt_missing = 0;
int cnt_zero = 0;
int right_cnt_data = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] <= -kZeroAsMissingValueRange) {
......@@ -148,11 +152,11 @@ std::vector<double> FindBinWithZeroAsMissing(const double* distinct_values, cons
} else if (distinct_values[i] > kZeroAsMissingValueRange) {
right_cnt_data += counts[i];
} else {
cnt_missing += counts[i];
cnt_zero += counts[i];
}
}
int left_cnt = 0;
int left_cnt = -1;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroAsMissingValueRange) {
left_cnt = i;
......@@ -160,8 +164,12 @@ std::vector<double> FindBinWithZeroAsMissing(const double* distinct_values, cons
}
}
if (left_cnt < 0) {
left_cnt = num_distinct_values;
}
if (left_cnt > 0) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_missing) * (max_bin - 1));
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound.back() = -kZeroAsMissingValueRange;
}
......@@ -184,9 +192,9 @@ std::vector<double> FindBinWithZeroAsMissing(const double* distinct_values, cons
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
return bin_upper_bound;
}
}
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
int na_cnt = 0;
int tmp_num_sample_values = 0;
......@@ -204,9 +212,9 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
}
na_cnt = num_sample_values - tmp_num_sample_values;
}
}
num_sample_values = tmp_num_sample_values;
bin_type_ = bin_type;
......@@ -253,14 +261,14 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) {
bin_upper_bound_ = FindBinWithZeroAsMissing(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
if (bin_upper_bound_.size() == 2) {
missing_type_ = MissingType::None;
}
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
} else {
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
bin_upper_bound_.push_back(NaN);
}
num_bin_ = static_cast<int>(bin_upper_bound_.size());
......@@ -279,8 +287,6 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
}
CHECK(num_bin_ <= max_bin);
} else {
// No missing handle for categorical features
missing_type_ = MissingType::None;
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
......@@ -296,22 +302,52 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
}
// sort by counts
Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
// avoid first bin is zero
if (distinct_values_int[0] == 0 && counts_int.size() > 1) {
std::swap(counts_int[0], counts_int[1]);
std::swap(distinct_values_int[0], distinct_values_int[1]);
}
// will ignore the categorical of small counts
const int cut_cnt = static_cast<int>(total_sample_cnt * 0.98f);
int cut_cnt = static_cast<int>((total_sample_cnt - na_cnt) * 0.99f);
size_t cur_cat = 0;
categorical_2_bin_.clear();
bin_2_categorical_.clear();
num_bin_ = 0;
int used_cnt = 0;
max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
while (used_cnt < cut_cnt || num_bin_ < max_bin) {
bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[num_bin_];
cnt_in_bin.clear();
while (cur_cat < distinct_values_int.size()
&& (used_cnt < cut_cnt || num_bin_ < max_bin)) {
if (distinct_values_int[cur_cat] < 0) {
na_cnt += counts_int[cur_cat];
cut_cnt -= counts_int[cur_cat];
Log::Warning("Met negative value in categorical features, will convert it to NaN");
} else {
bin_2_categorical_.push_back(distinct_values_int[cur_cat]);
categorical_2_bin_[distinct_values_int[cur_cat]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat];
cnt_in_bin.push_back(counts_int[cur_cat]);
++num_bin_;
}
cnt_in_bin = counts_int;
counts_int.resize(num_bin_);
counts_int.back() += static_cast<int>(total_sample_cnt - used_cnt);
++cur_cat;
}
// need an additional bin for NaN
if (cur_cat == distinct_values_int.size() && na_cnt > 0) {
// use -1 to represent NaN
bin_2_categorical_.push_back(-1);
categorical_2_bin_[-1] = num_bin_;
cnt_in_bin.push_back(0);
++num_bin_;
}
// Use MissingType::None to represent this bin contains all categoricals
if (cur_cat == distinct_values_int.size() && na_cnt == 0) {
missing_type_ = MissingType::None;
} else if (na_cnt == 0) {
missing_type_ = MissingType::Zero;
} else {
missing_type_ = MissingType::NaN;
}
cnt_in_bin.back() += static_cast<int>(total_sample_cnt - used_cnt);
}
// check trival(num_bin_ == 1) feature
......@@ -327,13 +363,16 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
if (!is_trival_) {
default_bin_ = ValueToBin(0);
if (bin_type_ == BinType::CategoricalBin) {
CHECK(default_bin_ > 0);
}
}
// calculate sparse rate
sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt);
}
}
int BinMapper::SizeForSpecificBin(int bin) {
int BinMapper::SizeForSpecificBin(int bin) {
int size = 0;
size += sizeof(int);
size += sizeof(MissingType);
......@@ -344,9 +383,9 @@ int BinMapper::SizeForSpecificBin(int bin) {
size += bin * sizeof(double);
size += sizeof(uint32_t);
return size;
}
}
void BinMapper::CopyTo(char * buffer) const {
void BinMapper::CopyTo(char * buffer) const {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
......@@ -368,9 +407,9 @@ void BinMapper::CopyTo(char * buffer) const {
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
}
}
}
void BinMapper::CopyFrom(const char * buffer) {
void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
......@@ -398,9 +437,9 @@ void BinMapper::CopyFrom(const char * buffer) {
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
}
}
}
}
void BinMapper::SaveBinaryToFile(FILE* file) const {
void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&missing_type_, sizeof(missing_type_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
......@@ -414,9 +453,9 @@ void BinMapper::SaveBinaryToFile(FILE* file) const {
} else {
fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
}
}
}
size_t BinMapper::SizesInByte() const {
size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(missing_type_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
......@@ -425,21 +464,21 @@ size_t BinMapper::SizesInByte() const {
ret += sizeof(int) * num_bin_;
}
return ret;
}
}
template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;
template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
// sparse threshold
if (sparse_rate >= sparse_threshold && is_enable_sparse) {
......@@ -449,9 +488,9 @@ Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
*is_sparse = false;
return CreateDenseBin(num_data, num_bin);
}
}
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) {
return new Dense4bitsBin(num_data);
} else if (num_bin <= 256) {
......@@ -461,9 +500,9 @@ Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
} else {
return new DenseBin<uint32_t>(num_data);
}
}
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new SparseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
......@@ -471,6 +510,6 @@ Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
} else {
return new SparseBin<uint32_t>(num_data);
}
}
}
} // namespace LightGBM
......@@ -239,7 +239,7 @@ void OverallConfig::CheckParamConflict() {
}
// Check max_depth and num_leaves
if (boosting_config.tree_config.max_depth > 0) {
int full_num_leaves = std::pow(2, boosting_config.tree_config.max_depth);
int full_num_leaves = static_cast<int>(std::pow(2, boosting_config.tree_config.max_depth));
if (full_num_leaves > boosting_config.tree_config.num_leaves
&& boosting_config.tree_config.num_leaves == kDefaultNumLeaves) {
Log::Warning("Accuarcy may be bad since you didn't set num_leaves.");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment