Commit c62dcf73 authored by Guolin Ke's avatar Guolin Ke
Browse files

fix merge bugs.

parent 7a82ba4f
......@@ -12,458 +12,462 @@
namespace LightGBM {
enum BinType {
NumericalBin,
CategoricalBin
};
enum MissingType {
None,
Zero,
NaN
};
/*! \brief Store data for one histogram bin */
struct HistogramBinEntry {
public:
/*! \brief Sum of gradients on this bin */
double sum_gradients = 0.0f;
/*! \brief Sum of hessians on this bin */
double sum_hessians = 0.0f;
/*! \brief Number of data on this bin */
data_size_t cnt = 0;
/*!
* \brief Sum up (reducers) functions for histogram bin
*/
inline static void SumReducer(const char *src, char *dst, int len) {
const int type_size = sizeof(HistogramBinEntry);
int used_size = 0;
const HistogramBinEntry* p1;
HistogramBinEntry* p2;
while (used_size < len) {
// convert
p1 = reinterpret_cast<const HistogramBinEntry*>(src);
p2 = reinterpret_cast<HistogramBinEntry*>(dst);
// add
p2->cnt += p1->cnt;
p2->sum_gradients += p1->sum_gradients;
p2->sum_hessians += p1->sum_hessians;
src += type_size;
dst += type_size;
used_size += type_size;
}
}
};
/*! \brief This class used to convert feature values into bin,
* and store some meta information for bin*/
class BinMapper {
public:
BinMapper();
BinMapper(const BinMapper& other);
explicit BinMapper(const void* memory);
~BinMapper();
bool CheckAlign(const BinMapper& other) const {
if (num_bin_ != other.num_bin_) {
return false;
}
if (missing_type_ != other.missing_type_) {
return false;
enum BinType {
NumericalBin,
CategoricalBin
};
enum MissingType {
None,
Zero,
NaN
};
/*! \brief Store data for one histogram bin */
struct HistogramBinEntry {
public:
/*! \brief Sum of gradients on this bin */
double sum_gradients = 0.0f;
/*! \brief Sum of hessians on this bin */
double sum_hessians = 0.0f;
/*! \brief Number of data on this bin */
data_size_t cnt = 0;
/*!
* \brief Sum up (reducers) functions for histogram bin
*/
inline static void SumReducer(const char *src, char *dst, int len) {
const int type_size = sizeof(HistogramBinEntry);
int used_size = 0;
const HistogramBinEntry* p1;
HistogramBinEntry* p2;
while (used_size < len) {
// convert
p1 = reinterpret_cast<const HistogramBinEntry*>(src);
p2 = reinterpret_cast<HistogramBinEntry*>(dst);
// add
p2->cnt += p1->cnt;
p2->sum_gradients += p1->sum_gradients;
p2->sum_hessians += p1->sum_hessians;
src += type_size;
dst += type_size;
used_size += type_size;
}
}
if (bin_type_ == BinType::NumericalBin) {
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
return false;
}
};
/*! \brief This class used to convert feature values into bin,
* and store some meta information for bin*/
class BinMapper {
public:
BinMapper();
BinMapper(const BinMapper& other);
explicit BinMapper(const void* memory);
~BinMapper();
bool CheckAlign(const BinMapper& other) const {
if (num_bin_ != other.num_bin_) {
return false;
}
} else {
for (int i = 0; i < num_bin_; i++) {
if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
return false;
if (missing_type_ != other.missing_type_) {
return false;
}
if (bin_type_ == BinType::NumericalBin) {
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
return false;
}
}
} else {
for (int i = 0; i < num_bin_; i++) {
if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
return false;
}
}
}
return true;
}
return true;
}
/*! \brief Get number of bins */
inline int num_bin() const { return num_bin_; }
/*! \brief Missing Type */
inline MissingType missing_type() const { return missing_type_; }
/*! \brief True if bin is trival (contains only one bin) */
inline bool is_trival() const { return is_trival_; }
/*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
inline double sparse_rate() const { return sparse_rate_; }
/*!
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(FILE* file) const;
/*!
* \brief Mapping bin into feature value
* \param bin
* \return Feature value of this bin
*/
inline double BinToValue(uint32_t bin) const {
if (bin_type_ == BinType::NumericalBin) {
return bin_upper_bound_[bin];
} else {
return bin_2_categorical_[bin];
/*! \brief Get number of bins */
inline int num_bin() const { return num_bin_; }
/*! \brief Missing Type */
inline MissingType missing_type() const { return missing_type_; }
/*! \brief True if bin is trival (contains only one bin) */
inline bool is_trival() const { return is_trival_; }
/*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
inline double sparse_rate() const { return sparse_rate_; }
/*!
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(FILE* file) const;
/*!
* \brief Mapping bin into feature value
* \param bin
* \return Feature value of this bin
*/
inline double BinToValue(uint32_t bin) const {
if (bin_type_ == BinType::NumericalBin) {
return bin_upper_bound_[bin];
} else {
return bin_2_categorical_[bin];
}
}
}
/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const;
/*!
* \brief Mapping feature value into bin
* \param value
* \return bin for this feature value
*/
inline uint32_t ValueToBin(double value) const;
/*!
* \brief Get the default bin when value is 0
* \return default bin
*/
inline uint32_t GetDefaultBin() const {
return default_bin_;
}
/*!
* \brief Construct feature value to bin mapper according feature values
* \param values (Sampled) values of this feature, Note: not include zero.
* \param num_values number of values.
* \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
* \param max_bin The maximal number of bin
* \param min_data_in_bin min number of data in one bin
* \param min_split_data
* \param bin_type Type of this bin
* \param use_missing True to enable missing value handle
* \param zero_as_missing True to use zero as missing value
*/
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
bool use_missing, bool zero_as_missing);
/*!
* \brief Use specific number of bin to calculate the size of this class
* \param bin The number of bin
* \return Size
*/
static int SizeForSpecificBin(int bin);
/*!
* \brief Seirilizing this object to buffer
* \param buffer The destination
*/
void CopyTo(char* buffer) const;
/*!
* \brief Deserilizing this object from buffer
* \param buffer The source
*/
void CopyFrom(const char* buffer);
/*!
* \brief Get bin types
*/
inline BinType bin_type() const { return bin_type_; }
/*!
* \brief Get bin info
*/
inline std::string bin_info() const {
if (bin_type_ == BinType::CategoricalBin) {
return Common::Join(bin_2_categorical_, ":");
} else {
std::stringstream str_buf;
str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
str_buf << '[' << min_val_ << ':' << max_val_ << ']';
return str_buf.str();
/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const;
/*!
* \brief Mapping feature value into bin
* \param value
* \return bin for this feature value
*/
inline uint32_t ValueToBin(double value) const;
/*!
* \brief Get the default bin when value is 0
* \return default bin
*/
inline uint32_t GetDefaultBin() const {
return default_bin_;
}
/*!
* \brief Construct feature value to bin mapper according feature values
* \param values (Sampled) values of this feature, Note: not include zero.
* \param num_values number of values.
* \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
* \param max_bin The maximal number of bin
* \param min_data_in_bin min number of data in one bin
* \param min_split_data
* \param bin_type Type of this bin
* \param use_missing True to enable missing value handle
* \param zero_as_missing True to use zero as missing value
*/
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
bool use_missing, bool zero_as_missing);
/*!
* \brief Use specific number of bin to calculate the size of this class
* \param bin The number of bin
* \return Size
*/
static int SizeForSpecificBin(int bin);
/*!
* \brief Seirilizing this object to buffer
* \param buffer The destination
*/
void CopyTo(char* buffer) const;
/*!
* \brief Deserilizing this object from buffer
* \param buffer The source
*/
void CopyFrom(const char* buffer);
/*!
* \brief Get bin types
*/
inline BinType bin_type() const { return bin_type_; }
/*!
* \brief Get bin info
*/
inline std::string bin_info() const {
if (bin_type_ == BinType::CategoricalBin) {
return Common::Join(bin_2_categorical_, ":");
} else {
std::stringstream str_buf;
str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
str_buf << '[' << min_val_ << ':' << max_val_ << ']';
return str_buf.str();
}
}
}
private:
/*! \brief Number of bins */
int num_bin_;
MissingType missing_type_;
/*! \brief Store upper bound for each bin */
std::vector<double> bin_upper_bound_;
/*! \brief True if this feature is trival */
bool is_trival_;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double sparse_rate_;
/*! \brief Type of this bin */
BinType bin_type_;
/*! \brief Mapper from categorical to bin */
std::unordered_map<int, unsigned int> categorical_2_bin_;
/*! \brief Mapper from bin to categorical */
std::vector<int> bin_2_categorical_;
/*! \brief minimal feature vaule */
double min_val_;
/*! \brief maximum feature value */
double max_val_;
/*! \brief bin value of feature value 0 */
uint32_t default_bin_;
};
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
class OrderedBin {
public:
/*! \brief virtual destructor */
virtual ~OrderedBin() {}
/*!
* \brief Initialization logic.
* \param used_indices If used_indices.size() == 0 means using all data, otherwise, used_indices[i] == true means i-th data is used
(this logic was build for bagging logic)
* \param num_leaves Number of leaves on this iteration
*/
virtual void Init(const char* used_indices, data_size_t num_leaves) = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param hessians Hessians, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients,
const score_t* hessians, HistogramBinEntry* out) const = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
/*!
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
* \param right_leaf The new leaf index after perform this split
* \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
* \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
*/
virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
virtual data_size_t NonZeroCount(int leaf) const = 0;
};
/*! \brief Iterator for one bin column */
class BinIterator {
public:
/*!
* \brief Get bin data on specific row index
* \param idx Index of this data
* \return Bin data
*/
virtual uint32_t Get(data_size_t idx) = 0;
virtual uint32_t RawGet(data_size_t idx) = 0;
virtual void Reset(data_size_t idx) = 0;
virtual ~BinIterator() = default;
};
/*!
* \brief Interface for bin data. This class will store bin data for one feature.
* unlike OrderedBin, this class will store data by original order.
* Note that it may cause cache misses when construct histogram,
* but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
class Bin {
public:
/*! \brief virtual destructor */
virtual ~Bin() {}
/*!
* \brief Push one record
* \pram tid Thread id
* \param idx Index of record
* \param value bin value of record
*/
virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
/*!
* \brief Get bin iterator of this bin for specific feature
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin default bin if bin not in [min_bin, max_bin]
* \return Iterator of this bin
*/
virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const = 0;
/*!
* \brief Save binary data to file
* \param file File want to write
*/
virtual void SaveBinaryToFile(FILE* file) const = 0;
/*!
* \brief Load from memory
* \param memory
* \param local_used_indices
*/
virtual void LoadFromMemory(const void* memory,
const std::vector<data_size_t>& local_used_indices) = 0;
/*!
* \brief Get sizes in byte of this object
*/
virtual size_t SizesInByte() const = 0;
/*! \brief Number of all data */
virtual data_size_t num_data() const = 0;
virtual void ReSize(data_size_t num_data) = 0;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
* \param out Output Result
*/
virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param out Output Result
*/
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param missing_type missing type
* \param default_left missing bin will go to left child
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object.
* \param bin_type type of bin
* \return The number of less than or equal data.
*/
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;
/*!
* \brief Create the ordered bin for this bin
* \return Pointer to ordered bin
*/
virtual OrderedBin* CreateOrderedBin() const = 0;
/*!
* \brief After pushed all feature data, call this could have better refactor for bin data
*/
virtual void FinishLoad() = 0;
private:
/*! \brief Number of bins */
int num_bin_;
MissingType missing_type_;
/*! \brief Store upper bound for each bin */
std::vector<double> bin_upper_bound_;
/*! \brief True if this feature is trival */
bool is_trival_;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double sparse_rate_;
/*! \brief Type of this bin */
BinType bin_type_;
/*! \brief Mapper from categorical to bin */
std::unordered_map<int, unsigned int> categorical_2_bin_;
/*! \brief Mapper from bin to categorical */
std::vector<int> bin_2_categorical_;
/*! \brief minimal feature vaule */
double min_val_;
/*! \brief maximum feature value */
double max_val_;
/*! \brief bin value of feature value 0 */
uint32_t default_bin_;
};
/*!
* \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
* \param num_data Total number of data
* \param num_bin Number of bin
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
* \param is_enable_sparse True if enable sparse feature
* \param sparse_threshold Threshold for treating a feature as a sparse feature
* \param is_sparse Will set to true if this bin is sparse
* \param default_bin Default bin for zeros value
* \return The bin data object
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
class OrderedBin {
public:
/*! \brief virtual destructor */
virtual ~OrderedBin() {}
/*!
* \brief Initialization logic.
* \param used_indices If used_indices.size() == 0 means using all data, otherwise, used_indices[i] == true means i-th data is used
(this logic was build for bagging logic)
* \param num_leaves Number of leaves on this iteration
*/
virtual void Init(const char* used_indices, data_size_t num_leaves) = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param hessians Hessians, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients,
const score_t* hessians, HistogramBinEntry* out) const = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
/*!
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
* \param right_leaf The new leaf index after perform this split
* \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
* \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
*/
virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
virtual data_size_t NonZeroCount(int leaf) const = 0;
};
/*! \brief Iterator for one bin column */
class BinIterator {
public:
/*!
* \brief Get bin data on specific row index
* \param idx Index of this data
* \return Bin data
*/
virtual uint32_t Get(data_size_t idx) = 0;
virtual uint32_t RawGet(data_size_t idx) = 0;
virtual void Reset(data_size_t idx) = 0;
virtual ~BinIterator() = default;
};
/*!
* \brief Create object for bin data of one feature, used for dense feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \return The bin data object
* \brief Interface for bin data. This class will store bin data for one feature.
* unlike OrderedBin, this class will store data by original order.
* Note that it may cause cache misses when construct histogram,
* but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
static Bin* CreateDenseBin(data_size_t num_data, int num_bin);
/*!
* \brief Create object for bin data of one feature, used for sparse feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \return The bin data object
*/
static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
};
inline uint32_t BinMapper::ValueToBin(double value) const {
if (std::isnan(value)) {
if (missing_type_ == MissingType::NaN) {
return num_bin_ - 1;
} else {
value = 0.0f;
}
}
if (bin_type_ == BinType::NumericalBin) {
// binary search to find bin
int l = 0;
int r = num_bin_ - 1;
if (missing_type_ == MissingType::NaN) {
r -= 1;
}
while (l < r) {
int m = (r + l - 1) / 2;
if (value <= bin_upper_bound_[m]) {
r = m;
class Bin {
public:
/*! \brief virtual destructor */
virtual ~Bin() {}
/*!
* \brief Push one record
* \pram tid Thread id
* \param idx Index of record
* \param value bin value of record
*/
virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
/*!
* \brief Get bin iterator of this bin for specific feature
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin default bin if bin not in [min_bin, max_bin]
* \return Iterator of this bin
*/
virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const = 0;
/*!
* \brief Save binary data to file
* \param file File want to write
*/
virtual void SaveBinaryToFile(FILE* file) const = 0;
/*!
* \brief Load from memory
* \param memory
* \param local_used_indices
*/
virtual void LoadFromMemory(const void* memory,
const std::vector<data_size_t>& local_used_indices) = 0;
/*!
* \brief Get sizes in byte of this object
*/
virtual size_t SizesInByte() const = 0;
/*! \brief Number of all data */
virtual data_size_t num_data() const = 0;
virtual void ReSize(data_size_t num_data) = 0;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
* \param out Output Result
*/
virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param out Output Result
*/
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param missing_type missing type
* \param default_left missing bin will go to left child
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object.
* \param bin_type type of bin
* \return The number of less than or equal data.
*/
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;
/*!
* \brief Create the ordered bin for this bin
* \return Pointer to ordered bin
*/
virtual OrderedBin* CreateOrderedBin() const = 0;
/*!
* \brief After pushed all feature data, call this could have better refactor for bin data
*/
virtual void FinishLoad() = 0;
/*!
* \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
* \param num_data Total number of data
* \param num_bin Number of bin
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
* \param is_enable_sparse True if enable sparse feature
* \param sparse_threshold Threshold for treating a feature as a sparse feature
* \param is_sparse Will set to true if this bin is sparse
* \param default_bin Default bin for zeros value
* \return The bin data object
*/
static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
/*!
* \brief Create object for bin data of one feature, used for dense feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \return The bin data object
*/
static Bin* CreateDenseBin(data_size_t num_data, int num_bin);
/*!
* \brief Create object for bin data of one feature, used for sparse feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \return The bin data object
*/
static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
};
inline uint32_t BinMapper::ValueToBin(double value) const {
if (std::isnan(value)) {
if (missing_type_ == MissingType::NaN) {
return num_bin_ - 1;
} else {
l = m + 1;
value = 0.0f;
}
}
return l;
} else {
int int_value = static_cast<int>(value);
if (categorical_2_bin_.count(int_value)) {
return categorical_2_bin_.at(int_value);
if (bin_type_ == BinType::NumericalBin) {
// binary search to find bin
int l = 0;
int r = num_bin_ - 1;
if (missing_type_ == MissingType::NaN) {
r -= 1;
}
while (l < r) {
int m = (r + l - 1) / 2;
if (value <= bin_upper_bound_[m]) {
r = m;
} else {
l = m + 1;
}
}
return l;
} else {
return num_bin_ - 1;
int int_value = static_cast<int>(value);
// convert negative value to NaN bin
if (int_value < 0) {
return num_bin_ - 1;
}
if (categorical_2_bin_.count(int_value)) {
return categorical_2_bin_.at(int_value);
} else {
return num_bin_ - 1;
}
}
}
}
} // namespace LightGBM
......
......@@ -409,7 +409,8 @@ inline void Tree::TreeSHAP(const double *feature_values, double *phi,
// internal node
} else {
const int hot_index = Decision(feature_values[split_index], node);
const int hot_index =
decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](feature_values[split_index], threshold_[node]);
const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
const double w = data_count(node);
const double hot_zero_fraction = data_count(hot_index)/w;
......
......@@ -16,461 +16,500 @@
namespace LightGBM {
BinMapper::BinMapper() {
}
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
missing_type_ = other.missing_type_;
is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
BinMapper::BinMapper() {
}
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
}
BinMapper::BinMapper(const void* memory) {
CopyFrom(reinterpret_cast<const char*>(memory));
}
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
missing_type_ = other.missing_type_;
is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
}
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
}
BinMapper::~BinMapper() {
BinMapper::BinMapper(const void* memory) {
CopyFrom(reinterpret_cast<const char*>(memory));
}
}
BinMapper::~BinMapper() {
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
int sum_left = 0;
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
sum_left += cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
return false;
}
bool NeedFilter(const std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
if (bin_type == BinType::NumericalBin) {
int sum_left = 0;
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
sum_left += cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
return false;
}
}
}
} else {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
int sum_left = cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
} else {
if (cnt_in_bin.size() <= 2) {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
int sum_left = cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
return false;
}
}
} else {
return false;
}
}
return true;
}
return true;
}
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
if (num_distinct_values <= max_bin) {
bin_upper_bound.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cur_cnt_inbin = 0;
}
}
cur_cnt_inbin += counts[num_distinct_values - 1];
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
double mean_bin_size = static_cast<double>(total_cnt) / max_bin;
// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_cnt);
std::vector<bool> is_big_count_value(num_distinct_values, false);
for (int i = 0; i < num_distinct_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
if (num_distinct_values <= max_bin) {
bin_upper_bound.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cur_cnt_inbin = 0;
}
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
cur_cnt_inbin += counts[num_distinct_values - 1];
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
double mean_bin_size = static_cast<double>(total_cnt) / max_bin;
// mean size for one bin
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_cnt);
std::vector<bool> is_big_count_value(num_distinct_values, false);
for (int i = 0; i < num_distinct_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
++bin_cnt;
// update bin upper bound
bin_upper_bound.resize(bin_cnt);
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
// last bin upper bound
bin_upper_bound[bin_cnt - 1] = std::numeric_limits<double>::infinity();
}
++bin_cnt;
// update bin upper bound
bin_upper_bound.resize(bin_cnt);
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
// last bin upper bound
bin_upper_bound[bin_cnt - 1] = std::numeric_limits<double>::infinity();
return bin_upper_bound;
}
return bin_upper_bound;
}
std::vector<double> FindBinWithZeroAsMissing(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
int left_cnt_data = 0;
int cnt_missing = 0;
int right_cnt_data = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] <= -kZeroAsMissingValueRange) {
left_cnt_data += counts[i];
} else if (distinct_values[i] > kZeroAsMissingValueRange) {
right_cnt_data += counts[i];
} else {
cnt_missing += counts[i];
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
int left_cnt_data = 0;
int cnt_zero = 0;
int right_cnt_data = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] <= -kZeroAsMissingValueRange) {
left_cnt_data += counts[i];
} else if (distinct_values[i] > kZeroAsMissingValueRange) {
right_cnt_data += counts[i];
} else {
cnt_zero += counts[i];
}
}
}
int left_cnt = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroAsMissingValueRange) {
left_cnt = i;
break;
int left_cnt = -1;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroAsMissingValueRange) {
left_cnt = i;
break;
}
}
}
if (left_cnt > 0) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_missing) * (max_bin - 1));
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound.back() = -kZeroAsMissingValueRange;
}
if (left_cnt < 0) {
left_cnt = num_distinct_values;
}
int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kZeroAsMissingValueRange) {
right_start = i;
break;
if (left_cnt > 0) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound.back() = -kZeroAsMissingValueRange;
}
}
if (right_start >= 0) {
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound.push_back(kZeroAsMissingValueRange);
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
return bin_upper_bound;
}
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
int na_cnt = 0;
int tmp_num_sample_values = 0;
for (int i = 0; i < num_sample_values; ++i) {
if (!std::isnan(values[i])) {
values[tmp_num_sample_values++] = values[i];
int right_start = -1;
for (int i = left_cnt; i < num_distinct_values; ++i) {
if (distinct_values[i] > kZeroAsMissingValueRange) {
right_start = i;
break;
}
}
if (right_start >= 0) {
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound.push_back(kZeroAsMissingValueRange);
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
return bin_upper_bound;
}
if (!use_missing) {
missing_type_ = MissingType::None;
} else if (zero_as_missing) {
missing_type_ = MissingType::Zero;
} else {
if (tmp_num_sample_values == num_sample_values) {
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
int na_cnt = 0;
int tmp_num_sample_values = 0;
for (int i = 0; i < num_sample_values; ++i) {
if (!std::isnan(values[i])) {
values[tmp_num_sample_values++] = values[i];
}
}
if (!use_missing) {
missing_type_ = MissingType::None;
} else if (zero_as_missing) {
missing_type_ = MissingType::Zero;
} else {
missing_type_ = MissingType::NaN;
if (tmp_num_sample_values == num_sample_values) {
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
na_cnt = num_sample_values - tmp_num_sample_values;
}
}
na_cnt = num_sample_values - tmp_num_sample_values;
}
num_sample_values = tmp_num_sample_values;
num_sample_values = tmp_num_sample_values;
bin_type_ = bin_type;
default_bin_ = 0;
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
bin_type_ = bin_type;
default_bin_ = 0;
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
std::sort(values, values + num_sample_values);
std::sort(values, values + num_sample_values);
// push zero in the front
if (num_sample_values == 0 || (values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
// push zero in the front
if (num_sample_values == 0 || (values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
if (num_sample_values > 0) {
distinct_values.push_back(values[0]);
counts.push_back(1);
}
if (num_sample_values > 0) {
distinct_values.push_back(values[0]);
counts.push_back(1);
}
for (int i = 1; i < num_sample_values; ++i) {
if (values[i] != values[i - 1]) {
if (values[i - 1] < 0.0f && values[i] > 0.0f) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
for (int i = 1; i < num_sample_values; ++i) {
if (values[i] != values[i - 1]) {
if (values[i - 1] < 0.0f && values[i] > 0.0f) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
distinct_values.push_back(values[i]);
counts.push_back(1);
} else {
++counts.back();
}
distinct_values.push_back(values[i]);
counts.push_back(1);
} else {
++counts.back();
}
}
// push zero in the back
if (num_sample_values > 0 && values[num_sample_values - 1] < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin;
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) {
bin_upper_bound_ = FindBinWithZeroAsMissing(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
if (bin_upper_bound_.size() == 2) {
missing_type_ = MissingType::None;
// push zero in the back
if (num_sample_values > 0 && values[num_sample_values - 1] < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(zero_cnt);
}
min_val_ = distinct_values.front();
max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin;
int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
if (bin_upper_bound_.size() == 2) {
missing_type_ = MissingType::None;
}
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
} else {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
bin_upper_bound_.push_back(NaN);
}
num_bin_ = static_cast<int>(bin_upper_bound_.size());
{
cnt_in_bin.resize(num_bin_, 0);
int i_bin = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > bin_upper_bound_[i_bin]) {
++i_bin;
}
cnt_in_bin[i_bin] += counts[i];
}
if (missing_type_ == MissingType::NaN) {
cnt_in_bin[num_bin_ - 1] = na_cnt;
}
}
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
CHECK(num_bin_ <= max_bin);
} else {
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
bin_upper_bound_.push_back(NaN);
}
num_bin_ = static_cast<int>(bin_upper_bound_.size());
{
cnt_in_bin.resize(num_bin_, 0);
int i_bin = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > bin_upper_bound_[i_bin]) {
++i_bin;
}
cnt_in_bin[i_bin] += counts[i];
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
counts_int.push_back(counts[0]);
for (size_t i = 1; i < distinct_values.size(); ++i) {
if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
counts_int.push_back(counts[i]);
} else {
counts_int.back() += counts[i];
}
}
if (missing_type_ == MissingType::NaN) {
cnt_in_bin[num_bin_ - 1] = na_cnt;
// sort by counts
Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
// avoid first bin is zero
if (distinct_values_int[0] == 0 && counts_int.size() > 1) {
std::swap(counts_int[0], counts_int[1]);
std::swap(distinct_values_int[0], distinct_values_int[1]);
}
}
CHECK(num_bin_ <= max_bin);
} else {
// No missing handle for categorical features
missing_type_ = MissingType::None;
// convert to int type first
std::vector<int> distinct_values_int;
std::vector<int> counts_int;
distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
counts_int.push_back(counts[0]);
for (size_t i = 1; i < distinct_values.size(); ++i) {
if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
counts_int.push_back(counts[i]);
// will ignore the categorical of small counts
int cut_cnt = static_cast<int>((total_sample_cnt - na_cnt) * 0.99f);
size_t cur_cat = 0;
categorical_2_bin_.clear();
bin_2_categorical_.clear();
num_bin_ = 0;
int used_cnt = 0;
max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
cnt_in_bin.clear();
while (cur_cat < distinct_values_int.size()
&& (used_cnt < cut_cnt || num_bin_ < max_bin)) {
if (distinct_values_int[cur_cat] < 0) {
na_cnt += counts_int[cur_cat];
cut_cnt -= counts_int[cur_cat];
Log::Warning("Met negative value in categorical features, will convert it to NaN");
} else {
bin_2_categorical_.push_back(distinct_values_int[cur_cat]);
categorical_2_bin_[distinct_values_int[cur_cat]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat];
cnt_in_bin.push_back(counts_int[cur_cat]);
++num_bin_;
}
++cur_cat;
}
// need an additional bin for NaN
if (cur_cat == distinct_values_int.size() && na_cnt > 0) {
// use -1 to represent NaN
bin_2_categorical_.push_back(-1);
categorical_2_bin_[-1] = num_bin_;
cnt_in_bin.push_back(0);
++num_bin_;
}
// Use MissingType::None to represent this bin contains all categoricals
if (cur_cat == distinct_values_int.size() && na_cnt == 0) {
missing_type_ = MissingType::None;
} else if (na_cnt == 0) {
missing_type_ = MissingType::Zero;
} else {
counts_int.back() += counts[i];
missing_type_ = MissingType::NaN;
}
cnt_in_bin.back() += static_cast<int>(total_sample_cnt - used_cnt);
}
// sort by counts
Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
// will ignore the categorical of small counts
const int cut_cnt = static_cast<int>(total_sample_cnt * 0.98f);
categorical_2_bin_.clear();
bin_2_categorical_.clear();
num_bin_ = 0;
int used_cnt = 0;
max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
while (used_cnt < cut_cnt || num_bin_ < max_bin) {
bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[num_bin_];
++num_bin_;
// check trival(num_bin_ == 1) feature
if (num_bin_ <= 1) {
is_trival_ = true;
} else {
is_trival_ = false;
}
// check useless bin
if (!is_trival_ && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
is_trival_ = true;
}
cnt_in_bin = counts_int;
counts_int.resize(num_bin_);
counts_int.back() += static_cast<int>(total_sample_cnt - used_cnt);
}
// check trival(num_bin_ == 1) feature
if (num_bin_ <= 1) {
is_trival_ = true;
} else {
is_trival_ = false;
}
// check useless bin
if (!is_trival_ && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
is_trival_ = true;
if (!is_trival_) {
default_bin_ = ValueToBin(0);
if (bin_type_ == BinType::CategoricalBin) {
CHECK(default_bin_ > 0);
}
}
// calculate sparse rate
sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt);
}
if (!is_trival_) {
default_bin_ = ValueToBin(0);
int BinMapper::SizeForSpecificBin(int bin) {
int size = 0;
size += sizeof(int);
size += sizeof(MissingType);
size += sizeof(bool);
size += sizeof(double);
size += sizeof(BinType);
size += 2 * sizeof(double);
size += bin * sizeof(double);
size += sizeof(uint32_t);
return size;
}
// calculate sparse rate
sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt);
}
int BinMapper::SizeForSpecificBin(int bin) {
int size = 0;
size += sizeof(int);
size += sizeof(MissingType);
size += sizeof(bool);
size += sizeof(double);
size += sizeof(BinType);
size += 2 * sizeof(double);
size += bin * sizeof(double);
size += sizeof(uint32_t);
return size;
}
void BinMapper::CopyTo(char * buffer) const {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
buffer += sizeof(missing_type_);
std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(buffer, &min_val_, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(buffer, &max_val_, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
buffer += sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
void BinMapper::CopyTo(char * buffer) const {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
buffer += sizeof(missing_type_);
std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(buffer, &min_val_, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(buffer, &max_val_, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
buffer += sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
}
}
}
void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
buffer += sizeof(missing_type_);
std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
buffer += sizeof(is_trival_);
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
} else {
bin_2_categorical_ = std::vector<int>(num_bin_);
std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
categorical_2_bin_.clear();
for (int i = 0; i < num_bin_; ++i) {
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
void BinMapper::CopyFrom(const char * buffer) {
std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
buffer += sizeof(missing_type_);
std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
buffer += sizeof(is_trival_);
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
buffer += sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
} else {
bin_2_categorical_ = std::vector<int>(num_bin_);
std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
categorical_2_bin_.clear();
for (int i = 0; i < num_bin_; ++i) {
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
}
}
}
}
void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&missing_type_, sizeof(missing_type_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file);
fwrite(&min_val_, sizeof(min_val_), 1, file);
fwrite(&max_val_, sizeof(max_val_), 1, file);
fwrite(&default_bin_, sizeof(default_bin_), 1, file);
if (bin_type_ == BinType::NumericalBin) {
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
} else {
fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&missing_type_, sizeof(missing_type_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file);
fwrite(&min_val_, sizeof(min_val_), 1, file);
fwrite(&max_val_, sizeof(max_val_), 1, file);
fwrite(&default_bin_, sizeof(default_bin_), 1, file);
if (bin_type_ == BinType::NumericalBin) {
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
} else {
fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
}
}
}
size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(missing_type_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
} else {
ret += sizeof(int) * num_bin_;
size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(missing_type_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
ret += sizeof(double) * num_bin_;
} else {
ret += sizeof(int) * num_bin_;
}
return ret;
}
return ret;
}
template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
// sparse threshold
if (sparse_rate >= sparse_threshold && is_enable_sparse) {
*is_sparse = true;
return CreateSparseBin(num_data, num_bin);
} else {
*is_sparse = false;
return CreateDenseBin(num_data, num_bin);
template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
// sparse threshold
if (sparse_rate >= sparse_threshold && is_enable_sparse) {
*is_sparse = true;
return CreateSparseBin(num_data, num_bin);
} else {
*is_sparse = false;
return CreateDenseBin(num_data, num_bin);
}
}
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) {
return new Dense4bitsBin(num_data);
} else if (num_bin <= 256) {
return new DenseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new DenseBin<uint16_t>(num_data);
} else {
return new DenseBin<uint32_t>(num_data);
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 16) {
return new Dense4bitsBin(num_data);
} else if (num_bin <= 256) {
return new DenseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new DenseBin<uint16_t>(num_data);
} else {
return new DenseBin<uint32_t>(num_data);
}
}
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new SparseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new SparseBin<uint16_t>(num_data);
} else {
return new SparseBin<uint32_t>(num_data);
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
if (num_bin <= 256) {
return new SparseBin<uint8_t>(num_data);
} else if (num_bin <= 65536) {
return new SparseBin<uint16_t>(num_data);
} else {
return new SparseBin<uint32_t>(num_data);
}
}
}
} // namespace LightGBM
......@@ -239,7 +239,7 @@ void OverallConfig::CheckParamConflict() {
}
// Check max_depth and num_leaves
if (boosting_config.tree_config.max_depth > 0) {
int full_num_leaves = std::pow(2, boosting_config.tree_config.max_depth);
int full_num_leaves = static_cast<int>(std::pow(2, boosting_config.tree_config.max_depth));
if (full_num_leaves > boosting_config.tree_config.num_leaves
&& boosting_config.tree_config.num_leaves == kDefaultNumLeaves) {
Log::Warning("Accuarcy may be bad since you didn't set num_leaves.");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment