Commit c62dcf73 authored by Guolin Ke's avatar Guolin Ke
Browse files

fix merge bugs.

parent 7a82ba4f
...@@ -12,458 +12,462 @@ ...@@ -12,458 +12,462 @@
namespace LightGBM { namespace LightGBM {
enum BinType { enum BinType {
NumericalBin, NumericalBin,
CategoricalBin CategoricalBin
}; };
enum MissingType { enum MissingType {
None, None,
Zero, Zero,
NaN NaN
}; };
/*! \brief Store data for one histogram bin */ /*! \brief Store data for one histogram bin */
struct HistogramBinEntry { struct HistogramBinEntry {
public: public:
/*! \brief Sum of gradients on this bin */ /*! \brief Sum of gradients on this bin */
double sum_gradients = 0.0f; double sum_gradients = 0.0f;
/*! \brief Sum of hessians on this bin */ /*! \brief Sum of hessians on this bin */
double sum_hessians = 0.0f; double sum_hessians = 0.0f;
/*! \brief Number of data on this bin */ /*! \brief Number of data on this bin */
data_size_t cnt = 0; data_size_t cnt = 0;
/*! /*!
* \brief Sum up (reducers) functions for histogram bin * \brief Sum up (reducers) functions for histogram bin
*/ */
inline static void SumReducer(const char *src, char *dst, int len) { inline static void SumReducer(const char *src, char *dst, int len) {
const int type_size = sizeof(HistogramBinEntry); const int type_size = sizeof(HistogramBinEntry);
int used_size = 0; int used_size = 0;
const HistogramBinEntry* p1; const HistogramBinEntry* p1;
HistogramBinEntry* p2; HistogramBinEntry* p2;
while (used_size < len) { while (used_size < len) {
// convert // convert
p1 = reinterpret_cast<const HistogramBinEntry*>(src); p1 = reinterpret_cast<const HistogramBinEntry*>(src);
p2 = reinterpret_cast<HistogramBinEntry*>(dst); p2 = reinterpret_cast<HistogramBinEntry*>(dst);
// add // add
p2->cnt += p1->cnt; p2->cnt += p1->cnt;
p2->sum_gradients += p1->sum_gradients; p2->sum_gradients += p1->sum_gradients;
p2->sum_hessians += p1->sum_hessians; p2->sum_hessians += p1->sum_hessians;
src += type_size; src += type_size;
dst += type_size; dst += type_size;
used_size += type_size; used_size += type_size;
} }
}
};
/*! \brief This class used to convert feature values into bin,
* and store some meta information for bin*/
class BinMapper {
public:
BinMapper();
BinMapper(const BinMapper& other);
explicit BinMapper(const void* memory);
~BinMapper();
bool CheckAlign(const BinMapper& other) const {
if (num_bin_ != other.num_bin_) {
return false;
}
if (missing_type_ != other.missing_type_) {
return false;
} }
if (bin_type_ == BinType::NumericalBin) { };
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) { /*! \brief This class used to convert feature values into bin,
return false; * and store some meta information for bin*/
} class BinMapper {
public:
BinMapper();
BinMapper(const BinMapper& other);
explicit BinMapper(const void* memory);
~BinMapper();
bool CheckAlign(const BinMapper& other) const {
if (num_bin_ != other.num_bin_) {
return false;
} }
} else { if (missing_type_ != other.missing_type_) {
for (int i = 0; i < num_bin_; i++) { return false;
if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) { }
return false; if (bin_type_ == BinType::NumericalBin) {
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
return false;
}
}
} else {
for (int i = 0; i < num_bin_; i++) {
if (bin_2_categorical_[i] != other.bin_2_categorical_[i]) {
return false;
}
} }
} }
return true;
} }
return true;
}
/*! \brief Get number of bins */ /*! \brief Get number of bins */
inline int num_bin() const { return num_bin_; } inline int num_bin() const { return num_bin_; }
/*! \brief Missing Type */ /*! \brief Missing Type */
inline MissingType missing_type() const { return missing_type_; } inline MissingType missing_type() const { return missing_type_; }
/*! \brief True if bin is trival (contains only one bin) */ /*! \brief True if bin is trival (contains only one bin) */
inline bool is_trival() const { return is_trival_; } inline bool is_trival() const { return is_trival_; }
/*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */ /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
inline double sparse_rate() const { return sparse_rate_; } inline double sparse_rate() const { return sparse_rate_; }
/*! /*!
* \brief Save binary data to file * \brief Save binary data to file
* \param file File want to write * \param file File want to write
*/ */
void SaveBinaryToFile(FILE* file) const; void SaveBinaryToFile(FILE* file) const;
/*! /*!
* \brief Mapping bin into feature value * \brief Mapping bin into feature value
* \param bin * \param bin
* \return Feature value of this bin * \return Feature value of this bin
*/ */
inline double BinToValue(uint32_t bin) const { inline double BinToValue(uint32_t bin) const {
if (bin_type_ == BinType::NumericalBin) { if (bin_type_ == BinType::NumericalBin) {
return bin_upper_bound_[bin]; return bin_upper_bound_[bin];
} else { } else {
return bin_2_categorical_[bin]; return bin_2_categorical_[bin];
}
} }
} /*!
/*! * \brief Get sizes in byte of this object
* \brief Get sizes in byte of this object */
*/ size_t SizesInByte() const;
size_t SizesInByte() const; /*!
/*! * \brief Mapping feature value into bin
* \brief Mapping feature value into bin * \param value
* \param value * \return bin for this feature value
* \return bin for this feature value */
*/ inline uint32_t ValueToBin(double value) const;
inline uint32_t ValueToBin(double value) const;
/*!
/*! * \brief Get the default bin when value is 0
* \brief Get the default bin when value is 0 * \return default bin
* \return default bin */
*/ inline uint32_t GetDefaultBin() const {
inline uint32_t GetDefaultBin() const { return default_bin_;
return default_bin_; }
} /*!
/*! * \brief Construct feature value to bin mapper according feature values
* \brief Construct feature value to bin mapper according feature values * \param values (Sampled) values of this feature, Note: not include zero.
* \param values (Sampled) values of this feature, Note: not include zero. * \param num_values number of values.
* \param num_values number of values. * \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros
* \param total_sample_cnt number of total sample count, equal with values.size() + num_zeros * \param max_bin The maximal number of bin
* \param max_bin The maximal number of bin * \param min_data_in_bin min number of data in one bin
* \param min_data_in_bin min number of data in one bin * \param min_split_data
* \param min_split_data * \param bin_type Type of this bin
* \param bin_type Type of this bin * \param use_missing True to enable missing value handle
* \param use_missing True to enable missing value handle * \param zero_as_missing True to use zero as missing value
* \param zero_as_missing True to use zero as missing value */
*/ void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing);
bool use_missing, bool zero_as_missing);
/*!
/*! * \brief Use specific number of bin to calculate the size of this class
* \brief Use specific number of bin to calculate the size of this class * \param bin The number of bin
* \param bin The number of bin * \return Size
* \return Size */
*/ static int SizeForSpecificBin(int bin);
static int SizeForSpecificBin(int bin);
/*!
/*! * \brief Seirilizing this object to buffer
* \brief Seirilizing this object to buffer * \param buffer The destination
* \param buffer The destination */
*/ void CopyTo(char* buffer) const;
void CopyTo(char* buffer) const;
/*!
/*! * \brief Deserilizing this object from buffer
* \brief Deserilizing this object from buffer * \param buffer The source
* \param buffer The source */
*/ void CopyFrom(const char* buffer);
void CopyFrom(const char* buffer);
/*!
/*! * \brief Get bin types
* \brief Get bin types */
*/ inline BinType bin_type() const { return bin_type_; }
inline BinType bin_type() const { return bin_type_; }
/*!
/*! * \brief Get bin info
* \brief Get bin info */
*/ inline std::string bin_info() const {
inline std::string bin_info() const { if (bin_type_ == BinType::CategoricalBin) {
if (bin_type_ == BinType::CategoricalBin) { return Common::Join(bin_2_categorical_, ":");
return Common::Join(bin_2_categorical_, ":"); } else {
} else { std::stringstream str_buf;
std::stringstream str_buf; str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2); str_buf << '[' << min_val_ << ':' << max_val_ << ']';
str_buf << '[' << min_val_ << ':' << max_val_ << ']'; return str_buf.str();
return str_buf.str(); }
} }
}
private:
/*! \brief Number of bins */
int num_bin_;
MissingType missing_type_;
/*! \brief Store upper bound for each bin */
std::vector<double> bin_upper_bound_;
/*! \brief True if this feature is trival */
bool is_trival_;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double sparse_rate_;
/*! \brief Type of this bin */
BinType bin_type_;
/*! \brief Mapper from categorical to bin */
std::unordered_map<int, unsigned int> categorical_2_bin_;
/*! \brief Mapper from bin to categorical */
std::vector<int> bin_2_categorical_;
/*! \brief minimal feature vaule */
double min_val_;
/*! \brief maximum feature value */
double max_val_;
/*! \brief bin value of feature value 0 */
uint32_t default_bin_;
};
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages by using ordered bin.
* 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only using ordered bin for sparse situations.
*/
class OrderedBin {
public:
/*! \brief virtual destructor */
virtual ~OrderedBin() {}
/*!
* \brief Initialization logic.
* \param used_indices If used_indices.size() == 0 means using all data, otherwise, used_indices[i] == true means i-th data is used
(this logic was build for bagging logic)
* \param num_leaves Number of leaves on this iteration
*/
virtual void Init(const char* used_indices, data_size_t num_leaves) = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param hessians Hessians, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients,
const score_t* hessians, HistogramBinEntry* out) const = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
/*!
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
* \param right_leaf The new leaf index after perform this split
* \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
* \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
*/
virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
virtual data_size_t NonZeroCount(int leaf) const = 0;
};
/*! \brief Iterator for one bin column */
class BinIterator {
public:
/*!
* \brief Get bin data on specific row index
* \param idx Index of this data
* \return Bin data
*/
virtual uint32_t Get(data_size_t idx) = 0;
virtual uint32_t RawGet(data_size_t idx) = 0;
virtual void Reset(data_size_t idx) = 0;
virtual ~BinIterator() = default;
};
/*!
* \brief Interface for bin data. This class will store bin data for one feature.
* unlike OrderedBin, this class will store data by original order.
* Note that it may cause cache misses when construct histogram,
* but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/
class Bin {
public:
/*! \brief virtual destructor */
virtual ~Bin() {}
/*!
* \brief Push one record
* \pram tid Thread id
* \param idx Index of record
* \param value bin value of record
*/
virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
/*!
* \brief Get bin iterator of this bin for specific feature
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin default bin if bin not in [min_bin, max_bin]
* \return Iterator of this bin
*/
virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const = 0;
/*!
* \brief Save binary data to file
* \param file File want to write
*/
virtual void SaveBinaryToFile(FILE* file) const = 0;
/*!
* \brief Load from memory
* \param memory
* \param local_used_indices
*/
virtual void LoadFromMemory(const void* memory,
const std::vector<data_size_t>& local_used_indices) = 0;
/*!
* \brief Get sizes in byte of this object
*/
virtual size_t SizesInByte() const = 0;
/*! \brief Number of all data */
virtual data_size_t num_data() const = 0;
virtual void ReSize(data_size_t num_data) = 0;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
* \param out Output Result
*/
virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param out Output Result
*/
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param missing_type missing type
* \param default_left missing bin will go to left child
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object.
* \param bin_type type of bin
* \return The number of less than or equal data.
*/
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;
/*!
* \brief Create the ordered bin for this bin
* \return Pointer to ordered bin
*/
virtual OrderedBin* CreateOrderedBin() const = 0;
/*! private:
* \brief After pushed all feature data, call this could have better refactor for bin data /*! \brief Number of bins */
*/ int num_bin_;
virtual void FinishLoad() = 0; MissingType missing_type_;
/*! \brief Store upper bound for each bin */
std::vector<double> bin_upper_bound_;
/*! \brief True if this feature is trival */
bool is_trival_;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double sparse_rate_;
/*! \brief Type of this bin */
BinType bin_type_;
/*! \brief Mapper from categorical to bin */
std::unordered_map<int, unsigned int> categorical_2_bin_;
/*! \brief Mapper from bin to categorical */
std::vector<int> bin_2_categorical_;
/*! \brief minimal feature vaule */
double min_val_;
/*! \brief maximum feature value */
double max_val_;
/*! \brief bin value of feature value 0 */
uint32_t default_bin_;
};
/*! /*!
* \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse" * \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* \param num_data Total number of data * There are 2 advantages by using ordered bin.
* \param num_bin Number of bin * 1. group the data by leafs to improve the cache hit.
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data ) * 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* \param is_enable_sparse True if enable sparse feature * However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* \param sparse_threshold Threshold for treating a feature as a sparse feature * So we only using ordered bin for sparse situations.
* \param is_sparse Will set to true if this bin is sparse
* \param default_bin Default bin for zeros value
* \return The bin data object
*/ */
static Bin* CreateBin(data_size_t num_data, int num_bin, class OrderedBin {
double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse); public:
/*! \brief virtual destructor */
virtual ~OrderedBin() {}
/*!
* \brief Initialization logic.
* \param used_indices If used_indices.size() == 0 means using all data, otherwise, used_indices[i] == true means i-th data is used
(this logic was build for bagging logic)
* \param num_leaves Number of leaves on this iteration
*/
virtual void Init(const char* used_indices, data_size_t num_leaves) = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param hessians Hessians, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients,
const score_t* hessians, HistogramBinEntry* out) const = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients, HistogramBinEntry* out) const = 0;
/*!
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
* \param right_leaf The new leaf index after perform this split
* \param is_in_leaf is_in_leaf[i] == mark means the i-th data will be on left leaf after split
* \param mark is_in_leaf[i] == mark means the i-th data will be on left leaf after split
*/
virtual void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) = 0;
virtual data_size_t NonZeroCount(int leaf) const = 0;
};
/*! \brief Iterator for one bin column */
class BinIterator {
public:
/*!
* \brief Get bin data on specific row index
* \param idx Index of this data
* \return Bin data
*/
virtual uint32_t Get(data_size_t idx) = 0;
virtual uint32_t RawGet(data_size_t idx) = 0;
virtual void Reset(data_size_t idx) = 0;
virtual ~BinIterator() = default;
};
/*! /*!
* \brief Create object for bin data of one feature, used for dense feature * \brief Interface for bin data. This class will store bin data for one feature.
* \param num_data Total number of data * unlike OrderedBin, this class will store data by original order.
* \param num_bin Number of bin * Note that it may cause cache misses when construct histogram,
* \return The bin data object * but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/ */
static Bin* CreateDenseBin(data_size_t num_data, int num_bin); class Bin {
public:
/*! /*! \brief virtual destructor */
* \brief Create object for bin data of one feature, used for sparse feature virtual ~Bin() {}
* \param num_data Total number of data /*!
* \param num_bin Number of bin * \brief Push one record
* \return The bin data object * \pram tid Thread id
*/ * \param idx Index of record
static Bin* CreateSparseBin(data_size_t num_data, int num_bin); * \param value bin value of record
}; */
virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
inline uint32_t BinMapper::ValueToBin(double value) const {
if (std::isnan(value)) {
if (missing_type_ == MissingType::NaN) { virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
return num_bin_ - 1; /*!
} else { * \brief Get bin iterator of this bin for specific feature
value = 0.0f; * \param min_bin min_bin of current used feature
} * \param max_bin max_bin of current used feature
} * \param default_bin default bin if bin not in [min_bin, max_bin]
if (bin_type_ == BinType::NumericalBin) { * \return Iterator of this bin
// binary search to find bin */
int l = 0; virtual BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const = 0;
int r = num_bin_ - 1;
if (missing_type_ == MissingType::NaN) { /*!
r -= 1; * \brief Save binary data to file
} * \param file File want to write
while (l < r) { */
int m = (r + l - 1) / 2; virtual void SaveBinaryToFile(FILE* file) const = 0;
if (value <= bin_upper_bound_[m]) {
r = m; /*!
* \brief Load from memory
* \param memory
* \param local_used_indices
*/
virtual void LoadFromMemory(const void* memory,
const std::vector<data_size_t>& local_used_indices) = 0;
/*!
* \brief Get sizes in byte of this object
*/
virtual size_t SizesInByte() const = 0;
/*! \brief Number of all data */
virtual data_size_t num_data() const = 0;
virtual void ReSize(data_size_t num_data) = 0;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
* \param out Output Result
*/
virtual void ConstructHistogram(
const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
/*!
* \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The naive solution is using gradients[data_indices[i]] for data_indices[i] to get gradients,
which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param out Output Result
*/
virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
virtual void ConstructHistogram(data_size_t num_data,
const score_t* ordered_gradients, HistogramBinEntry* out) const = 0;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param missing_type missing type
* \param default_left missing bin will go to left child
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object.
* \param bin_type type of bin
* \return The number of less than or equal data.
*/
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, MissingType missing_type, bool default_left, uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const = 0;
/*!
* \brief Create the ordered bin for this bin
* \return Pointer to ordered bin
*/
virtual OrderedBin* CreateOrderedBin() const = 0;
/*!
* \brief After pushed all feature data, call this could have better refactor for bin data
*/
virtual void FinishLoad() = 0;
/*!
* \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
* \param num_data Total number of data
* \param num_bin Number of bin
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
* \param is_enable_sparse True if enable sparse feature
* \param sparse_threshold Threshold for treating a feature as a sparse feature
* \param is_sparse Will set to true if this bin is sparse
* \param default_bin Default bin for zeros value
* \return The bin data object
*/
static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse, double sparse_threshold, bool* is_sparse);
/*!
* \brief Create object for bin data of one feature, used for dense feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \return The bin data object
*/
static Bin* CreateDenseBin(data_size_t num_data, int num_bin);
/*!
* \brief Create object for bin data of one feature, used for sparse feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \return The bin data object
*/
static Bin* CreateSparseBin(data_size_t num_data, int num_bin);
};
inline uint32_t BinMapper::ValueToBin(double value) const {
if (std::isnan(value)) {
if (missing_type_ == MissingType::NaN) {
return num_bin_ - 1;
} else { } else {
l = m + 1; value = 0.0f;
} }
} }
return l; if (bin_type_ == BinType::NumericalBin) {
} else { // binary search to find bin
int int_value = static_cast<int>(value); int l = 0;
if (categorical_2_bin_.count(int_value)) { int r = num_bin_ - 1;
return categorical_2_bin_.at(int_value); if (missing_type_ == MissingType::NaN) {
r -= 1;
}
while (l < r) {
int m = (r + l - 1) / 2;
if (value <= bin_upper_bound_[m]) {
r = m;
} else {
l = m + 1;
}
}
return l;
} else { } else {
return num_bin_ - 1; int int_value = static_cast<int>(value);
// convert negative value to NaN bin
if (int_value < 0) {
return num_bin_ - 1;
}
if (categorical_2_bin_.count(int_value)) {
return categorical_2_bin_.at(int_value);
} else {
return num_bin_ - 1;
}
} }
} }
}
} // namespace LightGBM } // namespace LightGBM
......
...@@ -409,7 +409,8 @@ inline void Tree::TreeSHAP(const double *feature_values, double *phi, ...@@ -409,7 +409,8 @@ inline void Tree::TreeSHAP(const double *feature_values, double *phi,
// internal node // internal node
} else { } else {
const int hot_index = Decision(feature_values[split_index], node); const int hot_index =
decision_funs[GetDecisionType(decision_type_[node], kCategoricalMask)](feature_values[split_index], threshold_[node]);
const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]); const int cold_index = (hot_index == left_child_[node] ? right_child_[node] : left_child_[node]);
const double w = data_count(node); const double w = data_count(node);
const double hot_zero_fraction = data_count(hot_index)/w; const double hot_zero_fraction = data_count(hot_index)/w;
......
...@@ -16,461 +16,500 @@ ...@@ -16,461 +16,500 @@
namespace LightGBM { namespace LightGBM {
BinMapper::BinMapper() { BinMapper::BinMapper() {
}
// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_;
missing_type_ = other.missing_type_;
is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
} }
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
}
BinMapper::BinMapper(const void* memory) { // deep copy function for BinMapper
CopyFrom(reinterpret_cast<const char*>(memory)); BinMapper::BinMapper(const BinMapper& other) {
} num_bin_ = other.num_bin_;
missing_type_ = other.missing_type_;
is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
}
min_val_ = other.min_val_;
max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
}
BinMapper::~BinMapper() { BinMapper::BinMapper(const void* memory) {
CopyFrom(reinterpret_cast<const char*>(memory));
}
} BinMapper::~BinMapper() {
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) { }
if (bin_type == BinType::NumericalBin) {
int sum_left = 0; bool NeedFilter(const std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) { if (bin_type == BinType::NumericalBin) {
sum_left += cnt_in_bin[i]; int sum_left = 0;
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) { for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
return false; sum_left += cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
return false;
}
} }
} } else {
} else { if (cnt_in_bin.size() <= 2) {
for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) { for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
int sum_left = cnt_in_bin[i]; int sum_left = cnt_in_bin[i];
if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) { if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
return false;
}
}
} else {
return false; return false;
} }
} }
return true;
} }
return true;
}
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
std::vector<double> bin_upper_bound;
if (num_distinct_values <= max_bin) {
bin_upper_bound.clear();
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
cur_cnt_inbin += counts[i];
if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cur_cnt_inbin = 0;
}
}
cur_cnt_inbin += counts[num_distinct_values - 1];
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
} else {
if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
max_bin = std::max(max_bin, 1);
}
double mean_bin_size = static_cast<double>(total_cnt) / max_bin;
// mean size for one bin std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
int rest_bin_cnt = max_bin; int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
int rest_sample_cnt = static_cast<int>(total_cnt); std::vector<double> bin_upper_bound;
std::vector<bool> is_big_count_value(num_distinct_values, false); if (num_distinct_values <= max_bin) {
for (int i = 0; i < num_distinct_values; ++i) { bin_upper_bound.clear();
if (counts[i] >= mean_bin_size) { int cur_cnt_inbin = 0;
is_big_count_value[i] = true; for (int i = 0; i < num_distinct_values - 1; ++i) {
--rest_bin_cnt; cur_cnt_inbin += counts[i];
rest_sample_cnt -= counts[i]; if (cur_cnt_inbin >= min_data_in_bin) {
bin_upper_bound.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
cur_cnt_inbin = 0;
}
} }
} cur_cnt_inbin += counts[num_distinct_values - 1];
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt; bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity()); } else {
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity()); if (min_data_in_bin > 0) {
max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
int bin_cnt = 0; max_bin = std::max(max_bin, 1);
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
} }
cur_cnt_inbin += counts[i]; double mean_bin_size = static_cast<double>(total_cnt) / max_bin;
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size || // mean size for one bin
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) { int rest_bin_cnt = max_bin;
upper_bounds[bin_cnt] = distinct_values[i]; int rest_sample_cnt = static_cast<int>(total_cnt);
++bin_cnt; std::vector<bool> is_big_count_value(num_distinct_values, false);
lower_bounds[bin_cnt] = distinct_values[i + 1]; for (int i = 0; i < num_distinct_values; ++i) {
if (bin_cnt >= max_bin - 1) { break; } if (counts[i] >= mean_bin_size) {
cur_cnt_inbin = 0; is_big_count_value[i] = true;
if (!is_big_count_value[i]) {
--rest_bin_cnt; --rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt); rest_sample_cnt -= counts[i];
} }
} }
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_distinct_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
}
++bin_cnt;
// update bin upper bound
bin_upper_bound.resize(bin_cnt);
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
// last bin upper bound
bin_upper_bound[bin_cnt - 1] = std::numeric_limits<double>::infinity();
} }
++bin_cnt; return bin_upper_bound;
// update bin upper bound
bin_upper_bound.resize(bin_cnt);
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
// last bin upper bound
bin_upper_bound[bin_cnt - 1] = std::numeric_limits<double>::infinity();
} }
return bin_upper_bound;
} std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
std::vector<double> FindBinWithZeroAsMissing(const double* distinct_values, const int* counts, std::vector<double> bin_upper_bound;
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { int left_cnt_data = 0;
std::vector<double> bin_upper_bound; int cnt_zero = 0;
int left_cnt_data = 0; int right_cnt_data = 0;
int cnt_missing = 0; for (int i = 0; i < num_distinct_values; ++i) {
int right_cnt_data = 0; if (distinct_values[i] <= -kZeroAsMissingValueRange) {
for (int i = 0; i < num_distinct_values; ++i) { left_cnt_data += counts[i];
if (distinct_values[i] <= -kZeroAsMissingValueRange) { } else if (distinct_values[i] > kZeroAsMissingValueRange) {
left_cnt_data += counts[i]; right_cnt_data += counts[i];
} else if (distinct_values[i] > kZeroAsMissingValueRange) { } else {
right_cnt_data += counts[i]; cnt_zero += counts[i];
} else { }
cnt_missing += counts[i];
} }
}
int left_cnt = 0; int left_cnt = -1;
for (int i = 0; i < num_distinct_values; ++i) { for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > -kZeroAsMissingValueRange) { if (distinct_values[i] > -kZeroAsMissingValueRange) {
left_cnt = i; left_cnt = i;
break; break;
}
} }
}
if (left_cnt > 0) { if (left_cnt < 0) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_missing) * (max_bin - 1)); left_cnt = num_distinct_values;
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin); }
bin_upper_bound.back() = -kZeroAsMissingValueRange;
}
int right_start = -1; if (left_cnt > 0) {
for (int i = left_cnt; i < num_distinct_values; ++i) { int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
if (distinct_values[i] > kZeroAsMissingValueRange) { bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
right_start = i; bin_upper_bound.back() = -kZeroAsMissingValueRange;
break;
} }
}
if (right_start >= 0) { int right_start = -1;
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size()); for (int i = left_cnt; i < num_distinct_values; ++i) {
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start, if (distinct_values[i] > kZeroAsMissingValueRange) {
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin); right_start = i;
bin_upper_bound.push_back(kZeroAsMissingValueRange); break;
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end()); }
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
return bin_upper_bound;
}
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
int na_cnt = 0;
int tmp_num_sample_values = 0;
for (int i = 0; i < num_sample_values; ++i) {
if (!std::isnan(values[i])) {
values[tmp_num_sample_values++] = values[i];
} }
if (right_start >= 0) {
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound.push_back(kZeroAsMissingValueRange);
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
return bin_upper_bound;
} }
if (!use_missing) {
missing_type_ = MissingType::None; void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
} else if (zero_as_missing) { int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
missing_type_ = MissingType::Zero; int na_cnt = 0;
} else { int tmp_num_sample_values = 0;
if (tmp_num_sample_values == num_sample_values) { for (int i = 0; i < num_sample_values; ++i) {
if (!std::isnan(values[i])) {
values[tmp_num_sample_values++] = values[i];
}
}
if (!use_missing) {
missing_type_ = MissingType::None; missing_type_ = MissingType::None;
} else if (zero_as_missing) {
missing_type_ = MissingType::Zero;
} else { } else {
missing_type_ = MissingType::NaN; if (tmp_num_sample_values == num_sample_values) {
missing_type_ = MissingType::None;
} else {
missing_type_ = MissingType::NaN;
na_cnt = num_sample_values - tmp_num_sample_values;
}
} }
na_cnt = num_sample_values - tmp_num_sample_values; num_sample_values = tmp_num_sample_values;
}
num_sample_values = tmp_num_sample_values;
bin_type_ = bin_type; bin_type_ = bin_type;
default_bin_ = 0; default_bin_ = 0;
int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt); int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values - na_cnt);
// find distinct_values first // find distinct_values first
std::vector<double> distinct_values; std::vector<double> distinct_values;
std::vector<int> counts; std::vector<int> counts;
std::sort(values, values + num_sample_values); std::sort(values, values + num_sample_values);
// push zero in the front // push zero in the front
if (num_sample_values == 0 || (values[0] > 0.0f && zero_cnt > 0)) { if (num_sample_values == 0 || (values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0.0f); distinct_values.push_back(0.0f);
counts.push_back(zero_cnt); counts.push_back(zero_cnt);
} }
if (num_sample_values > 0) { if (num_sample_values > 0) {
distinct_values.push_back(values[0]); distinct_values.push_back(values[0]);
counts.push_back(1); counts.push_back(1);
} }
for (int i = 1; i < num_sample_values; ++i) { for (int i = 1; i < num_sample_values; ++i) {
if (values[i] != values[i - 1]) { if (values[i] != values[i - 1]) {
if (values[i - 1] < 0.0f && values[i] > 0.0f) { if (values[i - 1] < 0.0f && values[i] > 0.0f) {
distinct_values.push_back(0.0f); distinct_values.push_back(0.0f);
counts.push_back(zero_cnt); counts.push_back(zero_cnt);
}
distinct_values.push_back(values[i]);
counts.push_back(1);
} else {
++counts.back();
} }
distinct_values.push_back(values[i]);
counts.push_back(1);
} else {
++counts.back();
} }
}
// push zero in the back // push zero in the back
if (num_sample_values > 0 && values[num_sample_values - 1] < 0.0f && zero_cnt > 0) { if (num_sample_values > 0 && values[num_sample_values - 1] < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0.0f); distinct_values.push_back(0.0f);
counts.push_back(zero_cnt); counts.push_back(zero_cnt);
} }
min_val_ = distinct_values.front(); min_val_ = distinct_values.front();
max_val_ = distinct_values.back(); max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin; std::vector<int> cnt_in_bin;
int num_distinct_values = static_cast<int>(distinct_values.size()); int num_distinct_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) { if (bin_type_ == BinType::NumericalBin) {
if (missing_type_ == MissingType::Zero) { if (missing_type_ == MissingType::Zero) {
bin_upper_bound_ = FindBinWithZeroAsMissing(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
if (bin_upper_bound_.size() == 2) { if (bin_upper_bound_.size() == 2) {
missing_type_ = MissingType::None; missing_type_ = MissingType::None;
}
} else if (missing_type_ == MissingType::None) {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
} else {
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
bin_upper_bound_.push_back(NaN);
}
num_bin_ = static_cast<int>(bin_upper_bound_.size());
{
cnt_in_bin.resize(num_bin_, 0);
int i_bin = 0;
for (int i = 0; i < num_distinct_values; ++i) {
if (distinct_values[i] > bin_upper_bound_[i_bin]) {
++i_bin;
}
cnt_in_bin[i_bin] += counts[i];
}
if (missing_type_ == MissingType::NaN) {
cnt_in_bin[num_bin_ - 1] = na_cnt;
}
} }
} else if (missing_type_ == MissingType::None) { CHECK(num_bin_ <= max_bin);
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
} else { } else {
bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin); // convert to int type first
bin_upper_bound_.push_back(NaN); std::vector<int> distinct_values_int;
} std::vector<int> counts_int;
num_bin_ = static_cast<int>(bin_upper_bound_.size()); distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
{ counts_int.push_back(counts[0]);
cnt_in_bin.resize(num_bin_, 0); for (size_t i = 1; i < distinct_values.size(); ++i) {
int i_bin = 0; if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
for (int i = 0; i < num_distinct_values; ++i) { distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
if (distinct_values[i] > bin_upper_bound_[i_bin]) { counts_int.push_back(counts[i]);
++i_bin; } else {
} counts_int.back() += counts[i];
cnt_in_bin[i_bin] += counts[i]; }
} }
if (missing_type_ == MissingType::NaN) { // sort by counts
cnt_in_bin[num_bin_ - 1] = na_cnt; Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
// avoid first bin is zero
if (distinct_values_int[0] == 0 && counts_int.size() > 1) {
std::swap(counts_int[0], counts_int[1]);
std::swap(distinct_values_int[0], distinct_values_int[1]);
} }
} // will ignore the categorical of small counts
CHECK(num_bin_ <= max_bin); int cut_cnt = static_cast<int>((total_sample_cnt - na_cnt) * 0.99f);
} else { size_t cur_cat = 0;
// No missing handle for categorical features categorical_2_bin_.clear();
missing_type_ = MissingType::None; bin_2_categorical_.clear();
// convert to int type first num_bin_ = 0;
std::vector<int> distinct_values_int; int used_cnt = 0;
std::vector<int> counts_int; max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
distinct_values_int.push_back(static_cast<int>(distinct_values[0])); cnt_in_bin.clear();
counts_int.push_back(counts[0]); while (cur_cat < distinct_values_int.size()
for (size_t i = 1; i < distinct_values.size(); ++i) { && (used_cnt < cut_cnt || num_bin_ < max_bin)) {
if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) { if (distinct_values_int[cur_cat] < 0) {
distinct_values_int.push_back(static_cast<int>(distinct_values[i])); na_cnt += counts_int[cur_cat];
counts_int.push_back(counts[i]); cut_cnt -= counts_int[cur_cat];
Log::Warning("Met negative value in categorical features, will convert it to NaN");
} else {
bin_2_categorical_.push_back(distinct_values_int[cur_cat]);
categorical_2_bin_[distinct_values_int[cur_cat]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[cur_cat];
cnt_in_bin.push_back(counts_int[cur_cat]);
++num_bin_;
}
++cur_cat;
}
// need an additional bin for NaN
if (cur_cat == distinct_values_int.size() && na_cnt > 0) {
// use -1 to represent NaN
bin_2_categorical_.push_back(-1);
categorical_2_bin_[-1] = num_bin_;
cnt_in_bin.push_back(0);
++num_bin_;
}
// Use MissingType::None to represent this bin contains all categoricals
if (cur_cat == distinct_values_int.size() && na_cnt == 0) {
missing_type_ = MissingType::None;
} else if (na_cnt == 0) {
missing_type_ = MissingType::Zero;
} else { } else {
counts_int.back() += counts[i]; missing_type_ = MissingType::NaN;
} }
cnt_in_bin.back() += static_cast<int>(total_sample_cnt - used_cnt);
} }
// sort by counts
Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true); // check trival(num_bin_ == 1) feature
// will ignore the categorical of small counts if (num_bin_ <= 1) {
const int cut_cnt = static_cast<int>(total_sample_cnt * 0.98f); is_trival_ = true;
categorical_2_bin_.clear(); } else {
bin_2_categorical_.clear(); is_trival_ = false;
num_bin_ = 0; }
int used_cnt = 0; // check useless bin
max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin); if (!is_trival_ && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
while (used_cnt < cut_cnt || num_bin_ < max_bin) { is_trival_ = true;
bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[num_bin_];
++num_bin_;
} }
cnt_in_bin = counts_int;
counts_int.resize(num_bin_);
counts_int.back() += static_cast<int>(total_sample_cnt - used_cnt);
}
// check trival(num_bin_ == 1) feature if (!is_trival_) {
if (num_bin_ <= 1) { default_bin_ = ValueToBin(0);
is_trival_ = true; if (bin_type_ == BinType::CategoricalBin) {
} else { CHECK(default_bin_ > 0);
is_trival_ = false; }
} }
// check useless bin // calculate sparse rate
if (!is_trival_ && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) { sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt);
is_trival_ = true;
} }
if (!is_trival_) {
default_bin_ = ValueToBin(0); int BinMapper::SizeForSpecificBin(int bin) {
int size = 0;
size += sizeof(int);
size += sizeof(MissingType);
size += sizeof(bool);
size += sizeof(double);
size += sizeof(BinType);
size += 2 * sizeof(double);
size += bin * sizeof(double);
size += sizeof(uint32_t);
return size;
} }
// calculate sparse rate
sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt); void BinMapper::CopyTo(char * buffer) const {
} std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
buffer += sizeof(num_bin_);
std::memcpy(buffer, &missing_type_, sizeof(missing_type_));
int BinMapper::SizeForSpecificBin(int bin) { buffer += sizeof(missing_type_);
int size = 0; std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
size += sizeof(int); buffer += sizeof(is_trival_);
size += sizeof(MissingType); std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
size += sizeof(bool); buffer += sizeof(sparse_rate_);
size += sizeof(double); std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
size += sizeof(BinType); buffer += sizeof(bin_type_);
size += 2 * sizeof(double); std::memcpy(buffer, &min_val_, sizeof(min_val_));
size += bin * sizeof(double); buffer += sizeof(min_val_);
size += sizeof(uint32_t); std::memcpy(buffer, &max_val_, sizeof(max_val_));
return size; buffer += sizeof(max_val_);
} std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
buffer += sizeof(default_bin_);
void BinMapper::CopyTo(char * buffer) const { if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, &num_bin_, sizeof(num_bin_)); std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
buffer += sizeof(num_bin_); } else {
std::memcpy(buffer, &missing_type_, sizeof(missing_type_)); std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
buffer += sizeof(missing_type_); }
std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(buffer, &min_val_, sizeof(min_val_));
buffer += sizeof(min_val_);
std::memcpy(buffer, &max_val_, sizeof(max_val_));
buffer += sizeof(max_val_);
std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
buffer += sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) {
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
} }
}
void BinMapper::CopyFrom(const char * buffer) {
void BinMapper::CopyFrom(const char * buffer) { std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
std::memcpy(&num_bin_, buffer, sizeof(num_bin_)); buffer += sizeof(num_bin_);
buffer += sizeof(num_bin_); std::memcpy(&missing_type_, buffer, sizeof(missing_type_));
std::memcpy(&missing_type_, buffer, sizeof(missing_type_)); buffer += sizeof(missing_type_);
buffer += sizeof(missing_type_); std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
std::memcpy(&is_trival_, buffer, sizeof(is_trival_)); buffer += sizeof(is_trival_);
buffer += sizeof(is_trival_); std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_)); buffer += sizeof(sparse_rate_);
buffer += sizeof(sparse_rate_); std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
std::memcpy(&bin_type_, buffer, sizeof(bin_type_)); buffer += sizeof(bin_type_);
buffer += sizeof(bin_type_); std::memcpy(&min_val_, buffer, sizeof(min_val_));
std::memcpy(&min_val_, buffer, sizeof(min_val_)); buffer += sizeof(min_val_);
buffer += sizeof(min_val_); std::memcpy(&max_val_, buffer, sizeof(max_val_));
std::memcpy(&max_val_, buffer, sizeof(max_val_)); buffer += sizeof(max_val_);
buffer += sizeof(max_val_); std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
std::memcpy(&default_bin_, buffer, sizeof(default_bin_)); buffer += sizeof(default_bin_);
buffer += sizeof(default_bin_); if (bin_type_ == BinType::NumericalBin) {
if (bin_type_ == BinType::NumericalBin) { bin_upper_bound_ = std::vector<double>(num_bin_);
bin_upper_bound_ = std::vector<double>(num_bin_); std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double)); } else {
} else { bin_2_categorical_ = std::vector<int>(num_bin_);
bin_2_categorical_ = std::vector<int>(num_bin_); std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int)); categorical_2_bin_.clear();
categorical_2_bin_.clear(); for (int i = 0; i < num_bin_; ++i) {
for (int i = 0; i < num_bin_; ++i) { categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i); }
} }
} }
}
void BinMapper::SaveBinaryToFile(FILE* file) const {
void BinMapper::SaveBinaryToFile(FILE* file) const { fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&num_bin_, sizeof(num_bin_), 1, file); fwrite(&missing_type_, sizeof(missing_type_), 1, file);
fwrite(&missing_type_, sizeof(missing_type_), 1, file); fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file); fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file); fwrite(&bin_type_, sizeof(bin_type_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file); fwrite(&min_val_, sizeof(min_val_), 1, file);
fwrite(&min_val_, sizeof(min_val_), 1, file); fwrite(&max_val_, sizeof(max_val_), 1, file);
fwrite(&max_val_, sizeof(max_val_), 1, file); fwrite(&default_bin_, sizeof(default_bin_), 1, file);
fwrite(&default_bin_, sizeof(default_bin_), 1, file); if (bin_type_ == BinType::NumericalBin) {
if (bin_type_ == BinType::NumericalBin) { fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file); } else {
} else { fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file); }
} }
}
size_t BinMapper::SizesInByte() const {
size_t BinMapper::SizesInByte() const { size_t ret = sizeof(num_bin_) + sizeof(missing_type_) + sizeof(is_trival_) + sizeof(sparse_rate_)
size_t ret = sizeof(num_bin_) + sizeof(missing_type_) + sizeof(is_trival_) + sizeof(sparse_rate_) + sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_); if (bin_type_ == BinType::NumericalBin) {
if (bin_type_ == BinType::NumericalBin) { ret += sizeof(double) * num_bin_;
ret += sizeof(double) * num_bin_; } else {
} else { ret += sizeof(int) * num_bin_;
ret += sizeof(int) * num_bin_; }
return ret;
} }
return ret;
} template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint8_t>; template class DenseBin<uint32_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>; template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint8_t>; template class SparseBin<uint32_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>; template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint8_t>; template class OrderedSparseBin<uint32_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>; Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, // sparse threshold
bool is_enable_sparse, double sparse_threshold, bool* is_sparse) { if (sparse_rate >= sparse_threshold && is_enable_sparse) {
// sparse threshold *is_sparse = true;
if (sparse_rate >= sparse_threshold && is_enable_sparse) { return CreateSparseBin(num_data, num_bin);
*is_sparse = true; } else {
return CreateSparseBin(num_data, num_bin); *is_sparse = false;
} else { return CreateDenseBin(num_data, num_bin);
*is_sparse = false; }
return CreateDenseBin(num_data, num_bin);
} }
}
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) { if (num_bin <= 16) {
if (num_bin <= 16) { return new Dense4bitsBin(num_data);
return new Dense4bitsBin(num_data); } else if (num_bin <= 256) {
} else if (num_bin <= 256) { return new DenseBin<uint8_t>(num_data);
return new DenseBin<uint8_t>(num_data); } else if (num_bin <= 65536) {
} else if (num_bin <= 65536) { return new DenseBin<uint16_t>(num_data);
return new DenseBin<uint16_t>(num_data); } else {
} else { return new DenseBin<uint32_t>(num_data);
return new DenseBin<uint32_t>(num_data); }
} }
}
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) { if (num_bin <= 256) {
if (num_bin <= 256) { return new SparseBin<uint8_t>(num_data);
return new SparseBin<uint8_t>(num_data); } else if (num_bin <= 65536) {
} else if (num_bin <= 65536) { return new SparseBin<uint16_t>(num_data);
return new SparseBin<uint16_t>(num_data); } else {
} else { return new SparseBin<uint32_t>(num_data);
return new SparseBin<uint32_t>(num_data); }
} }
}
} // namespace LightGBM } // namespace LightGBM
...@@ -239,7 +239,7 @@ void OverallConfig::CheckParamConflict() { ...@@ -239,7 +239,7 @@ void OverallConfig::CheckParamConflict() {
} }
// Check max_depth and num_leaves // Check max_depth and num_leaves
if (boosting_config.tree_config.max_depth > 0) { if (boosting_config.tree_config.max_depth > 0) {
int full_num_leaves = std::pow(2, boosting_config.tree_config.max_depth); int full_num_leaves = static_cast<int>(std::pow(2, boosting_config.tree_config.max_depth));
if (full_num_leaves > boosting_config.tree_config.num_leaves if (full_num_leaves > boosting_config.tree_config.num_leaves
&& boosting_config.tree_config.num_leaves == kDefaultNumLeaves) { && boosting_config.tree_config.num_leaves == kDefaultNumLeaves) {
Log::Warning("Accuarcy may be bad since you didn't set num_leaves."); Log::Warning("Accuarcy may be bad since you didn't set num_leaves.");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment