Commit 4f77bd28 authored by Guolin Ke's avatar Guolin Ke
Browse files

update to v2.

parent 13d4581b
...@@ -20,7 +20,9 @@ public: ...@@ -20,7 +20,9 @@ public:
LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data); LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
LIGHTGBM_EXPORT Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data); LIGHTGBM_EXPORT Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values,
std::vector<std::vector<int>>& sample_indices,
size_t total_sample_size, data_size_t num_data);
/*! \brief Disable copy */ /*! \brief Disable copy */
DatasetLoader& operator=(const DatasetLoader&) = delete; DatasetLoader& operator=(const DatasetLoader&) = delete;
...@@ -69,8 +71,6 @@ private: ...@@ -69,8 +71,6 @@ private:
std::unordered_set<int> ignore_features_; std::unordered_set<int> ignore_features_;
/*! \brief store feature names */ /*! \brief store feature names */
std::vector<std::string> feature_names_; std::vector<std::string> feature_names_;
/*! \brief Mapper from real feature index to used index*/
std::unordered_set<int> categorical_features_;
}; };
......
#ifndef LIGHTGBM_FEATURE_H_ #ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_H_ #define LIGHTGBM_FEATURE_GROUP_H_
#include <LightGBM/utils/random.h> #include <LightGBM/utils/random.h>
...@@ -12,22 +12,41 @@ ...@@ -12,22 +12,41 @@
namespace LightGBM { namespace LightGBM {
/*! \brief Using to store data and providing some operations on one feature*/ class Dataset;
class Feature { class DatasetLoader;
/*! \brief Using to store data and providing some operations on one feature group*/
class FeatureGroup {
public: public:
friend Dataset;
friend DatasetLoader;
/*! /*!
* \brief Constructor * \brief Constructor
* \param feature_idx Index of this feature * \param num_feature number of features of this group
* \param bin_mapper Bin mapper for this feature * \param bin_mappers Bin mapper for features
* \param num_data Total number of data * \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature * \param is_enable_sparse True if enable sparse feature
*/ */
Feature(int feature_idx, BinMapper* bin_mapper, FeatureGroup(int num_feature,
data_size_t num_data, bool is_enable_sparse) std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
:bin_mapper_(bin_mapper) { data_size_t num_data, bool is_enable_sparse) : num_feature_(num_feature) {
feature_index_ = feature_idx; CHECK(static_cast<int>(bin_mappers.size()) == num_feature);
bin_data_.reset(Bin::CreateBin(num_data, bin_mapper_->num_bin(), // use bin at zero to store default_bin
bin_mapper_->sparse_rate(), is_enable_sparse, &is_sparse_, bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type())); num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
int cnt_non_zero = 0;
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(bin_mappers[i].release());
auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetDefaultBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers_[i]->sparse_rate()));
}
double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_,
sparse_rate, is_enable_sparse, &is_sparse_));
} }
/*! /*!
* \brief Constructor from memory * \brief Constructor from memory
...@@ -35,39 +54,44 @@ public: ...@@ -35,39 +54,44 @@ public:
* \param num_all_data Number of global data * \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data * \param local_used_indices Local used indices, empty means using all data
*/ */
Feature(const void* memory, data_size_t num_all_data, FeatureGroup(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) { const std::vector<data_size_t>& local_used_indices) {
const char* memory_ptr = reinterpret_cast<const char*>(memory); const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get featuer index
feature_index_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(feature_index_);
// get is_sparse // get is_sparse
is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr)); is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_sparse_); memory_ptr += sizeof(is_sparse_);
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(num_feature_);
// get bin mapper // get bin mapper
bin_mapper_.reset(new BinMapper(memory_ptr)); bin_mappers_.clear();
memory_ptr += bin_mapper_->SizesInByte(); bin_offsets_.clear();
// start from 1, due to need to store zero bin in this slot
num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetDefaultBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
memory_ptr += bin_mappers_[i]->SizesInByte();
}
data_size_t num_data = num_all_data; data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) { if (!local_used_indices.empty()) {
num_data = static_cast<data_size_t>(local_used_indices.size()); num_data = static_cast<data_size_t>(local_used_indices.size());
} }
if (is_sparse_) { if (is_sparse_) {
bin_data_.reset(Bin::CreateSparseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type())); bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else { } else {
bin_data_.reset(Bin::CreateDenseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type())); bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
} }
// get bin data // get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices); bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
} }
/*! \brief Destructor */ /*! \brief Destructor */
~Feature() { ~FeatureGroup() {
}
bool CheckAlign(const Feature& other) const {
if (feature_index_ != other.feature_index_) {
return false;
}
return bin_mapper_->CheckAlign(*(other.bin_mapper_.get()));
} }
/*! /*!
...@@ -76,78 +100,91 @@ public: ...@@ -76,78 +100,91 @@ public:
* \param idx Index of record * \param idx Index of record
* \param value feature value of record * \param value feature value of record
*/ */
inline void PushData(int tid, data_size_t line_idx, double value) { inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
unsigned int bin = bin_mapper_->ValueToBin(value); uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
bin_data_->Push(tid, line_idx, bin); if (bin == bin_mappers_[sub_feature_idx]->GetDefaultBin()) { return; }
} bin += bin_offsets_[sub_feature_idx];
if (bin_mappers_[sub_feature_idx]->GetDefaultBin() == 0) {
inline void PushBin(int tid, data_size_t line_idx, unsigned int bin) { bin -= 1;
}
bin_data_->Push(tid, line_idx, bin); bin_data_->Push(tid, line_idx, bin);
} }
inline void CopySubset(const Feature* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) { inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices); bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
} }
inline void ReSize(data_size_t num_data) { inline BinIterator* SubFetureIterator(int sub_feature) {
bin_data_->ReSize(num_data); uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->GetIterator(min_bin, max_bin, default_bin);
} }
inline bool is_sparse() const { return is_sparse_; } inline data_size_t Split(
int sub_feature,
inline void FinishLoad() { bin_data_->FinishLoad(); } uint32_t threshold,
/*! \brief Index of this feature */ data_size_t* data_indices, data_size_t num_data,
inline int feature_index() const { return feature_index_; } data_size_t* lte_indices, data_size_t* gt_indices) const {
/*! \brief Bin mapper that this feature used */
inline const BinMapper* bin_mapper() const { return bin_mapper_.get(); } uint32_t min_bin = bin_offsets_[sub_feature];
/*! \brief Number of bin of this feature */ uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
inline int num_bin() const { return bin_mapper_->num_bin(); } uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin,
inline BinType bin_type() const { return bin_mapper_->bin_type(); } threshold, data_indices, num_data, lte_indices, gt_indices);
/*! \brief Get bin data of this feature */ }
inline const Bin* bin_data() const { return bin_data_.get(); }
/*! /*!
* \brief From bin to feature value * \brief From bin to feature value
* \param bin * \param bin
* \return Feature value of this bin * \return FeatureGroup value of this bin
*/ */
inline double BinToValue(unsigned int bin) inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
const { return bin_mapper_->BinToValue(bin); } return bin_mappers_[sub_feature_idx]->BinToValue(bin);
}
/*! /*!
* \brief Save binary data to file * \brief Save binary data to file
* \param file File want to write * \param file File want to write
*/ */
void SaveBinaryToFile(FILE* file) const { void SaveBinaryToFile(FILE* file) const {
fwrite(&feature_index_, sizeof(feature_index_), 1, file);
fwrite(&is_sparse_, sizeof(is_sparse_), 1, file); fwrite(&is_sparse_, sizeof(is_sparse_), 1, file);
bin_mapper_->SaveBinaryToFile(file); fwrite(&num_feature_, sizeof(num_feature_), 1, file);
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(file);
}
bin_data_->SaveBinaryToFile(file); bin_data_->SaveBinaryToFile(file);
} }
/*! /*!
* \brief Get sizes in byte of this object * \brief Get sizes in byte of this object
*/ */
size_t SizesInByte() const { size_t SizesInByte() const {
return sizeof(feature_index_) + sizeof(is_sparse_) + size_t ret = sizeof(is_sparse_) + sizeof(num_feature_);
bin_mapper_->SizesInByte() + bin_data_->SizesInByte(); for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
ret += bin_data_->SizesInByte();
return ret;
} }
/*! \brief Disable copy */ /*! \brief Disable copy */
Feature& operator=(const Feature&) = delete; FeatureGroup& operator=(const FeatureGroup&) = delete;
/*! \brief Disable copy */ /*! \brief Disable copy */
Feature(const Feature&) = delete; FeatureGroup(const FeatureGroup&) = delete;
private: private:
/*! \brief Index of this feature */ /*! \brief Number of features */
int feature_index_; int num_feature_;
/*! \brief Bin mapper that this feature used */ /*! \brief Bin mapper for sub features */
std::unique_ptr<BinMapper> bin_mapper_; std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
/*! \brief Bin offsets for sub features */
std::vector<uint32_t> bin_offsets_;
/*! \brief Bin data of this feature */ /*! \brief Bin data of this feature */
std::unique_ptr<Bin> bin_data_; std::unique_ptr<Bin> bin_data_;
/*! \brief True if this feature is sparse */ /*! \brief True if this feature is sparse */
bool is_sparse_; bool is_sparse_;
int num_total_bin_;
}; };
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_FEATURE_H_ #endif // LIGHTGBM_FEATURE_GROUP_H_
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#define LIGHTGBM_TREE_H_ #define LIGHTGBM_TREE_H_
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <string> #include <string>
...@@ -35,7 +34,6 @@ public: ...@@ -35,7 +34,6 @@ public:
* \brief Performing a split on tree leaves. * \brief Performing a split on tree leaves.
* \param leaf Index of leaf to be split * \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features * \param feature Index of feature; the converted index after removing useless features
* \param bin_type type of this feature, numerical or categorical
* \param threshold Threshold(bin) of split * \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data * \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value * \param threshold_double Threshold on feature value
...@@ -46,7 +44,7 @@ public: ...@@ -46,7 +44,7 @@ public:
* \param gain Split gain * \param gain Split gain
* \return The index of new leaf. * \return The index of new leaf.
*/ */
int Split(int leaf, int feature, BinType bin_type, unsigned int threshold, int real_feature, int Split(int leaf, int feature, uint32_t threshold, int real_feature,
double threshold_double, double left_value, double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain); double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
...@@ -64,8 +62,9 @@ public: ...@@ -64,8 +62,9 @@ public:
* \param num_data Number of total data * \param num_data Number of total data
* \param score Will add prediction to score * \param score Will add prediction to score
*/ */
void AddPredictionToScore(const Dataset* data, data_size_t num_data, void AddPredictionToScore(const Dataset* data,
double* score) const; data_size_t num_data,
double* score) const;
/*! /*!
* \brief Adding prediction value of this tree model to scorese * \brief Adding prediction value of this tree model to scorese
...@@ -93,7 +92,7 @@ public: ...@@ -93,7 +92,7 @@ public:
inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; } inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
/*! \brief Get feature of specific split*/ /*! \brief Get feature of specific split*/
inline int split_feature_real(int split_idx) const { return split_feature_real_[split_idx]; } inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
/*! /*!
* \brief Shrinkage for the tree's output * \brief Shrinkage for the tree's output
...@@ -101,8 +100,9 @@ public: ...@@ -101,8 +100,9 @@ public:
* \param rate The factor of shrinkage * \param rate The factor of shrinkage
*/ */
inline void Shrinkage(double rate) { inline void Shrinkage(double rate) {
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_leaves_; ++i) { for (int i = 0; i < num_leaves_; ++i) {
leaf_value_[i] = leaf_value_[i] * rate; leaf_value_[i] *= rate;
} }
} }
...@@ -112,15 +112,6 @@ public: ...@@ -112,15 +112,6 @@ public:
/*! \brief Serialize this object to json*/ /*! \brief Serialize this object to json*/
std::string ToJSON(); std::string ToJSON();
template<typename T>
static bool CategoricalDecision(T fval, T threshold) {
if (static_cast<int>(fval) == static_cast<int>(threshold)) {
return true;
} else {
return false;
}
}
template<typename T> template<typename T>
static bool NumericalDecision(T fval, T threshold) { static bool NumericalDecision(T fval, T threshold) {
if (fval <= threshold) { if (fval <= threshold) {
...@@ -130,26 +121,13 @@ public: ...@@ -130,26 +121,13 @@ public:
} }
} }
static const char* GetDecisionTypeName(int8_t type) { private:
if (type == 0) {
return "no_greater";
} else {
return "is";
}
}
static std::vector<bool(*)(unsigned int, unsigned int)> inner_decision_funs; inline int GetLeaf(std::vector<std::unique_ptr<BinIterator>>& iterators,
static std::vector<bool(*)(double, double)> decision_funs; data_size_t data_idx) const;
private: inline int GetLeafRaw(std::vector<std::unique_ptr<BinIterator>>& iterators,
/*! data_size_t data_idx) const;
* \brief Find leaf index of which record belongs by data
* \param data The dataset
* \param data_idx Index of record
* \return Leaf index
*/
inline int GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const;
/*! /*!
* \brief Find leaf index of which record belongs by features * \brief Find leaf index of which record belongs by features
...@@ -171,15 +149,13 @@ private: ...@@ -171,15 +149,13 @@ private:
/*! \brief A non-leaf node's right child */ /*! \brief A non-leaf node's right child */
std::vector<int> right_child_; std::vector<int> right_child_;
/*! \brief A non-leaf node's split feature */ /*! \brief A non-leaf node's split feature */
std::vector<int> split_feature_; std::vector<int> split_feature_inner;
/*! \brief A non-leaf node's split feature, the original index */ /*! \brief A non-leaf node's split feature, the original index */
std::vector<int> split_feature_real_; std::vector<int> split_feature_;
/*! \brief A non-leaf node's split threshold in bin */ /*! \brief A non-leaf node's split threshold in bin */
std::vector<unsigned int> threshold_in_bin_; std::vector<uint32_t> threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */ /*! \brief A non-leaf node's split threshold in feature value */
std::vector<double> threshold_; std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std::vector<int8_t> decision_type_;
/*! \brief A non-leaf node's split gain */ /*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_; std::vector<double> split_gain_;
// used for leaf node // used for leaf node
...@@ -208,13 +184,28 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const { ...@@ -208,13 +184,28 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
return leaf; return leaf;
} }
inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterators, inline int Tree::GetLeaf(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const { data_size_t data_idx) const {
int node = 0;
while (node >= 0) {
if (NumericalDecision<uint32_t>(
iterators[node]->Get(data_idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else {
node = right_child_[node];
}
}
return ~node;
}
inline int Tree::GetLeafRaw(std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
if (inner_decision_funs[decision_type_[node]]( if (NumericalDecision<uint32_t>(
iterators[split_feature_[node]]->Get(data_idx), iterators[split_feature_inner[node]]->Get(data_idx),
threshold_in_bin_[node])) { threshold_in_bin_[node])) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
node = right_child_[node]; node = right_child_[node];
...@@ -226,8 +217,8 @@ inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterat ...@@ -226,8 +217,8 @@ inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterat
inline int Tree::GetLeaf(const double* feature_values) const { inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
if (decision_funs[decision_type_[node]]( if (NumericalDecision<double>(
feature_values[split_feature_real_[node]], feature_values[split_feature_[node]],
threshold_[node])) { threshold_[node])) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
#include <LightGBM/utils/openmp_wrapper.h>
namespace LightGBM { namespace LightGBM {
...@@ -12,88 +13,136 @@ namespace LightGBM { ...@@ -12,88 +13,136 @@ namespace LightGBM {
template<typename VAL_T> template<typename VAL_T>
class ArrayArgs { class ArrayArgs {
public: public:
inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int step = std::max(1, (static_cast<int>(array.size()) + num_threads - 1) / num_threads);
std::vector<size_t> arg_maxs(num_threads, 0);
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < num_threads; ++i) {
size_t start = step * i;
if (start >= array.size()) { continue; }
size_t end = std::min(array.size(), start + step);
size_t arg_max = start;
for (size_t j = start + 1; j < end; ++j) {
if (array[j] > array[arg_max]) {
arg_max = j;
}
}
arg_maxs[i] = arg_max;
}
size_t ret = arg_maxs[0];
for (int i = 1; i < num_threads; ++i) {
if (array[arg_maxs[i]] > array[ret]) {
ret = arg_maxs[i];
}
}
return ret;
}
inline static size_t ArgMax(const std::vector<VAL_T>& array) { inline static size_t ArgMax(const std::vector<VAL_T>& array) {
if (array.empty()) { if (array.empty()) {
return 0; return 0;
} }
size_t argMax = 0; if (array.size() > 100) {
for (size_t i = 1; i < array.size(); ++i) { return ArgMaxMT(array);
if (array[i] > array[argMax]) { } else {
argMax = i; size_t arg_max = 0;
for (size_t i = 1; i < array.size(); ++i) {
if (array[i] > array[arg_max]) {
arg_max = i;
}
} }
return arg_max;
} }
return argMax;
} }
inline static size_t ArgMin(const std::vector<VAL_T>& array) { inline static size_t ArgMin(const std::vector<VAL_T>& array) {
if (array.empty()) { if (array.empty()) {
return 0; return 0;
} }
size_t argMin = 0; size_t arg_min = 0;
for (size_t i = 1; i < array.size(); ++i) { for (size_t i = 1; i < array.size(); ++i) {
if (array[i] < array[argMin]) { if (array[i] < array[arg_min]) {
argMin = i; arg_min = i;
} }
} }
return argMin; return arg_min;
} }
inline static size_t ArgMax(const VAL_T* array, size_t n) { inline static size_t ArgMax(const VAL_T* array, size_t n) {
if (n <= 0) { if (n <= 0) {
return 0; return 0;
} }
size_t argMax = 0; size_t arg_max = 0;
for (size_t i = 1; i < n; ++i) { for (size_t i = 1; i < n; ++i) {
if (array[i] > array[argMax]) { if (array[i] > array[arg_max]) {
argMax = i; arg_max = i;
} }
} }
return argMax; return arg_max;
} }
inline static size_t ArgMin(const VAL_T* array, size_t n) { inline static size_t ArgMin(const VAL_T* array, size_t n) {
if (n <= 0) { if (n <= 0) {
return 0; return 0;
} }
size_t argMin = 0; size_t arg_min = 0;
for (size_t i = 1; i < n; ++i) { for (size_t i = 1; i < n; ++i) {
if (array[i] < array[argMin]) { if (array[i] < array[arg_min]) {
argMin = i; arg_min = i;
} }
} }
return argMin; return arg_min;
} }
inline static size_t Partition(std::vector<VAL_T>* array, size_t start, size_t end) { inline static void Partition(std::vector<VAL_T>* arr, int start, int end, int* l, int* r) {
VAL_T& pivot = (*array)[end - 1]; int i = start - 1;
size_t p_idx = start; int j = end - 1;
for (size_t i = start; i < end - 1; ++i) { int p = i;
if ((*array)[i] > pivot) { int q = j;
std::swap((*array)[p_idx], (*array)[i]); if (start >= end) {
++p_idx; return;
}
} }
std::swap((*array)[p_idx], (*array)[end - 1]); std::vector<VAL_T>& ref = *arr;
return p_idx; VAL_T v = ref[end - 1];
for (;;) {
while (ref[++i] > v);
while (v > ref[--j]) { if (j == start) { break; } }
if (i >= j) { break; }
std::swap(ref[i], ref[j]);
if (ref[i] == v) { p++; std::swap(ref[p], ref[i]); }
if (v == ref[j]) { q--; std::swap(ref[j], ref[q]); }
}
std::swap(ref[i], ref[end - 1]);
j = i - 1;
i = i + 1;
for (int k = start; k <= p; k++, j--) { std::swap(ref[k], ref[j]); }
for (int k = end - 2; k >= q; k--, i++) { std::swap(ref[i], ref[k]); }
*l = j;
*r = i;
}; };
inline static size_t ArgMaxAtK(std::vector<VAL_T>* array, size_t start, size_t end, size_t k) { inline static int ArgMaxAtK(std::vector<VAL_T>* arr, int start, int end, int k) {
if (start == end - 1) { if (start >= end - 1) {
return start; return start;
} }
size_t p_idx = Partition(array, start, end); int l = start;
if (p_idx == k) { int r = end - 1;
return p_idx; Partition(arr, start, end, &l, &r);
} if ((k > l && k < r) || l == 0 || r == end - 1) {
else if (k < p_idx) { return k;
return ArgMaxAtK(array, start, p_idx, k); } else if (k <= l) {
} return ArgMaxAtK(arr, start, l, k);
else { } else {
return ArgMaxAtK(array, p_idx + 1, end, k); return ArgMaxAtK(arr, r, end, k);
} }
} }
inline static void MaxK(const std::vector<VAL_T>& array, size_t k, std::vector<VAL_T>* out) { inline static void MaxK(const std::vector<VAL_T>& array, int k, std::vector<VAL_T>* out) {
out->clear(); out->clear();
if (k <= 0) { if (k <= 0) {
return; return;
...@@ -104,7 +153,7 @@ public: ...@@ -104,7 +153,7 @@ public:
if (k >= array.size()) { if (k >= array.size()) {
return; return;
} }
ArgMaxAtK(out, 0, out->size(), k - 1); ArgMaxAtK(out, 0, static_cast<int>(out->size()), k - 1);
out->erase(out->begin() + k, out->end()); out->erase(out->begin() + k, out->end());
} }
......
...@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) { ...@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
frac = 0; frac = 0;
scale = 1.0; scale = 1.0;
if ((*p == 'e') || (*p == 'E')) { if ((*p == 'e') || (*p == 'E')) {
unsigned int expon; uint32_t expon;
// Get sign of exponent, if any. // Get sign of exponent, if any.
++p; ++p;
if (*p == '-') { if (*p == '-') {
......
...@@ -20,30 +20,41 @@ public: ...@@ -20,30 +20,41 @@ public:
std::random_device rd; std::random_device rd;
auto genrator = std::mt19937(rd()); auto genrator = std::mt19937(rd());
std::uniform_int_distribution<int> distribution(0, x); std::uniform_int_distribution<int> distribution(0, x);
x = static_cast<unsigned int>(distribution(genrator)); x = distribution(genrator);
} }
/*! /*!
* \brief Constructor, with specific seed * \brief Constructor, with specific seed
*/ */
Random(int seed) { Random(int seed) {
x = static_cast<unsigned int>(seed); x = seed;
} }
/*! /*!
* \brief Generate random integer * \brief Generate random integer, int16 range. [0, 65536]
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline int NextShort(int lower_bound, int upper_bound) {
return (RandInt16()) % (upper_bound - lower_bound) + lower_bound;
}
/*!
* \brief Generate random integer, int32 range
* \param lower_bound lower bound * \param lower_bound lower bound
* \param upper_bound upper bound * \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound) * \return The random integer between [lower_bound, upper_bound)
*/ */
inline int NextInt(int lower_bound, int upper_bound) { inline int NextInt(int lower_bound, int upper_bound) {
return (fastrand()) % (upper_bound - lower_bound) + lower_bound; return (RandInt32()) % (upper_bound - lower_bound) + lower_bound;
} }
/*! /*!
* \brief Generate random float data * \brief Generate random float data
* \return The random float between [0.0, 1.0) * \return The random float between [0.0, 1.0)
*/ */
inline float NextFloat() { inline float NextFloat() {
// get random float in [0,1) // get random float in [0,1)
return static_cast<float>(fastrand()) / (32768.0f); return static_cast<float>(RandInt16()) / (32768.0f);
} }
/*! /*!
* \brief Sample K data from {0,1,...,N-1} * \brief Sample K data from {0,1,...,N-1}
...@@ -65,10 +76,16 @@ public: ...@@ -65,10 +76,16 @@ public:
return ret; return ret;
} }
private: private:
inline int fastrand() { inline int RandInt16() {
x = (214013 * x + 2531011); x = (214013 * x + 2531011);
return (x >> 16) & 0x7FFF; return (x >> 16) & 0x7FFF;
} }
inline int RandInt32() {
x = (214013 * x + 2531011);
return x & 0x7FFFFFF;
}
int x = 123456789; int x = 123456789;
}; };
......
...@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child): ...@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
def print_simple_predicate(tab_len, node_id, is_left_child, prev_node_idx, is_leaf): def print_simple_predicate(tab_len, node_id, is_left_child, prev_node_idx, is_leaf):
if is_left_child: if is_left_child:
op = 'equal' if decision_type[prev_node_idx] == 1 else 'lessOrEqual' op = 'lessOrEqual'
else: else:
op = 'notEqual' if decision_type[prev_node_idx] == 1 else 'greaterThan' op = 'greaterThan'
out_('\t' * (tab_len + 1) + ("<SimplePredicate field=\"{0}\" " + " operator=\"{1}\" value=\"{2}\" />").format( out_('\t' * (tab_len + 1) + ("<SimplePredicate field=\"{0}\" " + " operator=\"{1}\" value=\"{2}\" />").format(
get_field_name(node_id, prev_node_idx, is_leaf), op, get_threshold(node_id, prev_node_idx, is_leaf))) get_field_name(node_id, prev_node_idx, is_leaf), op, get_threshold(node_id, prev_node_idx, is_leaf)))
...@@ -128,7 +128,6 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out: ...@@ -128,7 +128,6 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
split_feature = get_array_ints(next(model_content)) split_feature = get_array_ints(next(model_content))
split_gain = next(model_content) # unused split_gain = next(model_content) # unused
threshold = get_array_strings(next(model_content)) threshold = get_array_strings(next(model_content))
decision_type = get_array_ints(next(model_content))
left_child = get_array_ints(next(model_content)) left_child = get_array_ints(next(model_content))
right_child = get_array_ints(next(model_content)) right_child = get_array_ints(next(model_content))
leaf_parent = get_array_ints(next(model_content)) leaf_parent = get_array_ints(next(model_content))
......
...@@ -221,7 +221,7 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', ...@@ -221,7 +221,7 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32': 'float', 'float64': 'float', 'bool': 'int'} 'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical): def _data_from_pandas(data, feature_name):
if isinstance(data, DataFrame): if isinstance(data, DataFrame):
if feature_name == 'auto' or feature_name is None: if feature_name == 'auto' or feature_name is None:
if all([isinstance(name, integer_types + (np.integer, )) for name in data.columns]): if all([isinstance(name, integer_types + (np.integer, )) for name in data.columns]):
...@@ -229,25 +229,6 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica ...@@ -229,25 +229,6 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
warnings.filterwarnings('once') warnings.filterwarnings('once')
warnings.warn(msg, stacklevel=5) warnings.warn(msg, stacklevel=5)
data = data.rename(columns=str) data = data.rename(columns=str)
cat_cols = data.select_dtypes(include=['category']).columns
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes)
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto':
categorical_feature = list(cat_cols)
else:
categorical_feature = list(categorical_feature) + list(cat_cols)
if feature_name == 'auto': if feature_name == 'auto':
feature_name = list(data.columns) feature_name = list(data.columns)
data_dtypes = data.dtypes data_dtypes = data.dtypes
...@@ -261,9 +242,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica ...@@ -261,9 +242,7 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
else: else:
if feature_name == 'auto': if feature_name == 'auto':
feature_name = None feature_name = None
if categorical_feature == 'auto': return data, feature_name
categorical_feature = None
return data, feature_name, categorical_feature, pandas_categorical
def _label_from_pandas(label): def _label_from_pandas(label):
...@@ -277,19 +256,6 @@ def _label_from_pandas(label): ...@@ -277,19 +256,6 @@ def _label_from_pandas(label):
return label return label
def _save_pandas_categorical(file_name, pandas_categorical):
with open(file_name, 'a') as f:
f.write('\npandas_categorical:' + json.dumps(pandas_categorical, default=json_default_with_numpy))
def _load_pandas_categorical(file_name):
with open(file_name, 'r') as f:
last_line = f.readlines()[-1]
if last_line.startswith('pandas_categorical:'):
return json.loads(last_line[len('pandas_categorical:'):])
return None
class _InnerPredictor(object): class _InnerPredictor(object):
""" """
A _InnerPredictor of LightGBM. A _InnerPredictor of LightGBM.
...@@ -321,7 +287,6 @@ class _InnerPredictor(object): ...@@ -321,7 +287,6 @@ class _InnerPredictor(object):
ctypes.byref(out_num_class))) ctypes.byref(out_num_class)))
self.num_class = out_num_class.value self.num_class = out_num_class.value
self.num_total_iteration = out_num_iterations.value self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif booster_handle is not None: elif booster_handle is not None:
self.__is_manage_handle = False self.__is_manage_handle = False
self.handle = booster_handle self.handle = booster_handle
...@@ -335,7 +300,6 @@ class _InnerPredictor(object): ...@@ -335,7 +300,6 @@ class _InnerPredictor(object):
self.handle, self.handle,
ctypes.byref(out_num_iterations))) ctypes.byref(out_num_iterations)))
self.num_total_iteration = out_num_iterations.value self.num_total_iteration = out_num_iterations.value
self.pandas_categorical = None
else: else:
raise TypeError('Need Model file or Booster handle to create a predictor') raise TypeError('Need Model file or Booster handle to create a predictor')
...@@ -371,7 +335,7 @@ class _InnerPredictor(object): ...@@ -371,7 +335,7 @@ class _InnerPredictor(object):
""" """
if isinstance(data, Dataset): if isinstance(data, Dataset):
raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead") raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
data = _data_from_pandas(data, None, None, self.pandas_categorical)[0] data = _data_from_pandas(data, None)[0]
predict_type = C_API_PREDICT_NORMAL predict_type = C_API_PREDICT_NORMAL
if raw_score: if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE predict_type = C_API_PREDICT_RAW_SCORE
...@@ -532,7 +496,7 @@ class Dataset(object): ...@@ -532,7 +496,7 @@ class Dataset(object):
"""Dataset in LightGBM.""" """Dataset in LightGBM."""
def __init__(self, data, label=None, max_bin=255, reference=None, def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, silent=False, weight=None, group=None, silent=False,
feature_name='auto', categorical_feature='auto', params=None, feature_name='auto', params=None,
free_raw_data=True): free_raw_data=True):
""" """
Parameters Parameters
...@@ -555,11 +519,6 @@ class Dataset(object): ...@@ -555,11 +519,6 @@ class Dataset(object):
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional params: dict, optional
Other parameters Other parameters
free_raw_data: Bool free_raw_data: Bool
...@@ -574,12 +533,10 @@ class Dataset(object): ...@@ -574,12 +533,10 @@ class Dataset(object):
self.group = group self.group = group
self.silent = silent self.silent = silent
self.feature_name = feature_name self.feature_name = feature_name
self.categorical_feature = categorical_feature
self.params = params self.params = params
self.free_raw_data = free_raw_data self.free_raw_data = free_raw_data
self.used_indices = None self.used_indices = None
self._predictor = None self._predictor = None
self.pandas_categorical = None
def __del__(self): def __del__(self):
self._free_handle() self._free_handle()
...@@ -592,11 +549,11 @@ class Dataset(object): ...@@ -592,11 +549,11 @@ class Dataset(object):
def _lazy_init(self, data, label=None, max_bin=255, reference=None, def _lazy_init(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, predictor=None, weight=None, group=None, predictor=None,
silent=False, feature_name='auto', silent=False, feature_name='auto',
categorical_feature='auto', params=None): params=None):
if data is None: if data is None:
self.handle = None self.handle = None
return return
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical) data, feature_name, = _data_from_pandas(data, feature_name)
label = _label_from_pandas(label) label = _label_from_pandas(label)
self.data_has_header = False self.data_has_header = False
"""process for args""" """process for args"""
...@@ -608,23 +565,6 @@ class Dataset(object): ...@@ -608,23 +565,6 @@ class Dataset(object):
params["verbose"] = 0 params["verbose"] = 0
elif "verbose" not in params: elif "verbose" not in params:
params["verbose"] = 1 params["verbose"] = 1
"""get categorical features"""
if categorical_feature is not None:
categorical_indices = set()
feature_dict = {}
if feature_name is not None:
feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature:
if isinstance(name, string_type) and name in feature_dict:
categorical_indices.add(feature_dict[name])
elif isinstance(name, integer_types):
categorical_indices.add(name)
else:
raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature"
.format(type(name).__name__, name))
params['categorical_column'] = sorted(categorical_indices)
params_str = param_dict_to_str(params) params_str = param_dict_to_str(params)
"""process for reference dataset""" """process for reference dataset"""
ref_dataset = None ref_dataset = None
...@@ -784,7 +724,7 @@ class Dataset(object): ...@@ -784,7 +724,7 @@ class Dataset(object):
self._lazy_init(self.data, label=self.label, max_bin=self.max_bin, self._lazy_init(self.data, label=self.label, max_bin=self.max_bin,
weight=self.weight, group=self.group, predictor=self._predictor, weight=self.weight, group=self.group, predictor=self._predictor,
silent=self.silent, feature_name=self.feature_name, silent=self.silent, feature_name=self.feature_name,
categorical_feature=self.categorical_feature, params=self.params) params=self.params)
if self.free_raw_data: if self.free_raw_data:
self.data = None self.data = None
return self return self
...@@ -814,7 +754,6 @@ class Dataset(object): ...@@ -814,7 +754,6 @@ class Dataset(object):
weight=weight, group=group, silent=silent, params=params, weight=weight, group=group, silent=silent, params=params,
free_raw_data=self.free_raw_data) free_raw_data=self.free_raw_data)
ret._predictor = self._predictor ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
return ret return ret
def subset(self, used_indices, params=None): def subset(self, used_indices, params=None):
...@@ -829,9 +768,8 @@ class Dataset(object): ...@@ -829,9 +768,8 @@ class Dataset(object):
Other parameters Other parameters
""" """
ret = Dataset(None, reference=self, feature_name=self.feature_name, ret = Dataset(None, reference=self, feature_name=self.feature_name,
categorical_feature=self.categorical_feature, params=params) params=params)
ret._predictor = self._predictor ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices ret.used_indices = used_indices
return ret return ret
...@@ -939,24 +877,6 @@ class Dataset(object): ...@@ -939,24 +877,6 @@ class Dataset(object):
else: else:
raise TypeError("Unknown type") raise TypeError("Unknown type")
def set_categorical_feature(self, categorical_feature):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if self.categorical_feature == categorical_feature:
return
if self.data is not None:
self.categorical_feature = categorical_feature
self._free_handle()
else:
raise LightGBMError("Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")
def _set_predictor(self, predictor): def _set_predictor(self, predictor):
""" """
Set predictor for continued training, not recommand for user to call this function. Set predictor for continued training, not recommand for user to call this function.
...@@ -979,7 +899,6 @@ class Dataset(object): ...@@ -979,7 +899,6 @@ class Dataset(object):
reference : Dataset reference : Dataset
Will use reference as template to consturct current dataset Will use reference as template to consturct current dataset
""" """
self.set_categorical_feature(reference.categorical_feature)
self.set_feature_name(reference.feature_name) self.set_feature_name(reference.feature_name)
self._set_predictor(reference._predictor) self._set_predictor(reference._predictor)
if self.reference is reference: if self.reference is reference:
...@@ -1208,7 +1127,6 @@ class Booster(object): ...@@ -1208,7 +1127,6 @@ class Booster(object):
self.__inner_predict_buffer = [None] self.__inner_predict_buffer = [None]
self.__is_predicted_cur_iter = [False] self.__is_predicted_cur_iter = [False]
self.__get_eval_info() self.__get_eval_info()
self.pandas_categorical = train_set.pandas_categorical
elif model_file is not None: elif model_file is not None:
"""Prediction task""" """Prediction task"""
out_num_iterations = ctypes.c_int(0) out_num_iterations = ctypes.c_int(0)
...@@ -1221,7 +1139,6 @@ class Booster(object): ...@@ -1221,7 +1139,6 @@ class Booster(object):
self.handle, self.handle,
ctypes.byref(out_num_class))) ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value self.__num_class = out_num_class.value
self.pandas_categorical = _load_pandas_categorical(model_file)
elif 'model_str' in params: elif 'model_str' in params:
self.__load_model_from_string(params['model_str']) self.__load_model_from_string(params['model_str'])
else: else:
...@@ -1237,7 +1154,6 @@ class Booster(object): ...@@ -1237,7 +1154,6 @@ class Booster(object):
def __deepcopy__(self, _): def __deepcopy__(self, _):
model_str = self.__save_model_to_string() model_str = self.__save_model_to_string()
booster = Booster({'model_str': model_str}) booster = Booster({'model_str': model_str})
booster.pandas_categorical = self.pandas_categorical
return booster return booster
def __getstate__(self): def __getstate__(self):
...@@ -1477,7 +1393,6 @@ class Booster(object): ...@@ -1477,7 +1393,6 @@ class Booster(object):
self.handle, self.handle,
ctypes.c_int(num_iteration), ctypes.c_int(num_iteration),
c_str(filename))) c_str(filename)))
_save_pandas_categorical(filename, self.pandas_categorical)
def __load_model_from_string(self, model_str): def __load_model_from_string(self, model_str):
"""[Private] Load model from string""" """[Private] Load model from string"""
...@@ -1589,7 +1504,6 @@ class Booster(object): ...@@ -1589,7 +1504,6 @@ class Booster(object):
def _to_predictor(self): def _to_predictor(self):
"""Convert to predictor""" """Convert to predictor"""
predictor = _InnerPredictor(booster_handle=self.handle) predictor = _InnerPredictor(booster_handle=self.handle)
predictor.pandas_categorical = self.pandas_categorical
return predictor return predictor
def feature_name(self): def feature_name(self):
......
...@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types, ...@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def train(params, train_set, num_boost_round=100, def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None, valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None, fobj=None, feval=None, init_model=None,
feature_name='auto', categorical_feature='auto', feature_name='auto',
early_stopping_rounds=None, evals_result=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None): verbose_eval=True, learning_rates=None, callbacks=None):
""" """
...@@ -45,11 +45,6 @@ def train(params, train_set, num_boost_round=100, ...@@ -45,11 +45,6 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
...@@ -103,7 +98,6 @@ def train(params, train_set, num_boost_round=100, ...@@ -103,7 +98,6 @@ def train(params, train_set, num_boost_round=100,
train_set._update_params(params) train_set._update_params(params)
train_set._set_predictor(predictor) train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name) train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
is_valid_contain_train = False is_valid_contain_train = False
train_data_name = "training" train_data_name = "training"
...@@ -277,7 +271,7 @@ def _agg_cv_result(raw_results): ...@@ -277,7 +271,7 @@ def _agg_cv_result(raw_results):
def cv(params, train_set, num_boost_round=10, def cv(params, train_set, num_boost_round=10,
data_splitter=None, nfold=5, stratified=False, shuffle=True, data_splitter=None, nfold=5, stratified=False, shuffle=True,
metrics=None, fobj=None, feval=None, init_model=None, metrics=None, fobj=None, feval=None, init_model=None,
feature_name='auto', categorical_feature='auto', feature_name='auto',
early_stopping_rounds=None, fpreproc=None, early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None): callbacks=None):
...@@ -311,11 +305,6 @@ def cv(params, train_set, num_boost_round=10, ...@@ -311,11 +305,6 @@ def cv(params, train_set, num_boost_round=10,
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue. every <early_stopping_rounds> round(s) to continue.
...@@ -354,7 +343,6 @@ def cv(params, train_set, num_boost_round=10, ...@@ -354,7 +343,6 @@ def cv(params, train_set, num_boost_round=10,
train_set._update_params(params) train_set._update_params(params)
train_set._set_predictor(predictor) train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name) train_set.set_feature_name(feature_name)
train_set.set_categorical_feature(categorical_feature)
if metrics: if metrics:
params.setdefault('metric', []) params.setdefault('metric', [])
......
...@@ -257,12 +257,7 @@ def _to_graphviz(graph, tree_info, show_info, feature_names): ...@@ -257,12 +257,7 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
if info in {'split_gain', 'internal_value', 'internal_count'}: if info in {'split_gain', 'internal_value', 'internal_count'}:
label += '\n' + info + ':' + str(root[info]) label += '\n' + info + ':' + str(root[info])
graph.node(name, label=label) graph.node(name, label=label)
if root['decision_type'] == 'no_greater': l_dec, r_dec = '<=', '>'
l_dec, r_dec = '<=', '>'
elif root['decision_type'] == 'is':
l_dec, r_dec = 'is', "isn't"
else:
raise ValueError('Invalid decision type in tree model.')
add(root['left_child'], name, l_dec) add(root['left_child'], name, l_dec)
add(root['right_child'], name, r_dec) add(root['right_child'], name, r_dec)
else: # leaf else: # leaf
......
...@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase): ...@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto', feature_name='auto',
callbacks=None): callbacks=None):
""" """
Fit the gradient boosting model Fit the gradient boosting model
...@@ -318,11 +318,6 @@ class LGBMModel(LGBMModelBase): ...@@ -318,11 +318,6 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str, or 'auto' feature_name : list of str, or 'auto'
Feature names Feature names
If 'auto' and data is pandas DataFrame, use data columns name If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions callbacks : list of callback functions
List of callback functions that are applied at each iteration. List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information. See Callbacks in Python-API.md for more information.
...@@ -406,7 +401,6 @@ class LGBMModel(LGBMModelBase): ...@@ -406,7 +401,6 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval, evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, feature_name=feature_name, verbose_eval=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks) callbacks=callbacks)
if evals_result: if evals_result:
...@@ -514,7 +508,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -514,7 +508,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score=None, eval_init_score=None,
eval_metric="l2", eval_metric="l2",
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto', callbacks=None): feature_name='auto', callbacks=None):
super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight, super(LGBMRegressor, self).fit(X, y, sample_weight=sample_weight,
init_score=init_score, eval_set=eval_set, init_score=init_score, eval_set=eval_set,
...@@ -523,7 +517,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -523,7 +517,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_metric=eval_metric, eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name, verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks) callbacks=callbacks)
return self return self
...@@ -560,7 +553,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -560,7 +553,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score=None, eval_init_score=None,
eval_metric="binary_logloss", eval_metric="binary_logloss",
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto', feature_name='auto',
callbacks=None): callbacks=None):
self._le = LGBMLabelEncoder().fit(y) self._le = LGBMLabelEncoder().fit(y)
y = self._le.transform(y) y = self._le.transform(y)
...@@ -583,7 +576,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -583,7 +576,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_metric=eval_metric, eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name, verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks) callbacks=callbacks)
return self return self
...@@ -661,7 +653,7 @@ class LGBMRanker(LGBMModel): ...@@ -661,7 +653,7 @@ class LGBMRanker(LGBMModel):
eval_init_score=None, eval_group=None, eval_init_score=None, eval_group=None,
eval_metric='ndcg', eval_at=1, eval_metric='ndcg', eval_at=1,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto', feature_name='auto',
callbacks=None): callbacks=None):
""" """
Most arguments like common methods except following: Most arguments like common methods except following:
...@@ -692,6 +684,5 @@ class LGBMRanker(LGBMModel): ...@@ -692,6 +684,5 @@ class LGBMRanker(LGBMModel):
eval_metric=eval_metric, eval_metric=eval_metric,
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
verbose=verbose, feature_name=feature_name, verbose=verbose, feature_name=feature_name,
categorical_feature=categorical_feature,
callbacks=callbacks) callbacks=callbacks)
return self return self
#include <LightGBM/boosting.h> #include <LightGBM/boosting.h>
#include "gbdt.h" #include "gbdt.h"
#include "dart.hpp" #include "dart.hpp"
#include "goss.hpp"
namespace LightGBM { namespace LightGBM {
...@@ -31,6 +32,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename ...@@ -31,6 +32,8 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
return new GBDT(); return new GBDT();
} else if (type == std::string("dart")) { } else if (type == std::string("dart")) {
return new DART(); return new DART();
} else if (type == std::string("goss")) {
return new GOSS();
} else { } else {
return nullptr; return nullptr;
} }
...@@ -42,6 +45,10 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename ...@@ -42,6 +45,10 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename
ret.reset(new GBDT()); ret.reset(new GBDT());
} else if (type == std::string("dart")) { } else if (type == std::string("dart")) {
ret.reset(new DART()); ret.reset(new DART());
} else if (type == std::string("goss")) {
ret.reset(new GOSS());
} else {
Log::Fatal("unknow boosting type %s", type.c_str());
} }
LoadFileToBoosting(ret.get(), filename); LoadFileToBoosting(ret.get(), filename);
} else { } else {
......
...@@ -38,6 +38,11 @@ public: ...@@ -38,6 +38,11 @@ public:
random_for_drop_ = Random(gbdt_config_->drop_seed); random_for_drop_ = Random(gbdt_config_->drop_seed);
sum_weight_ = 0.0f; sum_weight_ = 0.0f;
} }
void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) override {
GBDT::ResetTrainingData(config, train_data, object_function, training_metrics);
}
/*! /*!
* \brief one training iteration * \brief one training iteration
*/ */
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include <LightGBM/utils/common.h> #include <LightGBM/utils/common.h>
#include <LightGBM/feature.h>
#include <LightGBM/objective_function.h> #include <LightGBM/objective_function.h>
#include <LightGBM/metric.h> #include <LightGBM/metric.h>
...@@ -37,7 +36,6 @@ GBDT::GBDT() ...@@ -37,7 +36,6 @@ GBDT::GBDT()
} }
GBDT::~GBDT() { GBDT::~GBDT() {
} }
void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function, void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
...@@ -106,16 +104,6 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_ ...@@ -106,16 +104,6 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
label_idx_ = train_data->label_idx(); label_idx_ = train_data->label_idx();
// get feature names // get feature names
feature_names_ = train_data->feature_names(); feature_names_ = train_data->feature_names();
// get feature infos
feature_infos_.clear();
for (int i = 0; i < max_feature_idx_ + 1; ++i) {
int feature_idx = train_data->GetInnerFeatureIndex(i);
if (feature_idx < 0) {
feature_infos_.push_back("trival feature");
} else {
feature_infos_.push_back(train_data->FeatureAt(feature_idx)->bin_mapper()->bin_info());
}
}
} }
if ((train_data_ != train_data && train_data != nullptr) if ((train_data_ != train_data && train_data != nullptr)
...@@ -587,11 +575,6 @@ std::string GBDT::SaveModelToString(int num_iterations) const { ...@@ -587,11 +575,6 @@ std::string GBDT::SaveModelToString(int num_iterations) const {
ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl; ss << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
} }
ss << std::endl << "feature information:" << std::endl;
for (int i = 0; i < max_feature_idx_ + 1; ++i) {
ss << feature_names_[i] << "=" << feature_infos_[i] << std::endl;
}
return ss.str(); return ss.str();
} }
...@@ -651,51 +634,12 @@ bool GBDT::LoadModelFromString(const std::string& model_str) { ...@@ -651,51 +634,12 @@ bool GBDT::LoadModelFromString(const std::string& model_str) {
Log::Fatal("Wrong size of feature_names"); Log::Fatal("Wrong size of feature_names");
return false; return false;
} }
} else { }
else {
Log::Fatal("Model file doesn't contain feature names"); Log::Fatal("Model file doesn't contain feature names");
return false; return false;
} }
// returns offset, or lines.size() if not found.
auto find_string_lineno = [&lines](const std::string &str, size_t start_line)
{
size_t i = start_line;
size_t featinfo_find_pos = std::string::npos;
while (i < lines.size()) {
featinfo_find_pos = lines[i].find(str);
if (featinfo_find_pos != std::string::npos)
break;
++i;
}
return i;
};
// load feature information
{
size_t finfo_line_idx = find_string_lineno("feature information:", 0);
if (finfo_line_idx >= lines.size()) {
Log::Fatal("Model file doesn't contain feature information");
return false;
}
feature_infos_.resize(max_feature_idx_ + 1);
// search for each feature name
for (int i=0; i < max_feature_idx_ + 1; i++) {
const auto feat_name = feature_names_[i];
size_t line_idx = find_string_lineno(feat_name + "=", finfo_line_idx + 1);
if (line_idx >= lines.size()) {
Log::Fatal(("Model file doesn't contain feature information for feature " + feat_name).c_str());
return false;
}
const auto this_line = lines[line_idx];
feature_infos_[i] = this_line.substr((feat_name + "=").size());
}
}
// get tree models // get tree models
size_t i = 0; size_t i = 0;
while (i < lines.size()) { while (i < lines.size()) {
...@@ -725,7 +669,7 @@ std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const { ...@@ -725,7 +669,7 @@ std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0); std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
for (size_t iter = 0; iter < models_.size(); ++iter) { for (size_t iter = 0; iter < models_.size(); ++iter) {
for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
++feature_importances[models_[iter]->split_feature_real(split_idx)]; ++feature_importances[models_[iter]->split_feature(split_idx)];
} }
} }
// store the importance first // store the importance first
......
...@@ -329,8 +329,6 @@ protected: ...@@ -329,8 +329,6 @@ protected:
int num_init_iteration_; int num_init_iteration_;
/*! \brief Feature names */ /*! \brief Feature names */
std::vector<std::string> feature_names_; std::vector<std::string> feature_names_;
/*! \brief Feature informations */
std::vector<std::string> feature_infos_;
/*! \brief number of threads */ /*! \brief number of threads */
int num_threads_; int num_threads_;
/*! \brief Buffer for multi-threading bagging */ /*! \brief Buffer for multi-threading bagging */
......
#ifndef LIGHTGBM_BOOSTING_GOSS_H_
#define LIGHTGBM_BOOSTING_GOSS_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/boosting.h>
#include "score_updater.hpp"
#include "gbdt.h"
#include <cstdio>
#include <vector>
#include <string>
#include <fstream>
#include <chrono>
namespace LightGBM {
class GOSS: public GBDT {
public:
/*!
* \brief Constructor
*/
GOSS() : GBDT() {
}
~GOSS() {
}
void Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) override {
GBDT::Init(config, train_data, object_function, training_metrics);
CHECK(gbdt_config_->top_rate + gbdt_config_->other_rate <= 1.0f);
CHECK(gbdt_config_->top_rate > 0.0f && gbdt_config_->other_rate > 0.0f);
if (gbdt_config_->bagging_freq > 0 && gbdt_config_->bagging_fraction != 1.0f) {
Log::Fatal("cannot used bagging in GOSS");
}
Log::Info("using GOSS");
}
void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) override {
if (config->bagging_freq > 0 && config->bagging_fraction != 1.0f) {
Log::Fatal("cannot used bagging in GOSS");
}
GBDT::ResetTrainingData(config, train_data, object_function, training_metrics);
if (train_data_ == nullptr) { return; }
bag_data_indices_.resize(num_data_);
tmp_indices_.resize(num_data_);
tmp_indice_right_.resize(num_data_);
offsets_buf_.resize(num_threads_);
left_cnts_buf_.resize(num_threads_);
right_cnts_buf_.resize(num_threads_);
left_write_pos_buf_.resize(num_threads_);
right_write_pos_buf_.resize(num_threads_);
is_use_subset_ = false;
if (config->top_rate + config->other_rate <= 0.5) {
auto bag_data_cnt = static_cast<data_size_t>((config->top_rate + config->other_rate) * num_data_);
tmp_subset_.reset(new Dataset(bag_data_cnt));
tmp_subset_->CopyFeatureMapperFrom(train_data_);
is_use_subset_ = true;
}
// flag to not bagging first
bag_data_cnt_ = num_data_;
}
data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer, data_size_t* buffer_right) {
std::vector<score_t> tmp_gradients(cnt);
for (data_size_t i = 0; i < cnt; ++i) {
tmp_gradients[i] = std::fabs(gradients_[start + i] * hessians_[start + i]);
}
data_size_t top_k = static_cast<data_size_t>(cnt * gbdt_config_->top_rate);
data_size_t other_k = static_cast<data_size_t>(cnt * gbdt_config_->other_rate);
top_k = std::max(1, top_k);
ArrayArgs<score_t>::ArgMaxAtK(&tmp_gradients, 0, static_cast<int>(tmp_gradients.size()), top_k);
score_t threshold = tmp_gradients[top_k - 1];
score_t multiply = static_cast<score_t>(cnt - top_k) / other_k;
data_size_t cur_left_cnt = 0;
data_size_t cur_right_cnt = 0;
data_size_t big_weight_cnt = 0;
for (data_size_t i = 0; i < cnt; ++i) {
if (std::fabs(gradients_[start + i] * hessians_[start + i]) >= threshold) {
buffer[cur_left_cnt++] = start + i;
++big_weight_cnt;
} else {
data_size_t sampled = cur_left_cnt - big_weight_cnt;
data_size_t rest_need = other_k - sampled;
data_size_t rest_all = (cnt - i) - (top_k - big_weight_cnt);
double prob = (rest_need) / static_cast<double>(rest_all);
if (cur_rand.NextFloat() < prob) {
buffer[cur_left_cnt++] = start + i;
gradients_[start + i] *= multiply;
hessians_[start + i] *= multiply;
} else {
buffer_right[cur_right_cnt++] = start + i;
}
}
}
return cur_left_cnt;
}
void Bagging(int iter) override {
bag_data_cnt_ = num_data_;
// not subsample for first iterations
if (iter < static_cast<int>(1.0f / gbdt_config_->learning_rate)) { return; }
const data_size_t min_inner_size = 1000;
data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
if (inner_size < min_inner_size) { inner_size = min_inner_size; }
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
left_cnts_buf_[i] = 0;
right_cnts_buf_[i] = 0;
data_size_t cur_start = i * inner_size;
if (cur_start > num_data_) { continue; }
data_size_t cur_cnt = inner_size;
if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt,
tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start);
offsets_buf_[i] = cur_start;
left_cnts_buf_[i] = cur_left_count;
right_cnts_buf_[i] = cur_cnt - cur_left_count;
}
data_size_t left_cnt = 0;
left_write_pos_buf_[0] = 0;
right_write_pos_buf_[0] = 0;
for (int i = 1; i < num_threads_; ++i) {
left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
}
left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < num_threads_; ++i) {
if (left_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
}
if (right_cnts_buf_[i] > 0) {
std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
tmp_indice_right_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
}
}
bag_data_cnt_ = left_cnt;
// set bagging data to tree learner
if (!is_use_subset_) {
tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
} else {
// get subset
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
tree_learner_->ResetTrainingData(tmp_subset_.get());
}
}
/*!
* \brief Get Type name of this boosting object
*/
const char* SubModelName() const override { return "tree"; }
private:
std::vector<data_size_t> tmp_indice_right_;
};
} // namespace LightGBM
#endif // LIGHTGBM_BOOSTING_GOSS_H_
#ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_ #ifndef LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_ #define LIGHTGBM_BOOSTING_SCORE_UPDATER_HPP_
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <LightGBM/tree.h> #include <LightGBM/tree.h>
......
...@@ -330,20 +330,22 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data, ...@@ -330,20 +330,22 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt); const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt); auto sample_indices = rand.Sample(nrow, sample_cnt);
std::vector<std::vector<double>> sample_values(ncol); std::vector<std::vector<double>> sample_values(ncol);
std::vector<std::vector<int>> sample_idx(ncol);
for (size_t i = 0; i < sample_indices.size(); ++i) { for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i]; auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx)); auto row = get_row_fun(static_cast<int>(idx));
for (size_t j = 0; j < row.size(); ++j) { for (size_t j = 0; j < row.size(); ++j) {
if (std::fabs(row[j]) > 1e-15) { if (std::fabs(row[j]) > kEpsilon) {
sample_values[j].push_back(row[j]); sample_values[j].emplace_back(row[j]);
sample_idx[j].emplace_back(static_cast<int>(i));
} }
} }
} }
DatasetLoader loader(io_config, nullptr, 1, nullptr); DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow)); ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
} else { } else {
ret.reset(new Dataset(nrow)); ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom( ret->CreateValid(
reinterpret_cast<const Dataset*>(reference)); reinterpret_cast<const Dataset*>(reference));
} }
...@@ -382,29 +384,28 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr, ...@@ -382,29 +384,28 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt); const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt); auto sample_indices = rand.Sample(nrow, sample_cnt);
std::vector<std::vector<double>> sample_values; std::vector<std::vector<double>> sample_values;
std::vector<std::vector<int>> sample_idx;
for (size_t i = 0; i < sample_indices.size(); ++i) { for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i]; auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx)); auto row = get_row_fun(static_cast<int>(idx));
for (std::pair<int, double>& inner_data : row) { for (std::pair<int, double>& inner_data : row) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) { if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set sample_values.resize(inner_data.first + 1);
size_t need_size = inner_data.first - sample_values.size() + 1; sample_idx.resize(inner_data.first + 1);
for (size_t j = 0; j < need_size; ++j) {
sample_values.emplace_back();
}
} }
if (std::fabs(inner_data.second) > 1e-15) { if (std::fabs(inner_data.second) > kEpsilon) {
// edit the feature value // edit the feature value
sample_values[inner_data.first].push_back(inner_data.second); sample_values[inner_data.first].emplace_back(inner_data.second);
sample_idx[inner_data.first].emplace_back(static_cast<int>(i));
} }
} }
} }
CHECK(num_col >= static_cast<int>(sample_values.size())); CHECK(num_col >= static_cast<int>(sample_values.size()));
DatasetLoader loader(io_config, nullptr, 1, nullptr); DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow)); ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
} else { } else {
ret.reset(new Dataset(nrow)); ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom( ret->CreateValid(
reinterpret_cast<const Dataset*>(reference)); reinterpret_cast<const Dataset*>(reference));
} }
...@@ -442,29 +443,33 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr, ...@@ -442,29 +443,33 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt); const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
auto sample_indices = rand.Sample(nrow, sample_cnt); auto sample_indices = rand.Sample(nrow, sample_cnt);
std::vector<std::vector<double>> sample_values(ncol_ptr - 1); std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
std::vector<std::vector<int>> sample_idx(ncol_ptr - 1);
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) { for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i); CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
for (int j = 0; j < sample_cnt; j++) { for (int j = 0; j < sample_cnt; j++) {
auto val = col_it.Get(sample_indices[j]); auto val = col_it.Get(sample_indices[j]);
if (std::fabs(val) > kEpsilon) { if (std::fabs(val) > kEpsilon) {
sample_values[i].push_back(val); sample_values[i].emplace_back(val);
sample_idx[i].emplace_back(j);
} }
} }
} }
DatasetLoader loader(io_config, nullptr, 1, nullptr); DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow)); ret.reset(loader.CostructFromSampleData(sample_values, sample_idx, sample_cnt, nrow));
} else { } else {
ret.reset(new Dataset(nrow)); ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom( ret->CreateValid(
reinterpret_cast<const Dataset*>(reference)); reinterpret_cast<const Dataset*>(reference));
} }
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < ncol_ptr - 1; ++i) { for (int i = 0; i < ncol_ptr - 1; ++i) {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
int feature_idx = ret->GetInnerFeatureIndex(i); int feature_idx = ret->InnerFeatureIndex(i);
if (feature_idx < 0) { continue; } if (feature_idx < 0) { continue; }
int group = ret->Feature2Group(feature_idx);
int sub_feature = ret->Feture2SubFeature(feature_idx);
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i); CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
int row_idx = 0; int row_idx = 0;
while (row_idx < nrow) { while (row_idx < nrow) {
...@@ -472,7 +477,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr, ...@@ -472,7 +477,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
row_idx = pair.first; row_idx = pair.first;
// no more data // no more data
if (row_idx < 0) { break; } if (row_idx < 0) { break; }
ret->FeatureAt(feature_idx)->PushData(tid, row_idx, pair.second); ret->PushOneData(tid, row_idx, group, sub_feature, pair.second);
} }
} }
ret->FinishLoad(); ret->FinishLoad();
......
...@@ -23,16 +23,10 @@ BinMapper::BinMapper(const BinMapper& other) { ...@@ -23,16 +23,10 @@ BinMapper::BinMapper(const BinMapper& other) {
num_bin_ = other.num_bin_; num_bin_ = other.num_bin_;
is_trival_ = other.is_trival_; is_trival_ = other.is_trival_;
sparse_rate_ = other.sparse_rate_; sparse_rate_ = other.sparse_rate_;
bin_type_ = other.bin_type_; bin_upper_bound_ = other.bin_upper_bound_;
if (bin_type_ == BinType::NumericalBin) {
bin_upper_bound_ = other.bin_upper_bound_;
} else {
bin_2_categorical_ = other.bin_2_categorical_;
categorical_2_bin_ = other.categorical_2_bin_;
}
min_val_ = other.min_val_; min_val_ = other.min_val_;
max_val_ = other.max_val_; max_val_ = other.max_val_;
default_bin_ = other.default_bin_;
} }
BinMapper::BinMapper(const void* memory) { BinMapper::BinMapper(const void* memory) {
...@@ -43,37 +37,48 @@ BinMapper::~BinMapper() { ...@@ -43,37 +37,48 @@ BinMapper::~BinMapper() {
} }
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin, BinType bin_type) { bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt) {
bin_type_ = bin_type; int sum_left = 0;
std::vector<double>& ref_values = (*values); for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
size_t sample_size = total_sample_cnt; sum_left += cnt_in_bin[i];
int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size()); if (sum_left >= filter_cnt) {
return false;
} else if (total_cnt - sum_left >= filter_cnt) {
return false;
}
}
return true;
}
void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
int max_bin, int min_data_in_bin, int min_split_data) {
// limit max_bin by min_data_in_bin
std::vector<double>& raw_values = values;
int zero_cnt = static_cast<int>(total_sample_cnt - raw_values.size());
// find distinct_values first // find distinct_values first
std::vector<double> distinct_values; std::vector<double> distinct_values;
std::vector<int> counts; std::vector<int> counts;
std::sort(ref_values.begin(), ref_values.end()); std::sort(raw_values.begin(), raw_values.end());
// push zero in the front // push zero in the front
if (ref_values.empty() || (ref_values[0] > 0.0f && zero_cnt > 0)) { if (raw_values.empty() || (raw_values[0] > 0.0f && zero_cnt > 0)) {
distinct_values.push_back(0); distinct_values.push_back(0.0f);
counts.push_back(zero_cnt); counts.push_back(zero_cnt);
} }
if (!ref_values.empty()) { if (!raw_values.empty()) {
distinct_values.push_back(ref_values[0]); distinct_values.push_back(raw_values[0]);
counts.push_back(1); counts.push_back(1);
} }
for (size_t i = 1; i < ref_values.size(); ++i) { for (size_t i = 1; i < raw_values.size(); ++i) {
if (ref_values[i] != ref_values[i - 1]) { if (raw_values[i] != raw_values[i - 1]) {
if (ref_values[i - 1] == 0.0f) { if (raw_values[i - 1] < 0.0f && raw_values[i] > 0.0f) {
counts.back() += zero_cnt; distinct_values.push_back(0.0f);
} else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
distinct_values.push_back(0);
counts.push_back(zero_cnt); counts.push_back(zero_cnt);
} }
distinct_values.push_back(ref_values[i]); distinct_values.push_back(raw_values[i]);
counts.push_back(1); counts.push_back(1);
} else { } else {
++counts.back(); ++counts.back();
...@@ -81,119 +86,106 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in ...@@ -81,119 +86,106 @@ void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, in
} }
// push zero in the back // push zero in the back
if (!ref_values.empty() && ref_values.back() < 0.0f && zero_cnt > 0) { if (!raw_values.empty() && raw_values.back() < 0.0f && zero_cnt > 0) {
distinct_values.push_back(0); distinct_values.push_back(0.0f);
counts.push_back(zero_cnt); counts.push_back(zero_cnt);
} }
min_val_ = distinct_values.front(); min_val_ = distinct_values.front();
max_val_ = distinct_values.back(); max_val_ = distinct_values.back();
std::vector<int> cnt_in_bin; std::vector<int> cnt_in_bin;
int num_values = static_cast<int>(distinct_values.size()); int num_values = static_cast<int>(distinct_values.size());
if (bin_type_ == BinType::NumericalBin) {
if (num_values <= max_bin) {
std::sort(distinct_values.begin(), distinct_values.end());
// use distinct value is enough
num_bin_ = num_values;
bin_upper_bound_ = std::vector<double>(num_values);
for (int i = 0; i < num_values - 1; ++i) {
bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
}
cnt_in_bin = counts;
bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
} else {
// mean size for one bin
double mean_bin_size = sample_size / static_cast<double>(max_bin);
int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(sample_size);
std::vector<bool> is_big_count_value(num_values, false);
for (int i = 0; i < num_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0; if (num_values <= max_bin) {
lower_bounds[bin_cnt] = distinct_values[0]; // use distinct value is enough
int cur_cnt_inbin = 0; bin_upper_bound_.clear();
for (int i = 0; i < num_values - 1; ++i) { int cur_cnt_inbin = 0;
if (!is_big_count_value[i]) { for (int i = 0; i < num_values - 1; ++i) {
rest_sample_cnt -= counts[i]; cur_cnt_inbin += counts[i];
} if (cur_cnt_inbin >= min_data_in_bin) {
cur_cnt_inbin += counts[i]; bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
// need a new bin cnt_in_bin.push_back(cur_cnt_inbin);
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size || cur_cnt_inbin = 0;
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
}
} }
++bin_cnt;
// update bin upper bound
bin_upper_bound_ = std::vector<double>(bin_cnt);
num_bin_ = bin_cnt;
for (int i = 0; i < bin_cnt - 1; ++i) {
bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
}
// last bin upper bound
bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
} }
cur_cnt_inbin += counts.back();
cnt_in_bin.push_back(cur_cnt_inbin);
bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
num_bin_ = static_cast<int>(bin_upper_bound_.size());
} else { } else {
// convert to int type first if (min_data_in_bin > 0) {
std::vector<int> distinct_values_int; max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
std::vector<int> counts_int; max_bin = std::max(max_bin, 1);
distinct_values_int.push_back(static_cast<int>(distinct_values[0])); }
counts_int.push_back(counts[0]); double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin;
for (size_t i = 1; i < distinct_values.size(); ++i) { if (zero_cnt > mean_bin_size) {
if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) { int non_zero_cnt = static_cast<int>(raw_values.size());
distinct_values_int.push_back(static_cast<int>(distinct_values[i])); max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin));
counts_int.push_back(counts[i]); }
} else { // mean size for one bin
counts_int.back() += counts[i]; int rest_bin_cnt = max_bin;
int rest_sample_cnt = static_cast<int>(total_sample_cnt);
std::vector<bool> is_big_count_value(num_values, false);
for (int i = 0; i < num_values; ++i) {
if (counts[i] >= mean_bin_size) {
is_big_count_value[i] = true;
--rest_bin_cnt;
rest_sample_cnt -= counts[i];
}
}
mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
int bin_cnt = 0;
lower_bounds[bin_cnt] = distinct_values[0];
int cur_cnt_inbin = 0;
for (int i = 0; i < num_values - 1; ++i) {
if (!is_big_count_value[i]) {
rest_sample_cnt -= counts[i];
}
cur_cnt_inbin += counts[i];
// need a new bin
if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
(is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
upper_bounds[bin_cnt] = distinct_values[i];
cnt_in_bin.push_back(cur_cnt_inbin);
++bin_cnt;
lower_bounds[bin_cnt] = distinct_values[i + 1];
if (bin_cnt >= max_bin - 1) { break; }
cur_cnt_inbin = 0;
if (!is_big_count_value[i]) {
--rest_bin_cnt;
mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
}
} }
} }
// sort by counts cur_cnt_inbin += counts.back();
Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true); cnt_in_bin.push_back(cur_cnt_inbin);
// will ingore the categorical of small counts ++bin_cnt;
const int cut_cnt = static_cast<int>(sample_size * 0.95f); // update bin upper bound
categorical_2_bin_.clear(); bin_upper_bound_ = std::vector<double>(bin_cnt);
bin_2_categorical_.clear(); num_bin_ = bin_cnt;
num_bin_ = 0; for (int i = 0; i < bin_cnt - 1; ++i) {
int used_cnt = 0; bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
while (used_cnt < cut_cnt || num_bin_ < max_bin ) {
bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
used_cnt += counts_int[num_bin_];
++num_bin_;
} }
cnt_in_bin = counts_int; // last bin upper bound
cnt_in_bin[0] += static_cast<int>(sample_size) - used_cnt; bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
} }
// check trival(num_bin_ == 1) feature // check trival(num_bin_ == 1) feature
if (num_bin_ <= 1) { if (num_bin_ <= 1) {
is_trival_ = true; is_trival_ = true;
default_bin_ = 0;
} else { } else {
is_trival_ = false; is_trival_ = false;
default_bin_ = ValueToBin(0);
}
if (NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data)) {
is_trival_ = true;
} }
// calculate sparse rate // calculate sparse rate
CHECK(num_bin_ <= max_bin); CHECK(num_bin_ <= max_bin);
sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(sample_size); sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(total_sample_cnt);
} }
...@@ -202,8 +194,9 @@ int BinMapper::SizeForSpecificBin(int bin) { ...@@ -202,8 +194,9 @@ int BinMapper::SizeForSpecificBin(int bin) {
size += sizeof(int); size += sizeof(int);
size += sizeof(bool); size += sizeof(bool);
size += sizeof(double); size += sizeof(double);
size += sizeof(BinType); size += 2 * sizeof(double);
size += bin * sizeof(double); size += bin * sizeof(double);
size += sizeof(uint32_t);
return size; return size;
} }
...@@ -214,18 +207,13 @@ void BinMapper::CopyTo(char * buffer) { ...@@ -214,18 +207,13 @@ void BinMapper::CopyTo(char * buffer) {
buffer += sizeof(is_trival_); buffer += sizeof(is_trival_);
std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_)); std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_); buffer += sizeof(sparse_rate_);
std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_)); std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_); buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_)); std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_); buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
if (bin_type_ == BinType::NumericalBin) { buffer += sizeof(default_bin_);
std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double)); std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
} else {
std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
}
} }
void BinMapper::CopyFrom(const char * buffer) { void BinMapper::CopyFrom(const char * buffer) {
...@@ -235,48 +223,30 @@ void BinMapper::CopyFrom(const char * buffer) { ...@@ -235,48 +223,30 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer += sizeof(is_trival_); buffer += sizeof(is_trival_);
std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_)); std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
buffer += sizeof(sparse_rate_); buffer += sizeof(sparse_rate_);
std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
buffer += sizeof(bin_type_);
std::memcpy(&min_val_, buffer, sizeof(min_val_)); std::memcpy(&min_val_, buffer, sizeof(min_val_));
buffer += sizeof(min_val_); buffer += sizeof(min_val_);
std::memcpy(&max_val_, buffer, sizeof(max_val_)); std::memcpy(&max_val_, buffer, sizeof(max_val_));
buffer += sizeof(max_val_); buffer += sizeof(max_val_);
std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
if (bin_type_ == BinType::NumericalBin) { buffer += sizeof(default_bin_);
bin_upper_bound_ = std::vector<double>(num_bin_); bin_upper_bound_ = std::vector<double>(num_bin_);
std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double)); std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
} else {
bin_2_categorical_ = std::vector<int>(num_bin_);
std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
categorical_2_bin_.clear();
for (int i = 0; i < num_bin_; ++i) {
categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
}
}
} }
void BinMapper::SaveBinaryToFile(FILE* file) const { void BinMapper::SaveBinaryToFile(FILE* file) const {
fwrite(&num_bin_, sizeof(num_bin_), 1, file); fwrite(&num_bin_, sizeof(num_bin_), 1, file);
fwrite(&is_trival_, sizeof(is_trival_), 1, file); fwrite(&is_trival_, sizeof(is_trival_), 1, file);
fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file); fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
fwrite(&bin_type_, sizeof(bin_type_), 1, file);
fwrite(&min_val_, sizeof(min_val_), 1, file); fwrite(&min_val_, sizeof(min_val_), 1, file);
fwrite(&max_val_, sizeof(max_val_), 1, file); fwrite(&max_val_, sizeof(max_val_), 1, file);
if (bin_type_ == BinType::NumericalBin) { fwrite(&default_bin_, sizeof(default_bin_), 1, file);
fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file); fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
} else {
fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
}
} }
size_t BinMapper::SizesInByte() const { size_t BinMapper::SizesInByte() const {
size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_) size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
+ sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_); + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
if (bin_type_ == BinType::NumericalBin) { ret += sizeof(double) * num_bin_;
ret += sizeof(double) * num_bin_;
} else {
ret += sizeof(int) * num_bin_;
}
return ret; return ret;
} }
...@@ -284,73 +254,46 @@ template class DenseBin<uint8_t>; ...@@ -284,73 +254,46 @@ template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>; template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>; template class DenseBin<uint32_t>;
template class DenseCategoricalBin<uint8_t>;
template class DenseCategoricalBin<uint16_t>;
template class DenseCategoricalBin<uint32_t>;
template class SparseBin<uint8_t>; template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>; template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>; template class SparseBin<uint32_t>;
template class SparseCategoricalBin<uint8_t>;
template class SparseCategoricalBin<uint16_t>;
template class SparseCategoricalBin<uint32_t>;
template class OrderedSparseBin<uint8_t>; template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>; template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>; template class OrderedSparseBin<uint32_t>;
double BinMapper::kSparseThreshold = 0.8f;
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate,
bool is_enable_sparse, bool* is_sparse, uint32_t default_bin, BinType bin_type) { bool is_enable_sparse, bool* is_sparse) {
// sparse threshold // sparse threshold
const double kSparseThreshold = 0.8f; if (sparse_rate >= BinMapper::kSparseThreshold && is_enable_sparse) {
if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
*is_sparse = true; *is_sparse = true;
return CreateSparseBin(num_data, num_bin, default_bin, bin_type); return CreateSparseBin(num_data, num_bin);
} else { } else {
*is_sparse = false; *is_sparse = false;
return CreateDenseBin(num_data, num_bin, default_bin, bin_type); return CreateDenseBin(num_data, num_bin);
} }
} }
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) { Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
if (bin_type == BinType::NumericalBin) { if (num_bin <= 256) {
if (num_bin <= 255) { return new DenseBin<uint8_t>(num_data);
return new DenseBin<uint8_t>(num_data, default_bin); } else if (num_bin <= 65536) {
} else if (num_bin <= 65535) { return new DenseBin<uint16_t>(num_data);
return new DenseBin<uint16_t>(num_data, default_bin);
} else {
return new DenseBin<uint32_t>(num_data, default_bin);
}
} else { } else {
if (num_bin <= 255) { return new DenseBin<uint32_t>(num_data);
return new DenseCategoricalBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65535) {
return new DenseCategoricalBin<uint16_t>(num_data, default_bin);
} else {
return new DenseCategoricalBin<uint32_t>(num_data, default_bin);
}
} }
} }
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) { Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
if (bin_type == BinType::NumericalBin) { if (num_bin <= 256) {
if (num_bin <= 255) { return new SparseBin<uint8_t>(num_data);
return new SparseBin<uint8_t>(num_data, default_bin); } else if (num_bin <= 65536) {
} else if (num_bin <= 65535) { return new SparseBin<uint16_t>(num_data);
return new SparseBin<uint16_t>(num_data, default_bin);
} else {
return new SparseBin<uint32_t>(num_data, default_bin);
}
} else { } else {
if (num_bin <= 255) { return new SparseBin<uint32_t>(num_data);
return new SparseCategoricalBin<uint8_t>(num_data, default_bin);
} else if (num_bin <= 65535) {
return new SparseCategoricalBin<uint16_t>(num_data, default_bin);
} else {
return new SparseCategoricalBin<uint32_t>(num_data, default_bin);
}
} }
} }
......
...@@ -39,11 +39,11 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para ...@@ -39,11 +39,11 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para
// generate seeds by seed. // generate seeds by seed.
if (GetInt(params, "seed", &seed)) { if (GetInt(params, "seed", &seed)) {
Random rand(seed); Random rand(seed);
int int_max = std::numeric_limits<int>::max(); int int_max = std::numeric_limits<short>::max();
io_config.data_random_seed = static_cast<int>(rand.NextInt(0, int_max)); io_config.data_random_seed = static_cast<int>(rand.NextShort(0, int_max));
boosting_config.bagging_seed = static_cast<int>(rand.NextInt(0, int_max)); boosting_config.bagging_seed = static_cast<int>(rand.NextShort(0, int_max));
boosting_config.drop_seed = static_cast<int>(rand.NextInt(0, int_max)); boosting_config.drop_seed = static_cast<int>(rand.NextShort(0, int_max));
boosting_config.tree_config.feature_fraction_seed = static_cast<int>(rand.NextInt(0, int_max)); boosting_config.tree_config.feature_fraction_seed = static_cast<int>(rand.NextShort(0, int_max));
} }
GetTaskType(params); GetTaskType(params);
GetBoostingType(params); GetBoostingType(params);
...@@ -79,6 +79,8 @@ void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::s ...@@ -79,6 +79,8 @@ void OverallConfig::GetBoostingType(const std::unordered_map<std::string, std::s
boosting_type = "gbdt"; boosting_type = "gbdt";
} else if (value == std::string("dart")) { } else if (value == std::string("dart")) {
boosting_type = "dart"; boosting_type = "dart";
} else if (value == std::string("goss")) {
boosting_type = "goss";
} else { } else {
Log::Fatal("Unknown boosting type %s", value.c_str()); Log::Fatal("Unknown boosting type %s", value.c_str());
} }
...@@ -214,7 +216,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) { ...@@ -214,7 +216,11 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString(params, "weight_column", &weight_column); GetString(params, "weight_column", &weight_column);
GetString(params, "group_column", &group_column); GetString(params, "group_column", &group_column);
GetString(params, "ignore_column", &ignore_column); GetString(params, "ignore_column", &ignore_column);
GetString(params, "categorical_column", &categorical_column); GetInt(params, "min_data_in_leaf", &min_data_in_leaf);
GetInt(params, "min_dato_in_bin", &min_data_in_bin);
GetDouble(params, "max_conflict_rate", &max_conflict_rate);
GetBool(params, "enable_bundle", &enable_bundle);
GetBool(params, "adjacent_bundle", &adjacent_bundle);
} }
...@@ -323,6 +329,8 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par ...@@ -323,6 +329,8 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
GetInt(params, "max_drop", &max_drop); GetInt(params, "max_drop", &max_drop);
GetBool(params, "xgboost_dart_mode", &xgboost_dart_mode); GetBool(params, "xgboost_dart_mode", &xgboost_dart_mode);
GetBool(params, "uniform_drop", &uniform_drop); GetBool(params, "uniform_drop", &uniform_drop);
GetDouble(params, "top_rate", &top_rate);
GetDouble(params, "other_rate", &other_rate);
CHECK(drop_rate <= 1.0 && drop_rate >= 0.0); CHECK(drop_rate <= 1.0 && drop_rate >= 0.0);
CHECK(skip_drop <= 1.0 && skip_drop >= 0.0); CHECK(skip_drop <= 1.0 && skip_drop >= 0.0);
GetTreeLearnerType(params); GetTreeLearnerType(params);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment