Commit eade219e authored by Qiwei Ye's avatar Qiwei Ye
Browse files

merge conflict

parents f23e6083 060bd316
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include <LightGBM/config.h> #include <LightGBM/config.h>
#include <LightGBM/feature.h> #include <LightGBM/feature_group.h>
#include <vector> #include <vector>
#include <utility> #include <utility>
...@@ -19,7 +19,6 @@ namespace LightGBM { ...@@ -19,7 +19,6 @@ namespace LightGBM {
/*! \brief forward declaration */ /*! \brief forward declaration */
class DatasetLoader; class DatasetLoader;
/*! /*!
* \brief This class is used to store some meta(non-feature) data for training data, * \brief This class is used to store some meta(non-feature) data for training data,
* e.g. labels, weights, initial scores, qurey level informations. * e.g. labels, weights, initial scores, qurey level informations.
...@@ -88,8 +87,6 @@ public: ...@@ -88,8 +87,6 @@ public:
void SetQuery(const data_size_t* query, data_size_t len); void SetQuery(const data_size_t* query, data_size_t len);
void SetQueryId(const data_size_t* query_id, data_size_t len);
/*! /*!
* \brief Set initial scores * \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score. * \param init_score Initial scores, this class will manage memory for init_score.
...@@ -175,7 +172,7 @@ public: ...@@ -175,7 +172,7 @@ public:
* \brief Get Number of queries * \brief Get Number of queries
* \return Number of queries * \return Number of queries
*/ */
inline const data_size_t num_queries() const { return num_queries_; } inline data_size_t num_queries() const { return num_queries_; }
/*! /*!
* \brief Get weights for queries, if not exists, will return nullptr * \brief Get weights for queries, if not exists, will return nullptr
...@@ -244,6 +241,9 @@ private: ...@@ -244,6 +241,9 @@ private:
std::vector<data_size_t> queries_; std::vector<data_size_t> queries_;
/*! \brief mutex for threading safe call */ /*! \brief mutex for threading safe call */
std::mutex mutex_; std::mutex mutex_;
bool weight_load_from_file_;
bool query_load_from_file_;
bool init_score_load_from_file_;
}; };
...@@ -280,14 +280,20 @@ class Dataset { ...@@ -280,14 +280,20 @@ class Dataset {
public: public:
friend DatasetLoader; friend DatasetLoader;
Dataset(); LIGHTGBM_EXPORT Dataset();
Dataset(data_size_t num_data); LIGHTGBM_EXPORT Dataset(data_size_t num_data);
void Construct(
std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
const std::vector<std::vector<int>>& sample_non_zero_indices,
size_t total_sample_cnt,
const IOConfig& io_config);
/*! \brief Destructor */ /*! \brief Destructor */
~Dataset(); LIGHTGBM_EXPORT ~Dataset();
bool CheckAlign(const Dataset& other) const { LIGHTGBM_EXPORT bool CheckAlign(const Dataset& other) const {
if (num_features_ != other.num_features_) { if (num_features_ != other.num_features_) {
return false; return false;
} }
...@@ -298,7 +304,7 @@ public: ...@@ -298,7 +304,7 @@ public:
return false; return false;
} }
for (int i = 0; i < num_features_; ++i) { for (int i = 0; i < num_features_; ++i) {
if (!features_[i]->CheckAlign(*(other.features_[i].get()))) { if (!FeatureBinMapper(i)->CheckAlign(*(other.FeatureBinMapper(i)))) {
return false; return false;
} }
} }
...@@ -306,57 +312,140 @@ public: ...@@ -306,57 +312,140 @@ public:
} }
inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) { inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
if (is_finish_load_) { return; }
for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) { for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
int feature_idx = used_feature_map_[i]; int feature_idx = used_feature_map_[i];
if (feature_idx >= 0) { if (feature_idx >= 0) {
features_[feature_idx]->PushData(tid, row_idx, feature_values[i]); const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
feature_groups_[group]->PushData(tid, sub_feature, row_idx, feature_values[i]);
} }
} }
} }
inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) { inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) {
if (is_finish_load_) { return; }
for (auto& inner_data : feature_values) { for (auto& inner_data : feature_values) {
if (inner_data.first >= num_total_features_) { continue; } if (inner_data.first >= num_total_features_) { continue; }
int feature_idx = used_feature_map_[inner_data.first]; int feature_idx = used_feature_map_[inner_data.first];
if (feature_idx >= 0) { if (feature_idx >= 0) {
features_[feature_idx]->PushData(tid, row_idx, inner_data.second); const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
feature_groups_[group]->PushData(tid, sub_feature, row_idx, inner_data.second);
}
} }
} }
inline void PushOneData(int tid, data_size_t row_idx, int group, int sub_feature, double value) {
feature_groups_[group]->PushData(tid, sub_feature, row_idx, value);
}
inline int RealFeatureIndex(int fidx) const {
return real_feature_idx_[fidx];
} }
inline int GetInnerFeatureIndex(int col_idx) const { inline int InnerFeatureIndex(int col_idx) const {
return used_feature_map_[col_idx]; return used_feature_map_[col_idx];
} }
inline int Feature2Group(int feature_idx) const {
return feature2group_[feature_idx];
}
inline int Feture2SubFeature(int feature_idx) const {
return feature2subfeature_[feature_idx];
}
inline uint64_t NumTotalBin() const {
return group_bin_boundaries_.back();
}
Dataset* Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const; void ReSize(data_size_t num_data);
void FinishLoad(); void CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data);
bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element); LIGHTGBM_EXPORT void FinishLoad();
bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element); LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element); LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);
bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr); LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element);
bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr); LIGHTGBM_EXPORT bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr);
bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr); LIGHTGBM_EXPORT bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr);
LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
/*! /*!
* \brief Save current dataset into binary file, will save to "filename.bin" * \brief Save current dataset into binary file, will save to "filename.bin"
*/ */
void SaveBinaryFile(const char* bin_filename); LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);
LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
LIGHTGBM_EXPORT void CreateValid(const Dataset* dataset);
void ConstructHistograms(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data,
int leaf_idx,
std::vector<std::unique_ptr<OrderedBin>>& ordered_bins,
const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
HistogramBinEntry* histogram_data) const;
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
HistogramBinEntry* data) const;
inline data_size_t Split(
int feature,
uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
const int group = feature2group_[feature];
const int sub_feature = feature2subfeature_[feature];
return feature_groups_[group]->Split(sub_feature, threshold, data_indices, num_data, lte_indices, gt_indices);
}
void CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_sparse); inline int SubFeatureBinOffset(int i) const {
const int sub_feature = feature2subfeature_[i];
if (sub_feature == 0) {
return 1;
} else {
return 0;
}
}
/*! inline int FeatureNumBin(int i) const {
* \brief Get a feature pointer for specific index const int group = feature2group_[i];
* \param i Index for feature const int sub_feature = feature2subfeature_[i];
* \return Pointer of feature return feature_groups_[group]->bin_mappers_[sub_feature]->num_bin();
*/ }
inline Feature* FeatureAt(int i) const { return features_[i].get(); }
inline const BinMapper* FeatureBinMapper(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->bin_mappers_[sub_feature].get();
}
inline BinIterator* FeatureIterator(int i) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->SubFeatureIterator(sub_feature);
}
inline double RealThreshold(int i, uint32_t threshold) const {
const int group = feature2group_[i];
const int sub_feature = feature2subfeature_[i];
return feature_groups_[group]->bin_mappers_[sub_feature]->BinToValue(threshold);
}
inline void CreateOrderedBins(std::vector<std::unique_ptr<OrderedBin>>* ordered_bins) const {
ordered_bins->resize(num_groups_);
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_groups_; ++i) {
ordered_bins->at(i).reset(feature_groups_[i]->bin_data_->CreateOrderedBin());
}
}
/*! /*!
* \brief Get meta data pointer * \brief Get meta data pointer
...@@ -384,6 +473,20 @@ public: ...@@ -384,6 +473,20 @@ public:
feature_names_ = std::vector<std::string>(feature_names); feature_names_ = std::vector<std::string>(feature_names);
} }
inline std::vector<std::string> feature_infos() const {
std::vector<std::string> bufs;
for (int i = 0; i < num_total_features_; i++) {
int fidx = used_feature_map_[i];
if (fidx == -1) {
bufs.push_back("none");
} else {
const auto bin_mapper = FeatureBinMapper(fidx);
bufs.push_back(bin_mapper->bin_info());
}
}
return bufs;
}
/*! \brief Get Number of data */ /*! \brief Get Number of data */
inline data_size_t num_data() const { return num_data_; } inline data_size_t num_data() const { return num_data_; }
...@@ -395,7 +498,7 @@ public: ...@@ -395,7 +498,7 @@ public:
private: private:
const char* data_filename_; const char* data_filename_;
/*! \brief Store used features */ /*! \brief Store used features */
std::vector<std::unique_ptr<Feature>> features_; std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
/*! \brief Mapper from real feature index to used index*/ /*! \brief Mapper from real feature index to used index*/
std::vector<int> used_feature_map_; std::vector<int> used_feature_map_;
/*! \brief Number of used features*/ /*! \brief Number of used features*/
...@@ -412,6 +515,14 @@ private: ...@@ -412,6 +515,14 @@ private:
std::vector<std::string> feature_names_; std::vector<std::string> feature_names_;
/*! \brief store feature names */ /*! \brief store feature names */
static const char* binary_file_token; static const char* binary_file_token;
int num_groups_;
std::vector<int> real_feature_idx_;
std::vector<int> feature2group_;
std::vector<int> feature2subfeature_;
std::vector<uint64_t> group_bin_boundaries_;
std::vector<int> group_feature_start_;
std::vector<int> group_feature_cnt_;
bool is_finish_load_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -8,21 +8,21 @@ namespace LightGBM { ...@@ -8,21 +8,21 @@ namespace LightGBM {
class DatasetLoader { class DatasetLoader {
public: public:
DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun, int num_class, const char* filename); LIGHTGBM_EXPORT DatasetLoader(const IOConfig& io_config, const PredictFunction& predict_fun, int num_class, const char* filename);
~DatasetLoader(); LIGHTGBM_EXPORT ~DatasetLoader();
Dataset* LoadFromFile(const char* filename, int rank, int num_machines); LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename, int rank, int num_machines);
Dataset* LoadFromFile(const char* filename) { LIGHTGBM_EXPORT Dataset* LoadFromFile(const char* filename) {
return LoadFromFile(filename, 0, 1); return LoadFromFile(filename, 0, 1);
} }
Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data); LIGHTGBM_EXPORT Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines); LIGHTGBM_EXPORT Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values,
std::vector<std::vector<int>>& sample_indices,
Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data); size_t total_sample_size, data_size_t num_data);
/*! \brief Disable copy */ /*! \brief Disable copy */
DatasetLoader& operator=(const DatasetLoader&) = delete; DatasetLoader& operator=(const DatasetLoader&) = delete;
...@@ -31,6 +31,8 @@ public: ...@@ -31,6 +31,8 @@ public:
private: private:
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
void SetHeader(const char* filename); void SetHeader(const char* filename);
void CheckDataset(const Dataset* dataset); void CheckDataset(const Dataset* dataset);
...@@ -71,7 +73,6 @@ private: ...@@ -71,7 +73,6 @@ private:
std::vector<std::string> feature_names_; std::vector<std::string> feature_names_;
/*! \brief Mapper from real feature index to used index*/ /*! \brief Mapper from real feature index to used index*/
std::unordered_set<int> categorical_features_; std::unordered_set<int> categorical_features_;
}; };
} }
......
#ifndef LIGHTGBM_EXPORT_H_
#define LIGHTGBM_EXPORT_H_
/** Macros for exporting symbols in MSVC/GCC/CLANG **/
#ifdef __cplusplus
#define LIGHTGBM_EXTERN_C extern "C"
#else
#define LIGHTGBM_EXTERN_C
#endif
#ifdef _MSC_VER
#define LIGHTGBM_EXPORT __declspec(dllexport)
#define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C __declspec(dllexport)
#else
#define LIGHTGBM_EXPORT
#define LIGHTGBM_C_EXPORT LIGHTGBM_EXTERN_C
#endif
#endif /** LIGHTGBM_EXPORT_H_ **/
#ifndef LIGHTGBM_FEATURE_H_ #ifndef LIGHTGBM_FEATURE_GROUP_H_
#define LIGHTGBM_FEATURE_H_ #define LIGHTGBM_FEATURE_GROUP_H_
#include <LightGBM/utils/random.h> #include <LightGBM/utils/random.h>
...@@ -12,22 +12,41 @@ ...@@ -12,22 +12,41 @@
namespace LightGBM { namespace LightGBM {
/*! \brief Using to store data and providing some operations on one feature*/ class Dataset;
class Feature { class DatasetLoader;
/*! \brief Using to store data and providing some operations on one feature group*/
class FeatureGroup {
public: public:
friend Dataset;
friend DatasetLoader;
/*! /*!
* \brief Constructor * \brief Constructor
* \param feature_idx Index of this feature * \param num_feature number of features of this group
* \param bin_mapper Bin mapper for this feature * \param bin_mappers Bin mapper for features
* \param num_data Total number of data * \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature * \param is_enable_sparse True if enable sparse feature
*/ */
Feature(int feature_idx, BinMapper* bin_mapper, FeatureGroup(int num_feature,
data_size_t num_data, bool is_enable_sparse) std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
:bin_mapper_(bin_mapper) { data_size_t num_data, bool is_enable_sparse) : num_feature_(num_feature) {
feature_index_ = feature_idx; CHECK(static_cast<int>(bin_mappers.size()) == num_feature);
bin_data_.reset(Bin::CreateBin(num_data, bin_mapper_->num_bin(), // use bin at zero to store default_bin
bin_mapper_->sparse_rate(), is_enable_sparse, &is_sparse_, bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type())); num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
int cnt_non_zero = 0;
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(bin_mappers[i].release());
auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetDefaultBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers_[i]->sparse_rate()));
}
double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
bin_data_.reset(Bin::CreateBin(num_data, num_total_bin_,
sparse_rate, is_enable_sparse, &is_sparse_));
} }
/*! /*!
* \brief Constructor from memory * \brief Constructor from memory
...@@ -35,39 +54,44 @@ public: ...@@ -35,39 +54,44 @@ public:
* \param num_all_data Number of global data * \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data * \param local_used_indices Local used indices, empty means using all data
*/ */
Feature(const void* memory, data_size_t num_all_data, FeatureGroup(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) { const std::vector<data_size_t>& local_used_indices) {
const char* memory_ptr = reinterpret_cast<const char*>(memory); const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get featuer index
feature_index_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(feature_index_);
// get is_sparse // get is_sparse
is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr)); is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_sparse_); memory_ptr += sizeof(is_sparse_);
num_feature_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(num_feature_);
// get bin mapper // get bin mapper
bin_mapper_.reset(new BinMapper(memory_ptr)); bin_mappers_.clear();
memory_ptr += bin_mapper_->SizesInByte(); bin_offsets_.clear();
// start from 1, due to need to store zero bin in this slot
num_total_bin_ = 1;
bin_offsets_.emplace_back(num_total_bin_);
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_.emplace_back(new BinMapper(memory_ptr));
auto num_bin = bin_mappers_[i]->num_bin();
if (bin_mappers_[i]->GetDefaultBin() == 0) {
num_bin -= 1;
}
num_total_bin_ += num_bin;
bin_offsets_.emplace_back(num_total_bin_);
memory_ptr += bin_mappers_[i]->SizesInByte();
}
data_size_t num_data = num_all_data; data_size_t num_data = num_all_data;
if (!local_used_indices.empty()) { if (!local_used_indices.empty()) {
num_data = static_cast<data_size_t>(local_used_indices.size()); num_data = static_cast<data_size_t>(local_used_indices.size());
} }
if (is_sparse_) { if (is_sparse_) {
bin_data_.reset(Bin::CreateSparseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type())); bin_data_.reset(Bin::CreateSparseBin(num_data, num_total_bin_));
} else { } else {
bin_data_.reset(Bin::CreateDenseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->GetDefaultBin(), bin_mapper_->bin_type())); bin_data_.reset(Bin::CreateDenseBin(num_data, num_total_bin_));
} }
// get bin data // get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices); bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
} }
/*! \brief Destructor */ /*! \brief Destructor */
~Feature() { ~FeatureGroup() {
}
bool CheckAlign(const Feature& other) const {
if (feature_index_ != other.feature_index_) {
return false;
}
return bin_mapper_->CheckAlign(*(other.bin_mapper_.get()));
} }
/*! /*!
...@@ -76,66 +100,91 @@ public: ...@@ -76,66 +100,91 @@ public:
* \param idx Index of record * \param idx Index of record
* \param value feature value of record * \param value feature value of record
*/ */
inline void PushData(int tid, data_size_t line_idx, double value) { inline void PushData(int tid, int sub_feature_idx, data_size_t line_idx, double value) {
unsigned int bin = bin_mapper_->ValueToBin(value); uint32_t bin = bin_mappers_[sub_feature_idx]->ValueToBin(value);
bin_data_->Push(tid, line_idx, bin); if (bin == bin_mappers_[sub_feature_idx]->GetDefaultBin()) { return; }
bin += bin_offsets_[sub_feature_idx];
if (bin_mappers_[sub_feature_idx]->GetDefaultBin() == 0) {
bin -= 1;
} }
inline void PushBin(int tid, data_size_t line_idx, unsigned int bin) {
bin_data_->Push(tid, line_idx, bin); bin_data_->Push(tid, line_idx, bin);
} }
inline void FinishLoad() { bin_data_->FinishLoad(); }
/*! \brief Index of this feature */
inline int feature_index() const { return feature_index_; }
/*! \brief Bin mapper that this feature used */
inline const BinMapper* bin_mapper() const { return bin_mapper_.get(); }
/*! \brief Number of bin of this feature */
inline int num_bin() const { return bin_mapper_->num_bin(); }
inline BinType bin_type() const { return bin_mapper_->bin_type(); } inline void CopySubset(const FeatureGroup* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
/*! \brief Get bin data of this feature */ bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
inline const Bin* bin_data() const { return bin_data_.get(); } }
inline BinIterator* SubFeatureIterator(int sub_feature) {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->GetIterator(min_bin, max_bin, default_bin);
}
inline data_size_t Split(
int sub_feature,
uint32_t threshold,
data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const {
uint32_t min_bin = bin_offsets_[sub_feature];
uint32_t max_bin = bin_offsets_[sub_feature + 1] - 1;
uint32_t default_bin = bin_mappers_[sub_feature]->GetDefaultBin();
return bin_data_->Split(min_bin, max_bin, default_bin,
threshold, data_indices, num_data, lte_indices, gt_indices, bin_mappers_[sub_feature]->bin_type());
}
/*! /*!
* \brief From bin to feature value * \brief From bin to feature value
* \param bin * \param bin
* \return Feature value of this bin * \return FeatureGroup value of this bin
*/ */
inline double BinToValue(unsigned int bin) inline double BinToValue(int sub_feature_idx, uint32_t bin) const {
const { return bin_mapper_->BinToValue(bin); } return bin_mappers_[sub_feature_idx]->BinToValue(bin);
}
/*! /*!
* \brief Save binary data to file * \brief Save binary data to file
* \param file File want to write * \param file File want to write
*/ */
void SaveBinaryToFile(FILE* file) const { void SaveBinaryToFile(FILE* file) const {
fwrite(&feature_index_, sizeof(feature_index_), 1, file);
fwrite(&is_sparse_, sizeof(is_sparse_), 1, file); fwrite(&is_sparse_, sizeof(is_sparse_), 1, file);
bin_mapper_->SaveBinaryToFile(file); fwrite(&num_feature_, sizeof(num_feature_), 1, file);
for (int i = 0; i < num_feature_; ++i) {
bin_mappers_[i]->SaveBinaryToFile(file);
}
bin_data_->SaveBinaryToFile(file); bin_data_->SaveBinaryToFile(file);
} }
/*! /*!
* \brief Get sizes in byte of this object * \brief Get sizes in byte of this object
*/ */
size_t SizesInByte() const { size_t SizesInByte() const {
return sizeof(feature_index_) + sizeof(is_sparse_) + size_t ret = sizeof(is_sparse_) + sizeof(num_feature_);
bin_mapper_->SizesInByte() + bin_data_->SizesInByte(); for (int i = 0; i < num_feature_; ++i) {
ret += bin_mappers_[i]->SizesInByte();
}
ret += bin_data_->SizesInByte();
return ret;
} }
/*! \brief Disable copy */ /*! \brief Disable copy */
Feature& operator=(const Feature&) = delete; FeatureGroup& operator=(const FeatureGroup&) = delete;
/*! \brief Disable copy */ /*! \brief Disable copy */
Feature(const Feature&) = delete; FeatureGroup(const FeatureGroup&) = delete;
private: private:
/*! \brief Index of this feature */ /*! \brief Number of features */
int feature_index_; int num_feature_;
/*! \brief Bin mapper that this feature used */ /*! \brief Bin mapper for sub features */
std::unique_ptr<BinMapper> bin_mapper_; std::vector<std::unique_ptr<BinMapper>> bin_mappers_;
/*! \brief Bin offsets for sub features */
std::vector<uint32_t> bin_offsets_;
/*! \brief Bin data of this feature */ /*! \brief Bin data of this feature */
std::unique_ptr<Bin> bin_data_; std::unique_ptr<Bin> bin_data_;
/*! \brief True if this feature is sparse */ /*! \brief True if this feature is sparse */
bool is_sparse_; bool is_sparse_;
int num_total_bin_;
}; };
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_FEATURE_H_ #endif // LIGHTGBM_FEATURE_GROUP_H_
...@@ -46,7 +46,7 @@ public: ...@@ -46,7 +46,7 @@ public:
* \param type Specific type of metric * \param type Specific type of metric
* \param config Config for metric * \param config Config for metric
*/ */
static Metric* CreateMetric(const std::string& type, const MetricConfig& config); LIGHTGBM_EXPORT static Metric* CreateMetric(const std::string& type, const MetricConfig& config);
}; };
......
...@@ -41,7 +41,7 @@ public: ...@@ -41,7 +41,7 @@ public:
* When number of machines is not power of 2, need group machines into power of 2 group. * When number of machines is not power of 2, need group machines into power of 2 group.
* And we can let each group has at most 2 machines. * And we can let each group has at most 2 machines.
* if the group only has 1 machine. this machine is the normal node * if the group only has 1 machine. this machine is the normal node
* if the grou has 2 machines, this group will have two type of nodes, one is the leader. * if the group has 2 machines, this group will have two type of nodes, one is the leader.
* leader will represent this group and communication with others. * leader will represent this group and communication with others.
*/ */
enum RecursiveHalvingNodeType { enum RecursiveHalvingNodeType {
......
...@@ -44,7 +44,7 @@ public: ...@@ -44,7 +44,7 @@ public:
* \param type Specific type of objective function * \param type Specific type of objective function
* \param config Config for objective function * \param config Config for objective function
*/ */
static ObjectiveFunction* CreateObjectiveFunction(const std::string& type, LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& type,
const ObjectiveConfig& config); const ObjectiveConfig& config);
}; };
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#define LIGHTGBM_TREE_H_ #define LIGHTGBM_TREE_H_
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include <LightGBM/feature.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <string> #include <string>
...@@ -46,7 +45,7 @@ public: ...@@ -46,7 +45,7 @@ public:
* \param gain Split gain * \param gain Split gain
* \return The index of new leaf. * \return The index of new leaf.
*/ */
int Split(int leaf, int feature, BinType bin_type, unsigned int threshold, int real_feature, int Split(int leaf, int feature, BinType bin_type, uint32_t threshold, int real_feature,
double threshold_double, double left_value, double threshold_double, double left_value,
double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain); double right_value, data_size_t left_cnt, data_size_t right_cnt, double gain);
...@@ -64,7 +63,8 @@ public: ...@@ -64,7 +63,8 @@ public:
* \param num_data Number of total data * \param num_data Number of total data
* \param score Will add prediction to score * \param score Will add prediction to score
*/ */
void AddPredictionToScore(const Dataset* data, data_size_t num_data, void AddPredictionToScore(const Dataset* data,
data_size_t num_data,
double* score) const; double* score) const;
/*! /*!
...@@ -93,7 +93,7 @@ public: ...@@ -93,7 +93,7 @@ public:
inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; } inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
/*! \brief Get feature of specific split*/ /*! \brief Get feature of specific split*/
inline int split_feature_real(int split_idx) const { return split_feature_real_[split_idx]; } inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
/*! /*!
* \brief Shrinkage for the tree's output * \brief Shrinkage for the tree's output
...@@ -101,9 +101,11 @@ public: ...@@ -101,9 +101,11 @@ public:
* \param rate The factor of shrinkage * \param rate The factor of shrinkage
*/ */
inline void Shrinkage(double rate) { inline void Shrinkage(double rate) {
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_leaves_; ++i) { for (int i = 0; i < num_leaves_; ++i) {
leaf_value_[i] = leaf_value_[i] * rate; leaf_value_[i] *= rate;
} }
shrinkage_ *= rate;
} }
/*! \brief Serialize this object to string*/ /*! \brief Serialize this object to string*/
...@@ -138,18 +140,10 @@ public: ...@@ -138,18 +140,10 @@ public:
} }
} }
static std::vector<std::function<bool(unsigned int, unsigned int)>> inner_decision_funs; static std::vector<bool(*)(uint32_t, uint32_t)> inner_decision_funs;
static std::vector<std::function<bool(double, double)>> decision_funs; static std::vector<bool(*)(double, double)> decision_funs;
private: private:
/*!
* \brief Find leaf index of which record belongs by data
* \param data The dataset
* \param data_idx Index of record
* \return Leaf index
*/
inline int GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const;
/*! /*!
* \brief Find leaf index of which record belongs by features * \brief Find leaf index of which record belongs by features
...@@ -171,11 +165,11 @@ private: ...@@ -171,11 +165,11 @@ private:
/*! \brief A non-leaf node's right child */ /*! \brief A non-leaf node's right child */
std::vector<int> right_child_; std::vector<int> right_child_;
/*! \brief A non-leaf node's split feature */ /*! \brief A non-leaf node's split feature */
std::vector<int> split_feature_; std::vector<int> split_feature_inner;
/*! \brief A non-leaf node's split feature, the original index */ /*! \brief A non-leaf node's split feature, the original index */
std::vector<int> split_feature_real_; std::vector<int> split_feature_;
/*! \brief A non-leaf node's split threshold in bin */ /*! \brief A non-leaf node's split threshold in bin */
std::vector<unsigned int> threshold_in_bin_; std::vector<uint32_t> threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */ /*! \brief A non-leaf node's split threshold in feature value */
std::vector<double> threshold_; std::vector<double> threshold_;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */ /*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
...@@ -195,39 +189,34 @@ private: ...@@ -195,39 +189,34 @@ private:
std::vector<data_size_t> internal_count_; std::vector<data_size_t> internal_count_;
/*! \brief Depth for leaves */ /*! \brief Depth for leaves */
std::vector<int> leaf_depth_; std::vector<int> leaf_depth_;
double shrinkage_;
bool has_categorical_;
}; };
inline double Tree::Predict(const double* feature_values) const { inline double Tree::Predict(const double* feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values); int leaf = GetLeaf(feature_values);
return LeafOutput(leaf); return LeafOutput(leaf);
} else {
return 0.0f;
}
} }
inline int Tree::PredictLeafIndex(const double* feature_values) const { inline int Tree::PredictLeafIndex(const double* feature_values) const {
if (num_leaves_ > 1) {
int leaf = GetLeaf(feature_values); int leaf = GetLeaf(feature_values);
return leaf; return leaf;
}
inline int Tree::GetLeaf(const std::vector<std::unique_ptr<BinIterator>>& iterators,
data_size_t data_idx) const {
int node = 0;
while (node >= 0) {
if (inner_decision_funs[decision_type_[node]](
iterators[split_feature_[node]]->Get(data_idx),
threshold_in_bin_[node])) {
node = left_child_[node];
} else { } else {
node = right_child_[node]; return 0;
} }
}
return ~node;
} }
inline int Tree::GetLeaf(const double* feature_values) const { inline int Tree::GetLeaf(const double* feature_values) const {
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
if (decision_funs[decision_type_[node]]( if (decision_funs[decision_type_[node]](
feature_values[split_feature_real_[node]], feature_values[split_feature_[node]],
threshold_[node])) { threshold_[node])) {
node = left_child_[node]; node = left_child_[node];
} else { } else {
......
...@@ -27,6 +27,8 @@ public: ...@@ -27,6 +27,8 @@ public:
*/ */
virtual void Init(const Dataset* train_data) = 0; virtual void Init(const Dataset* train_data) = 0;
virtual void ResetTrainingData(const Dataset* train_data) = 0;
/*! /*!
* \brief Reset tree configs * \brief Reset tree configs
* \param tree_config config of tree * \param tree_config config of tree
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
#include <LightGBM/utils/openmp_wrapper.h>
namespace LightGBM { namespace LightGBM {
...@@ -12,88 +13,136 @@ namespace LightGBM { ...@@ -12,88 +13,136 @@ namespace LightGBM {
template<typename VAL_T> template<typename VAL_T>
class ArrayArgs { class ArrayArgs {
public: public:
inline static size_t ArgMaxMT(const std::vector<VAL_T>& array) {
int num_threads = 1;
#pragma omp parallel
#pragma omp master
{
num_threads = omp_get_num_threads();
}
int step = std::max(1, (static_cast<int>(array.size()) + num_threads - 1) / num_threads);
std::vector<size_t> arg_maxs(num_threads, 0);
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < num_threads; ++i) {
size_t start = step * i;
if (start >= array.size()) { continue; }
size_t end = std::min(array.size(), start + step);
size_t arg_max = start;
for (size_t j = start + 1; j < end; ++j) {
if (array[j] > array[arg_max]) {
arg_max = j;
}
}
arg_maxs[i] = arg_max;
}
size_t ret = arg_maxs[0];
for (int i = 1; i < num_threads; ++i) {
if (array[arg_maxs[i]] > array[ret]) {
ret = arg_maxs[i];
}
}
return ret;
}
inline static size_t ArgMax(const std::vector<VAL_T>& array) { inline static size_t ArgMax(const std::vector<VAL_T>& array) {
if (array.empty()) { if (array.empty()) {
return 0; return 0;
} }
size_t argMax = 0; if (array.size() > 100) {
return ArgMaxMT(array);
} else {
size_t arg_max = 0;
for (size_t i = 1; i < array.size(); ++i) { for (size_t i = 1; i < array.size(); ++i) {
if (array[i] > array[argMax]) { if (array[i] > array[arg_max]) {
argMax = i; arg_max = i;
}
} }
return arg_max;
} }
return argMax;
} }
inline static size_t ArgMin(const std::vector<VAL_T>& array) { inline static size_t ArgMin(const std::vector<VAL_T>& array) {
if (array.empty()) { if (array.empty()) {
return 0; return 0;
} }
size_t argMin = 0; size_t arg_min = 0;
for (size_t i = 1; i < array.size(); ++i) { for (size_t i = 1; i < array.size(); ++i) {
if (array[i] < array[argMin]) { if (array[i] < array[arg_min]) {
argMin = i; arg_min = i;
} }
} }
return argMin; return arg_min;
} }
inline static size_t ArgMax(const VAL_T* array, size_t n) { inline static size_t ArgMax(const VAL_T* array, size_t n) {
if (n <= 0) { if (n <= 0) {
return 0; return 0;
} }
size_t argMax = 0; size_t arg_max = 0;
for (size_t i = 1; i < n; ++i) { for (size_t i = 1; i < n; ++i) {
if (array[i] > array[argMax]) { if (array[i] > array[arg_max]) {
argMax = i; arg_max = i;
} }
} }
return argMax; return arg_max;
} }
inline static size_t ArgMin(const VAL_T* array, size_t n) { inline static size_t ArgMin(const VAL_T* array, size_t n) {
if (n <= 0) { if (n <= 0) {
return 0; return 0;
} }
size_t argMin = 0; size_t arg_min = 0;
for (size_t i = 1; i < n; ++i) { for (size_t i = 1; i < n; ++i) {
if (array[i] < array[argMin]) { if (array[i] < array[arg_min]) {
argMin = i; arg_min = i;
} }
} }
return argMin; return arg_min;
} }
inline static size_t Partition(std::vector<VAL_T>* array, size_t start, size_t end) { inline static void Partition(std::vector<VAL_T>* arr, int start, int end, int* l, int* r) {
VAL_T& pivot = (*array)[end - 1]; int i = start - 1;
size_t p_idx = start; int j = end - 1;
for (size_t i = start; i < end - 1; ++i) { int p = i;
if ((*array)[i] > pivot) { int q = j;
std::swap((*array)[p_idx], (*array)[i]); if (start >= end) {
++p_idx; return;
}
} }
std::swap((*array)[p_idx], (*array)[end - 1]); std::vector<VAL_T>& ref = *arr;
return p_idx; VAL_T v = ref[end - 1];
for (;;) {
while (ref[++i] > v);
while (v > ref[--j]) { if (j == start) { break; } }
if (i >= j) { break; }
std::swap(ref[i], ref[j]);
if (ref[i] == v) { p++; std::swap(ref[p], ref[i]); }
if (v == ref[j]) { q--; std::swap(ref[j], ref[q]); }
}
std::swap(ref[i], ref[end - 1]);
j = i - 1;
i = i + 1;
for (int k = start; k <= p; k++, j--) { std::swap(ref[k], ref[j]); }
for (int k = end - 2; k >= q; k--, i++) { std::swap(ref[i], ref[k]); }
*l = j;
*r = i;
}; };
inline static size_t ArgMaxAtK(std::vector<VAL_T>* array, size_t start, size_t end, size_t k) { inline static int ArgMaxAtK(std::vector<VAL_T>* arr, int start, int end, int k) {
if (start == end - 1) { if (start >= end - 1) {
return start; return start;
} }
size_t p_idx = Partition(array, start, end); int l = start;
if (p_idx == k) { int r = end - 1;
return p_idx; Partition(arr, start, end, &l, &r);
} if ((k > l && k < r) || l == 0 || r == end - 1) {
else if (k < p_idx) { return k;
return ArgMaxAtK(array, start, p_idx, k); } else if (k <= l) {
} return ArgMaxAtK(arr, start, l, k);
else { } else {
return ArgMaxAtK(array, p_idx + 1, end, k); return ArgMaxAtK(arr, r, end, k);
} }
} }
inline static void MaxK(const std::vector<VAL_T>& array, size_t k, std::vector<VAL_T>* out) { inline static void MaxK(const std::vector<VAL_T>& array, int k, std::vector<VAL_T>* out) {
out->clear(); out->clear();
if (k <= 0) { if (k <= 0) {
return; return;
...@@ -101,10 +150,10 @@ public: ...@@ -101,10 +150,10 @@ public:
for (auto val : array) { for (auto val : array) {
out->push_back(val); out->push_back(val);
} }
if (k >= array.size()) { if (static_cast<size_t>(k) >= array.size()) {
return; return;
} }
ArgMaxAtK(out, 0, out->size(), k - 1); ArgMaxAtK(out, 0, static_cast<int>(out->size()), k - 1);
out->erase(out->begin() + k, out->end()); out->erase(out->begin() + k, out->end());
} }
......
...@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) { ...@@ -150,7 +150,7 @@ inline static const char* Atof(const char* p, double* out) {
frac = 0; frac = 0;
scale = 1.0; scale = 1.0;
if ((*p == 'e') || (*p == 'E')) { if ((*p == 'e') || (*p == 'E')) {
unsigned int expon; uint32_t expon;
// Get sign of exponent, if any. // Get sign of exponent, if any.
++p; ++p;
if (*p == '-') { if (*p == '-') {
...@@ -273,6 +273,9 @@ inline static std::string ArrayToString(const std::vector<T>& arr, size_t n, cha ...@@ -273,6 +273,9 @@ inline static std::string ArrayToString(const std::vector<T>& arr, size_t n, cha
template<typename T> template<typename T>
inline static std::vector<T> StringToArray(const std::string& str, char delimiter, size_t n) { inline static std::vector<T> StringToArray(const std::string& str, char delimiter, size_t n) {
if (n == 0) {
return std::vector<T>();
}
std::vector<std::string> strs = Split(str.c_str(), delimiter); std::vector<std::string> strs = Split(str.c_str(), delimiter);
if (strs.size() != n) { if (strs.size() != n) {
Log::Fatal("StringToArray error, size doesn't match."); Log::Fatal("StringToArray error, size doesn't match.");
......
...@@ -45,6 +45,10 @@ public: ...@@ -45,6 +45,10 @@ public:
GetLevel() = level; GetLevel() = level;
} }
static void ResetUseException(bool use_ex) {
UseException() = use_ex;
}
static void Debug(const char *format, ...) { static void Debug(const char *format, ...) {
va_list val; va_list val;
va_start(val, format); va_start(val, format);
...@@ -73,7 +77,13 @@ public: ...@@ -73,7 +77,13 @@ public:
vsprintf(str_buf, format, val); vsprintf(str_buf, format, val);
#endif #endif
va_end(val); va_end(val);
fprintf(stderr, "[LightGBM] [Fatal] %s\n", str_buf);
fflush(stderr);
if (UseException()) {
throw std::runtime_error(std::string(str_buf)); throw std::runtime_error(std::string(str_buf));
} else {
std::exit(-1);
}
} }
private: private:
...@@ -96,6 +106,8 @@ private: ...@@ -96,6 +106,8 @@ private:
static LogLevel& GetLevel() { static thread_local LogLevel level = LogLevel::Info; return level; } static LogLevel& GetLevel() { static thread_local LogLevel level = LogLevel::Info; return level; }
#endif #endif
static bool& UseException() { static bool use_ex = false; return use_ex; }
}; };
} // namespace LightGBM } // namespace LightGBM
......
#ifndef LIGHTGBM_OPENMP_WRAPPER_H_
#define LIGHTGBM_OPENMP_WRAPPER_H_
#ifdef _OPENMP
#include <omp.h>
#else
#ifdef _MSC_VER
#pragma warning( disable : 4068 ) // disable unknown pragma warning
#endif
#ifdef __cplusplus
extern "C" {
#endif
/** Fall here if no OPENMP support, so just
simulate a single thread running.
All #pragma omp should be ignored by the compiler **/
inline void omp_set_num_threads(int) {}
inline int omp_get_num_threads() {return 1;}
inline int omp_get_thread_num() {return 0;}
#ifdef __cplusplus
}; // extern "C"
#endif
#endif
#endif /* LIGHTGBM_OPENMP_WRAPPER_H_ */
...@@ -20,30 +20,41 @@ public: ...@@ -20,30 +20,41 @@ public:
std::random_device rd; std::random_device rd;
auto genrator = std::mt19937(rd()); auto genrator = std::mt19937(rd());
std::uniform_int_distribution<int> distribution(0, x); std::uniform_int_distribution<int> distribution(0, x);
x = static_cast<unsigned int>(distribution(genrator)); x = distribution(genrator);
} }
/*! /*!
* \brief Constructor, with specific seed * \brief Constructor, with specific seed
*/ */
Random(int seed) { Random(int seed) {
x = static_cast<unsigned int>(seed); x = seed;
} }
/*! /*!
* \brief Generate random integer * \brief Generate random integer, int16 range. [0, 65536]
* \param lower_bound lower bound
* \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound)
*/
inline int NextShort(int lower_bound, int upper_bound) {
return (RandInt16()) % (upper_bound - lower_bound) + lower_bound;
}
/*!
* \brief Generate random integer, int32 range
* \param lower_bound lower bound * \param lower_bound lower bound
* \param upper_bound upper bound * \param upper_bound upper bound
* \return The random integer between [lower_bound, upper_bound) * \return The random integer between [lower_bound, upper_bound)
*/ */
inline int NextInt(int lower_bound, int upper_bound) { inline int NextInt(int lower_bound, int upper_bound) {
return (next()) % (upper_bound - lower_bound + 1) + lower_bound; return (RandInt32()) % (upper_bound - lower_bound) + lower_bound;
} }
/*! /*!
* \brief Generate random float data * \brief Generate random float data
* \return The random float between [0.0, 1.0) * \return The random float between [0.0, 1.0)
*/ */
inline double NextDouble() { inline float NextFloat() {
// get random float in [0,1) // get random float in [0,1)
return static_cast<double>(next() % 2047) / 2047.0f; return static_cast<float>(RandInt16()) / (32768.0f);
} }
/*! /*!
* \brief Sample K data from {0,1,...,N-1} * \brief Sample K data from {0,1,...,N-1}
...@@ -58,26 +69,24 @@ public: ...@@ -58,26 +69,24 @@ public:
} }
for (int i = 0; i < N; ++i) { for (int i = 0; i < N; ++i) {
double prob = (K - ret.size()) / static_cast<double>(N - i); double prob = (K - ret.size()) / static_cast<double>(N - i);
if (NextDouble() < prob) { if (NextFloat() < prob) {
ret.push_back(i); ret.push_back(i);
} }
} }
return ret; return ret;
} }
private: private:
unsigned next() { inline int RandInt16() {
x ^= x << 16; x = (214013 * x + 2531011);
x ^= x >> 5; return (x >> 16) & 0x7FFF;
x ^= x << 1;
auto t = x;
x = y;
y = z;
z = t ^ x ^ y;
return z;
} }
unsigned int x = 123456789;
unsigned int y = 362436069; inline int RandInt32() {
unsigned int z = 521288629; x = (214013 * x + 2531011);
return x & 0x7FFFFFF;
}
int x = 123456789;
}; };
......
#ifndef LIGHTGBM_UTILS_THREADING_H_ #ifndef LIGHTGBM_UTILS_THREADING_H_
#define LIGHTGBM_UTILS_THREADING_H_ #define LIGHTGBM_UTILS_THREADING_H_
#include <omp.h> #include <LightGBM/utils/openmp_wrapper.h>
#include <vector> #include <vector>
#include <functional> #include <functional>
......
...@@ -6,13 +6,19 @@ Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors ...@@ -6,13 +6,19 @@ Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
from __future__ import absolute_import from __future__ import absolute_import
from .basic import Dataset, Booster from .basic import Booster, Dataset
from .engine import train, cv from .callback import (early_stopping, print_evaluation, record_evaluation,
from .callback import print_evaluation, record_evaluation, reset_parameter, early_stopping reset_parameter)
from .engine import cv, train
try: try:
from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
except ImportError: except ImportError:
pass pass
try:
from .plotting import plot_importance, plot_metric, plot_tree
except ImportError:
pass
__version__ = 0.1 __version__ = 0.1
...@@ -20,4 +26,5 @@ __version__ = 0.1 ...@@ -20,4 +26,5 @@ __version__ = 0.1
__all__ = ['Dataset', 'Booster', __all__ = ['Dataset', 'Booster',
'train', 'cv', 'train', 'cv',
'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker', 'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker',
'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping'] 'print_evaluation', 'record_evaluation', 'reset_parameter', 'early_stopping',
'plot_importance', 'plot_metric', 'plot_tree']
This diff is collapsed.
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
from __future__ import absolute_import from __future__ import absolute_import
import collections import collections
from operator import gt, lt
from .compat import range_ from .compat import range_
...@@ -159,48 +160,50 @@ def early_stopping(stopping_rounds, verbose=True): ...@@ -159,48 +160,50 @@ def early_stopping(stopping_rounds, verbose=True):
callback : function callback : function
The requested callback function. The requested callback function.
""" """
factor_to_bigger_better = {} best_score = []
best_score = {} best_iter = []
best_iter = {} best_msg = []
best_msg = {} cmp_op = []
def init(env): def init(env):
"""internal function""" """internal function"""
if not env.evaluation_result_list: if not env.evaluation_result_list:
raise ValueError('For early stopping, at least one dataset or eval metric is required for evaluation') raise ValueError('For early stopping, at least one dataset and eval metric is required for evaluation')
if verbose: if verbose:
msg = "Train until valid scores didn't improve in {} rounds." msg = "Train until valid scores didn't improve in {} rounds."
print(msg.format(stopping_rounds)) print(msg.format(stopping_rounds))
for i in range_(len(env.evaluation_result_list)): for eval_ret in env.evaluation_result_list:
best_score[i] = float('-inf') best_iter.append(0)
best_iter[i] = 0
if verbose: if verbose:
best_msg[i] = "" best_msg.append(None)
factor_to_bigger_better[i] = 1.0 if env.evaluation_result_list[i][3] else -1.0 if eval_ret[3]:
best_score.append(float('-inf'))
cmp_op.append(gt)
else:
best_score.append(float('inf'))
cmp_op.append(lt)
def callback(env): def callback(env):
"""internal function""" """internal function"""
if not best_score: if not cmp_op:
init(env) init(env)
best_msg_buffer = None
for i in range_(len(env.evaluation_result_list)): for i in range_(len(env.evaluation_result_list)):
score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i] score = env.evaluation_result_list[i][2]
if score > best_score[i]: if cmp_op[i](score, best_score[i]):
best_score[i] = score best_score[i] = score
best_iter[i] = env.iteration best_iter[i] = env.iteration
if verbose: if verbose:
best_msg[i] = '[%d]\t%s' % ( if not best_msg_buffer:
env.iteration + 1, '\t'.join( best_msg_buffer = '[%d]\t%s' % (
[_format_eval_result(x) for x in env.evaluation_result_list] env.iteration + 1, '\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
) best_msg[i] = best_msg_buffer
) elif env.iteration - best_iter[i] >= stopping_rounds:
else:
if env.iteration - best_iter[i] >= stopping_rounds:
env.model.set_attr(best_iteration=str(best_iter[i])) env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose: if verbose:
print('Early stopping, best iteration is:') print('Early stopping, best iteration is:\n' + best_msg[i])
print(best_msg[i])
raise EarlyStopException(best_iter[i]) raise EarlyStopException(best_iter[i])
callback.order = 30 callback.order = 30
return callback return callback
...@@ -6,13 +6,15 @@ from __future__ import absolute_import ...@@ -6,13 +6,15 @@ from __future__ import absolute_import
import inspect import inspect
import sys import sys
import numpy as np
is_py3 = (sys.version_info[0] == 3) is_py3 = (sys.version_info[0] == 3)
"""compatibility between python2 and python3""" """compatibility between python2 and python3"""
if is_py3: if is_py3:
string_type = str string_type = str
numeric_types = (int, float, bool) numeric_types = (int, float, bool)
integer_types = int integer_types = (int, )
range_ = range range_ = range
def argc_(func): def argc_(func):
...@@ -36,6 +38,16 @@ except (ImportError, SyntaxError): ...@@ -36,6 +38,16 @@ except (ImportError, SyntaxError):
# because of u'...' Unicode literals. # because of u'...' Unicode literals.
import json import json
def json_default_with_numpy(obj):
if isinstance(obj, (np.integer, np.floating, np.bool_)):
return obj.item()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj
"""pandas""" """pandas"""
try: try:
from pandas import Series, DataFrame from pandas import Series, DataFrame
...@@ -69,5 +81,4 @@ except ImportError: ...@@ -69,5 +81,4 @@ except ImportError:
LGBMClassifierBase = object LGBMClassifierBase = object
LGBMRegressorBase = object LGBMRegressorBase = object
LGBMLabelEncoder = None LGBMLabelEncoder = None
LGBMDeprecated = None
LGBMStratifiedKFold = None LGBMStratifiedKFold = None
...@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types, ...@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def train(params, train_set, num_boost_round=100, def train(params, train_set, num_boost_round=100,
valid_sets=None, valid_names=None, valid_sets=None, valid_names=None,
fobj=None, feval=None, init_model=None, fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None, feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, evals_result=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None): verbose_eval=True, learning_rates=None, callbacks=None):
""" """
...@@ -42,12 +42,14 @@ def train(params, train_set, num_boost_round=100, ...@@ -42,12 +42,14 @@ def train(params, train_set, num_boost_round=100,
Note: should return (eval_name, eval_result, is_higher_better) of list of this Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance init_model : file name of lightgbm model or 'Booster' instance
model used for continued train model used for continued train
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or int If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features, Categorical features,
type int represents index, type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
...@@ -96,7 +98,7 @@ def train(params, train_set, num_boost_round=100, ...@@ -96,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
init_iteration = predictor.num_total_iteration if predictor is not None else 0 init_iteration = predictor.num_total_iteration if predictor is not None else 0
"""check dataset""" """check dataset"""
if not isinstance(train_set, Dataset): if not isinstance(train_set, Dataset):
raise TypeError("Traninig only accepts Dataset object") raise TypeError("Training only accepts Dataset object")
train_set._update_params(params) train_set._update_params(params)
train_set._set_predictor(predictor) train_set._set_predictor(predictor)
...@@ -219,28 +221,35 @@ class CVBooster(object): ...@@ -219,28 +221,35 @@ class CVBooster(object):
return handlerFunction return handlerFunction
def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=False, shuffle=True): def _make_n_folds(full_data, data_splitter, nfold, params, seed, fpreproc=None, stratified=False, shuffle=True):
""" """
Make an n-fold list of Booster from random indices. Make an n-fold list of Booster from random indices.
""" """
np.random.seed(seed) np.random.seed(seed)
if stratified: num_data = full_data.construct().num_data()
if SKLEARN_INSTALLED: if data_splitter is not None:
sfk = LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed) if not hasattr(data_splitter, 'split'):
idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())] raise AttributeError("data_splitter has no method 'split'")
else: folds = data_splitter.split(np.arange(num_data))
elif stratified:
if not SKLEARN_INSTALLED:
raise LightGBMError('Scikit-learn is required for stratified cv') raise LightGBMError('Scikit-learn is required for stratified cv')
sfk = LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
folds = sfk.split(X=np.zeros(num_data), y=full_data.get_label())
else: else:
full_data.construct()
if shuffle: if shuffle:
randidx = np.random.permutation(full_data.num_data()) randidx = np.random.permutation(num_data)
kstep = int(len(randidx) / nfold) else:
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range_(nfold)] randidx = np.arange(num_data)
kstep = int(num_data / nfold)
test_id = [randidx[i: i + kstep] for i in range_(0, num_data, kstep)]
train_id = [np.concatenate([test_id[i] for i in range_(nfold) if k != i]) for k in range_(nfold)]
folds = zip(train_id, test_id)
ret = CVBooster() ret = CVBooster()
for k in range_(nfold): for train_idx, test_idx in folds:
train_set = full_data.subset(np.concatenate([idset[i] for i in range_(nfold) if k != i])) train_set = full_data.subset(train_idx)
valid_set = full_data.subset(idset[k]) valid_set = full_data.subset(test_idx)
# run preprocessing on the data set if needed # run preprocessing on the data set if needed
if fpreproc is not None: if fpreproc is not None:
train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy()) train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
...@@ -265,9 +274,10 @@ def _agg_cv_result(raw_results): ...@@ -265,9 +274,10 @@ def _agg_cv_result(raw_results):
return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()] return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, def cv(params, train_set, num_boost_round=10,
shuffle=True, metrics=None, fobj=None, feval=None, init_model=None, data_splitter=None, nfold=5, stratified=False, shuffle=True,
feature_name=None, categorical_feature=None, metrics=None, fobj=None, feval=None, init_model=None,
feature_name='auto', categorical_feature='auto',
early_stopping_rounds=None, fpreproc=None, early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None): callbacks=None):
...@@ -282,14 +292,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, ...@@ -282,14 +292,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
Data to be trained. Data to be trained.
num_boost_round : int num_boost_round : int
Number of boosting iterations. Number of boosting iterations.
data_splitter : an instance with split(X) method
Instance with split(X) method.
nfold : int nfold : int
Number of folds in CV. Number of folds in CV.
stratified : bool stratified : bool
Perform stratified sampling. Perform stratified sampling.
shuffle: bool shuffle: bool
Whether shuffle before split data Whether shuffle before split data
folds : a KFold or StratifiedKFold instance
Sklearn KFolds or StratifiedKFolds.
metrics : string or list of strings metrics : string or list of strings
Evaluation metrics to be watched in CV. Evaluation metrics to be watched in CV.
fobj : function fobj : function
...@@ -298,11 +308,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, ...@@ -298,11 +308,14 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
Custom evaluation function. Custom evaluation function.
init_model : file name of lightgbm model or 'Booster' instance init_model : file name of lightgbm model or 'Booster' instance
model used for continued train model used for continued train
feature_name : list of str feature_name : list of str, or 'auto'
Feature names Feature names
categorical_feature : list of str or int If 'auto' and data is pandas DataFrame, use data columns name
Categorical features, type int represents index, categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue. every <early_stopping_rounds> round(s) to continue.
...@@ -351,7 +364,10 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, ...@@ -351,7 +364,10 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
params['metric'].extend(metrics) params['metric'].extend(metrics)
results = collections.defaultdict(list) results = collections.defaultdict(list)
cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified, shuffle) cvfolds = _make_n_folds(train_set, data_splitter=data_splitter,
nfold=nfold, params=params, seed=seed,
fpreproc=fpreproc, stratified=stratified,
shuffle=shuffle)
# setup callbacks # setup callbacks
if callbacks is None: if callbacks is None:
...@@ -380,7 +396,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, ...@@ -380,7 +396,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
begin_iteration=0, begin_iteration=0,
end_iteration=num_boost_round, end_iteration=num_boost_round,
evaluation_result_list=None)) evaluation_result_list=None))
cvfolds.update(fobj) cvfolds.update(fobj=fobj)
res = _agg_cv_result(cvfolds.eval_valid(feval)) res = _agg_cv_result(cvfolds.eval_valid(feval))
for _, key, mean, _, std in res: for _, key, mean, _, std in res:
results[key + '-mean'].append(mean) results[key + '-mean'].append(mean)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment