Commit dce329e5 authored by Hui Xue's avatar Hui Xue
Browse files

Merge remote-tracking branch 'upstream/master'

# Conflicts:
#	src/io/dataset.cpp
#	src/io/ordered_sparse_bin.hpp
#	src/treelearner/leaf_splits.hpp
#	src/treelearner/serial_tree_learner.cpp
parents 0b9fe27a a6a75fe9
...@@ -267,3 +267,4 @@ _Pvt_Extensions ...@@ -267,3 +267,4 @@ _Pvt_Extensions
*.out *.out
*.app *.app
/windows/LightGBM.VC.db /windows/LightGBM.VC.db
lightgbm
...@@ -4,15 +4,15 @@ LightGBM, Light Gradient Boosting Machine ...@@ -4,15 +4,15 @@ LightGBM, Light Gradient Boosting Machine
LightGBM is a gradient boosting framework that is using tree based learning algorithms. It is designed to be distributed and efficient with following advantages: LightGBM is a gradient boosting framework that is using tree based learning algorithms. It is designed to be distributed and efficient with following advantages:
- Fast training efficiency - Fast training speed and high efficiency
- Low memory usage - Lower memory usage
- Better accuracy - Better accuracy
- Parallel learning supported - Parallel learning supported
- Deal with large scale of data - Capability of handling large-scaling data
For the details, please refer to [Features](https://github.com/Microsoft/LightGBM/wiki/Features). For more details, please refer to [Features](https://github.com/Microsoft/LightGBM/wiki/Features).
The [experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#comparison-experiment) on the public data also shows that LightGBM can outperform other existing boosting tools on both learning efficiency and accuracy, with significant lower memory consumption. What's more, the [experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#parallel-experiment) shows that LightGBM can achieve linear speed-up by using multiple machines for training in specific settings. The [experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#comparison-experiment) on public datasets show that LightGBM outperform other existing boosting tools on both efficiency and accuracy, with significant lower memory consumption. What's more, the [experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#parallel-experiment) show that LightGBM can achieve linear speed-up by using multiple machines for training in specific settings.
Get Started Get Started
------------ ------------
......
...@@ -8,17 +8,17 @@ ...@@ -8,17 +8,17 @@
namespace LightGBM { namespace LightGBM {
/*! \brief forward declaration */
class Dataset; class Dataset;
class Boosting; class Boosting;
class ObjectiveFunction; class ObjectiveFunction;
class Metric; class Metric;
/*! /*!
* \brief The entrance of LightGBM. this application has two tasks: * \brief The main entrance of LightGBM. this application has two tasks:
* Train and Predict. * Train and Predict.
* Train task will train a new model * Train task will train a new model
* Predict task will predict the scores of test data and save the score to local disk * Predict task will predicting the scores of test data using exsiting model,
* and saving the score to disk.
*/ */
class Application { class Application {
public: public:
...@@ -32,9 +32,9 @@ public: ...@@ -32,9 +32,9 @@ public:
private: private:
/*! /*!
* \brief Global Sync by minimal, will return minimal of global * \brief Global Sync by minimal, will return minimal T across nodes
* \param local Local data * \param local Local data
* \return Global minimal data * \return minimal values across nodes
*/ */
template<typename T> template<typename T>
T GlobalSyncUpByMin(T& local); T GlobalSyncUpByMin(T& local);
...@@ -45,19 +45,19 @@ private: ...@@ -45,19 +45,19 @@ private:
/*! \brief Load data, including training data and validation data*/ /*! \brief Load data, including training data and validation data*/
void LoadData(); void LoadData();
/*! \brief Some initial works before training*/ /*! \brief Initialization before training*/
void InitTrain(); void InitTrain();
/*! \brief The training logic */ /*! \brief Main Training logic */
void Train(); void Train();
/*! \brief Initialize the enviroment needed by prediction */ /*! \brief Initializations before prediction */
void InitPredict(); void InitPredict();
/*! \brief Load model */ /*! \brief Load model from local disk */
void LoadModel(); void LoadModel();
/*! \brief The prediction logic */ /*! \brief Main predicting logic */
void Predict(); void Predict();
/*! \brief All configs */ /*! \brief All configs */
......
...@@ -20,7 +20,7 @@ public: ...@@ -20,7 +20,7 @@ public:
data_size_t cnt = 0; data_size_t cnt = 0;
/*! /*!
* \brief Sum up reduce function for histogram bin * \brief Sum up (reducers) functions for histogram bin
*/ */
inline static void SumReducer(const char *src, char *dst, int len) { inline static void SumReducer(const char *src, char *dst, int len) {
const int type_size = sizeof(HistogramBinEntry); const int type_size = sizeof(HistogramBinEntry);
...@@ -42,8 +42,8 @@ public: ...@@ -42,8 +42,8 @@ public:
} }
}; };
/*! \brief This class used to convert featrue value to bin, /*! \brief This class used to convert feature values into bin,
* and store some meta infomartion for bin*/ * and store some meta information for bin*/
class BinMapper { class BinMapper {
public: public:
BinMapper(); BinMapper();
...@@ -53,9 +53,9 @@ public: ...@@ -53,9 +53,9 @@ public:
/*! \brief Get number of bins */ /*! \brief Get number of bins */
inline int num_bin() const { return num_bin_; } inline int num_bin() const { return num_bin_; }
/*! \brief True if bin is trival(only contain one bin) */ /*! \brief True if bin is trival (contains only one bin) */
inline bool is_trival() const { return is_trival_; } inline bool is_trival() const { return is_trival_; }
/*! \brief Sparse rate of this bins( num_zero_bins / num_data ) */ /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
inline double sparse_rate() const { return sparse_rate_; } inline double sparse_rate() const { return sparse_rate_; }
/*! /*!
* \brief Save binary data to file * \brief Save binary data to file
...@@ -63,9 +63,9 @@ public: ...@@ -63,9 +63,9 @@ public:
*/ */
void SaveBinaryToFile(FILE* file) const; void SaveBinaryToFile(FILE* file) const;
/*! /*!
* \brief Map bin to feature value * \brief Mapping bin into feature value
* \param bin * \param bin
* \return Feature value for this bin * \return Feature value of this bin
*/ */
inline double BinToValue(unsigned int bin) const { inline double BinToValue(unsigned int bin) const {
return bin_upper_bound_[bin]; return bin_upper_bound_[bin];
...@@ -75,7 +75,7 @@ public: ...@@ -75,7 +75,7 @@ public:
*/ */
size_t SizesInByte() const; size_t SizesInByte() const;
/*! /*!
* \brief Map feature value to bin * \brief Mapping feature value into bin
* \param value * \param value
* \return bin for this feature value * \return bin for this feature value
*/ */
...@@ -96,13 +96,13 @@ public: ...@@ -96,13 +96,13 @@ public:
static int SizeForSpecificBin(int bin); static int SizeForSpecificBin(int bin);
/*! /*!
* \brief Copy this object to buffer * \brief Seirilizing this object to buffer
* \param buffer The destination * \param buffer The destination
*/ */
void CopyTo(char* buffer); void CopyTo(char* buffer);
/*! /*!
* \brief Restore this object from buffer * \brief Deserilizing this object from buffer
* \param buffer The source * \param buffer The source
*/ */
void CopyFrom(const char* buffer); void CopyFrom(const char* buffer);
...@@ -119,12 +119,12 @@ private: ...@@ -119,12 +119,12 @@ private:
}; };
/*! /*!
* \brief Interface for ordered bin data. It's very efficient for constructing histogram, especially for sparse bin * \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
* There are 2 advantages for using ordered bin. * There are 2 advantages by using ordered bin.
* 1. group the data by leaf, improve the cache hit. * 1. group the data by leafs to improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram consturction for sparse feature. * 2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
* But it has a additional cost, it need re-order the bins after leaf split, which will cost much for dense feature. * However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
* So we only use ordered bin for sparse features now. * So we only using ordered bin for sparse situations.
*/ */
class OrderedBin { class OrderedBin {
public: public:
...@@ -132,11 +132,12 @@ public: ...@@ -132,11 +132,12 @@ public:
virtual ~OrderedBin() {} virtual ~OrderedBin() {}
/*! /*!
* \brief Initial logic, call before train one tree. * \brief Initialization logic.
* \param used_indices If used_indices==nullptr means using all data, otherwise, used_indices[i] != 0 means i-th data is used(for bagging logic) * \param used_indices If used_indices==nullptr means using all data, otherwise, used_indices[i] != 0 means i-th data is used
* \param num_leavas Number of leveas on this iteration (this logic was build for bagging logic)
* \param num_leaves Number of leaves on this iteration
*/ */
virtual void Init(const char* used_indices, data_size_t num_leavas) = 0; virtual void Init(const char* used_indices, data_size_t num_leaves) = 0;
/*! /*!
* \brief Construct histogram by using this bin * \brief Construct histogram by using this bin
...@@ -173,8 +174,8 @@ public: ...@@ -173,8 +174,8 @@ public:
/*! /*!
* \brief Interface for bin data. This class will store bin data for one feature. * \brief Interface for bin data. This class will store bin data for one feature.
* unlike OrderedBin, this class will store data by original order. * unlike OrderedBin, this class will store data by original order.
* Though it may have many cache miss when construct histogram, * Note that it may cause cache misses when construct histogram,
* but it doesn't need to re-order operation, So it is still faster than OrderedBin for dense feature * but it doesn't need to re-order operation, So it will be faster than OrderedBin for dense feature
*/ */
class Bin { class Bin {
public: public:
...@@ -218,10 +219,11 @@ public: ...@@ -218,10 +219,11 @@ public:
/*! /*!
* \brief Construct histogram of this feature, * \brief Construct histogram of this feature,
* Note: here use ordered_gradients and ordered_hessians to improve cache hit chance * Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
* The navie solution is use gradients[data_indices[i]] for data_indices[i] to get gradients, which is not cache friendly, since the access of memory is not continuous. * The navie solution is use gradients[data_indices[i]] for data_indices[i] to get gradients,
* ordered_gradients and ordered_hessians are preprocessed, they are re-ordered by data_indices. which is not cache friendly, since the access of memory is not continuous.
* It uses ordered_gradients[i] for data_indices[i]'s gradients (same for ordered_hessians). * ordered_gradients and ordered_hessians are preprocessed, and they are re-ordered by data_indices.
* Ordered_gradients[i] is aligned with data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf * \param data_indices Used data indices in current leaf
* \param num_data Number of used data * \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i] * \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
...@@ -265,32 +267,34 @@ public: ...@@ -265,32 +267,34 @@ public:
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data ) * \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
* \param is_enable_sparse True if enable sparse feature * \param is_enable_sparse True if enable sparse feature
* \param is_sparse Will set to true if this bin is sparse * \param is_sparse Will set to true if this bin is sparse
* \param default_bin Default bin for zeros value
* \return The bin data object * \return The bin data object
*/ */
static Bin* CreateBin(data_size_t num_data, int num_bin, static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse, bool* is_sparse); double sparse_rate, bool is_enable_sparse, bool* is_sparse, int default_bin);
/*! /*!
* \brief Create object for bin data of one feature, used for dense feature * \brief Create object for bin data of one feature, used for dense feature
* \param num_data Total number of data * \param num_data Total number of data
* \param num_bin Number of bin * \param num_bin Number of bin
* \param default_bin Default bin for zeros value
* \return The bin data object * \return The bin data object
*/ */
static Bin* CreateDenseBin(data_size_t num_data, int num_bin); static Bin* CreateDenseBin(data_size_t num_data, int num_bin, int default_bin);
/*! /*!
* \brief Create object for bin data of one feature, used for sparse feature * \brief Create object for bin data of one feature, used for sparse feature
* \param num_data Total number of data * \param num_data Total number of data
* \param num_bin Number of bin * \param num_bin Number of bin
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data ) * \param default_bin Default bin for zeros value
* \return The bin data object * \return The bin data object
*/ */
static Bin* CreateSparseBin(data_size_t num_data, static Bin* CreateSparseBin(data_size_t num_data,
int num_bin); int num_bin, int default_bin);
}; };
inline unsigned int BinMapper::ValueToBin(double value) const { inline unsigned int BinMapper::ValueToBin(double value) const {
// use binary search to find bin // binary search to find bin
int l = 0; int l = 0;
int r = num_bin_ - 1; int r = num_bin_ - 1;
while (l < r) { while (l < r) {
......
...@@ -23,8 +23,8 @@ public: ...@@ -23,8 +23,8 @@ public:
virtual ~Boosting() {} virtual ~Boosting() {}
/*! /*!
* \brief Initial logic * \brief Initialization logic
* \param config Config for boosting * \param config Configs for boosting
* \param train_data Training data * \param train_data Training data
* \param object_function Training objective function * \param object_function Training objective function
* \param training_metrics Training metric * \param training_metrics Training metric
...@@ -54,12 +54,19 @@ public: ...@@ -54,12 +54,19 @@ public:
virtual double PredictRaw(const double * feature_values) const = 0; virtual double PredictRaw(const double * feature_values) const = 0;
/*! /*!
* \brief Prediction for one record, will use sigmoid transform if needed * \brief Prediction for one record, sigmoid transformation will be used if needed
* \param feature_values Feature value on this record * \param feature_values Feature value on this record
* \return Prediction result for this record * \return Prediction result for this record
*/ */
virtual double Predict(const double * feature_values) const = 0; virtual double Predict(const double * feature_values) const = 0;
/*!
* \brief Predtion for one record with leaf index
* \param feature_values Feature value on this record
* \return Predicted leaf index for this record
*/
virtual std::vector<int> PredictLeafIndex(const double * feature_values) const = 0;
/*! /*!
* \brief Serialize models by string * \brief Serialize models by string
* \return String output of tranined model * \return String output of tranined model
......
...@@ -93,6 +93,8 @@ public: ...@@ -93,6 +93,8 @@ public:
std::string output_result = "LightGBM_predict_result.txt"; std::string output_result = "LightGBM_predict_result.txt";
std::string input_model = ""; std::string input_model = "";
std::string input_init_score = ""; std::string input_init_score = "";
int verbosity = 1;
std::string log_file = "";
int num_model_predict = -1; int num_model_predict = -1;
bool is_pre_partition = false; bool is_pre_partition = false;
bool is_enable_sparse = true; bool is_enable_sparse = true;
...@@ -120,6 +122,7 @@ public: ...@@ -120,6 +122,7 @@ public:
struct MetricConfig: public ConfigBase { struct MetricConfig: public ConfigBase {
public: public:
virtual ~MetricConfig() {} virtual ~MetricConfig() {}
int early_stopping_round = 0;
int output_freq = 1; int output_freq = 1;
double sigmoid = 1; double sigmoid = 1;
bool is_provide_training_metric = false; bool is_provide_training_metric = false;
...@@ -134,9 +137,17 @@ struct TreeConfig: public ConfigBase { ...@@ -134,9 +137,17 @@ struct TreeConfig: public ConfigBase {
public: public:
int min_data_in_leaf = 100; int min_data_in_leaf = 100;
double min_sum_hessian_in_leaf = 10.0f; double min_sum_hessian_in_leaf = 10.0f;
// should > 1, only one leaf means not need to learning
int num_leaves = 127; int num_leaves = 127;
int feature_fraction_seed = 2; int feature_fraction_seed = 2;
double feature_fraction = 1.0; double feature_fraction = 1.0;
// max cache size(unit:MB) for historical histogram. < 0 means not limit
double histogram_pool_size = -1;
// max depth of tree model.
// Still grow tree by leaf-wise, but limit the max depth to avoid over-fitting
// And the max leaves will be min(num_leaves, pow(2, max_depth - 1))
// max_depth < 0 means not limit
int max_depth = -1;
void Set(const std::unordered_map<std::string, std::string>& params) override; void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -155,6 +166,7 @@ public: ...@@ -155,6 +166,7 @@ public:
double bagging_fraction = 1.0; double bagging_fraction = 1.0;
int bagging_seed = 3; int bagging_seed = 3;
int bagging_freq = 0; int bagging_freq = 0;
int early_stopping_round = 0;
void Set(const std::unordered_map<std::string, std::string>& params) override; void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -189,6 +201,7 @@ public: ...@@ -189,6 +201,7 @@ public:
int num_threads = 0; int num_threads = 0;
bool is_parallel = false; bool is_parallel = false;
bool is_parallel_find_bin = false; bool is_parallel_find_bin = false;
bool predict_leaf_index = false;
IOConfig io_config; IOConfig io_config;
BoostingType boosting_type = BoostingType::kGBDT; BoostingType boosting_type = BoostingType::kGBDT;
BoostingConfig* boosting_config; BoostingConfig* boosting_config;
...@@ -308,7 +321,10 @@ struct ParameterAlias { ...@@ -308,7 +321,10 @@ struct ParameterAlias {
{ "two_round", "use_two_round_loading" }, { "two_round", "use_two_round_loading" },
{ "mlist", "machine_list_file" }, { "mlist", "machine_list_file" },
{ "is_save_binary", "is_save_binary_file" }, { "is_save_binary", "is_save_binary_file" },
{ "save_binary", "is_save_binary_file" } { "save_binary", "is_save_binary_file" },
{ "early_stopping_rounds", "early_stopping_round"},
{ "early_stopping", "early_stopping_round"},
{ "verbosity", "verbose" }
}); });
std::unordered_map<std::string, std::string> tmp_map; std::unordered_map<std::string, std::string> tmp_map;
for (const auto& pair : *params) { for (const auto& pair : *params) {
......
...@@ -22,7 +22,7 @@ class Feature; ...@@ -22,7 +22,7 @@ class Feature;
* *
* Some details: * Some details:
* 1. Label, used for traning. * 1. Label, used for traning.
* 2. Weights, weighs of record, optional * 2. Weights, weighs of records, optional
* 3. Query Boundaries, necessary for lambdarank. * 3. Query Boundaries, necessary for lambdarank.
* The documents of i-th query is in [ query_boundarise[i], query_boundarise[i+1] ) * The documents of i-th query is in [ query_boundarise[i], query_boundarise[i+1] )
* 4. Query Weights, auto calculate by weights and query_boundarise(if both of them are existed) * 4. Query Weights, auto calculate by weights and query_boundarise(if both of them are existed)
...@@ -36,7 +36,7 @@ public: ...@@ -36,7 +36,7 @@ public:
*/ */
Metadata(); Metadata();
/*! /*!
* \brief Initialize, will load qurey level informations, since it is need for sampling data * \brief Initialization will load qurey level informations, since it is need for sampling data
* \param data_filename Filename of data * \param data_filename Filename of data
* \param init_score_filename Filename of initial score * \param init_score_filename Filename of initial score
* \param is_int_label True if label is int type * \param is_int_label True if label is int type
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
namespace LightGBM { namespace LightGBM {
/*! \brief Used to store data and provide some operations on one feature*/ /*! \brief Using to store data and providing some operations on one feature*/
class Feature { class Feature {
public: public:
/*! /*!
...@@ -27,7 +27,7 @@ public: ...@@ -27,7 +27,7 @@ public:
:bin_mapper_(bin_mapper) { :bin_mapper_(bin_mapper) {
feature_index_ = feature_idx; feature_index_ = feature_idx;
bin_data_ = Bin::CreateBin(num_data, bin_mapper_->num_bin(), bin_data_ = Bin::CreateBin(num_data, bin_mapper_->num_bin(),
bin_mapper_->sparse_rate(), is_enable_sparse, &is_sparse_); bin_mapper_->sparse_rate(), is_enable_sparse, &is_sparse_, bin_mapper_->ValueToBin(0));
} }
/*! /*!
* \brief Constructor from memory * \brief Constructor from memory
...@@ -52,9 +52,9 @@ public: ...@@ -52,9 +52,9 @@ public:
num_data = static_cast<data_size_t>(local_used_indices.size()); num_data = static_cast<data_size_t>(local_used_indices.size());
} }
if (is_sparse_) { if (is_sparse_) {
bin_data_ = Bin::CreateSparseBin(num_data, bin_mapper_->num_bin()); bin_data_ = Bin::CreateSparseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->ValueToBin(0));
} else { } else {
bin_data_ = Bin::CreateDenseBin(num_data, bin_mapper_->num_bin()); bin_data_ = Bin::CreateDenseBin(num_data, bin_mapper_->num_bin(), bin_mapper_->ValueToBin(0));
} }
// get bin data // get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices); bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
......
...@@ -28,11 +28,11 @@ public: ...@@ -28,11 +28,11 @@ public:
const Metadata& metadata, data_size_t num_data) = 0; const Metadata& metadata, data_size_t num_data) = 0;
/*! /*!
* \brief Calcalute and print metric result * \brief Calcaluting and printing metric result
* \param iter Current iteration * \param iter Current iteration
* \param score Current prediction score * \param score Current prediction score
*/ */
virtual void Print(int iter, const score_t* score) const = 0; virtual score_t PrintAndGetLoss(int iter, const score_t* score) const = 0;
/*! /*!
* \brief Create object of metrics * \brief Create object of metrics
...@@ -40,6 +40,9 @@ public: ...@@ -40,6 +40,9 @@ public:
* \param config Config for metric * \param config Config for metric
*/ */
static Metric* CreateMetric(const std::string& type, const MetricConfig& config); static Metric* CreateMetric(const std::string& type, const MetricConfig& config);
bool the_bigger_the_better = false;
int early_stopping_round_ = 0;
}; };
/*! /*!
...@@ -55,7 +58,7 @@ public: ...@@ -55,7 +58,7 @@ public:
/*! /*!
* \brief Calculate the DCG score at position k * \brief Calculate the DCG score at position k
* \param k The position want to eval at * \param k The position to evaluate
* \param label Pointer of label * \param label Pointer of label
* \param score Pointer of score * \param score Pointer of score
* \param num_data Number of data * \param num_data Number of data
...@@ -66,7 +69,7 @@ public: ...@@ -66,7 +69,7 @@ public:
/*! /*!
* \brief Calculate the DCG score at multi position * \brief Calculate the DCG score at multi position
* \param ks The positions want to eval at * \param ks The positions to evaluate
* \param label Pointer of label * \param label Pointer of label
* \param score Pointer of score * \param score Pointer of score
* \param num_data Number of data * \param num_data Number of data
......
...@@ -14,7 +14,7 @@ namespace LightGBM { ...@@ -14,7 +14,7 @@ namespace LightGBM {
/*! \brief forward declaration */ /*! \brief forward declaration */
class Linkers; class Linkers;
/*! \brief The network structure for all gather */ /*! \brief The network structure for all_gather */
class BruckMap { class BruckMap {
public: public:
/*! \brief The communication times for one all gather operation */ /*! \brief The communication times for one all gather operation */
...@@ -98,7 +98,7 @@ public: ...@@ -98,7 +98,7 @@ public:
static inline int num_machines(); static inline int num_machines();
/*! /*!
* \brief Perform all reduce. if data size is small, * \brief Perform all_reduce. if data size is small,
will perform AllreduceByAllGather, else with call ReduceScatter followed allgather will perform AllreduceByAllGather, else with call ReduceScatter followed allgather
* \param input Input data * \param input Input data
* \param input_size The size of input data * \param input_size The size of input data
...@@ -110,7 +110,7 @@ public: ...@@ -110,7 +110,7 @@ public:
char* output, const ReduceFunction& reducer); char* output, const ReduceFunction& reducer);
/*! /*!
* \brief Perform all reduce, use all gather. When data is small, can use this to reduce communication times * \brief Perform all_reduce by using all_gather. it can be use to reduce communication time when data is small
* \param input Input data * \param input Input data
* \param input_size The size of input data * \param input_size The size of input data
* \param output Output result * \param output Output result
...@@ -120,8 +120,9 @@ public: ...@@ -120,8 +120,9 @@ public:
const ReduceFunction& reducer); const ReduceFunction& reducer);
/*! /*!
* \brief Perform all gather, use bruck algorithm. Communication times is O(log(n)), and communication cost is O(send_size * number_machine) * \brief Performing all_gather by using bruck algorithm.
* if all machine have same input size, can call this function Communication times is O(log(n)), and communication cost is O(send_size * number_machine)
* It can be used when all nodes have same input size.
* \param input Input data * \param input Input data
* \param send_size The size of input data * \param send_size The size of input data
* \param output Output result * \param output Output result
...@@ -129,8 +130,9 @@ public: ...@@ -129,8 +130,9 @@ public:
static void Allgather(char* input, int send_size, char* output); static void Allgather(char* input, int send_size, char* output);
/*! /*!
* \brief Perform all gather, use bruck algorithm. Communication times is O(log(n)), and communication cost is O(all_size) * \brief Performing all_gather by using bruck algorithm.
* if all machine have different input size, can call this function Communication times is O(log(n)), and communication cost is O(all_size)
* It can be used when nodes have different input size.
* \param input Input data * \param input Input data
* \param all_size The size of input data * \param all_size The size of input data
* \param block_start The block start for different machines * \param block_start The block start for different machines
...@@ -141,7 +143,8 @@ public: ...@@ -141,7 +143,8 @@ public:
int* block_len, char* output); int* block_len, char* output);
/*! /*!
* \brief Perform reduce scatter, use recursive halving algorithm. Communication times is O(log(n)), and communication cost is O(input_size) * \brief Perform reduce scatter by using recursive halving algorithm.
Communication times is O(log(n)), and communication cost is O(input_size)
* \param input Input data * \param input Input data
* \param input_size The size of input data * \param input_size The size of input data
* \param block_start The block start for different machines * \param block_start The block start for different machines
......
...@@ -9,7 +9,6 @@ namespace LightGBM { ...@@ -9,7 +9,6 @@ namespace LightGBM {
/*! /*!
* \brief The interface of Objective Function. * \brief The interface of Objective Function.
* Objective function is used to get gradients
*/ */
class ObjectiveFunction { class ObjectiveFunction {
public: public:
...@@ -24,8 +23,8 @@ public: ...@@ -24,8 +23,8 @@ public:
virtual void Init(const Metadata& metadata, data_size_t num_data) = 0; virtual void Init(const Metadata& metadata, data_size_t num_data) = 0;
/*! /*!
* \brief calculate first order derivative of loss function * \brief calculating first order derivative of loss function
* \param score Current prediction score * \param score prediction score in this round
* \gradients Output gradients * \gradients Output gradients
* \hessians Output hessians * \hessians Output hessians
*/ */
......
...@@ -31,9 +31,9 @@ public: ...@@ -31,9 +31,9 @@ public:
~Tree(); ~Tree();
/*! /*!
* \brief Split a tree leave, * \brief Performing a split on tree leaves.
* \param leaf Index of leaf that want to split * \param leaf Index of leaf to be split
* \param feature Index of feature, the converted index after remove useless features * \param feature Index of feature; the converted index after removing useless features
* \param threshold Threshold(bin) of split * \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data * \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value * \param threshold_double Threshold on feature value
...@@ -50,7 +50,7 @@ public: ...@@ -50,7 +50,7 @@ public:
inline score_t LeafOutput(int leaf) const { return leaf_value_[leaf]; } inline score_t LeafOutput(int leaf) const { return leaf_value_[leaf]; }
/*! /*!
* \brief Add prediction of this tree model to score * \brief Adding prediction value of this tree model to scores
* \param data The dataset * \param data The dataset
* \param num_data Number of total data * \param num_data Number of total data
* \param score Will add prediction to score * \param score Will add prediction to score
...@@ -59,7 +59,7 @@ public: ...@@ -59,7 +59,7 @@ public:
score_t* score) const; score_t* score) const;
/*! /*!
* \brief Add prediction of this tree model to score * \brief Adding prediction value of this tree model to scorese
* \param data The dataset * \param data The dataset
* \param used_data_indices Indices of used data * \param used_data_indices Indices of used data
* \param num_data Number of total data * \param num_data Number of total data
...@@ -70,17 +70,22 @@ public: ...@@ -70,17 +70,22 @@ public:
data_size_t num_data, score_t* score) const; data_size_t num_data, score_t* score) const;
/*! /*!
* \brief Prediction for one record * \brief Prediction on one record
* \param feature_values Feature value of this record * \param feature_values Feature value of this record
* \return Prediction result * \return Prediction result
*/ */
inline score_t Predict(const double* feature_values) const; inline score_t Predict(const double* feature_values) const;
inline int PredictLeafIndex(const double* feature_values) const;
/*! \brief Get Number of leaves*/ /*! \brief Get Number of leaves*/
inline int num_leaves() const { return num_leaves_; } inline int num_leaves() const { return num_leaves_; }
/*! \brief Get depth of specific leaf*/
inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
/*! /*!
* \brief Shrinkage for the tree's output * \brief Shrinkage for the tree's output
* shrinkage rate (a.k.a learning rate) is used to tune the traning process
* \param rate The factor of shrinkage * \param rate The factor of shrinkage
*/ */
inline void Shrinkage(double rate) { inline void Shrinkage(double rate) {
...@@ -98,7 +103,7 @@ public: ...@@ -98,7 +103,7 @@ public:
Tree(const Tree&) = delete; Tree(const Tree&) = delete;
private: private:
/*! /*!
* \brief Find leaf index that this record belongs * \brief Find leaf index of which record belongs by data
* \param data The dataset * \param data The dataset
* \param data_idx Index of record * \param data_idx Index of record
* \return Leaf index * \return Leaf index
...@@ -107,7 +112,7 @@ private: ...@@ -107,7 +112,7 @@ private:
data_size_t data_idx) const; data_size_t data_idx) const;
/*! /*!
* \brief Find leaf index that this record belongs * \brief Find leaf index of which record belongs by features
* \param feature_values Feature value of this record * \param feature_values Feature value of this record
* \return Leaf index * \return Leaf index
*/ */
...@@ -137,14 +142,21 @@ private: ...@@ -137,14 +142,21 @@ private:
int* leaf_parent_; int* leaf_parent_;
/*! \brief Output of leaves */ /*! \brief Output of leaves */
score_t* leaf_value_; score_t* leaf_value_;
/*! \brief Depth for leaves */
int* leaf_depth_;
}; };
inline score_t Tree::Predict(const double* feature_values)const { inline score_t Tree::Predict(const double* feature_values) const {
int leaf = GetLeaf(feature_values); int leaf = GetLeaf(feature_values);
return LeafOutput(leaf); return LeafOutput(leaf);
} }
inline int Tree::PredictLeafIndex(const double* feature_values) const {
int leaf = GetLeaf(feature_values);
return leaf;
}
inline int Tree::GetLeaf(const std::vector<BinIterator*>& iterators, inline int Tree::GetLeaf(const std::vector<BinIterator*>& iterators,
data_size_t data_idx) const { data_size_t data_idx) const {
int node = 0; int node = 0;
......
...@@ -22,14 +22,13 @@ public: ...@@ -22,14 +22,13 @@ public:
virtual ~TreeLearner() {} virtual ~TreeLearner() {}
/*! /*!
* \brief Init tree learner with training data set and tree config * \brief Initialize tree learner with training dataset and configs
* \param train_data The used training data * \param train_data The used training data
* \param tree_config The tree setting
*/ */
virtual void Init(const Dataset* train_data) = 0; virtual void Init(const Dataset* train_data) = 0;
/*! /*!
* \brief fit train data set and return a trained tree * \brief training tree model on dataset
* \param gradients The first order gradients * \param gradients The first order gradients
* \param hessians The second order gradients * \param hessians The second order gradients
* \return A trained tree * \return A trained tree
...@@ -45,7 +44,7 @@ public: ...@@ -45,7 +44,7 @@ public:
data_size_t num_data) = 0; data_size_t num_data) = 0;
/*! /*!
* \brief Use last trained tree to predition training score, and add to out_score; * \brief Using last trained tree to predict score then adding to out_score;
* \param out_score output score * \param out_score output score
*/ */
virtual void AddPredictionToScore(score_t *out_score) const = 0; virtual void AddPredictionToScore(score_t *out_score) const = 0;
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <vector> #include <vector>
#include <sstream> #include <sstream>
#include <cstdint> #include <cstdint>
#include <algorithm>
namespace LightGBM { namespace LightGBM {
...@@ -80,7 +81,7 @@ inline static const char* Atoi(const char* p, int* out) { ...@@ -80,7 +81,7 @@ inline static const char* Atoi(const char* p, int* out) {
inline static const char* Atof(const char* p, double* out) { inline static const char* Atof(const char* p, double* out) {
int frac; int frac;
double sign, value, scale; double sign, value, scale;
*out = 0;
// Skip leading white space, if any. // Skip leading white space, if any.
while (*p == ' ') { while (*p == ' ') {
++p; ++p;
...@@ -140,34 +141,25 @@ inline static const char* Atof(const char* p, double* out) { ...@@ -140,34 +141,25 @@ inline static const char* Atof(const char* p, double* out) {
// Return signed and scaled floating point result. // Return signed and scaled floating point result.
*out = sign * (frac ? (value / scale) : (value * scale)); *out = sign * (frac ? (value / scale) : (value * scale));
} else { } else {
if (*p == 'n' || *p == 'N') { size_t cnt = 0;
++p; while (*(p + cnt) != '\0' && *(p + cnt) != ' '
if (!(*p == 'a' || *p == 'A')) { && *(p + cnt) != '\t' && *(p + cnt) != ','
Log::Stderr("meet error while parsing string to float, expect a nan here"); && *(p + cnt) != '\n' && *(p + cnt) != '\r'
} && *(p + cnt) != ':') {
++p; ++cnt;
if (!(*p == 'n' || *p == 'N')) { }
Log::Stderr("meet error while parsing string to float, expect a nan here"); if(cnt > 0){
} std::string tmp_str(p, cnt);
++p; std::transform(tmp_str.begin(), tmp_str.end(), tmp_str.begin(), ::tolower);
// default convert nan to 0 if (tmp_str == std::string("na") || tmp_str == std::string("nan")) {
*out = 0; *out = 0;
} else if (*p == 'i' || *p == 'I') { } else if( tmp_str == std::string("inf") || tmp_str == std::string("infinity")) {
++p;
if (!(*p == 'n' || *p == 'N')) {
Log::Stderr("meet error while parsing string to float, expect a inf here");
}
++p;
if (!(*p == 'f' || *p == 'F')) {
Log::Stderr("meet error while parsing string to float, expect a inf here");
}
++p;
// default inf
*out = sign * 1e308; *out = sign * 1e308;
} else {
if (*p != '\0') {
Log::Stderr("Meet unknow characters while parsing string to float");
} }
else {
Log::Fatal("Unknow token %s in data file", tmp_str.c_str());
}
p += cnt;
} }
} }
...@@ -209,7 +201,7 @@ inline static std::string ArrayToString(const T* arr, int n, char delimiter) { ...@@ -209,7 +201,7 @@ inline static std::string ArrayToString(const T* arr, int n, char delimiter) {
inline static void StringToIntArray(const std::string& str, char delimiter, size_t n, int* out) { inline static void StringToIntArray(const std::string& str, char delimiter, size_t n, int* out) {
std::vector<std::string> strs = Split(str.c_str(), delimiter); std::vector<std::string> strs = Split(str.c_str(), delimiter);
if (strs.size() != n) { if (strs.size() != n) {
Log::Stderr("StringToIntArray error, size don't equal."); Log::Fatal("StringToIntArray error, size doesn't matched.");
} }
for (size_t i = 0; i < strs.size(); ++i) { for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]); strs[i] = Trim(strs[i]);
...@@ -220,7 +212,7 @@ inline static void StringToIntArray(const std::string& str, char delimiter, size ...@@ -220,7 +212,7 @@ inline static void StringToIntArray(const std::string& str, char delimiter, size
inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, double* out) { inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, double* out) {
std::vector<std::string> strs = Split(str.c_str(), delimiter); std::vector<std::string> strs = Split(str.c_str(), delimiter);
if (strs.size() != n) { if (strs.size() != n) {
Log::Stderr("StringToDoubleArray error, size don't equal"); Log::Fatal("StringToDoubleArray error, size doesn't matched.");
} }
for (size_t i = 0; i < strs.size(); ++i) { for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]); strs[i] = Trim(strs[i]);
...@@ -231,7 +223,7 @@ inline static void StringToDoubleArray(const std::string& str, char delimiter, s ...@@ -231,7 +223,7 @@ inline static void StringToDoubleArray(const std::string& str, char delimiter, s
inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, float* out) { inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, float* out) {
std::vector<std::string> strs = Split(str.c_str(), delimiter); std::vector<std::string> strs = Split(str.c_str(), delimiter);
if (strs.size() != n) { if (strs.size() != n) {
Log::Stderr("StringToDoubleArray error, size don't equal"); Log::Fatal("StringToDoubleArray error, size doesn't matched.");
} }
double tmp; double tmp;
for (size_t i = 0; i < strs.size(); ++i) { for (size_t i = 0; i < strs.size(); ++i) {
......
...@@ -8,42 +8,89 @@ ...@@ -8,42 +8,89 @@
namespace LightGBM { namespace LightGBM {
#ifndef CHECK
#define CHECK(condition) \
if (!(condition)) Log::Fatal("Check failed: " #condition \
" at %s, line %d .\n", __FILE__, __LINE__);
#endif
#ifndef CHECK_NOTNULL
#define CHECK_NOTNULL(pointer) \
if ((pointer) == nullptr) LightGBM::Log::Fatal(#pointer " Can't be NULL");
#endif
enum class LogLevel: int {
Fatal = -1,
Error = 0,
Info = 1,
Debug = 2,
};
/*!
* \brief A static Log class
*/
class Log { class Log {
public: public:
/*!
* \brief Resets the minimal log level. It is INFO by default.
* \param level The new minimal log level.
*/
static void ResetLogLevel(LogLevel level) {
GetLevel() = level;
}
inline static void Stderr(const char *format, ...) { static void Debug(const char *format, ...) {
va_list argptr; va_list val;
char fixed[512]; va_start(val, format);
#ifdef _MSC_VER Write(LogLevel::Debug, "Debug", format, val);
sprintf_s(fixed, "[LightGBM Error] %s \n", format); va_end(val);
#else }
sprintf(fixed, "[LightGBM Error] %s \n", format); static void Info(const char *format, ...) {
#endif va_list val;
va_start(argptr, format); va_start(val, format);
vfprintf(stderr, fixed, argptr); Write(LogLevel::Info, "Info", format, val);
va_end(argptr); va_end(val);
}
static void Error(const char *format, ...) {
va_list val;
va_start(val, format);
Write(LogLevel::Error, "Error", format, val);
va_end(val);
}
static void Fatal(const char *format, ...) {
va_list val;
va_start(val, format);
fprintf(stderr, "[LightGBM] [Fatal] ");
vfprintf(stderr, format, val);
fprintf(stderr, "\n");
fflush(stderr); fflush(stderr);
std::exit(1); va_end(val);
exit(1);
} }
inline static void Stdout(const char *format, ...) { private:
va_list argptr;
char fixed[512]; static void Write(LogLevel level, const char* level_str, const char *format, va_list val) {
#ifdef _MSC_VER if (level <= GetLevel()) { // omit the message with low level
sprintf_s(fixed, "[LightGBM] %s\n", format); // write to STDOUT
#else printf("[LightGBM] [%s] ", level_str);
sprintf(fixed, "[LightGBM] %s\n", format); vprintf(format, val);
#endif printf("\n");
va_start(argptr, format);
vfprintf(stdout, fixed, argptr);
va_end(argptr);
fflush(stdout); fflush(stdout);
} }
}; }
#define CHECK(condition) \ // a trick to use static variable in header file.
if (!(condition)) Log::Stderr("Check failed: " #condition \ // May be not good, but avoid to use an additional cpp file
" at %s, line %d .\n", __FILE__, __LINE__); static LogLevel& GetLevel() {
static LogLevel level;
return level;
};
};
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_UTILS_LOG_H_ #endif // LightGBM_UTILS_LOG_H_
#ifndef LIGHTGBM_UTILS_LRU_POOL_H_
#define LIGHTGBM_UTILS_LRU_POOL_H_
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <cstring>
namespace LightGBM {
/*!
* \brief A LRU cached object pool, used for store historical histograms
*/
template<typename T>
class LRUPool {
public:
/*!
* \brief Constructor
*/
LRUPool() {
}
/*!
* \brief Destructor
*/
~LRUPool() {
FreeAll();
}
/*!
* \brief Reset pool size
* \param cache_size Max cache size
* \param total_size Total size will be used
*/
void ResetSize(int cache_size, int total_size) {
// free old memory
FreeAll();
cache_size_ = cache_size;
// at least need 2 bucket to store smaller leaf and larger leaf
CHECK(cache_size_ >= 2);
total_size_ = total_size;
pool_ = new T[cache_size];
mapper_ = new int[total_size_];
inverse_mapper_ = new int[cache_size_];
last_used_time_ = new int[cache_size_];
ResetMap();
}
/*!
* \brief Return true if this pool is enough to store all data
*/
bool IsEnough() {
return cache_size_ == total_size_;
}
/*!
* \brief Reset mapper
*/
void ResetMap() {
cur_time_ = 0;
memset(mapper_, -1, sizeof(int)*total_size_);
memset(inverse_mapper_, -1, sizeof(int)*cache_size_);
memset(last_used_time_, 0, sizeof(int)*cache_size_);
}
/*!
* \brief Set data for the pool for specific index
* \param idx which index want to set to
* \param data
*/
void Set(int idx, const T& data) {
pool_[idx] = data;
}
/*!
* \brief Get data for the specific index
* \param idx which index want to get
* \param out output data will store into this
* \return True if this index is in the pool, False if this index is not in the pool
*/
bool Get(int idx, T* out) {
if (mapper_[idx] >= 0) {
int slot = mapper_[idx];
*out = pool_[slot];
last_used_time_[slot] = ++cur_time_;
return true;
} else {
// choose the least used slot
int slot = static_cast<int>(ArrayArgs<int>::ArgMin(last_used_time_, cache_size_));
*out = pool_[slot];
last_used_time_[slot] = ++cur_time_;
// reset previous mapper
if (inverse_mapper_[slot] >= 0) mapper_[inverse_mapper_[slot]] = -1;
// update current mapper
mapper_[idx] = slot;
inverse_mapper_[slot] = idx;
return false;
}
}
/*!
* \brief Move data from one index to another index
* \param src_idx
* \param dst_idx
*/
void Move(int src_idx, int dst_idx) {
if (mapper_[src_idx] < 0) {
return;
}
// get slot of src idx
int slot = mapper_[src_idx];
// reset src_idx
mapper_[src_idx] = -1;
// move to dst idx
mapper_[dst_idx] = slot;
last_used_time_[slot] = ++cur_time_;
inverse_mapper_[slot] = dst_idx;
}
private:
void FreeAll(){
if (pool_ != nullptr) {
delete[] pool_;
}
if (mapper_ != nullptr) {
delete[] mapper_;
}
if (inverse_mapper_ != nullptr) {
delete[] inverse_mapper_;
}
if (last_used_time_ != nullptr) {
delete[] last_used_time_;
}
}
T* pool_ = nullptr;
int cache_size_;
int total_size_;
int* mapper_ = nullptr;
int* inverse_mapper_ = nullptr;
int* last_used_time_ = nullptr;
int cur_time_ = 0;
};
}
#endif // LIGHTGBM_UTILS_LRU_POOL_H_
...@@ -87,7 +87,7 @@ public: ...@@ -87,7 +87,7 @@ public:
}); });
// if last line of file doesn't contain end of line // if last line of file doesn't contain end of line
if (last_line_.size() > 0) { if (last_line_.size() > 0) {
Log::Stdout("Warning: last line of file %s doesn't contain end of line, application will still use this line", filename_); Log::Info("Warning: last line of file %s doesn't contain end of line, application will still use this line", filename_);
process_fun(total_cnt, last_line_.c_str(), last_line_.size()); process_fun(total_cnt, last_line_.c_str(), last_line_.size());
++total_cnt; ++total_cnt;
last_line_ = ""; last_line_ = "";
...@@ -224,7 +224,7 @@ public: ...@@ -224,7 +224,7 @@ public:
}); });
// if last line of file doesn't contain end of line // if last line of file doesn't contain end of line
if (last_line_.size() > 0) { if (last_line_.size() > 0) {
Log::Stdout("Warning: last line of file %s doesn't contain end of line, application will still use this line", filename_); Log::Info("Warning: last line of file %s doesn't contain end of line, application will still use this line", filename_);
if (filter_fun(used_cnt, total_cnt)) { if (filter_fun(used_cnt, total_cnt)) {
lines_.push_back(last_line_); lines_.push_back(last_line_);
process_fun(used_cnt, lines_); process_fun(used_cnt, lines_);
......
...@@ -69,7 +69,7 @@ void Application::LoadParameters(int argc, char** argv) { ...@@ -69,7 +69,7 @@ void Application::LoadParameters(int argc, char** argv) {
params[key] = value; params[key] = value;
} }
else { else {
Log::Stdout("Warning: unknown parameter in command line: %s", argv[i]); Log::Error("Unknown parameter in command line: %s", argv[i]);
} }
} }
// check for alias // check for alias
...@@ -101,11 +101,11 @@ void Application::LoadParameters(int argc, char** argv) { ...@@ -101,11 +101,11 @@ void Application::LoadParameters(int argc, char** argv) {
} }
} }
else { else {
Log::Stdout("Warning: unknown parameter in config file: %s", line.c_str()); Log::Error("Unknown parameter in config file: %s", line.c_str());
} }
} }
} else { } else {
Log::Stdout("config file: %s doesn't exist, will ignore", Log::Error("Config file: %s doesn't exist, will ignore",
params["config_file"].c_str()); params["config_file"].c_str());
} }
} }
...@@ -113,7 +113,7 @@ void Application::LoadParameters(int argc, char** argv) { ...@@ -113,7 +113,7 @@ void Application::LoadParameters(int argc, char** argv) {
ParameterAlias::KeyAliasTransform(&params); ParameterAlias::KeyAliasTransform(&params);
// load configs // load configs
config_.Set(params); config_.Set(params);
Log::Stdout("finished load parameters"); Log::Info("Loading parameters .. finished");
} }
void Application::LoadData() { void Application::LoadData() {
...@@ -125,7 +125,7 @@ void Application::LoadData() { ...@@ -125,7 +125,7 @@ void Application::LoadData() {
if (config_.io_config.input_model.size() > 0) { if (config_.io_config.input_model.size() > 0) {
LoadModel(); LoadModel();
if (boosting_->NumberOfSubModels() > 0) { if (boosting_->NumberOfSubModels() > 0) {
predictor = new Predictor(boosting_, config_.io_config.is_sigmoid); predictor = new Predictor(boosting_, config_.io_config.is_sigmoid, config_.predict_leaf_index);
predict_fun = predict_fun =
[&predictor](const std::vector<std::pair<int, double>>& features) { [&predictor](const std::vector<std::pair<int, double>>& features) {
return predictor->PredictRawOneLine(features); return predictor->PredictRawOneLine(features);
...@@ -201,7 +201,7 @@ void Application::LoadData() { ...@@ -201,7 +201,7 @@ void Application::LoadData() {
} }
auto end_time = std::chrono::high_resolution_clock::now(); auto end_time = std::chrono::high_resolution_clock::now();
// output used time on each iteration // output used time on each iteration
Log::Stdout("Finish loading data, use %f seconds ", Log::Info("Finish loading data, use %f seconds",
std::chrono::duration<double, std::milli>(end_time - start_time) * 1e-3); std::chrono::duration<double, std::milli>(end_time - start_time) * 1e-3);
} }
...@@ -209,7 +209,7 @@ void Application::InitTrain() { ...@@ -209,7 +209,7 @@ void Application::InitTrain() {
if (config_.is_parallel) { if (config_.is_parallel) {
// need init network // need init network
Network::Init(config_.network_config); Network::Init(config_.network_config);
Log::Stdout("finish network initialization"); Log::Info("Finish network initialization");
// sync global random seed for feature patition // sync global random seed for feature patition
if (config_.boosting_type == BoostingType::kGBDT) { if (config_.boosting_type == BoostingType::kGBDT) {
GBDTConfig* gbdt_config = GBDTConfig* gbdt_config =
...@@ -240,28 +240,28 @@ void Application::InitTrain() { ...@@ -240,28 +240,28 @@ void Application::InitTrain() {
boosting_->AddDataset(valid_datas_[i], boosting_->AddDataset(valid_datas_[i],
ConstPtrInVectorWarpper<Metric>(valid_metrics_[i])); ConstPtrInVectorWarpper<Metric>(valid_metrics_[i]));
} }
Log::Stdout("finish training init"); Log::Info("Finish training initilization.");
} }
void Application::Train() { void Application::Train() {
Log::Stdout("start train"); Log::Info("Start train");
boosting_->Train(); boosting_->Train();
Log::Stdout("finish train"); Log::Info("Finish train");
} }
void Application::Predict() { void Application::Predict() {
// create predictor // create predictor
Predictor predictor(boosting_, config_.io_config.is_sigmoid); Predictor predictor(boosting_, config_.io_config.is_sigmoid, config_.predict_leaf_index);
predictor.Predict(config_.io_config.data_filename.c_str(), config_.io_config.output_result.c_str()); predictor.Predict(config_.io_config.data_filename.c_str(), config_.io_config.output_result.c_str());
Log::Stdout("finish predict"); Log::Info("Finish predict.");
} }
void Application::InitPredict() { void Application::InitPredict() {
boosting_ = boosting_ =
Boosting::CreateBoosting(config_.boosting_type, config_.boosting_config); Boosting::CreateBoosting(config_.boosting_type, config_.boosting_config);
LoadModel(); LoadModel();
Log::Stdout("finish predict init"); Log::Info("Finish predict initilization.");
} }
void Application::LoadModel() { void Application::LoadModel() {
......
...@@ -26,9 +26,10 @@ public: ...@@ -26,9 +26,10 @@ public:
* \brief Constructor * \brief Constructor
* \param boosting Input boosting model * \param boosting Input boosting model
* \param is_sigmoid True if need to predict result with sigmoid transform(if needed, like binary classification) * \param is_sigmoid True if need to predict result with sigmoid transform(if needed, like binary classification)
* \param predict_leaf_index True if output leaf index instead of prediction score
*/ */
Predictor(const Boosting* boosting, bool is_simgoid) Predictor(const Boosting* boosting, bool is_simgoid, bool predict_leaf_index)
: is_simgoid_(is_simgoid) { : is_simgoid_(is_simgoid), predict_leaf_index(predict_leaf_index) {
boosting_ = boosting; boosting_ = boosting;
num_features_ = boosting_->MaxFeatureIdx() + 1; num_features_ = boosting_->MaxFeatureIdx() + 1;
#pragma omp parallel #pragma omp parallel
...@@ -54,44 +55,39 @@ public: ...@@ -54,44 +55,39 @@ public:
} }
/*! /*!
* \brief prediction for one record, only raw result(not sigmoid transform) * \brief prediction for one record, only raw result(without sigmoid transformation)
* \param features Feature for this record * \param features Feature for this record
* \return Prediction result * \return Prediction result
*/ */
double PredictRawOneLine(const std::vector<std::pair<int, double>>& features) { double PredictRawOneLine(const std::vector<std::pair<int, double>>& features) {
const int tid = omp_get_thread_num(); const int tid = PutFeatureValuesToBuffer(features);
// init feature value // get result without sigmoid transformation
std::memset(features_[tid], 0, sizeof(double)*num_features_);
// put feature value
for (const auto& p : features) {
if (p.first < num_features_) {
features_[tid][p.first] = p.second;
}
}
// get result without sigmoid transform
return boosting_->PredictRaw(features_[tid]); return boosting_->PredictRaw(features_[tid]);
} }
/*! /*!
* \brief prediction for one record, will use sigmoid transform if needed(only needs in binary classification now) * \brief prediction for one record, only raw result(without sigmoid transformation)
* \param features Feature for this record * \param features Feature for this record
* \return Predictied leaf index
*/
std::vector<int> PredictLeafIndexOneLine(const std::vector<std::pair<int, double>>& features) {
const int tid = PutFeatureValuesToBuffer(features);
// get result for leaf index
return boosting_->PredictLeafIndex(features_[tid]);
}
/*!
* \brief prediction for one record, will use sigmoid transformation if needed(only enabled for binary classification noe)
* \param features Feature of this record
* \return Prediction result * \return Prediction result
*/ */
double PredictOneLine(const std::vector<std::pair<int, double>>& features) { double PredictOneLine(const std::vector<std::pair<int, double>>& features) {
const int tid = omp_get_thread_num(); const int tid = PutFeatureValuesToBuffer(features);
// init feature value // get result with sigmoid transform if needed
std::memset(features_[tid], 0, sizeof(double)*num_features_);
// put feature value
for (const auto& p : features) {
if (p.first < num_features_) {
features_[tid][p.first] = p.second;
}
}
// get result with sigmoid transform
return boosting_->Predict(features_[tid]); return boosting_->Predict(features_[tid]);
} }
/*! /*!
* \brief prediction for a data, and save result * \brief predicting on data, then saving result to disk
* \param data_filename Filename of data * \param data_filename Filename of data
* \param has_label True if this data contains label * \param has_label True if this data contains label
* \param result_filename Filename of output result * \param result_filename Filename of output result
...@@ -106,13 +102,13 @@ public: ...@@ -106,13 +102,13 @@ public:
#endif #endif
if (result_file == NULL) { if (result_file == NULL) {
Log::Stderr("predition result file %s doesn't exists", data_filename); Log::Fatal("Predition result file %s doesn't exists", data_filename);
} }
bool has_label = false; bool has_label = false;
Parser* parser = Parser::CreateParser(data_filename, num_features_, &has_label); Parser* parser = Parser::CreateParser(data_filename, num_features_, &has_label);
if (parser == nullptr) { if (parser == nullptr) {
Log::Stderr("can regonise input data format, filename %s", data_filename); Log::Fatal("Recongnizing input data format failed, filename %s", data_filename);
} }
// function for parse data // function for parse data
...@@ -124,30 +120,46 @@ public: ...@@ -124,30 +120,46 @@ public:
(const char* buffer, std::vector<std::pair<int, double>>* feature) { (const char* buffer, std::vector<std::pair<int, double>>* feature) {
parser->ParseOneLine(buffer, feature, &tmp_label); parser->ParseOneLine(buffer, feature, &tmp_label);
}; };
Log::Stdout("start prediction for data %s, and data has label", data_filename); Log::Info("Start prediction for data %s with labels", data_filename);
} else { } else {
// parse function without label // parse function without label
parser_fun = [this, &parser] parser_fun = [this, &parser]
(const char* buffer, std::vector<std::pair<int, double>>* feature) { (const char* buffer, std::vector<std::pair<int, double>>* feature) {
parser->ParseOneLine(buffer, feature); parser->ParseOneLine(buffer, feature);
}; };
Log::Stdout("start prediction for data %s, and data doesn't has label", data_filename); Log::Info("Start prediction for data %s without label", data_filename);
}
std::function<std::string(const std::vector<std::pair<int, double>>&)> predict_fun;
if (predict_leaf_index) {
predict_fun = [this](const std::vector<std::pair<int, double>>& features){
std::vector<int> predicted_leaf_index = PredictLeafIndexOneLine(features);
std::stringstream result_ss;
for (size_t i = 0; i < predicted_leaf_index.size(); ++i){
if (i > 0) {
result_ss << '\t';
}
result_ss << predicted_leaf_index[i];
}
return result_ss.str();
};
} }
std::function<double(const std::vector<std::pair<int, double>>&)> predict_fun; else {
if (is_simgoid_) { if (is_simgoid_) {
predict_fun = [this](const std::vector<std::pair<int, double>>& features) { predict_fun = [this](const std::vector<std::pair<int, double>>& features){
return PredictOneLine(features); return std::to_string(PredictOneLine(features));
}; };
} else { }
predict_fun = [this](const std::vector<std::pair<int, double>>& features) { else {
return PredictRawOneLine(features); predict_fun = [this](const std::vector<std::pair<int, double>>& features){
return std::to_string(PredictRawOneLine(features));
}; };
} }
}
std::function<void(data_size_t, const std::vector<std::string>&)> process_fun = std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
[this, &parser_fun, &predict_fun, &result_file] [this, &parser_fun, &predict_fun, &result_file]
(data_size_t, const std::vector<std::string>& lines) { (data_size_t, const std::vector<std::string>& lines) {
std::vector<std::pair<int, double>> oneline_features; std::vector<std::pair<int, double>> oneline_features;
std::vector<double> pred_result(lines.size(), 0.0f); std::vector<std::string> pred_result(lines.size(), "");
#pragma omp parallel for schedule(static) private(oneline_features) #pragma omp parallel for schedule(static) private(oneline_features)
for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); i++) { for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); i++) {
oneline_features.clear(); oneline_features.clear();
...@@ -158,10 +170,9 @@ public: ...@@ -158,10 +170,9 @@ public:
} }
for (size_t i = 0; i < pred_result.size(); ++i) { for (size_t i = 0; i < pred_result.size(); ++i) {
fprintf(result_file, "%f\n", pred_result[i]); fprintf(result_file, "%s\n", pred_result[i].c_str());
} }
}; };
TextReader<data_size_t> predict_data_reader(data_filename); TextReader<data_size_t> predict_data_reader(data_filename);
predict_data_reader.ReadAllAndProcessParallel(process_fun); predict_data_reader.ReadAllAndProcessParallel(process_fun);
...@@ -170,6 +181,18 @@ public: ...@@ -170,6 +181,18 @@ public:
} }
private: private:
int PutFeatureValuesToBuffer(const std::vector<std::pair<int, double>>& features) {
int tid = omp_get_thread_num();
// init feature value
std::memset(features_[tid], 0, sizeof(double)*num_features_);
// put feature value
for (const auto& p : features) {
if (p.first < num_features_) {
features_[tid][p.first] = p.second;
}
}
return tid;
}
/*! \brief Boosting model */ /*! \brief Boosting model */
const Boosting* boosting_; const Boosting* boosting_;
/*! \brief Buffer for feature values */ /*! \brief Buffer for feature values */
...@@ -180,6 +203,8 @@ private: ...@@ -180,6 +203,8 @@ private:
bool is_simgoid_; bool is_simgoid_;
/*! \brief Number of threads */ /*! \brief Number of threads */
int num_threads_; int num_threads_;
/*! \brief True if output leaf index instead of prediction score */
bool predict_leaf_index;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
namespace LightGBM { namespace LightGBM {
GBDT::GBDT(const BoostingConfig* config) GBDT::GBDT(const BoostingConfig* config)
...@@ -22,6 +21,7 @@ GBDT::GBDT(const BoostingConfig* config) ...@@ -22,6 +21,7 @@ GBDT::GBDT(const BoostingConfig* config)
out_of_bag_data_indices_(nullptr), bag_data_indices_(nullptr) { out_of_bag_data_indices_(nullptr), bag_data_indices_(nullptr) {
max_feature_idx_ = 0; max_feature_idx_ = 0;
gbdt_config_ = dynamic_cast<const GBDTConfig*>(config); gbdt_config_ = dynamic_cast<const GBDTConfig*>(config);
early_stopping_round_ = gbdt_config_->early_stopping_round;
} }
GBDT::~GBDT() { GBDT::~GBDT() {
...@@ -92,8 +92,12 @@ void GBDT::AddDataset(const Dataset* valid_data, ...@@ -92,8 +92,12 @@ void GBDT::AddDataset(const Dataset* valid_data,
// for a validation dataset, we need its score and metric // for a validation dataset, we need its score and metric
valid_score_updater_.push_back(new ScoreUpdater(valid_data)); valid_score_updater_.push_back(new ScoreUpdater(valid_data));
valid_metrics_.emplace_back(); valid_metrics_.emplace_back();
best_iter_.emplace_back();
best_score_.emplace_back();
for (const auto& metric : valid_metrics) { for (const auto& metric : valid_metrics) {
valid_metrics_.back().push_back(metric); valid_metrics_.back().push_back(metric);
best_iter_.back().push_back(0);
best_score_.back().push_back(-1);
} }
} }
...@@ -145,7 +149,7 @@ void GBDT::Bagging(int iter) { ...@@ -145,7 +149,7 @@ void GBDT::Bagging(int iter) {
bag_data_cnt_ = cur_left_cnt; bag_data_cnt_ = cur_left_cnt;
out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_; out_of_bag_data_cnt_ = num_data_ - bag_data_cnt_;
} }
Log::Stdout("re-bagging, using %d data to train", bag_data_cnt_); Log::Info("re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner // set bagging data to tree learner
tree_learner_->SetBaggingData(bag_data_indices_, bag_data_cnt_); tree_learner_->SetBaggingData(bag_data_indices_, bag_data_cnt_);
} }
...@@ -171,7 +175,7 @@ void GBDT::Train() { ...@@ -171,7 +175,7 @@ void GBDT::Train() {
Tree * new_tree = TrainOneTree(); Tree * new_tree = TrainOneTree();
// if cannot learn a new tree, then stop // if cannot learn a new tree, then stop
if (new_tree->num_leaves() <= 1) { if (new_tree->num_leaves() <= 1) {
Log::Stdout("Cannot do any boosting for tree cannot split"); Log::Info("Can't training anymore, there isn't any leaf meets split requirements.");
break; break;
} }
// shrinkage by learning rate // shrinkage by learning rate
...@@ -180,19 +184,44 @@ void GBDT::Train() { ...@@ -180,19 +184,44 @@ void GBDT::Train() {
UpdateScore(new_tree); UpdateScore(new_tree);
UpdateScoreOutOfBag(new_tree); UpdateScoreOutOfBag(new_tree);
// print message for metric // print message for metric
OutputMetric(iter + 1); bool is_early_stopping = OutputMetric(iter + 1);
// add model // add model
models_.push_back(new_tree); models_.push_back(new_tree);
// save model to file per iteration // save model to file per iteration
if (early_stopping_round_ > 0){
// if use early stopping, save previous model at (iter - early_stopping_round_) iteration
if (iter >= early_stopping_round_){
fprintf(output_model_file, "Tree=%d\n", iter - early_stopping_round_);
Tree * printing_tree = models_.at(iter - early_stopping_round_);
fprintf(output_model_file, "%s\n", printing_tree->ToString().c_str());
fflush(output_model_file);
}
}
else{
fprintf(output_model_file, "Tree=%d\n", iter); fprintf(output_model_file, "Tree=%d\n", iter);
fprintf(output_model_file, "%s\n", new_tree->ToString().c_str()); fprintf(output_model_file, "%s\n", new_tree->ToString().c_str());
fflush(output_model_file); fflush(output_model_file);
}
auto end_time = std::chrono::high_resolution_clock::now(); auto end_time = std::chrono::high_resolution_clock::now();
// output used time per iteration // output used time per iteration
Log::Stdout("%f seconds elapsed, finished %d iteration", std::chrono::duration<double, Log::Info("%f seconds elapsed, finished %d iteration", std::chrono::duration<double,
std::milli>(end_time - start_time) * 1e-3, iter + 1); std::milli>(end_time - start_time) * 1e-3, iter + 1);
if (is_early_stopping) {
// close file with an early-stopping message
Log::Info("Early stopping at iteration %d, the best iteration round is %d", iter + 1, iter + 1 - early_stopping_round_);
fclose(output_model_file);
return;
}
} }
// close file // close file
if (early_stopping_round_ > 0) {
// save remaining models
for (int iter = gbdt_config_->num_iterations - early_stopping_round_; iter < static_cast<int>(models_.size()); ++iter){
fprintf(output_model_file, "Tree=%d\n", iter);
fprintf(output_model_file, "%s\n", models_.at(iter)->ToString().c_str());
}
fflush(output_model_file);
}
fclose(output_model_file); fclose(output_model_file);
} }
...@@ -209,17 +238,31 @@ void GBDT::UpdateScore(const Tree* tree) { ...@@ -209,17 +238,31 @@ void GBDT::UpdateScore(const Tree* tree) {
} }
} }
void GBDT::OutputMetric(int iter) { bool GBDT::OutputMetric(int iter) {
bool ret = false;
// print training metric // print training metric
for (auto& sub_metric : training_metrics_) { for (auto& sub_metric : training_metrics_) {
sub_metric->Print(iter, train_score_updater_->score()); sub_metric->PrintAndGetLoss(iter, train_score_updater_->score());
} }
// print validation metric // print validation metric
for (size_t i = 0; i < valid_metrics_.size(); ++i) { for (size_t i = 0; i < valid_metrics_.size(); ++i) {
for (auto& sub_metric : valid_metrics_[i]) { for (size_t j = 0; j < valid_metrics_[i].size(); ++j) {
sub_metric->Print(iter, valid_score_updater_[i]->score()); score_t test_score_ = valid_metrics_[i][j]->PrintAndGetLoss(iter, valid_score_updater_[i]->score());
if (!ret && early_stopping_round_ > 0){
bool the_bigger_the_better_ = valid_metrics_[i][j]->the_bigger_the_better;
if (best_score_[i][j] < 0
|| (!the_bigger_the_better_ && test_score_ < best_score_[i][j])
|| ( the_bigger_the_better_ && test_score_ > best_score_[i][j])){
best_score_[i][j] = test_score_;
best_iter_[i][j] = iter;
}
else {
if (iter - best_iter_[i][j] >= early_stopping_round_) ret = true;
}
} }
} }
}
return ret;
} }
void GBDT::Boosting() { void GBDT::Boosting() {
...@@ -264,7 +307,7 @@ void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) { ...@@ -264,7 +307,7 @@ void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) {
} }
} }
if (i == lines.size()) { if (i == lines.size()) {
Log::Stderr("The model doesn't contain max_feature_idx"); Log::Fatal("Model file doesn't contain max_feature_idx");
return; return;
} }
// get sigmoid parameter // get sigmoid parameter
...@@ -303,7 +346,7 @@ void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) { ...@@ -303,7 +346,7 @@ void GBDT::ModelsFromString(const std::string& model_str, int num_used_model) {
} }
} }
Log::Stdout("Loaded %d modles\n", models_.size()); Log::Info("%d models has been loaded\n", models_.size());
} }
double GBDT::PredictRaw(const double* value) const { double GBDT::PredictRaw(const double* value) const {
...@@ -321,7 +364,15 @@ double GBDT::Predict(const double* value) const { ...@@ -321,7 +364,15 @@ double GBDT::Predict(const double* value) const {
} }
// if need sigmoid transform // if need sigmoid transform
if (sigmoid_ > 0) { if (sigmoid_ > 0) {
ret = 1.0 / (1.0 + std::exp(-sigmoid_ * ret)); ret = 1.0 / (1.0 + std::exp(- 2.0f * sigmoid_ * ret));
}
return ret;
}
std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
std::vector<int> ret;
for (size_t i = 0; i < models_.size(); ++i) {
ret.push_back(models_[i]->PredictLeafIndex(value));
} }
return ret; return ret;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment