Commit 2a8d38c5 authored by Qiwei Ye's avatar Qiwei Ye
Browse files

Merge branches 'master' and 'master' of https://github.com/Microsoft/LightGBM

parents 351b3d7e ed958eb2
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
task = predict
data = multiclass.test
input_model= LightGBM_model.txt
# task type, support train and predict
task = train
# boosting type, support gbdt for now, alias: boosting, boost
boosting_type = gbdt
# application type, support following application
# regression , regression task
# binary , binary classification task
# lambdarank , lambdarank task
# multiclass
# alias: application, app
objective = multiclass
# eval metrics, support multi metric, delimite by ',' , support following metrics
# l1
# l2 , default metric for regression
# ndcg , default metric for lambdarank
# auc
# binary_logloss , default metric for binary
# binary_error
# multi_logloss
# multi_error
metric = multi_logloss
# number of class, for multiclass classification
num_class = 5
# frequence for metric output
metric_freq = 1
# true if need output metric for training data, alias: tranining_metric, train_metric
is_training_metric = true
# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy.
max_bin = 255
# training data
# if exsting weight file, should name to "regression.train.weight"
# alias: train_data, train
data = multiclass.train
# valid data
valid_data = multiclass.test
# round for early stopping
early_stopping = 10
# number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
num_trees = 100
# shrinkage rate , alias: shrinkage_rate
learning_rate = 0.05
# number of leaves for one tree, alias: num_leaf
num_leaves = 31
...@@ -54,9 +54,6 @@ private: ...@@ -54,9 +54,6 @@ private:
/*! \brief Initializations before prediction */ /*! \brief Initializations before prediction */
void InitPredict(); void InitPredict();
/*! \brief Load model from local disk */
void LoadModel();
/*! \brief Main predicting logic */ /*! \brief Main predicting logic */
void Predict(); void Predict();
......
...@@ -13,9 +13,9 @@ namespace LightGBM { ...@@ -13,9 +13,9 @@ namespace LightGBM {
struct HistogramBinEntry { struct HistogramBinEntry {
public: public:
/*! \brief Sum of gradients on this bin */ /*! \brief Sum of gradients on this bin */
score_t sum_gradients = 0.0; double sum_gradients = 0.0;
/*! \brief Sum of hessians on this bin */ /*! \brief Sum of hessians on this bin */
score_t sum_hessians = 0.0; double sum_hessians = 0.0;
/*! \brief Number of data on this bin */ /*! \brief Number of data on this bin */
data_size_t cnt = 0; data_size_t cnt = 0;
...@@ -56,7 +56,7 @@ public: ...@@ -56,7 +56,7 @@ public:
/*! \brief True if bin is trival (contains only one bin) */ /*! \brief True if bin is trival (contains only one bin) */
inline bool is_trival() const { return is_trival_; } inline bool is_trival() const { return is_trival_; }
/*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */ /*! \brief Sparsity of this bin ( num_zero_bins / num_data ) */
inline double sparse_rate() const { return sparse_rate_; } inline float sparse_rate() const { return sparse_rate_; }
/*! /*!
* \brief Save binary data to file * \brief Save binary data to file
* \param file File want to write * \param file File want to write
...@@ -67,7 +67,7 @@ public: ...@@ -67,7 +67,7 @@ public:
* \param bin * \param bin
* \return Feature value of this bin * \return Feature value of this bin
*/ */
inline double BinToValue(unsigned int bin) const { inline float BinToValue(unsigned int bin) const {
return bin_upper_bound_[bin]; return bin_upper_bound_[bin];
} }
/*! /*!
...@@ -79,14 +79,14 @@ public: ...@@ -79,14 +79,14 @@ public:
* \param value * \param value
* \return bin for this feature value * \return bin for this feature value
*/ */
inline unsigned int ValueToBin(double value) const; inline unsigned int ValueToBin(float value) const;
/*! /*!
* \brief Construct feature value to bin mapper according feature values * \brief Construct feature value to bin mapper according feature values
* \param values (Sampled) values of this feature * \param values (Sampled) values of this feature
* \param max_bin The maximal number of bin * \param max_bin The maximal number of bin
*/ */
void FindBin(std::vector<double>* values, int max_bin); void FindBin(std::vector<float>* values, int max_bin);
/*! /*!
* \brief Use specific number of bin to calculate the size of this class * \brief Use specific number of bin to calculate the size of this class
...@@ -111,11 +111,11 @@ private: ...@@ -111,11 +111,11 @@ private:
/*! \brief Number of bins */ /*! \brief Number of bins */
int num_bin_; int num_bin_;
/*! \brief Store upper bound for each bin */ /*! \brief Store upper bound for each bin */
double* bin_upper_bound_; float* bin_upper_bound_;
/*! \brief True if this feature is trival */ /*! \brief True if this feature is trival */
bool is_trival_; bool is_trival_;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */ /*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double sparse_rate_; float sparse_rate_;
}; };
/*! /*!
...@@ -271,7 +271,7 @@ public: ...@@ -271,7 +271,7 @@ public:
* \return The bin data object * \return The bin data object
*/ */
static Bin* CreateBin(data_size_t num_data, int num_bin, static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse, bool* is_sparse, int default_bin); float sparse_rate, bool is_enable_sparse, bool* is_sparse, int default_bin);
/*! /*!
* \brief Create object for bin data of one feature, used for dense feature * \brief Create object for bin data of one feature, used for dense feature
...@@ -293,7 +293,7 @@ public: ...@@ -293,7 +293,7 @@ public:
int num_bin, int default_bin); int num_bin, int default_bin);
}; };
inline unsigned int BinMapper::ValueToBin(double value) const { inline unsigned int BinMapper::ValueToBin(float value) const {
// binary search to find bin // binary search to find bin
int l = 0; int l = 0;
int r = num_bin_ - 1; int r = num_bin_ - 1;
......
...@@ -28,12 +28,12 @@ public: ...@@ -28,12 +28,12 @@ public:
* \param train_data Training data * \param train_data Training data
* \param object_function Training objective function * \param object_function Training objective function
* \param training_metrics Training metric * \param training_metrics Training metric
* \param output_model_filename Filename of output model
*/ */
virtual void Init(const Dataset* train_data, virtual void Init(
const BoostingConfig* config,
const Dataset* train_data,
const ObjectiveFunction* object_function, const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics, const std::vector<const Metric*>& training_metrics) = 0;
const char* output_model_filename) = 0;
/*! /*!
* \brief Add a validation data * \brief Add a validation data
...@@ -44,40 +44,59 @@ public: ...@@ -44,40 +44,59 @@ public:
const std::vector<const Metric*>& valid_metrics) = 0; const std::vector<const Metric*>& valid_metrics) = 0;
/*! \brief Training logic */ /*! \brief Training logic */
virtual void Train() = 0; virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) = 0;
/*! \brief Get eval result */
virtual std::vector<std::string> EvalCurrent(bool is_eval_train) const = 0 ;
/*! \brief Get prediction result */
virtual const std::vector<const score_t*> PredictCurrent(bool is_predict_train) const = 0;
/*! /*!
* \brief Prediction for one record, not sigmoid transform * \brief Prediction for one record, not sigmoid transform
* \param feature_values Feature value on this record * \param feature_values Feature value on this record
* \param num_used_model Number of used model
* \return Prediction result for this record * \return Prediction result for this record
*/ */
virtual double PredictRaw(const double * feature_values) const = 0; virtual float PredictRaw(const float* feature_values,
int num_used_model) const = 0;
/*! /*!
* \brief Prediction for one record, sigmoid transformation will be used if needed * \brief Prediction for one record, sigmoid transformation will be used if needed
* \param feature_values Feature value on this record * \param feature_values Feature value on this record
* \param num_used_model Number of used model
* \return Prediction result for this record * \return Prediction result for this record
*/ */
virtual double Predict(const double * feature_values) const = 0; virtual float Predict(const float* feature_values,
int num_used_model) const = 0;
/*! /*!
* \brief Predtion for one record with leaf index * \brief Predtion for one record with leaf index
* \param feature_values Feature value on this record * \param feature_values Feature value on this record
* \param num_used_model Number of used model
* \return Predicted leaf index for this record * \return Predicted leaf index for this record
*/ */
virtual std::vector<int> PredictLeafIndex(const double * feature_values) const = 0; virtual std::vector<int> PredictLeafIndex(
const float* feature_values,
int num_used_model) const = 0;
/*!
* \brief Predtion for multiclass classification
* \param feature_values Feature value on this record
* \return Prediction result, num_class numbers per line
*/
virtual std::vector<float> PredictMulticlass(const float* value, int num_used_model) const = 0;
/*! /*!
* \brief Serialize models by string * \brief save model to file
* \return String output of tranined model
*/ */
virtual std::string ModelsToString() const = 0; virtual void SaveModelToFile(bool is_finish, const char* filename) = 0;
/*! /*!
* \brief Restore from a serialized string * \brief Restore from a serialized string
* \param model_str The string of model * \param model_str The string of model
*/ */
virtual void ModelsFromString(const std::string& model_str, int num_used_model) = 0; virtual void ModelsFromString(const std::string& model_str) = 0;
/*! /*!
* \brief Get max feature index of this model * \brief Get max feature index of this model
...@@ -85,19 +104,45 @@ public: ...@@ -85,19 +104,45 @@ public:
*/ */
virtual int MaxFeatureIdx() const = 0; virtual int MaxFeatureIdx() const = 0;
/*!
* \brief Get index of label column
* \return index of label column
*/
virtual int LabelIdx() const = 0;
/*! /*!
* \brief Get number of weak sub-models * \brief Get number of weak sub-models
* \return Number of weak sub-models * \return Number of weak sub-models
*/ */
virtual int NumberOfSubModels() const = 0; virtual int NumberOfSubModels() const = 0;
/*!
* \brief Get number of classes
* \return Number of classes
*/
virtual int NumberOfClass() const = 0;
/*!
* \brief Get Type name of this boosting object
*/
virtual const char* Name() const = 0;
/*! /*!
* \brief Create boosting object * \brief Create boosting object
* \param type Type of boosting * \param type Type of boosting
* \param config config for boosting
* \param filename name of model file, if existing will continue to train from this model
* \return The boosting object * \return The boosting object
*/ */
static Boosting* CreateBoosting(BoostingType type, static Boosting* CreateBoosting(BoostingType type, const char* filename);
const BoostingConfig* config);
/*!
* \brief Create boosting object from model file
* \param filename name of model file
* \return The boosting object
*/
static Boosting* CreateBoosting(const char* filename);
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -49,15 +49,15 @@ public: ...@@ -49,15 +49,15 @@ public:
const std::string& name, int* out); const std::string& name, int* out);
/*! /*!
* \brief Get double value by specific name of key * \brief Get float value by specific name of key
* \param params Store the key and value for params * \param params Store the key and value for params
* \param name Name of key * \param name Name of key
* \param out Value will assign to out if key exists * \param out Value will assign to out if key exists
* \return True if key exists * \return True if key exists
*/ */
inline bool GetDouble( inline bool GetFloat(
const std::unordered_map<std::string, std::string>& params, const std::unordered_map<std::string, std::string>& params,
const std::string& name, double* out); const std::string& name, float* out);
/*! /*!
* \brief Get bool value by specific name of key * \brief Get bool value by specific name of key
...@@ -73,7 +73,7 @@ public: ...@@ -73,7 +73,7 @@ public:
/*! \brief Types of boosting */ /*! \brief Types of boosting */
enum BoostingType { enum BoostingType {
kGBDT kGBDT, kUnknow
}; };
...@@ -94,13 +94,26 @@ public: ...@@ -94,13 +94,26 @@ public:
std::string input_model = ""; std::string input_model = "";
std::string input_init_score = ""; std::string input_init_score = "";
int verbosity = 1; int verbosity = 1;
std::string log_file = "";
int num_model_predict = -1; int num_model_predict = -1;
bool is_pre_partition = false; bool is_pre_partition = false;
bool is_enable_sparse = true; bool is_enable_sparse = true;
bool use_two_round_loading = false; bool use_two_round_loading = false;
bool is_save_binary_file = false; bool is_save_binary_file = false;
bool is_sigmoid = true; bool is_sigmoid = true;
bool has_header = false;
/*! \brief Index or column name of label, default is the first column
* And add an prefix "name:" while using column name */
std::string label_column = "";
/*! \brief Index or column name of weight, < 0 means not used
* And add an prefix "name:" while using column name */
std::string weight_column = "";
/*! \brief Index or column name of group, < 0 means not used */
std::string group_column = "";
/*! \brief ignored features, separate by ','
* e.g. name:column_name1,column_name2 */
std::string ignore_column = "";
void Set(const std::unordered_map<std::string, std::string>& params) override; void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -108,13 +121,15 @@ public: ...@@ -108,13 +121,15 @@ public:
struct ObjectiveConfig: public ConfigBase { struct ObjectiveConfig: public ConfigBase {
public: public:
virtual ~ObjectiveConfig() {} virtual ~ObjectiveConfig() {}
double sigmoid = 1; float sigmoid = 1.0f;
// for lambdarank // for lambdarank
std::vector<double> label_gain; std::vector<float> label_gain;
// for lambdarank // for lambdarank
int max_position = 20; int max_position = 20;
// for binary // for binary
bool is_unbalance = false; bool is_unbalance = false;
// for multiclass
int num_class = 1;
void Set(const std::unordered_map<std::string, std::string>& params) override; void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -122,11 +137,9 @@ public: ...@@ -122,11 +137,9 @@ public:
struct MetricConfig: public ConfigBase { struct MetricConfig: public ConfigBase {
public: public:
virtual ~MetricConfig() {} virtual ~MetricConfig() {}
int early_stopping_round = 0; int num_class = 1;
int output_freq = 1; float sigmoid = 1.0f;
double sigmoid = 1; std::vector<float> label_gain;
bool is_provide_training_metric = false;
std::vector<double> label_gain;
std::vector<int> eval_at; std::vector<int> eval_at;
void Set(const std::unordered_map<std::string, std::string>& params) override; void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -136,12 +149,18 @@ public: ...@@ -136,12 +149,18 @@ public:
struct TreeConfig: public ConfigBase { struct TreeConfig: public ConfigBase {
public: public:
int min_data_in_leaf = 100; int min_data_in_leaf = 100;
double min_sum_hessian_in_leaf = 10.0f; float min_sum_hessian_in_leaf = 10.0f;
// should > 1, only one leaf means not need to learning
int num_leaves = 127; int num_leaves = 127;
int feature_fraction_seed = 2; int feature_fraction_seed = 2;
double feature_fraction = 1.0; float feature_fraction = 1.0f;
// max cache size(unit:MB) for historical histogram. < 0 means not limit // max cache size(unit:MB) for historical histogram. < 0 means not limit
double histogram_pool_size = -1; float histogram_pool_size = -1.0f;
// max depth of tree model.
// Still grow tree by leaf-wise, but limit the max depth to avoid over-fitting
// And the max leaves will be min(num_leaves, pow(2, max_depth - 1))
// max_depth < 0 means not limit
int max_depth = -1;
void Set(const std::unordered_map<std::string, std::string>& params) override; void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -155,12 +174,15 @@ enum TreeLearnerType { ...@@ -155,12 +174,15 @@ enum TreeLearnerType {
struct BoostingConfig: public ConfigBase { struct BoostingConfig: public ConfigBase {
public: public:
virtual ~BoostingConfig() {} virtual ~BoostingConfig() {}
int output_freq = 1;
bool is_provide_training_metric = false;
int num_iterations = 10; int num_iterations = 10;
double learning_rate = 0.1; float learning_rate = 0.1f;
double bagging_fraction = 1.0; float bagging_fraction = 1.0f;
int bagging_seed = 3; int bagging_seed = 3;
int bagging_freq = 0; int bagging_freq = 0;
int early_stopping_round = 0; int early_stopping_round = 0;
int num_class = 1;
void Set(const std::unordered_map<std::string, std::string>& params) override; void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -207,7 +229,7 @@ public: ...@@ -207,7 +229,7 @@ public:
delete boosting_config; delete boosting_config;
} }
void Set(const std::unordered_map<std::string, std::string>& params) override; void Set(const std::unordered_map<std::string, std::string>& params) override;
void LoadFromString(const char* str);
private: private:
void GetBoostingType(const std::unordered_map<std::string, std::string>& params); void GetBoostingType(const std::unordered_map<std::string, std::string>& params);
...@@ -235,17 +257,23 @@ inline bool ConfigBase::GetInt( ...@@ -235,17 +257,23 @@ inline bool ConfigBase::GetInt(
const std::unordered_map<std::string, std::string>& params, const std::unordered_map<std::string, std::string>& params,
const std::string& name, int* out) { const std::string& name, int* out) {
if (params.count(name) > 0) { if (params.count(name) > 0) {
Common::Atoi(params.at(name).c_str(), out); if (!Common::AtoiAndCheck(params.at(name).c_str(), out)) {
Log::Fatal("Parameter %s should be int type, passed is [%s]",
name.c_str(), params.at(name).c_str());
}
return true; return true;
} }
return false; return false;
} }
inline bool ConfigBase::GetDouble( inline bool ConfigBase::GetFloat(
const std::unordered_map<std::string, std::string>& params, const std::unordered_map<std::string, std::string>& params,
const std::string& name, double* out) { const std::string& name, float* out) {
if (params.count(name) > 0) { if (params.count(name) > 0) {
Common::Atof(params.at(name).c_str(), out); if (!Common::AtofAndCheck(params.at(name).c_str(), out)) {
Log::Fatal("Parameter %s should be float type, passed is [%s]",
name.c_str(), params.at(name).c_str());
}
return true; return true;
} }
return false; return false;
...@@ -257,10 +285,13 @@ inline bool ConfigBase::GetBool( ...@@ -257,10 +285,13 @@ inline bool ConfigBase::GetBool(
if (params.count(name) > 0) { if (params.count(name) > 0) {
std::string value = params.at(name); std::string value = params.at(name);
std::transform(value.begin(), value.end(), value.begin(), ::tolower); std::transform(value.begin(), value.end(), value.begin(), ::tolower);
if (value == std::string("false")) { if (value == std::string("false") || value == std::string("-")) {
*out = false; *out = false;
} else { } else if (value == std::string("true") || value == std::string("+")) {
*out = true; *out = true;
} else {
Log::Fatal("Parameter %s should be \"true\"/\"+\" or \"false\"/\"-\", passed is [%s]",
name.c_str(), params.at(name).c_str());
} }
return true; return true;
} }
...@@ -318,7 +349,15 @@ struct ParameterAlias { ...@@ -318,7 +349,15 @@ struct ParameterAlias {
{ "save_binary", "is_save_binary_file" }, { "save_binary", "is_save_binary_file" },
{ "early_stopping_rounds", "early_stopping_round"}, { "early_stopping_rounds", "early_stopping_round"},
{ "early_stopping", "early_stopping_round"}, { "early_stopping", "early_stopping_round"},
{ "verbosity", "verbose" } { "verbosity", "verbose" },
{ "header", "has_header" },
{ "label", "label_column" },
{ "weight", "weight_column" },
{ "group", "group_column" },
{ "query", "group_column" },
{ "query_column", "group_column" },
{ "ignore_feature", "ignore_column" },
{ "blacklist", "ignore_column" }
}); });
std::unordered_map<std::string, std::string> tmp_map; std::unordered_map<std::string, std::string> tmp_map;
for (const auto& pair : *params) { for (const auto& pair : *params) {
......
...@@ -5,11 +5,13 @@ ...@@ -5,11 +5,13 @@
#include <LightGBM/utils/text_reader.h> #include <LightGBM/utils/text_reader.h>
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <vector> #include <vector>
#include <utility> #include <utility>
#include <functional> #include <functional>
#include <string> #include <string>
#include <unordered_set>
namespace LightGBM { namespace LightGBM {
...@@ -56,10 +58,12 @@ public: ...@@ -56,10 +58,12 @@ public:
~Metadata(); ~Metadata();
/*! /*!
* \brief Initial work, will auto load weight, inital scores * \brief Initial work, will allocate space for label, weight(if exists) and query(if exists)
* \param num_data Number of training data * \param num_data Number of training data
* \param weight_idx Index of weight column, < 0 means doesn't exists
* \param query_idx Index of query id column, < 0 means doesn't exists
*/ */
void InitLabel(data_size_t num_data); void Init(data_size_t num_data, int weight_idx, int query_idx);
/*! /*!
* \brief Partition label by used indices * \brief Partition label by used indices
...@@ -79,7 +83,7 @@ public: ...@@ -79,7 +83,7 @@ public:
* \brief Set initial scores * \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score. * \param init_score Initial scores, this class will manage memory for init_score.
*/ */
void SetInitScore(score_t* init_score); void SetInitScore(const float* init_score, data_size_t len);
/*! /*!
...@@ -104,9 +108,29 @@ public: ...@@ -104,9 +108,29 @@ public:
* \param idx Index of this record * \param idx Index of this record
* \param value Label value of this record * \param value Label value of this record
*/ */
inline void SetLabelAt(data_size_t idx, double value) inline void SetLabelAt(data_size_t idx, float value)
{ {
label_[idx] = static_cast<float>(value); label_[idx] = value;
}
/*!
* \brief Set Weight for one record
* \param idx Index of this record
* \param value Weight value of this record
*/
inline void SetWeightAt(data_size_t idx, float value)
{
weights_[idx] = value;
}
/*!
* \brief Set Query Id for one record
* \param idx Index of this record
* \param value Query Id value of this record
*/
inline void SetQueryAt(data_size_t idx, float value)
{
queries_[idx] = static_cast<data_size_t>(value);
} }
/*! /*!
...@@ -142,7 +166,7 @@ public: ...@@ -142,7 +166,7 @@ public:
* \brief Get initial scores, if not exists, will return nullptr * \brief Get initial scores, if not exists, will return nullptr
* \return Pointer of initial scores * \return Pointer of initial scores
*/ */
inline const score_t* init_score() const { return init_score_; } inline const float* init_score() const { return init_score_; }
/*! \brief Load initial scores from file */ /*! \brief Load initial scores from file */
void LoadInitialScore(); void LoadInitialScore();
...@@ -177,46 +201,40 @@ private: ...@@ -177,46 +201,40 @@ private:
/*! \brief Number of Initial score, used to check correct weight file */ /*! \brief Number of Initial score, used to check correct weight file */
data_size_t num_init_score_; data_size_t num_init_score_;
/*! \brief Initial score */ /*! \brief Initial score */
score_t* init_score_; float* init_score_;
/*! \brief Queries data */
data_size_t* queries_;
}; };
/*! \brief Interface for Parser */ /*! \brief Interface for Parser */
class Parser { class Parser {
public: public:
/*! \brief virtual destructor */ /*! \brief virtual destructor */
virtual ~Parser() {} virtual ~Parser() {}
/*!
* \brief Parse one line with label
* \param str One line record, string format, should end with '\0'
* \param out_features Output features, store in (feature_idx, feature_value)
* \param out_label Output label
*/
virtual void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features,
double* out_label) const = 0;
/*! /*!
* \brief Parse one line with label * \brief Parse one line with label
* \param str One line record, string format, should end with '\0' * \param str One line record, string format, should end with '\0'
* \param out_features Output features, store in (feature_idx, feature_value) * \param out_features Output columns, store in (column_idx, values)
* \param out_label Output label * \param out_label Label will store to this if exists
*/ */
virtual void ParseOneLine(const char* str, virtual void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features) const = 0; std::vector<std::pair<int, float>>* out_features, float* out_label) const = 0;
/*! /*!
* \brief Create a object of parser, will auto choose the format depend on file * \brief Create a object of parser, will auto choose the format depend on file
* \param filename One Filename of data * \param filename One Filename of data
* \param num_features Pass num_features of this data file if you know, <=0 means don't know * \param num_features Pass num_features of this data file if you know, <=0 means don't know
* \param has_label output, if num_features > 0, will output this data has label or not * \param label_idx index of label column
* \return Object of parser * \return Object of parser
*/ */
static Parser* CreateParser(const char* filename, int num_features, bool* has_label); static Parser* CreateParser(const char* filename, bool has_header, int num_features, int label_idx);
}; };
using PredictFunction = using PredictFunction =
std::function<double(const std::vector<std::pair<int, double>>&)>; std::function<float(const std::vector<std::pair<int, float>>&)>;
/*! \brief The main class of data set, /*! \brief The main class of data set,
* which are used to traning or validation * which are used to traning or validation
...@@ -227,29 +245,21 @@ public: ...@@ -227,29 +245,21 @@ public:
* \brief Constructor * \brief Constructor
* \param data_filename Filename of dataset * \param data_filename Filename of dataset
* \param init_score_filename Filename of initial score * \param init_score_filename Filename of initial score
* \param is_int_label True if label is int type * \param io_config configs for IO
* \param max_bin The maximal number of bin that feature values will bucket in
* \param random_seed The seed for random generator
* \param is_enable_sparse True for sparse feature
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score * \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/ */
Dataset(const char* data_filename, const char* init_score_filename, Dataset(const char* data_filename, const char* init_score_filename,
int max_bin, int random_seed, bool is_enable_sparse, const PredictFunction& predict_fun); const IOConfig& io_config, const PredictFunction& predict_fun);
/*! /*!
* \brief Constructor * \brief Constructor
* \param data_filename Filename of dataset * \param data_filename Filename of dataset
* \param is_int_label True if label is int type * \param io_config configs for IO
* \param max_bin The maximal number of bin that feature values will bucket in
* \param random_seed The seed for random generator
* \param is_enable_sparse True for sparse feature
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score * \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/ */
Dataset(const char* data_filename, Dataset(const char* data_filename,
int max_bin, int random_seed, bool is_enable_sparse, const IOConfig& io_config, const PredictFunction& predict_fun)
const PredictFunction& predict_fun) : Dataset(data_filename, "", io_config, predict_fun) {
: Dataset(data_filename, "", max_bin, random_seed,
is_enable_sparse, predict_fun) {
} }
/*! \brief Destructor */ /*! \brief Destructor */
...@@ -304,6 +314,12 @@ public: ...@@ -304,6 +314,12 @@ public:
/*! \brief Get Number of total features */ /*! \brief Get Number of total features */
inline int num_total_features() const { return num_total_features_; } inline int num_total_features() const { return num_total_features_; }
/*! \brief Get the index of label column */
inline int label_idx() const { return label_idx_; }
/*! \brief Get names of current data set */
inline std::vector<std::string> feature_names() const { return feature_names_; }
/*! \brief Get Number of data */ /*! \brief Get Number of data */
inline data_size_t num_data() const { return num_data_; } inline data_size_t num_data() const { return num_data_; }
...@@ -394,10 +410,20 @@ private: ...@@ -394,10 +410,20 @@ private:
bool is_loading_from_binfile_; bool is_loading_from_binfile_;
/*! \brief Number of global data, used for distributed learning */ /*! \brief Number of global data, used for distributed learning */
size_t global_num_data_ = 0; size_t global_num_data_ = 0;
// used to local used data indices /*! \brief used to local used data indices */
std::vector<data_size_t> used_data_indices_; std::vector<data_size_t> used_data_indices_;
// prediction function for initial model /*! \brief prediction function for initial model */
const PredictFunction& predict_fun_; const PredictFunction& predict_fun_;
/*! \brief index of label column */
int label_idx_ = 0;
/*! \brief index of weight column */
int weight_idx_ = -1;
/*! \brief index of group column */
int group_idx_ = -1;
/*! \brief Mapper from real feature index to used index*/
std::unordered_set<int> ignore_features_;
/*! \brief store feature names */
std::vector<std::string> feature_names_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -71,7 +71,7 @@ public: ...@@ -71,7 +71,7 @@ public:
* \param idx Index of record * \param idx Index of record
* \param value feature value of record * \param value feature value of record
*/ */
inline void PushData(int tid, data_size_t line_idx, double value) { inline void PushData(int tid, data_size_t line_idx, float value) {
unsigned int bin = bin_mapper_->ValueToBin(value); unsigned int bin = bin_mapper_->ValueToBin(value);
bin_data_->Push(tid, line_idx, bin); bin_data_->Push(tid, line_idx, bin);
} }
...@@ -89,7 +89,7 @@ public: ...@@ -89,7 +89,7 @@ public:
* \param bin * \param bin
* \return Feature value of this bin * \return Feature value of this bin
*/ */
inline double BinToValue(unsigned int bin) inline float BinToValue(unsigned int bin)
const { return bin_mapper_->BinToValue(bin); } const { return bin_mapper_->BinToValue(bin); }
/*! /*!
......
...@@ -12,7 +12,7 @@ namespace LightGBM { ...@@ -12,7 +12,7 @@ namespace LightGBM {
/*! \brief Type of data size, it is better to use signed type*/ /*! \brief Type of data size, it is better to use signed type*/
typedef int32_t data_size_t; typedef int32_t data_size_t;
/*! \brief Type of score, and gradients */ /*! \brief Type of score, and gradients */
typedef double score_t; typedef float score_t;
const score_t kMinScore = -std::numeric_limits<score_t>::infinity(); const score_t kMinScore = -std::numeric_limits<score_t>::infinity();
......
...@@ -11,7 +11,7 @@ namespace LightGBM { ...@@ -11,7 +11,7 @@ namespace LightGBM {
/*! /*!
* \brief The interface of metric. * \brief The interface of metric.
* Metric is used to calculate and output metric result on training / validation data. * Metric is used to calculate metric result
*/ */
class Metric { class Metric {
public: public:
...@@ -27,12 +27,14 @@ public: ...@@ -27,12 +27,14 @@ public:
virtual void Init(const char* test_name, virtual void Init(const char* test_name,
const Metadata& metadata, data_size_t num_data) = 0; const Metadata& metadata, data_size_t num_data) = 0;
virtual const char* GetName() const = 0;
virtual bool is_bigger_better() const = 0;
/*! /*!
* \brief Calcaluting and printing metric result * \brief Calcaluting and printing metric result
* \param iter Current iteration
* \param score Current prediction score * \param score Current prediction score
*/ */
virtual score_t PrintAndGetLoss(int iter, const score_t* score) const = 0; virtual std::vector<float> Eval(const score_t* score) const = 0;
/*! /*!
* \brief Create object of metrics * \brief Create object of metrics
...@@ -41,8 +43,6 @@ public: ...@@ -41,8 +43,6 @@ public:
*/ */
static Metric* CreateMetric(const std::string& type, const MetricConfig& config); static Metric* CreateMetric(const std::string& type, const MetricConfig& config);
bool the_bigger_the_better = false;
int early_stopping_round_ = 0;
}; };
/*! /*!
...@@ -54,7 +54,7 @@ public: ...@@ -54,7 +54,7 @@ public:
* \brief Initial logic * \brief Initial logic
* \param label_gain Gain for labels, default is 2^i - 1 * \param label_gain Gain for labels, default is 2^i - 1
*/ */
static void Init(std::vector<double> label_gain); static void Init(std::vector<float> label_gain);
/*! /*!
* \brief Calculate the DCG score at position k * \brief Calculate the DCG score at position k
...@@ -64,7 +64,7 @@ public: ...@@ -64,7 +64,7 @@ public:
* \param num_data Number of data * \param num_data Number of data
* \return The DCG score * \return The DCG score
*/ */
static double CalDCGAtK(data_size_t k, const float* label, static float CalDCGAtK(data_size_t k, const float* label,
const score_t* score, data_size_t num_data); const score_t* score, data_size_t num_data);
/*! /*!
...@@ -77,7 +77,7 @@ public: ...@@ -77,7 +77,7 @@ public:
*/ */
static void CalDCG(const std::vector<data_size_t>& ks, static void CalDCG(const std::vector<data_size_t>& ks,
const float* label, const score_t* score, const float* label, const score_t* score,
data_size_t num_data, std::vector<double>* out); data_size_t num_data, std::vector<float>* out);
/*! /*!
* \brief Calculate the Max DCG score at position k * \brief Calculate the Max DCG score at position k
...@@ -86,7 +86,7 @@ public: ...@@ -86,7 +86,7 @@ public:
* \param num_data Number of data * \param num_data Number of data
* \return The max DCG score * \return The max DCG score
*/ */
static double CalMaxDCGAtK(data_size_t k, static float CalMaxDCGAtK(data_size_t k,
const float* label, data_size_t num_data); const float* label, data_size_t num_data);
/*! /*!
...@@ -97,22 +97,22 @@ public: ...@@ -97,22 +97,22 @@ public:
* \param out Output result * \param out Output result
*/ */
static void CalMaxDCG(const std::vector<data_size_t>& ks, static void CalMaxDCG(const std::vector<data_size_t>& ks,
const float* label, data_size_t num_data, std::vector<double>* out); const float* label, data_size_t num_data, std::vector<float>* out);
/*! /*!
* \brief Get discount score of position k * \brief Get discount score of position k
* \param k The position * \param k The position
* \return The discount of this position * \return The discount of this position
*/ */
inline static double GetDiscount(data_size_t k) { return discount_[k]; } inline static float GetDiscount(data_size_t k) { return discount_[k]; }
private: private:
/*! \brief True if inited, avoid init multi times */ /*! \brief True if inited, avoid init multi times */
static bool is_inited_; static bool is_inited_;
/*! \brief store gains for different label */ /*! \brief store gains for different label */
static std::vector<double> label_gain_; static std::vector<float> label_gain_;
/*! \brief store discount score for different position */ /*! \brief store discount score for different position */
static std::vector<double> discount_; static std::vector<float> discount_;
/*! \brief max position for eval */ /*! \brief max position for eval */
static const data_size_t kMaxPosition; static const data_size_t kMaxPosition;
}; };
......
...@@ -36,7 +36,7 @@ public: ...@@ -36,7 +36,7 @@ public:
* This function is used for prediction task, if has sigmoid param, the prediction value will be transform by sigmoid function. * This function is used for prediction task, if has sigmoid param, the prediction value will be transform by sigmoid function.
* \return Sigmoid param, if <=0.0 means don't use sigmoid transform on this objective. * \return Sigmoid param, if <=0.0 means don't use sigmoid transform on this objective.
*/ */
virtual double GetSigmoid() const = 0; virtual float GetSigmoid() const = 0;
/*! /*!
* \brief Create object of objective function * \brief Create object of objective function
......
...@@ -36,18 +36,18 @@ public: ...@@ -36,18 +36,18 @@ public:
* \param feature Index of feature; the converted index after removing useless features * \param feature Index of feature; the converted index after removing useless features
* \param threshold Threshold(bin) of split * \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data * \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value * \param threshold_float Threshold on feature value
* \param left_value Model Left child output * \param left_value Model Left child output
* \param right_value Model Right child output * \param right_value Model Right child output
* \param gain Split gain * \param gain Split gain
* \return The index of new leaf. * \return The index of new leaf.
*/ */
int Split(int leaf, int feature, unsigned int threshold, int real_feature, int Split(int leaf, int feature, unsigned int threshold, int real_feature,
double threshold_double, score_t left_value, float threshold_float, float left_value,
score_t right_value, double gain); float right_value, float gain);
/*! \brief Get the output of one leave */ /*! \brief Get the output of one leave */
inline score_t LeafOutput(int leaf) const { return leaf_value_[leaf]; } inline float LeafOutput(int leaf) const { return leaf_value_[leaf]; }
/*! /*!
* \brief Adding prediction value of this tree model to scores * \brief Adding prediction value of this tree model to scores
...@@ -74,20 +74,26 @@ public: ...@@ -74,20 +74,26 @@ public:
* \param feature_values Feature value of this record * \param feature_values Feature value of this record
* \return Prediction result * \return Prediction result
*/ */
inline score_t Predict(const double* feature_values) const; inline float Predict(const float* feature_values) const;
inline int PredictLeafIndex(const double* feature_values) const; inline int PredictLeafIndex(const float* feature_values) const;
/*! \brief Get Number of leaves*/ /*! \brief Get Number of leaves*/
inline int num_leaves() const { return num_leaves_; } inline int num_leaves() const { return num_leaves_; }
/*! \brief Get depth of specific leaf*/
inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
/*! \brief Get feature of specific split*/
inline int split_feature_real(int split_idx) const { return split_feature_real_[split_idx]; }
/*! /*!
* \brief Shrinkage for the tree's output * \brief Shrinkage for the tree's output
* shrinkage rate (a.k.a learning rate) is used to tune the traning process * shrinkage rate (a.k.a learning rate) is used to tune the traning process
* \param rate The factor of shrinkage * \param rate The factor of shrinkage
*/ */
inline void Shrinkage(double rate) { inline void Shrinkage(float rate) {
for (int i = 0; i < num_leaves_; ++i) { for (int i = 0; i < num_leaves_; ++i) {
leaf_value_[i] = static_cast<score_t>(leaf_value_[i] * rate); leaf_value_[i] = leaf_value_[i] * rate;
} }
} }
...@@ -113,7 +119,7 @@ private: ...@@ -113,7 +119,7 @@ private:
* \param feature_values Feature value of this record * \param feature_values Feature value of this record
* \return Leaf index * \return Leaf index
*/ */
inline int GetLeaf(const double* feature_values) const; inline int GetLeaf(const float* feature_values) const;
/*! \brief Number of max leaves*/ /*! \brief Number of max leaves*/
int max_leaves_; int max_leaves_;
...@@ -131,23 +137,25 @@ private: ...@@ -131,23 +137,25 @@ private:
/*! \brief A non-leaf node's split threshold in bin */ /*! \brief A non-leaf node's split threshold in bin */
unsigned int* threshold_in_bin_; unsigned int* threshold_in_bin_;
/*! \brief A non-leaf node's split threshold in feature value */ /*! \brief A non-leaf node's split threshold in feature value */
double* threshold_; float* threshold_;
/*! \brief A non-leaf node's split gain */ /*! \brief A non-leaf node's split gain */
double* split_gain_; float* split_gain_;
// used for leaf node // used for leaf node
/*! \brief The parent of leaf */ /*! \brief The parent of leaf */
int* leaf_parent_; int* leaf_parent_;
/*! \brief Output of leaves */ /*! \brief Output of leaves */
score_t* leaf_value_; float* leaf_value_;
/*! \brief Depth for leaves */
int* leaf_depth_;
}; };
inline score_t Tree::Predict(const double* feature_values) const { inline float Tree::Predict(const float* feature_values) const {
int leaf = GetLeaf(feature_values); int leaf = GetLeaf(feature_values);
return LeafOutput(leaf); return LeafOutput(leaf);
} }
inline int Tree::PredictLeafIndex(const double* feature_values) const { inline int Tree::PredictLeafIndex(const float* feature_values) const {
int leaf = GetLeaf(feature_values); int leaf = GetLeaf(feature_values);
return leaf; return leaf;
} }
...@@ -166,7 +174,7 @@ inline int Tree::GetLeaf(const std::vector<BinIterator*>& iterators, ...@@ -166,7 +174,7 @@ inline int Tree::GetLeaf(const std::vector<BinIterator*>& iterators,
return ~node; return ~node;
} }
inline int Tree::GetLeaf(const double* feature_values) const { inline int Tree::GetLeaf(const float* feature_values) const {
int node = 0; int node = 0;
while (node >= 0) { while (node >= 0) {
if (feature_values[split_feature_real_[node]] <= threshold_[node]) { if (feature_values[split_feature_real_[node]] <= threshold_[node]) {
......
...@@ -43,14 +43,39 @@ inline static std::string& RemoveQuotationSymbol(std::string& str) { ...@@ -43,14 +43,39 @@ inline static std::string& RemoveQuotationSymbol(std::string& str) {
str.erase(0, str.find_first_not_of("'\"")); str.erase(0, str.find_first_not_of("'\""));
return str; return str;
} }
inline static bool StartsWith(const std::string& str, const std::string prefix) {
if (str.substr(0, prefix.size()) == prefix) {
return true;
} else {
return false;
}
}
inline static std::vector<std::string> Split(const char* c_str, char delimiter) {
std::vector<std::string> ret;
std::string str(c_str);
size_t i = 0;
size_t pos = str.find(delimiter);
while (pos != std::string::npos) {
ret.push_back(str.substr(i, pos - i));
i = ++pos;
pos = str.find(delimiter, pos);
}
ret.push_back(str.substr(i));
return ret;
}
inline static std::vector<std::string> Split(const char* str, char delimiter) { inline static std::vector<std::string> Split(const char* c_str, const char* delimiters) {
std::stringstream ss(str); // will split when met any chars in delimiters
std::string tmp_str;
std::vector<std::string> ret; std::vector<std::string> ret;
while (std::getline(ss, tmp_str, delimiter)) { std::string str(c_str);
ret.push_back(tmp_str); size_t i = 0;
size_t pos = str.find_first_of(delimiters);
while (pos != std::string::npos) {
ret.push_back(str.substr(i, pos - i));
i = ++pos;
pos = str.find_first_of(delimiters, pos);
} }
ret.push_back(str.substr(i));
return ret; return ret;
} }
...@@ -78,9 +103,9 @@ inline static const char* Atoi(const char* p, int* out) { ...@@ -78,9 +103,9 @@ inline static const char* Atoi(const char* p, int* out) {
} }
//ref to http://www.leapsecond.com/tools/fast_atof.c //ref to http://www.leapsecond.com/tools/fast_atof.c
inline static const char* Atof(const char* p, double* out) { inline static const char* Atof(const char* p, float* out) {
int frac; int frac;
double sign, value, scale; float sign, value, scale;
*out = 0; *out = 0;
// Skip leading white space, if any. // Skip leading white space, if any.
while (*p == ' ') { while (*p == ' ') {
...@@ -88,9 +113,9 @@ inline static const char* Atof(const char* p, double* out) { ...@@ -88,9 +113,9 @@ inline static const char* Atof(const char* p, double* out) {
} }
// Get sign, if any. // Get sign, if any.
sign = 1.0; sign = 1.0f;
if (*p == '-') { if (*p == '-') {
sign = -1.0; sign = -1.0f;
++p; ++p;
} }
else if (*p == '+') { else if (*p == '+') {
...@@ -100,24 +125,24 @@ inline static const char* Atof(const char* p, double* out) { ...@@ -100,24 +125,24 @@ inline static const char* Atof(const char* p, double* out) {
// is a number // is a number
if ((*p >= '0' && *p <= '9') || *p == '.' || *p == 'e' || *p == 'E') { if ((*p >= '0' && *p <= '9') || *p == '.' || *p == 'e' || *p == 'E') {
// Get digits before decimal point or exponent, if any. // Get digits before decimal point or exponent, if any.
for (value = 0.0; *p >= '0' && *p <= '9'; ++p) { for (value = 0.0f; *p >= '0' && *p <= '9'; ++p) {
value = value * 10.0 + (*p - '0'); value = value * 10.0f + (*p - '0');
} }
// Get digits after decimal point, if any. // Get digits after decimal point, if any.
if (*p == '.') { if (*p == '.') {
double pow10 = 10.0; float pow10 = 10.0f;
++p; ++p;
while (*p >= '0' && *p <= '9') { while (*p >= '0' && *p <= '9') {
value += (*p - '0') / pow10; value += (*p - '0') / pow10;
pow10 *= 10.0; pow10 *= 10.0f;
++p; ++p;
} }
} }
// Handle exponent, if any. // Handle exponent, if any.
frac = 0; frac = 0;
scale = 1.0; scale = 1.0f;
if ((*p == 'e') || (*p == 'E')) { if ((*p == 'e') || (*p == 'E')) {
unsigned int expon; unsigned int expon;
// Get sign of exponent, if any. // Get sign of exponent, if any.
...@@ -132,11 +157,9 @@ inline static const char* Atof(const char* p, double* out) { ...@@ -132,11 +157,9 @@ inline static const char* Atof(const char* p, double* out) {
for (expon = 0; *p >= '0' && *p <= '9'; ++p) { for (expon = 0; *p >= '0' && *p <= '9'; ++p) {
expon = expon * 10 + (*p - '0'); expon = expon * 10 + (*p - '0');
} }
if (expon > 308) expon = 308; if (expon > 38) expon = 38;
// Calculate scaling factor.
while (expon >= 50) { scale *= 1E50; expon -= 50; }
while (expon >= 8) { scale *= 1E8; expon -= 8; } while (expon >= 8) { scale *= 1E8; expon -= 8; }
while (expon > 0) { scale *= 10.0; expon -= 1; } while (expon > 0) { scale *= 10.0f; expon -= 1; }
} }
// Return signed and scaled floating point result. // Return signed and scaled floating point result.
*out = sign * (frac ? (value / scale) : (value * scale)); *out = sign * (frac ? (value / scale) : (value * scale));
...@@ -152,9 +175,9 @@ inline static const char* Atof(const char* p, double* out) { ...@@ -152,9 +175,9 @@ inline static const char* Atof(const char* p, double* out) {
std::string tmp_str(p, cnt); std::string tmp_str(p, cnt);
std::transform(tmp_str.begin(), tmp_str.end(), tmp_str.begin(), ::tolower); std::transform(tmp_str.begin(), tmp_str.end(), tmp_str.begin(), ::tolower);
if (tmp_str == std::string("na") || tmp_str == std::string("nan")) { if (tmp_str == std::string("na") || tmp_str == std::string("nan")) {
*out = 0; *out = 0.0f;
} else if( tmp_str == std::string("inf") || tmp_str == std::string("infinity")) { } else if( tmp_str == std::string("inf") || tmp_str == std::string("infinity")) {
*out = sign * 1e308; *out = sign * static_cast<float>(1e38);
} }
else { else {
Log::Fatal("Unknow token %s in data file", tmp_str.c_str()); Log::Fatal("Unknow token %s in data file", tmp_str.c_str());
...@@ -170,6 +193,22 @@ inline static const char* Atof(const char* p, double* out) { ...@@ -170,6 +193,22 @@ inline static const char* Atof(const char* p, double* out) {
return p; return p;
} }
inline bool AtoiAndCheck(const char* p, int* out) {
const char* after = Atoi(p, out);
if (*after != '\0') {
return false;
}
return true;
}
inline bool AtofAndCheck(const char* p, float* out) {
const char* after = Atof(p, out);
if (*after != '\0') {
return false;
}
return true;
}
inline static const char* SkipSpaceAndTab(const char* p) { inline static const char* SkipSpaceAndTab(const char* p) {
while (*p == ' ' || *p == '\t') { while (*p == ' ' || *p == '\t') {
++p; ++p;
...@@ -189,56 +228,57 @@ inline static std::string ArrayToString(const T* arr, int n, char delimiter) { ...@@ -189,56 +228,57 @@ inline static std::string ArrayToString(const T* arr, int n, char delimiter) {
if (n <= 0) { if (n <= 0) {
return std::string(""); return std::string("");
} }
std::stringstream ss; std::stringstream str_buf;
ss << arr[0]; str_buf << arr[0];
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
ss << delimiter; str_buf << delimiter;
ss << arr[i]; str_buf << arr[i];
} }
return ss.str(); return str_buf.str();
} }
inline static void StringToIntArray(const std::string& str, char delimiter, size_t n, int* out) { template<typename T>
std::vector<std::string> strs = Split(str.c_str(), delimiter); inline static std::string ArrayToString(std::vector<T> arr, char delimiter) {
if (strs.size() != n) { if (arr.size() <= 0) {
Log::Fatal("StringToIntArray error, size doesn't matched."); return std::string("");
} }
for (size_t i = 0; i < strs.size(); ++i) { std::stringstream str_buf;
strs[i] = Trim(strs[i]); str_buf << arr[0];
Atoi(strs[i].c_str(), &out[i]); for (size_t i = 1; i < arr.size(); ++i) {
str_buf << delimiter;
str_buf << arr[i];
} }
return str_buf.str();
} }
inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, double* out) { inline static void StringToIntArray(const std::string& str, char delimiter, size_t n, int* out) {
std::vector<std::string> strs = Split(str.c_str(), delimiter); std::vector<std::string> strs = Split(str.c_str(), delimiter);
if (strs.size() != n) { if (strs.size() != n) {
Log::Fatal("StringToDoubleArray error, size doesn't matched."); Log::Fatal("StringToIntArray error, size doesn't matched.");
} }
for (size_t i = 0; i < strs.size(); ++i) { for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]); strs[i] = Trim(strs[i]);
Atof(strs[i].c_str(), &out[i]); Atoi(strs[i].c_str(), &out[i]);
} }
} }
inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, float* out) { inline static void StringToFloatArray(const std::string& str, char delimiter, size_t n, float* out) {
std::vector<std::string> strs = Split(str.c_str(), delimiter); std::vector<std::string> strs = Split(str.c_str(), delimiter);
if (strs.size() != n) { if (strs.size() != n) {
Log::Fatal("StringToDoubleArray error, size doesn't matched."); Log::Fatal("StringToFloatArray error, size doesn't matched.");
} }
double tmp;
for (size_t i = 0; i < strs.size(); ++i) { for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]); strs[i] = Trim(strs[i]);
Atof(strs[i].c_str(), &tmp); Atof(strs[i].c_str(), &out[i]);
out[i] = static_cast<float>(tmp);
} }
} }
inline static std::vector<double> StringToDoubleArray(const std::string& str, char delimiter) { inline static std::vector<float> StringToFloatArray(const std::string& str, char delimiter) {
std::vector<std::string> strs = Split(str.c_str(), delimiter); std::vector<std::string> strs = Split(str.c_str(), delimiter);
std::vector<double> ret; std::vector<float> ret;
for (size_t i = 0; i < strs.size(); ++i) { for (size_t i = 0; i < strs.size(); ++i) {
strs[i] = Trim(strs[i]); strs[i] = Trim(strs[i]);
double val = 0.0; float val = 0.0f;
Atof(strs[i].c_str(), &val); Atof(strs[i].c_str(), &val);
ret.push_back(val); ret.push_back(val);
} }
...@@ -296,6 +336,26 @@ static inline int64_t Pow2RoundUp(int64_t x) { ...@@ -296,6 +336,26 @@ static inline int64_t Pow2RoundUp(int64_t x) {
return 0; return 0;
} }
/*!
* \brief Do inplace softmax transformaton on p_rec
* \param p_rec The input/output vector of the values.
*/
inline void Softmax(std::vector<float>* p_rec) {
std::vector<float> &rec = *p_rec;
float wmax = rec[0];
for (size_t i = 1; i < rec.size(); ++i) {
wmax = std::max(rec[i], wmax);
}
float wsum = 0.0f;
for (size_t i = 0; i < rec.size(); ++i) {
rec[i] = std::exp(rec[i] - wmax);
wsum += rec[i];
}
for (size_t i = 0; i < rec.size(); ++i) {
rec[i] /= static_cast<float>(wsum);
}
}
} // namespace Common } // namespace Common
} // namespace LightGBM } // namespace LightGBM
......
...@@ -85,11 +85,8 @@ private: ...@@ -85,11 +85,8 @@ private:
// a trick to use static variable in header file. // a trick to use static variable in header file.
// May be not good, but avoid to use an additional cpp file // May be not good, but avoid to use an additional cpp file
static LogLevel& GetLevel() { static LogLevel& GetLevel() { static LogLevel level; return level; }
static LogLevel level;
return level;
};
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <LightGBM/utils/log.h> #include <LightGBM/utils/log.h>
#include <cstring> #include <cstring>
#include <functional>
namespace LightGBM { namespace LightGBM {
...@@ -38,40 +39,41 @@ public: ...@@ -38,40 +39,41 @@ public:
cache_size_ = cache_size; cache_size_ = cache_size;
// at least need 2 bucket to store smaller leaf and larger leaf // at least need 2 bucket to store smaller leaf and larger leaf
CHECK(cache_size_ >= 2); CHECK(cache_size_ >= 2);
total_size_ = total_size; total_size_ = total_size;
if (cache_size_ > total_size_) {
pool_ = new T[cache_size]; cache_size_ = total_size_;
mapper_ = new int[total_size_]; }
inverse_mapper_ = new int[cache_size_]; is_enough_ = (cache_size_ == total_size_);
last_used_time_ = new int[cache_size_]; pool_ = new T[cache_size_];
ResetMap(); if (!is_enough_) {
mapper_ = new int[total_size_];
inverse_mapper_ = new int[cache_size_];
last_used_time_ = new int[cache_size_];
ResetMap();
}
} }
/*!
* \brief Return true if this pool is enough to store all data
*/
bool IsEnough() {
return cache_size_ == total_size_;
}
/*! /*!
* \brief Reset mapper * \brief Reset mapper
*/ */
void ResetMap() { void ResetMap() {
cur_time_ = 0; if (!is_enough_) {
memset(mapper_, -1, sizeof(int)*total_size_); cur_time_ = 0;
memset(inverse_mapper_, -1, sizeof(int)*cache_size_); memset(mapper_, -1, sizeof(int)*total_size_);
memset(last_used_time_, 0, sizeof(int)*cache_size_); memset(inverse_mapper_, -1, sizeof(int)*cache_size_);
memset(last_used_time_, 0, sizeof(int)*cache_size_);
}
} }
/*! /*!
* \brief Set data for the pool for specific index * \brief Fill the pool
* \param idx which index want to set to * \param obj_create_fun that used to generate object
* \param data
*/ */
void Set(int idx, const T& data) { void Fill(std::function<T()> obj_create_fun) {
pool_[idx] = data; for (int i = 0; i < cache_size_; ++i) {
pool_[i] = obj_create_fun();
}
} }
/*! /*!
...@@ -81,7 +83,11 @@ public: ...@@ -81,7 +83,11 @@ public:
* \return True if this index is in the pool, False if this index is not in the pool * \return True if this index is in the pool, False if this index is not in the pool
*/ */
bool Get(int idx, T* out) { bool Get(int idx, T* out) {
if (mapper_[idx] >= 0) { if (is_enough_) {
*out = pool_[idx];
return true;
}
else if (mapper_[idx] >= 0) {
int slot = mapper_[idx]; int slot = mapper_[idx];
*out = pool_[slot]; *out = pool_[slot];
last_used_time_[slot] = ++cur_time_; last_used_time_[slot] = ++cur_time_;
...@@ -108,6 +114,10 @@ public: ...@@ -108,6 +114,10 @@ public:
* \param dst_idx * \param dst_idx
*/ */
void Move(int src_idx, int dst_idx) { void Move(int src_idx, int dst_idx) {
if (is_enough_) {
std::swap(pool_[src_idx], pool_[dst_idx]);
return;
}
if (mapper_[src_idx] < 0) { if (mapper_[src_idx] < 0) {
return; return;
} }
...@@ -122,6 +132,7 @@ public: ...@@ -122,6 +132,7 @@ public:
inverse_mapper_[slot] = dst_idx; inverse_mapper_[slot] = dst_idx;
} }
private: private:
void FreeAll(){ void FreeAll(){
if (pool_ != nullptr) { if (pool_ != nullptr) {
delete[] pool_; delete[] pool_;
...@@ -139,6 +150,7 @@ private: ...@@ -139,6 +150,7 @@ private:
T* pool_ = nullptr; T* pool_ = nullptr;
int cache_size_; int cache_size_;
int total_size_; int total_size_;
bool is_enough_ = false;
int* mapper_ = nullptr; int* mapper_ = nullptr;
int* inverse_mapper_ = nullptr; int* inverse_mapper_ = nullptr;
int* last_used_time_ = nullptr; int* last_used_time_ = nullptr;
......
...@@ -21,7 +21,7 @@ public: ...@@ -21,7 +21,7 @@ public:
* \param filename Filename of data * \param filename Filename of data
* \process_fun Process function * \process_fun Process function
*/ */
static size_t Read(const char* filename, const std::function<size_t (const char*, size_t)>& process_fun) { static size_t Read(const char* filename, int skip_bytes, const std::function<size_t (const char*, size_t)>& process_fun) {
FILE* file; FILE* file;
#ifdef _MSC_VER #ifdef _MSC_VER
...@@ -38,8 +38,13 @@ public: ...@@ -38,8 +38,13 @@ public:
char* buffer_process = new char[buffer_size]; char* buffer_process = new char[buffer_size];
// buffer used for the file reading // buffer used for the file reading
char* buffer_read = new char[buffer_size]; char* buffer_read = new char[buffer_size];
size_t read_cnt = 0;
if (skip_bytes > 0) {
// skip first k bytes
read_cnt = fread(buffer_process, 1, skip_bytes, file);
}
// read first block // read first block
size_t read_cnt = fread(buffer_process, 1, buffer_size, file); read_cnt = fread(buffer_process, 1, buffer_size, file);
size_t last_read_cnt = 0; size_t last_read_cnt = 0;
while (read_cnt > 0) { while (read_cnt > 0) {
// strat read thread // strat read thread
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <LightGBM/utils/random.h> #include <LightGBM/utils/random.h>
#include <cstdio> #include <cstdio>
#include <sstream>
#include <vector> #include <vector>
#include <string> #include <string>
...@@ -22,9 +23,41 @@ public: ...@@ -22,9 +23,41 @@ public:
/*! /*!
* \brief Constructor * \brief Constructor
* \param filename Filename of data * \param filename Filename of data
* \param is_skip_first_line True if need to skip header
*/ */
TextReader(const char* filename): TextReader(const char* filename, bool is_skip_first_line):
filename_(filename){ filename_(filename), is_skip_first_line_(is_skip_first_line){
if (is_skip_first_line_) {
FILE* file;
#ifdef _MSC_VER
fopen_s(&file, filename, "r");
#else
file = fopen(filename, "r");
#endif
std::stringstream str_buf;
int read_c = -1;
read_c = fgetc(file);
while (read_c != EOF) {
char tmp_ch = static_cast<char>(read_c);
if (tmp_ch == '\n' || tmp_ch == '\r') {
break;
}
str_buf << tmp_ch;
++skip_bytes_;
read_c = fgetc(file);
}
if (static_cast<char>(read_c) == '\r') {
read_c = fgetc(file);
++skip_bytes_;
}
if (static_cast<char>(read_c) == '\n') {
read_c = fgetc(file);
++skip_bytes_;
}
fclose(file);
first_line_ = str_buf.str();
Log::Debug("skip header:\"%s\" in file %s", first_line_.c_str(), filename_);
}
} }
/*! /*!
* \brief Destructor * \brief Destructor
...@@ -40,6 +73,12 @@ public: ...@@ -40,6 +73,12 @@ public:
lines_.shrink_to_fit(); lines_.shrink_to_fit();
} }
/*! /*!
* \brief return first line of data
*/
inline std::string first_line() {
return first_line_;
}
/*!
* \brief Get text data that read from file * \brief Get text data that read from file
* \return Text data, store in std::vector by line * \return Text data, store in std::vector by line
*/ */
...@@ -48,7 +87,7 @@ public: ...@@ -48,7 +87,7 @@ public:
INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) { INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
last_line_ = ""; last_line_ = "";
INDEX_T total_cnt = 0; INDEX_T total_cnt = 0;
PipelineReader::Read(filename_, PipelineReader::Read(filename_, skip_bytes_,
[this, &total_cnt, &process_fun] [this, &total_cnt, &process_fun]
(const char* buffer_process, size_t read_cnt) { (const char* buffer_process, size_t read_cnt) {
size_t cnt = 0; size_t cnt = 0;
...@@ -73,7 +112,7 @@ public: ...@@ -73,7 +112,7 @@ public:
++i; ++i;
++total_cnt; ++total_cnt;
// skip end of line // skip end of line
while (buffer_process[i] == '\n' || buffer_process[i] == '\r') { ++i; } while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
last_i = i; last_i = i;
} }
else { else {
...@@ -176,7 +215,7 @@ public: ...@@ -176,7 +215,7 @@ public:
last_line_ = ""; last_line_ = "";
INDEX_T total_cnt = 0; INDEX_T total_cnt = 0;
INDEX_T used_cnt = 0; INDEX_T used_cnt = 0;
PipelineReader::Read(filename_, PipelineReader::Read(filename_, skip_bytes_,
[this, &total_cnt, &process_fun,&used_cnt, &filter_fun] [this, &total_cnt, &process_fun,&used_cnt, &filter_fun]
(const char* buffer_process, size_t read_cnt) { (const char* buffer_process, size_t read_cnt) {
size_t cnt = 0; size_t cnt = 0;
...@@ -208,7 +247,7 @@ public: ...@@ -208,7 +247,7 @@ public:
++i; ++i;
++total_cnt; ++total_cnt;
// skip end of line // skip end of line
while (buffer_process[i] == '\n' || buffer_process[i] == '\r') { ++i; } while ((buffer_process[i] == '\n' || buffer_process[i] == '\r') && i < read_cnt) { ++i; }
last_i = i; last_i = i;
} }
else { else {
...@@ -242,7 +281,7 @@ public: ...@@ -242,7 +281,7 @@ public:
} }
INDEX_T ReadPartAndProcessParallel(const std::vector<INDEX_T>& used_data_indices, const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) { INDEX_T ReadPartAndProcessParallel(const std::vector<INDEX_T>& used_data_indices, const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun) {
return ReadAllAndProcessParallelWithFilter(process_fun, return ReadAllAndProcessParallelWithFilter(process_fun,
[&used_data_indices](INDEX_T used_cnt ,INDEX_T total_cnt) { [&used_data_indices](INDEX_T used_cnt ,INDEX_T total_cnt) {
if (used_cnt < used_data_indices.size() && total_cnt == used_data_indices[used_cnt]) { if (used_cnt < used_data_indices.size() && total_cnt == used_data_indices[used_cnt]) {
return true; return true;
...@@ -260,6 +299,12 @@ private: ...@@ -260,6 +299,12 @@ private:
std::vector<std::string> lines_; std::vector<std::string> lines_;
/*! \brief Buffer for last line */ /*! \brief Buffer for last line */
std::string last_line_; std::string last_line_;
/*! \brief first line */
std::string first_line_="";
/*! \brief is skip first line */
bool is_skip_first_line_ = false;
/*! \brief is skip first line */
int skip_bytes_ = 0;
}; };
} // namespace LightGBM } // namespace LightGBM
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment