Commit 1c774687 authored by Guolin Ke's avatar Guolin Ke
Browse files

first commit

parents
This diff is collapsed.
192.168.1.101 12400
192.168.1.102 12400
task = predict
data = binary.test
input_model= LightGBM_model.txt
# task type, support train and predict
task = train
# boosting type, support gbdt for now, alias: boosting, boost
boosting_type = gbdt
# application type, support following application
# regression , regression task
# binary , binary classification task
# lambdarank , lambdarank task
# alias: application, app
objective = binary
# eval metrics, support multi metric, delimite by ',' , support following metrics
# l1
# l2 , default metric for regression
# ndcg , default metric for lambdarank
# auc
# binary_logloss , default metric for binary
# binary_error
metric = binary_logloss,auc
# frequence for metric output
metric_freq = 1
# true if need output metric for training data, alias: tranining_metric, train_metric
is_training_metric = true
# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy.
max_bin = 255
# training data
# if exsting weight file, should name to "binary.train.weight"
# alias: train_data, train
data = binary.train
# validation data, support multi validation data, separated by ','
# if exsting weight file, should name to "binary.test.weight"
# alias: valid, test, test_data,
valid_data = binary.test
# number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
num_trees = 100
# shrinkage rate , alias: shrinkage_rate
learning_rate = 0.1
# number of leaves for one tree, alias: num_leaf
num_leaves = 63
# type of tree learner, support following types:
# serial , single machine version
# feature , use feature parallel to train
# data , use data parallel to train
# voting , use voting based parallel to train
# alias: tree
tree_learner = feature
# number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu.
# num_threads = 8
# feature sub-sample, will random select 80% feature to train on each iteration
# alias: sub_feature
feature_fraction = 0.8
# Support bagging (data sub-sample), will perform bagging every 5 iterations
bagging_freq = 5
# Bagging farction, will random select 80% data on bagging
# alias: sub_row
bagging_fraction = 0.8
# minimal number data for one leaf, use this to deal with over-fit
# alias : min_data_per_leaf, min_data
min_data_in_leaf = 50
# minimal sum hessians for one leaf, use this to deal with over-fit
min_sum_hessian_in_leaf = 5.0
# save memory and faster speed for sparse feature, alias: is_sparse
is_enable_sparse = true
# when data is bigger than memory size, set this to true. otherwise set false will have faster speed
# alias: two_round_loading, two_round
use_two_round_loading = false
# true if need to save data to binary file and application will auto load data from binary file next time
# alias: is_save_binary, save_binary
is_save_binary_file = false
# output model file
output_model = LightGBM_model.txt
# support continuous train from trained gbdt model
# input_model= trained_model.txt
# output prediction file for predict task
# output_result= prediction.txt
# support continuous train from initial score file
# input_init_score= init_score.txt
# number of machines in parallel training, alias: num_machine
num_machines = 2
# local listening port in parallel training, alias: local_port
local_listen_port = 12400
# machines list file for parallel training, alias: mlist
machine_list_file = mlist.txt
Regression Example
=====================
Here is an example for LightGBM to run regression task.
***You should copy executable file to this folder first.***
#### Training
For windows, by running following command in this folder:
```
LightGBM.exe config=train.conf
```
For linux, by running following command in this folder:
```
./LightGBM config=train.conf
```
#### Prediction
You should finish training first.
For windows, by running following command in this folder:
```
LightGBM.exe config=predict.conf
```
For linux, by running following command in this folder:
```
./LightGBM config=predict.conf
```
task = predict
data = regression.test
input_model= LightGBM_model.txt
This diff is collapsed.
This diff is collapsed.
# task type, support train and predict
task = train
# boosting type, support gbdt for now, alias: boosting, boost
boosting_type = gbdt
# application type, support following application
# regression , regression task
# binary , binary classification task
# lambdarank , lambdarank task
# alias: application, app
objective = regression
# eval metrics, support multi metric, delimite by ',' , support following metrics
# l1
# l2 , default metric for regression
# ndcg , default metric for lambdarank
# auc
# binary_logloss , default metric for binary
# binary_error
metric = l2
# frequence for metric output
metric_freq = 1
# true if need output metric for training data, alias: tranining_metric, train_metric
is_training_metric = true
# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy.
max_bin = 255
# training data
# if exsting weight file, should name to "regression.train.weight"
# alias: train_data, train
data = regression.train
# validation data, support multi validation data, separated by ','
# if exsting weight file, should name to "regression.test.weight"
# alias: valid, test, test_data,
valid_data = regression.test
# number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
num_trees = 100
# shrinkage rate , alias: shrinkage_rate
learning_rate = 0.05
# number of leaves for one tree, alias: num_leaf
num_leaves = 31
# type of tree learner, support following types:
# serial , single machine version
# feature , use feature parallel to train
# data , use data parallel to train
# voting , use voting based parallel to train
# alias: tree
tree_learner = serial
# number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu.
# num_threads = 8
# feature sub-sample, will random select 80% feature to train on each iteration
# alias: sub_feature
feature_fraction = 0.9
# Support bagging (data sub-sample), will perform bagging every 5 iterations
bagging_freq = 5
# Bagging farction, will random select 80% data on bagging
# alias: sub_row
bagging_fraction = 0.8
# minimal number data for one leaf, use this to deal with over-fit
# alias : min_data_per_leaf, min_data
min_data_in_leaf = 100
# minimal sum hessians for one leaf, use this to deal with over-fit
min_sum_hessian_in_leaf = 5.0
# save memory and faster speed for sparse feature, alias: is_sparse
is_enable_sparse = true
# when data is bigger than memory size, set this to true. otherwise set false will have faster speed
# alias: two_round_loading, two_round
use_two_round_loading = false
# true if need to save data to binary file and application will auto load data from binary file next time
# alias: is_save_binary, save_binary
is_save_binary_file = false
# output model file
output_model = LightGBM_model.txt
# support continuous train from trained gbdt model
# input_model= trained_model.txt
# output prediction file for predict task
# output_result= prediction.txt
# support continuous train from initial score file
# input_init_score= init_score.txt
# number of machines in parallel training, alias: num_machine
num_machines = 1
# local listening port in parallel training, alias: local_port
local_listen_port = 12400
# machines list file for parallel training, alias: mlist
machine_list_file = mlist.txt
#ifndef LIGHTGBM_APPLICATION_H_
#define LIGHTGBM_APPLICATION_H_
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <vector>
namespace LightGBM {
/*! \brief forward declaration */
class Dataset;
class Boosting;
class ObjectiveFunction;
class Metric;
/*!
* \brief The entrance of LightGBM. this application has two tasks:
* Train and Predict.
* Train task will train a new model
* Predict task will predicting the scores of test data then saving the score to local disk
*/
class Application {
public:
Application(int argc, char** argv);
/*! \brief Destructor */
~Application();
/*! \brief To call this funciton to run application*/
inline void Run();
private:
/*!
* \brief Global Sync by minimal, will return minimal of global
* \param local Local data
* \return Global minimal data
*/
template<typename T>
T GlobalSyncUpByMin(T& local);
/*! \brief Load parametes from command line and config file*/
void LoadParameters(int argc, char** argv);
/*! \brief Load data, including training data and validation data*/
void LoadData();
/*! \brief Some initial works before training*/
void InitTrain();
/*! \brief The training logic */
void Train();
/*! \brief Initialize the enviroment needed by prediction */
void InitPredict();
/*! \brief Load model */
void LoadModel();
/*! \brief The prediction logic */
void Predict();
/*! \brief All configs */
OverallConfig config_;
/*! \brief Training data */
Dataset* train_data_;
/*! \brief Validation data */
std::vector<Dataset*> valid_datas_;
/*! \brief Metric for training data */
std::vector<Metric*> train_metric_;
/*! \brief Metrics for validation data */
std::vector<std::vector<Metric*>> valid_metrics_;
/*! \brief Boosting object */
Boosting* boosting_;
/*! \brief Training objective function */
ObjectiveFunction* objective_fun_;
};
inline void Application::Run() {
if (config_.task_type == TaskType::kPredict) {
InitPredict();
Predict();
} else {
InitTrain();
Train();
}
}
} // namespace LightGBM
#endif #endif // LightGBM_APPLICATION_H_
#ifndef LIGHTGBM_BIN_H_
#define LIGHTGBM_BIN_H_
#include <LightGBM/meta.h>
#include <vector>
#include <functional>
namespace LightGBM {
/*! \brief Store data for one histogram bin */
struct HistogramBinEntry {
public:
/*! \brief Sum of gradients on this bin */
score_t sum_gradients = 0.0;
/*! \brief Sum of hessians on this bin */
score_t sum_hessians = 0.0;
/*! \brief Number of data on this bin */
data_size_t cnt = 0;
/*!
* \brief Sum up reduce function for histogram bin
*/
inline static void SumReducer(const char *src, char *dst, int len) {
const int type_size = sizeof(HistogramBinEntry);
int used_size = 0;
const HistogramBinEntry* p1;
HistogramBinEntry* p2;
while (used_size < len) {
// convert
p1 = reinterpret_cast<const HistogramBinEntry*>(src);
p2 = reinterpret_cast<HistogramBinEntry*>(dst);
// add
p2->cnt += p1->cnt;
p2->sum_gradients += p1->sum_gradients;
p2->sum_hessians += p1->sum_hessians;
src += type_size;
dst += type_size;
used_size += type_size;
}
}
};
/*! \brief This class used to convert featrue value to bin,
* and store some meta infomartion for bin*/
class BinMapper {
public:
BinMapper();
BinMapper(const BinMapper& other);
explicit BinMapper(const void* memory);
~BinMapper();
/*! \brief Get number of bins */
inline int num_bin() const { return num_bin_; }
/*! \brief True if bin is trival(only contain one bin) */
inline bool is_trival() const { return is_trival_; }
/*! \brief Sparse rate of this bins( num_zero_bins / num_data ) */
inline double sparse_rate() const { return sparse_rate_; }
/*!
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(FILE* file) const;
/*!
* \brief Map bin to feature value
* \param bin
* \return Feature value for this bin
*/
inline double BinToValue(unsigned int bin) const {
return bin_upper_bound_[bin];
}
/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const;
/*!
* \brief Map feature value to bin
* \param value
* \return bin for this feature value
*/
inline unsigned int ValueToBin(double value) const;
/*!
* \brief Construct feature value to bin mapper according feature values
* \param values (Sampled) values of this feature
* \param max_bin The maximal number of bin
*/
void FindBin(std::vector<double>* values, int max_bin);
/*!
* \brief Use specific number of bin to calculate the size of this class
* \param bin The number of bin
* \return Size
*/
static int SizeForSpecificBin(int bin);
/*!
* \brief Copy this object to buffer
* \param buffer The destination
*/
void CopyTo(char* buffer);
/*!
* \brief Restore this object from buffer
* \param buffer The source
*/
void CopyFrom(const char* buffer);
private:
/*! \brief Number of bins */
int num_bin_;
/*! \brief Store upper bound for each bin */
double* bin_upper_bound_;
/*! \brief True if this feature is trival */
bool is_trival_;
/*! \brief Sparse rate of this bins( num_bin0/num_data ) */
double sparse_rate_;
};
/*!
* \brief Interface for ordered bin data. efficient for construct histogram, especally for sparse bin
* There are 2 advantages for using ordered bin.
* 1. group the data by leaf, improve the cache hit.
* 2. only store the non-zero bin, which can speed up the histogram cconsturction for sparse feature.
* But it has a additional cost, it need re-order the bins after leaf split, which will cost much for dense feature.
* So we only use ordered bin for sparse features now.
*/
class OrderedBin {
public:
/*! \brief virtual destructor */
virtual ~OrderedBin() {}
/*!
* \brief Initial logic, call before train one tree.
* \param used_indices If used_indices==nullptr means using all data, otherwise, used_indices[i] != 0 means i-th data is used(for bagging logic)
* \param num_leavas Number of leveas on this iteration
*/
virtual void Init(const char* used_indices, data_size_t num_leavas) = 0;
/*!
* \brief Construct histogram by using this bin
* Note: Unlike Bin, OrderedBin doesn't use ordered gradients and ordered hessians.
* Because it is hard to know the relative index in one leaf for sparse bin, since we skipped zero bins.
* \param leaf Using which leaf's data to construct
* \param gradients Gradients, Note:non-oredered by leaf
* \param hessians Hessians, Note:non-oredered by leaf
* \param out Output Result
*/
virtual void ConstructHistogram(int leaf, const score_t* gradients,
const score_t* hessians, HistogramBinEntry* out) const = 0;
/*!
* \brief Split current bin, and perform re-order by leaf
* \param leaf Using which leaf's to split
* \param right_leaf The new leaf index after perform this split
* \param left_indices left_indices[i] != 0 means the i-th data will be on left leaf after split
*/
virtual void Split(int leaf, int right_leaf, const char* left_indices) = 0;
};
/*! \brief Iterator for one bin column */
class BinIterator {
public:
/*!
* \brief Get bin data on specific row index
* \param idx Index of this data
* \return Bin data
*/
virtual uint32_t Get(data_size_t idx) = 0;
};
/*!
* \brief Interface for bin data. This class will store bin data for one feature.
* unlike OrderedBin, this class will store data by original order.
* Though it may have many cache miss when construct histogram,
* but it doesn't need to re-order operation, So it is still faster than OrderedBin for dense feature
*/
class Bin {
public:
/*! \brief virtual destructor */
virtual ~Bin() {}
/*!
* \brief Push one record
* \pram tid Thread id
* \param idx Index of record
* \param value bin value of record
*/
virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
/*!
* \brief Get bin interator of this bin
* \param start_idx start index of this
* \return Iterator of this bin
*/
virtual BinIterator* GetIterator(data_size_t start_idx) const = 0;
/*!
* \brief Save binary data to file
* \param file File want to write
*/
virtual void SaveBinaryToFile(FILE* file) const = 0;
/*!
* \brief Load from memory
* \param file File want to write
*/
virtual void LoadFromMemory(const void* memory,
const std::vector<data_size_t>& local_used_indices) = 0;
/*!
* \brief Get sizes in byte of this object
*/
virtual size_t SizesInByte() const = 0;
/*! \brief Number of all data */
virtual data_size_t num_data() const = 0;
/*!
* \brief Construct histogram of this feature,
* Note: here use ordered_gradients and ordered_hessians to improve cache hit chance
* The navie solution is use gradients[data_indices[i]] for data_indices[i] to get gradients, which is not cache friendly, since the access of memory is not continuous.
* ordered_gradients and ordered_hessians are preprocessed, they are re-ordered by data_indices.
* It uses ordered_gradients[i] for data_indices[i]'s gradients (same for ordered_hessians).
* \param data_indices Used data indices in current leaf
* \param num_data Number of used data
* \param ordered_gradients Pointer to gradients, the data_indices[i]-th data's gradient is ordered_gradients[i]
* \param ordered_hessians Pointer to hessians, the data_indices[i]-th data's hessian is ordered_hessians[i]
* \param out Output Result
*/
virtual void ConstructHistogram(
data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const = 0;
/*!
* \brief Split data according to threshold, if bin <= threshold, will put into left(lte_indices), else put into right(gt_indices)
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
* \param lte_indices After called this function. The less or equal data indices will store on this object.
* \param gt_indices After called this function. The greater data indices will store on this object.
* \return The number of less than or equal data.
*/
virtual data_size_t Split(
unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const = 0;
/*!
* \brief Create the ordered bin for this bin
* \return Pointer to ordered bin
*/
virtual OrderedBin* CreateOrderedBin() const = 0;
/*!
* \brief After pushed all feature data, should call this to have better refactor for bin data
*/
virtual void FinishLoad() = 0;
/*!
* \brief Create object for bin data of one feature, will call CreateDenseBin or CreateSparseBin according to "is_sparse"
* \param num_data Total number of data
* \param num_bin Number of bin
* \param is_sparse True if this feature is saprese
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
* \param is_enable_sparse True if enable sparse feature
* \param is_sparse Will set to true if this bin is sparse
* \return The bin data object
*/
static Bin* CreateBin(data_size_t num_data, int num_bin,
double sparse_rate, bool is_enable_sparse, bool* is_sparse);
/*!
* \brief Create object for bin data of one feature, used for dense feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \return The bin data object
*/
static Bin* CreateDenseBin(data_size_t num_data, int num_bin);
/*!
* \brief Create object for bin data of one feature, used for sparse feature
* \param num_data Total number of data
* \param num_bin Number of bin
* \param sparse_rate Sparse rate of this bins( num_bin0/num_data )
* \return The bin data object
*/
static Bin* CreateSparseBin(data_size_t num_data,
int num_bin);
};
inline unsigned int BinMapper::ValueToBin(double value) const {
// use binary search to find bin
int l = 0;
int r = num_bin_ - 1;
while (l < r) {
int m = (r + l - 1) / 2;
if (value <= bin_upper_bound_[m]) {
r = m;
} else {
l = m + 1;
}
}
return l;
}
} // namespace LightGBM
#endif #endif // LightGBM_BIN_H_
#ifndef LIGHTGBM_BOOSTING_H_
#define LIGHTGBM_BOOSTING_H_
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <vector>
#include <string>
namespace LightGBM {
/*! \brief forward declaration */
class Dataset;
class ObjectiveFunction;
class Metric;
/*!
* \brief The interface for Boosting
*/
class Boosting {
public:
/*! \brief virtual destructor */
virtual ~Boosting() {}
/*!
* \brief Initial logic
* \param config Config for boosting
* \param train_data Training data
* \param object_function Training objective function
* \param training_metrics Training metric
* \param output_model_filename Filename of output model
*/
virtual void Init(const Dataset* train_data,
const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics,
const char* output_model_filename) = 0;
/*!
* \brief Add a validation data
* \param valid_data Validation data
* \param valid_metrics Metric for validation data
*/
virtual void AddDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) = 0;
/*! \brief Training logic */
virtual void Train() = 0;
/*!
* \brief Predtion for one record, not sigmoid transform
* \param feature_values Feature value on this record
* \return Prediction result for this record
*/
virtual double PredictRaw(const double * feature_values) const = 0;
/*!
* \brief Predtion for one record, will use sigmoid transform if needed
* \param feature_values Feature value on this record
* \return Prediction result for this record
*/
virtual double Predict(const double * feature_values) const = 0;
/*!
* \brief Serialize models by string
* \return String output of tranined model
*/
virtual std::string ModelsToString() const = 0;
/*!
* \brief Restore from a serialized string
* \param model_str The string of model
*/
virtual void ModelsFromString(const std::string& model_str, int num_used_model) = 0;
/*!
* \brief Get max feature index of this model
* \return Max feature index of this model
*/
virtual int MaxFeatureIdx() const = 0;
/*!
* \brief Get number of weak sub-models
* \return Number of weak sub-models
*/
virtual int NumberOfSubModels() const = 0;
/*!
* \brief Create boosting object
* \param type Type of boosting
* \return The boosting object
*/
static Boosting* CreateBoosting(BoostingType type,
const BoostingConfig* config);
};
} // namespace LightGBM
#endif #endif // LightGBM_BOOSTING_H_
#ifndef LIGHTGBM_CONFIG_H_
#define LIGHTGBM_CONFIG_H_
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>
#include <vector>
#include <string>
#include <unordered_map>
#include <algorithm>
namespace LightGBM {
/*!
* \brief The interface for Config
*/
struct ConfigBase {
public:
/*! \brief virtual destructor */
virtual ~ConfigBase() {}
/*!
* \brief SetLabelAt current config object by params
* \param params Store the key and value for params
*/
virtual void Set(
const std::unordered_map<std::string, std::string>& params) = 0;
/*!
* \brief Get string value by specific name of key
* \param params Store the key and value for params
* \param name Name of key
* \param out Value will asign to out if key exists
* \return True if key exists
*/
inline bool GetString(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, std::string* out);
/*!
* \brief Get int value by specific name of key
* \param params Store the key and value for params
* \param name Name of key
* \param out Value will asign to out if key exists
* \return True if key exists
*/
inline bool GetInt(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, int* out);
/*!
* \brief Get double value by specific name of key
* \param params Store the key and value for params
* \param name Name of key
* \param out Value will asign to out if key exists
* \return True if key exists
*/
inline bool GetDouble(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, double* out);
/*!
* \brief Get bool value by specific name of key
* \param params Store the key and value for params
* \param name Name of key
* \param out Value will asign to out if key exists
* \return True if key exists
*/
inline bool GetBool(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, bool* out);
};
/*! \brief Types of boosting */
enum BoostingType {
kGBDT
};
/*! \brief Types of tasks */
enum TaskType {
kTrain, kPredict
};
/*! \brief Config for input and output files */
struct IOConfig: public ConfigBase {
public:
int max_bin = 255;
int data_random_seed = 1;
std::string data_filename = "";
bool data_has_label = true;
std::vector<std::string> valid_data_filenames;
std::string output_model = "LightGBM_model.txt";
std::string output_result = "LightGBM_predict_result.txt";
std::string input_model = "";
std::string input_init_score = "";
int num_model_predict = -1;
bool is_pre_partition = false;
bool is_enable_sparse = true;
bool use_two_round_loading = false;
bool is_save_binary_file = false;
bool is_sigmoid = true;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
/*! \brief Config for objective function */
struct ObjectiveConfig: public ConfigBase {
public:
virtual ~ObjectiveConfig() {}
double sigmoid = 1;
// for lambdarank
std::vector<double> label_gain;
// for lambdarank
int max_position = 20;
// for binary
bool is_unbalance = false;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
/*! \brief Config for metrics interface*/
struct MetricConfig: public ConfigBase {
public:
virtual ~MetricConfig() {}
int output_freq = 1;
double sigmoid = 1;
bool is_provide_training_metric = false;
std::vector<double> label_gain;
std::vector<int> eval_at;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
/*! \brief Config for tree model */
struct TreeConfig: public ConfigBase {
public:
int min_data_in_leaf = 100;
double min_sum_hessian_in_leaf = 10.0f;
int num_leaves = 127;
int feature_fraction_seed = 2;
double feature_fraction = 1.0;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
/*! \brief Types of tree learning algorithms */
enum TreeLearnerType {
kSerialTreeLearner, kFeatureParallelTreelearner,
kDataParallelTreeLearner
};
/*! \brief Config for Boosting */
struct BoostingConfig: public ConfigBase {
public:
virtual ~BoostingConfig() {}
int num_iterations = 10;
double learning_rate = 0.1;
double bagging_fraction = 1.0;
int bagging_seed = 3;
int bagging_freq = 0;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
/*! \brief Config for GBDT */
struct GBDTConfig: public BoostingConfig {
public:
TreeLearnerType tree_learner_type = TreeLearnerType::kSerialTreeLearner;
TreeConfig tree_config;
void Set(const std::unordered_map<std::string, std::string>& params) override;
private:
void GetTreeLearnerType(const std::unordered_map<std::string,
std::string>& params);
};
/*! \brief Config for Network */
struct NetworkConfig: public ConfigBase {
public:
int num_machines = 1;
int local_listen_port = 12400;
int time_out = 120; // in minutes
std::string machine_list_filename = "";
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
/*! \brief Overall config, all configs will put on this class */
struct OverallConfig: public ConfigBase {
public:
TaskType task_type = TaskType::kTrain;
NetworkConfig network_config;
int num_threads = 0;
bool is_parallel = false;
bool is_parallel_find_bin = false;
IOConfig io_config;
BoostingType boosting_type = BoostingType::kGBDT;
BoostingConfig* boosting_config;
std::string objective_type = "regression";
ObjectiveConfig objective_config;
std::vector<std::string> metric_types;
MetricConfig metric_config;
~OverallConfig() {
delete boosting_config;
}
void Set(const std::unordered_map<std::string, std::string>& params) override;
private:
void GetBoostingType(const std::unordered_map<std::string, std::string>& params);
void GetObjectiveType(const std::unordered_map<std::string, std::string>& params);
void GetMetricType(const std::unordered_map<std::string, std::string>& params);
void GetTaskType(const std::unordered_map<std::string, std::string>& params);
void CheckParamConflict();
};
inline bool ConfigBase::GetString(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, std::string* out) {
if (params.count(name) > 0) {
*out = params.at(name);
return true;
}
return false;
}
inline bool ConfigBase::GetInt(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, int* out) {
if (params.count(name) > 0) {
Common::Atoi(params.at(name).c_str(), out);
return true;
}
return false;
}
inline bool ConfigBase::GetDouble(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, double* out) {
if (params.count(name) > 0) {
Common::Atof(params.at(name).c_str(), out);
return true;
}
return false;
}
inline bool ConfigBase::GetBool(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, bool* out) {
if (params.count(name) > 0) {
std::string value = params.at(name);
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
if (value == std::string("false")) {
*out = false;
} else {
*out = true;
}
return true;
}
return false;
}
struct ParameterAlias {
static void KeyAliasTransform(std::unordered_map<std::string, std::string>* params) {
std::unordered_map<std::string, std::string> alias_table(
{
{ "config", "config_file" },
{ "nthread", "num_threads" },
{ "num_thread", "num_threads" },
{ "boosting", "boosting_type" },
{ "boost", "boosting_type" },
{ "application", "objective" },
{ "app", "objective" },
{ "train_data", "data" },
{ "train", "data" },
{ "has_label", "data_has_label" },
{ "is_data_has_label", "data_has_label" },
{ "model_output", "output_model" },
{ "model_out", "output_model" },
{ "model_input", "input_model" },
{ "model_in", "input_model" },
{ "init_score", "input_init_score"},
{ "predict_result", "output_result" },
{ "prediction_result", "output_result" },
{ "valid", "valid_data" },
{ "test_data", "valid_data" },
{ "test", "valid_data" },
{ "is_sparse", "is_enable_sparse" },
{ "tranining_metric", "is_training_metric" },
{ "train_metric", "is_training_metric" },
{ "ndcg_at", "ndcg_eval_at" },
{ "min_data_per_leaf", "min_data_in_leaf" },
{ "min_data", "min_data_in_leaf" },
{ "min_sum_hessian_per_leaf", "min_sum_hessian_in_leaf" },
{ "min_sum_hessian", "min_sum_hessian_in_leaf" },
{ "min_hessian", "min_sum_hessian_in_leaf" },
{ "num_leaf", "num_leaves" },
{ "sub_feature", "feature_fraction" },
{ "num_iteration", "num_iterations" },
{ "num_tree", "num_iterations" },
{ "num_round", "num_iterations" },
{ "num_trees", "num_iterations" },
{ "num_rounds", "num_iterations" },
{ "sub_row", "bagging_fraction" },
{ "shrinkage_rate", "learning_rate" },
{ "tree", "tree_learner" },
{ "num_machine", "num_machines" },
{ "local_port", "local_listen_port" },
{ "two_round_loading", "use_two_round_loading"},
{ "two_round", "use_two_round_loading" },
{ "mlist", "machine_list_file" },
{ "is_save_binary", "is_save_binary_file" },
{ "save_binary", "is_save_binary_file" }
});
std::unordered_map<std::string, std::string> tmp_map;
for (const auto& pair : *params) {
if (alias_table.count(pair.first) > 0) {
tmp_map[alias_table[pair.first]] = pair.second;
}
}
for (const auto& pair : tmp_map) {
if (params->count(pair.first) == 0) {
params->insert(std::make_pair(pair.first, pair.second));
}
}
}
};
} // namespace LightGBM
#endif #endif // LightGBM_CONFIG_H_
#ifndef LIGHTGBM_DATA_H_
#define LIGHTGBM_DATA_H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/meta.h>
#include <vector>
#include <utility>
#include <functional>
#include <string>
namespace LightGBM {
/*! \brief forward declaration */
class Feature;
/*!
* \brief This class is used to store some meta(non-feature) data for tranining data,
* e.g. labels, weights, initial scores, qurey level informations.
*
* Some details:
* 1. Label, used for traning.
* 2. Weights, weighs of record, optional
* 3. Query Boundaries, necessary for lambdarank.
* The documents of i-th query is in [ query_boundarise[i], query_boundarise[i+1] )
* 4. Query Weights, auto calculate by weights and query_boundarise(if both of them are existed)
* the weight for i-th query is sum(query_boundarise[i] , .., query_boundarise[i+1]) / (query_boundarise[i + 1] - query_boundarise[i+1])
* 5. Initial score. optional. if exsitng, the model will boost from this score, otherwise will start from 0.
*/
class Metadata {
public:
/*!
* \brief Null costructor
*/
Metadata();
/*!
* \brief Initialize, will load qurey level informations, since it is need for sampling data
* \param data_filename Filename of data
* \param init_score_filename Filename of initial score
* \param is_int_label True if label is int type
*/
void Init(const char* data_filename, const char* init_score_filename);
/*!
* \brief Initialize, only load initial score
* \param init_score_filename Filename of initial score
*/
void Init(const char* init_score_filename);
/*!
* \brief Initial with binary memory
* \param memory Pointer to memory
*/
void LoadFromMemory(const void* memory);
/*! \brief Destructor */
~Metadata();
/*!
* \brief Initial work, will auto load weight, inital scores
* \param num_data Number of training data
*/
void InitLabel(data_size_t num_data);
/*!
* \brief Partition label by used indices
* \param used_indices Indice of local used
*/
void PartitionLabel(const std::vector<data_size_t>& used_indices);
/*!
* \brief Partition meta data according to local used indices if need
* \param num_all_data Number of total training data, including other machines' data on parallel learning
* \param used_data_indices Indices of local used training data
*/
void CheckOrPartition(data_size_t num_all_data,
const std::vector<data_size_t>& used_data_indices);
/*!
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
*/
void SetInitScore(score_t* init_score);
/*!
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(FILE* file) const;
/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const;
/*!
* \brief Get pointer of label
* \return Pointer of label
*/
inline const float* label() const { return label_; }
/*!
* \brief Set label for one record
* \param idx Index of this record
* \param value Label value of this record
*/
inline void SetLabelAt(data_size_t idx, double value)
{
label_[idx] = static_cast<float>(value);
}
/*!
* \brief Get weights, if not exists, will return nullput
* \return Pointer of weights
*/
inline const float* weights()
const { return weights_; }
/*!
* \brief Get data boundaries on queries, if not exists, will return nullput
* we assume data will order by query,
* the interval of [query_boundaris[i], query_boundaris[i+1])
* is the data indices for query i.
* \return Pointer of data boundaries on queries
*/
inline const data_size_t* query_boundaries()
const { return query_boundaries_; }
/*!
* \brief Get Number of queries
* \return Number of queries
*/
inline const data_size_t num_queries() const { return num_queries_; }
/*!
* \brief Get weights for queries, if not exists, will return nullput
* \return Pointer of weights for queries
*/
inline const float* query_weights() const { return query_weights_; }
/*!
* \brief Get initial scores, if not exists, will return nullput
* \return Pointer of initial scores
*/
inline const score_t* init_score() const { return init_score_; }
/*! \brief Load initial scores from file */
void LoadInitialScore();
private:
/*! \brief Load wights from file */
void LoadWeights();
/*! \brief Load query boundaries from file */
void LoadQueryBoundaries();
/*! \brief Load query wights */
void LoadQueryWeights();
/*! \brief Filename of current data */
const char* data_filename_;
/*! \brief Filename of initial scores */
const char* init_score_filename_;
/*! \brief Number of data */
data_size_t num_data_;
/*! \brief Number of weights, used to check correct weight file */
data_size_t num_weights_;
/*! \brief Label data */
float* label_;
/*! \brief Label data, int type */
int16_t* label_int_;
/*! \brief Weights data */
float* weights_;
/*! \brief Query boundaries */
data_size_t* query_boundaries_;
/*! \brief Query weights */
float* query_weights_;
/*! \brief Number of querys */
data_size_t num_queries_;
/*! \brief Number of Initial score, used to check correct weight file */
data_size_t num_init_score_;
/*! \brief Initial score */
score_t* init_score_;
};
/*! \brief Interface for Parser */
class Parser {
public:
/*! \brief virtual destructor */
virtual ~Parser() {}
/*!
* \brief Parse one line with label
* \param str One line record, string format, should end with '\0'
* \param out_features Output features, store in (feature_idx, feature_value)
* \param out_label Output label
*/
virtual void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features,
double* out_label) const = 0;
/*!
* \brief Parse one line with label
* \param str One line record, string format, should end with '\0'
* \param out_features Output features, store in (feature_idx, feature_value)
* \param out_label Output label
*/
virtual void ParseOneLine(const char* str,
std::vector<std::pair<int, double>>* out_features) const = 0;
/*!
* \brief Create a object of parser, will auto choose the format depend on file
* \param filename One Filename of data
* \return Object of parser
*/
static Parser* CreateParser(const char* filename);
};
using PredictFunction =
std::function<double(const std::vector<std::pair<int, double>>&)>;
/*! \brief The main class of data set,
* which are used to traning or validation
*/
class Dataset {
public:
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param init_score_filename Filename of initial score
* \param is_int_label True if label is int type
* \param max_bin The maximal number of bin that feature values will bucket in
* \param random_seed The seed for random generator
* \param is_enable_sparse True for sparse feature
* \param predict_fun Used for initial model, will give a prediction score based on this function, thenn set as initial score
*/
Dataset(const char* data_filename, const char* init_score_filename,
int max_bin, int random_seed, bool is_enable_sparse, const PredictFunction& predict_fun);
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param is_int_label True if label is int type
* \param max_bin The maximal number of bin that feature values will bucket in
* \param random_seed The seed for random generator
* \param is_enable_sparse True for sparse feature
* \param predict_fun Used for initial model, will give a prediction score based on this function, thenn set as initial score
*/
Dataset(const char* data_filename,
int max_bin, int random_seed, bool is_enable_sparse,
const PredictFunction& predict_fun)
: Dataset(data_filename, "", max_bin, random_seed,
is_enable_sparse, predict_fun) {
}
/*! \brief Destructor */
~Dataset();
/*!
* \brief Load training data on parallel training
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param use_two_round_loading True if need to use two round loading
*/
void LoadTrainData(int rank, int num_machines, bool is_pre_partition,
bool use_two_round_loading);
/*!
* \brief Load training data on single machine training
* \param use_two_round_loading True if need to use two round loading
*/
inline void LoadTrainData(bool use_two_round_loading) {
LoadTrainData(0, 1, false, use_two_round_loading);
}
/*!
* \brief Load data and use bin mapper from other data set, general this function is used to extract feature for validation data
* \param train_set Other loaded data set
* \param use_two_round_loading True if need to use two round loading
*/
void LoadValidationData(const Dataset* train_set, bool use_two_round_loading);
/*!
* \brief Save current dataset into binary file, will save to "filename.bin"
*/
void SaveBinaryFile();
/*!
* \brief Get a feature pointer for specific index
* \param i Index for feature
* \return Pointer of feature
*/
inline const Feature* FeatureAt(int i) const { return features_[i]; }
/*!
* \brief Get meta data pointer
* \return Pointer of meta data
*/
inline const Metadata& metadata() const { return metadata_; }
/*! \brief Get Number of used features */
inline int num_features() const { return num_features_; }
/*! \brief Get Number of data */
inline data_size_t num_data() const { return num_data_; }
/*! \brief Disable copy */
Dataset& operator=(const Dataset&) = delete;
/*! \brief Disable copy */
Dataset(const Dataset&) = delete;
private:
/*!
* \brief Load data content on memory. if num_machines > 1 and !is_pre_partition, will partition data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void LoadDataToMemory(int rank, int num_machines, bool is_pre_partition);
/*!
* \brief Sample data from memory, need load data to memory first
* \param out_data Store the sampled data
*/
void SampleDataFromMemory(std::vector<std::string>* out_data);
/*!
* \brief Sample data from file
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param out_data Store the sampled data
*/
void SampleDataFromFile(int rank, int num_machines,
bool is_pre_partition, std::vector<std::string>* out_data);
/*!
* \brief Get feature bin mapper from sampled data.
* if num_machines > 1, differnt machines will construct bin mapper for different features, then have a global sync up
* \param rank Rank of local machine
* \param num_machines Total number of all machines
*/
void ConstructBinMappers(int rank, int num_machines,
const std::vector<std::string>& sample_data);
/*! \brief Extract local features from memory */
void ExtractFeaturesFromMemory();
/*! \brief Extract local features from file */
void ExtractFeaturesFromFile();
/*! \brief Check can load from binary file */
void CheckCanLoadFromBin();
/*!
* \brief Load data set from binary file
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void LoadDataFromBinFile(int rank, int num_machines, bool is_pre_partition);
/*! \brief Check this data set is null or not */
void CheckDataset();
/*! \brief Filename of data */
const char* data_filename_;
/*! \brief A reader class that can read text data */
TextReader<data_size_t>* text_reader_;
/*! \brief A parser class that can parse data */
Parser* parser_;
/*! \brief Store used features */
std::vector<Feature*> features_;
/*! \brief Mapper from real feature index to used index*/
std::vector<int> used_feature_map_;
/*! \brief Number of used features*/
int num_features_;
/*! \brief Number of total data*/
data_size_t num_data_;
/*! \brief Store some label level data*/
Metadata metadata_;
/*! \brief Random generator*/
Random random_;
/*! \brief The maximal number of bin that feature values will bucket in */
int max_bin_;
/*! \brief True if enable sparse */
bool is_enable_sparse_;
/*! \brief True if dataset is loaded from binary file */
bool is_loading_from_binfile_;
/*! \brief Number of global data, used for distributed learning */
size_t global_num_data_ = 0;
// used to local used data indices
std::vector<data_size_t> used_data_indices_;
// prediction function for initial model
const PredictFunction& predict_fun_;
};
} // namespace LightGBM
#endif #endif // LightGBM_DATA_H_
#ifndef LIGHTGBM_FEATURE_H_
#define LIGHTGBM_FEATURE_H_
#include <LightGBM/utils/random.h>
#include <LightGBM/meta.h>
#include <LightGBM/bin.h>
#include <cstdio>
#include <vector>
namespace LightGBM {
/*! \brief Used to store data and provide some operations on one feature*/
class Feature {
public:
/*!
* \brief Constructor
* \param feature_idx Index of this feature
* \param bin_mapper Bin mapper for this feature
* \param num_data Total number of data
* \param is_enable_sparse True if enable sparse feature
*/
Feature(int feature_idx, BinMapper* bin_mapper,
data_size_t num_data, bool is_enable_sparse)
:bin_mapper_(bin_mapper) {
feature_index_ = feature_idx;
bin_data_ = Bin::CreateBin(num_data, bin_mapper_->num_bin(),
bin_mapper_->sparse_rate(), is_enable_sparse, &is_sparse_);
}
/*!
* \brief Constructor from memory
* \param memory Pointer of memory
* \param num_all_data Number of global data
* \param local_used_indices Local used indices, empty means using all data
*/
Feature(const void* memory, data_size_t num_all_data,
const std::vector<data_size_t>& local_used_indices) {
const char* memory_ptr = reinterpret_cast<const char*>(memory);
// get featuer index
feature_index_ = *(reinterpret_cast<const int*>(memory_ptr));
memory_ptr += sizeof(feature_index_);
// get is_sparse
is_sparse_ = *(reinterpret_cast<const bool*>(memory_ptr));
memory_ptr += sizeof(is_sparse_);
// get bin mapper
bin_mapper_ = new BinMapper(memory_ptr);
memory_ptr += bin_mapper_->SizesInByte();
data_size_t num_data = num_all_data;
if (local_used_indices.size() > 0) {
num_data = static_cast<data_size_t>(local_used_indices.size());
}
if (is_sparse_) {
bin_data_ = Bin::CreateSparseBin(num_data, bin_mapper_->num_bin());
} else {
bin_data_ = Bin::CreateDenseBin(num_data, bin_mapper_->num_bin());
}
// get bin data
bin_data_->LoadFromMemory(memory_ptr, local_used_indices);
}
/*! \brief Destructor */
~Feature() {
delete bin_mapper_;
delete bin_data_;
}
/*!
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
* \param idx Index of record
* \param value feature value of record
*/
inline void PushData(int tid, data_size_t line_idx, double value) {
unsigned int bin = bin_mapper_->ValueToBin(value);
bin_data_->Push(tid, line_idx, bin);
}
inline void FinishLoad() { bin_data_->FinishLoad(); }
/*! \brief Index of this feature */
inline int feature_index() const { return feature_index_; }
/*! \brief Bin mapper that this feature used */
inline const BinMapper* bin_mapper() const { return bin_mapper_; }
/*! \brief Number of bin of this feature */
inline int num_bin() const { return bin_mapper_->num_bin(); }
/*! \brief Get bin data of this feature */
inline const Bin* bin_data() const { return bin_data_; }
/*!
* \brief From bin to feature value
* \param bin
* \return Feature value of this bin
*/
inline double BinToValue(unsigned int bin)
const { return bin_mapper_->BinToValue(bin); }
/*!
* \brief Save binary data to file
* \param file File want to write
*/
void SaveBinaryToFile(FILE* file) const {
fwrite(&feature_index_, sizeof(feature_index_), 1, file);
fwrite(&is_sparse_, sizeof(is_sparse_), 1, file);
bin_mapper_->SaveBinaryToFile(file);
bin_data_->SaveBinaryToFile(file);
}
/*!
* \brief Get sizes in byte of this object
*/
size_t SizesInByte() const {
return sizeof(feature_index_) + sizeof(is_sparse_) +
bin_mapper_->SizesInByte() + bin_data_->SizesInByte();
}
/*! \brief Disable copy */
Feature& operator=(const Feature&) = delete;
/*! \brief Disable copy */
Feature(const Feature&) = delete;
private:
/*! \brief Index of this feature */
int feature_index_;
/*! \brief Bin mapper that this feature used */
BinMapper* bin_mapper_;
/*! \brief Bin data of this feature */
Bin* bin_data_;
/*! \brief True if this feature is sparse */
bool is_sparse_;
};
} // namespace LightGBM
#endif #endif // LightGBM_FEATURE_H_
#ifndef LIGHTGBM_META_H_
#define LIGHTGBM_META_H_
#include <cstdint>
#include <limits>
#include <vector>
#include <functional>
namespace LightGBM {
/*! \brief Type of data size, it is better to use signed type*/
typedef int32_t data_size_t;
/*! \brief Type of score, and gradients */
typedef double score_t;
const score_t kMinScore = -std::numeric_limits<score_t>::infinity();
const score_t kEpsilon = 1e-15f;
template<typename T>
std::vector<const T*> ConstPtrInVectorWarpper(std::vector<T*> input) {
return std::vector<const T*>(input.begin(), input.end());
}
using ReduceFunction = std::function<void(const char*, char*, int)>;
} // namespace LightGBM
#endif #endif // LightGBM_META_H_
#ifndef LIGHTGBM_METRIC_H_
#define LIGHTGBM_METRIC_H_
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <LightGBM/dataset.h>
#include <vector>
namespace LightGBM {
/*!
* \brief The interface of metric.
* Metric is used to calculate and output metric result on training / validation data.
*/
class Metric {
public:
/*! \brief virtual destructor */
virtual ~Metric() {}
/*!
* \brief Initialize
* \param test_name Specific name for this metric, will output on log
* \param metadata Label data
* \param num_data Number of data
*/
virtual void Init(const char* test_name,
const Metadata& metadata, data_size_t num_data) = 0;
/*!
* \brief Calcalute and print metric result
* \param iter Current iteration
* \param score Current prediction score
*/
virtual void Print(int iter, const score_t* score) const = 0;
/*!
* \brief Create object of metrics
* \param type Specific type of metric
* \param config Config for metric
*/
static Metric* CreateMetric(const std::string& type, const MetricConfig& config);
};
/*!
* \brief Static class, used to calculate DCG score
*/
class DCGCalculator {
public:
/*!
* \brief Initial logic
* \param label_gain Gain for labels, default is 2^i - 1
*/
static void Init(std::vector<double> label_gain);
/*!
* \brief Calculate the DCG score at position k
* \param k The position want to eval at
* \param label Pointer of label
* \param score Pointer of score
* \param num_data Number of data
* \return The DCG score
*/
static double CalDCGAtK(data_size_t k, const float* label,
const score_t* score, data_size_t num_data);
/*!
* \brief Calculate the DCG score at multi position
* \param ks The positions want to eval at
* \param label Pointer of label
* \param score Pointer of score
* \param num_data Number of data
* \param out Output result
*/
static void CalDCG(const std::vector<data_size_t>& ks,
const float* label, const score_t* score,
data_size_t num_data, std::vector<double>* out);
/*!
* \brief Calculate the Max DCG score at position k
* \param k The position want to eval at
* \param label Pointer of label
* \param num_data Number of data
* \return The max DCG score
*/
static double CalMaxDCGAtK(data_size_t k,
const float* label, data_size_t num_data);
/*!
* \brief Calculate the Max DCG score at multi position
* \param ks The positions want to eval at
* \param label Pointer of label
* \param num_data Number of data
* \param out Output result
*/
static void CalMaxDCG(const std::vector<data_size_t>& ks,
const float* label, data_size_t num_data, std::vector<double>* out);
/*!
* \brief Get discount score of position k
* \param k The position
* \return The discount of this position
*/
inline static double GetDiscount(data_size_t k) { return discount_[k]; }
private:
/*! \brief True if inited, avoid init multi times */
static bool is_inited_;
/*! \brief store gains for different label */
static std::vector<double> label_gain_;
/*! \brief store discount score for different position */
static std::vector<double> discount_;
/*! \brief max position for eval */
static const data_size_t kMaxPosition;
};
} // namespace LightGBM
#endif #endif // LightGBM_METRIC_H_
#ifndef LIGHTGBM_NETWORK_H_
#define LIGHTGBM_NETWORK_H_
#include <LightGBM/utils/log.h>
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <functional>
#include <vector>
namespace LightGBM {
/*! \brief forward declaration */
class Linkers;
/*! \brief The network structure for all gather */
class BruckMap {
public:
/*! \brief The communication times for one all gather operation */
int k;
/*! \brief in_ranks[i] means the incomming rank on i-th communication */
std::vector<int> in_ranks;
/*! \brief out_ranks[i] means the out rank on i-th communication */
std::vector<int> out_ranks;
BruckMap();
explicit BruckMap(int n);
/*!
* \brief Create the object of bruck map
* \param rank Rank of this machine
* \param num_machines The total number of machines
* \return The object of bruck map
*/
static BruckMap Construct(int rank, int num_machines);
};
/*!
* \brief node type on recursive halving algorithm
* When number of machines is not power of 2, need group maiches into power of 2 group.
* And we can let each group has at most 2 machines.
* if the group only has 1 machine. this machine is the normal node
* if the grou has 2 machines, this group will have two type of nodes, one is the leader.
* leader will represent this group and communication with others.
*/
enum RecursiveHalvingNodeType {
Normal, // normal node, 1 group only have 1 machine
GroupLeader, // leader of group when number of machines in this group is 2.
Other // non-leader machines in group
};
/*! \brief Network structure for recursive halving algorithm */
class RecursiveHalvingMap {
public:
/*! \brief Communication times for one recursize halving algorithm */
int k;
/*! \brief Node type */
RecursiveHalvingNodeType type;
/*! \brief Neighbor, only used for non-normal node*/
int neighbor;
/*! \brief ranks[i] means the machines that will communicate with on i-th communication*/
std::vector<int> ranks;
/*! \brief send_block_start[i] means send block start index at i-th communication*/
std::vector<int> send_block_start;
/*! \brief send_block_start[i] means send block size at i-th communication*/
std::vector<int> send_block_len;
/*! \brief send_block_start[i] means recv block start index at i-th communication*/
std::vector<int> recv_block_start;
/*! \brief send_block_start[i] means recv block size at i-th communication*/
std::vector<int> recv_block_len;
RecursiveHalvingMap();
RecursiveHalvingMap(RecursiveHalvingNodeType _type, int n);
/*!
* \brief Create the object of recursive halving map
* \param rank Rank of this machine
* \param num_machines The total number of machines
* \return The object of recursive halving map
*/
static RecursiveHalvingMap Construct(int rank, int num_machines);
};
/*! \brief A static class that contains some collective communication algorithm */
class Network {
public:
/*!
* \brief Initialize
* \param config Config of network setting
*/
static void Init(NetworkConfig config);
/*! \brief Free this static class */
static void Dispose();
/*! \brief Get rank of this machine */
static inline int rank();
/*! \brief Get total number of machines */
static inline int num_machines();
/*!
* \brief Perform all reduce. if data size is small,
will perform AllreduceByAllGather, else with call ReduceScatter followed allgather
* \param input Input data
* \param input_size The size of input data
* \param type_size The size of one object in the reduce function
* \param output Output result
* \param reducer Reduce function
*/
static void Allreduce(char* input, int input_size, int type_size,
char* output, const ReduceFunction& reducer);
/*!
* \brief Perform all reduce, use all gather. When data is small, can use this to reduce communication times
* \param input Input data
* \param input_size The size of input data
* \param output Output result
* \param reducer Reduce function
*/
static void AllreduceByAllGather(char* input, int input_size, char* output,
const ReduceFunction& reducer);
/*!
* \brief Perform all gather, use bruck algorithm. Communication times is O(log(n)), and communication cost is O(send_size * number_machine)
* if all machine have same input size, can call this function
* \param input Input data
* \param send_size The size of input data
* \param output Output result
*/
static void Allgather(char* input, int send_size, char* output);
/*!
* \brief Perform all gather, use bruck algorithm. Communication times is O(log(n)), and communication cost is O(all_size)
* if all machine have different input size, can call this function
* \param input Input data
* \param all_size The size of input data
* \param block_start The block start for different machines
* \param block_len The block size for different machines
* \param output Output result
*/
static void Allgather(char* input, int all_size, int* block_start,
int* block_len, char* output);
/*!
* \brief Perform reduce scatter, use recursive halving algorithm. Communication times is O(log(n)), and communication cost is O(input_size)
* \param input Input data
* \param input_size The size of input data
* \param block_start The block start for different machines
* \param block_len The block size for different machines
* \param output Output result
* \param reducer Reduce function
*/
static void ReduceScatter(char* input, int input_size,
int* block_start, int* block_len, char* output,
const ReduceFunction& reducer);
private:
/*! \brief Number of all machines */
static int num_machines_;
/*! \brief Rank of local machine */
static int rank_;
/*! \brief The network interface, provide send/recv functions */
static Linkers *linkers_;
/*! \brief Bruck map for all gather algorithm*/
static BruckMap bruck_map_;
/*! \brief Recursive halving map for reduce scatter */
static RecursiveHalvingMap recursive_halving_map_;
/*! \brief Buffer to store block start index */
static int* block_start_;
/*! \brief Buffer to store block size */
static int* block_len_;
/*! \brief Buffer */
static char* buffer_;
/*! \brief Size of buffer_ */
static int buffer_size_;
};
inline int Network::rank() {
return rank_;
}
inline int Network::num_machines() {
return num_machines_;
}
} // namespace LightGBM
#endif #endif // LightGBM_NETWORK_H_
#ifndef LIGHTGBM_OBJECTIVE_FUNCTION_H_
#define LIGHTGBM_OBJECTIVE_FUNCTION_H_
#include <LightGBM/meta.h>
#include <LightGBM/config.h>
#include <LightGBM/dataset.h>
namespace LightGBM {
/*!
* \brief The interface of Objective Function.
* Objective function is used to get gradients
*/
class ObjectiveFunction {
public:
/*! \brief virtual destructor */
virtual ~ObjectiveFunction() {}
/*!
* \brief Initialize
* \param metadata Label data
* \param num_data Number of data
*/
virtual void Init(const Metadata& metadata, data_size_t num_data) = 0;
/*!
* \brief calculate first order derivative of loss function
* \param score Current prediction score
* \gradients Output gradients
* \hessians Output hessians
*/
virtual void GetGradients(const score_t* score,
score_t* gradients, score_t* hessians) const = 0;
/*!
* \brief Get sigmoid param for this objective if has.
* This function is used for prediction task, if has sigmoid param, the prediction value will be transform by sigmoid function.
* \return Sigmoid param, if <=0.0 means don't use sigmoid transform on this objective.
*/
virtual double GetSigmoid() const = 0;
/*!
* \brief Create object of objective function
* \param type Specific type of objective function
* \param config Config for objective function
*/
static ObjectiveFunction* CreateObjectiveFunction(const std::string& type,
const ObjectiveConfig& config);
};
} // namespace LightGBM
#endif #endif // LightGBM_OBJECTIVE_FUNCTION_H_
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment