Commit 504d400c authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Merge pull request #56 from guolinke/master to dev

merge to dev
parents e2fe9283 b25bbcc2
#ifndef LIGHTGBM_C_API_H_
#define LIGHTGBM_C_API_H_
#include<cstdint>
#ifdef __cplusplus
#define DLL_EXTERN_C extern "C"
#else
#define DLL_EXTERN_C
#endif
#ifdef _MSC_VER
#define DllExport DLL_EXTERN_C __declspec(dllexport)
#else
#define DllExport DLL_EXTERN_C
#endif
typedef void* DatesetHandle;
typedef void* BoosterHandle;
/*!
* \brief get string message of the last error
* all function in this file will return 0 when success
* and -1 when an error occured,
* \return const char* error inforomation
*/
DllExport const char* LGBM_GetLastError();
// --- start Dataset inferfaces
/*!
* \brief load data set from file like the command_line LightGBM do
* \param parameters additional parameters
* \param filename the name of the file
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out a loaded dataset
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromFile(const char* filename,
const char* parameters,
const DatesetHandle* reference,
DatesetHandle* out);
/*!
* \brief load data set from binary file like the command_line LightGBM do
* \param filename the name of the file
* \param out a loaded dataset
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromBinaryFile(const char* filename,
DatesetHandle* out);
/*!
* \brief create a dataset from CSR format
* \param indptr pointer to row headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromCSR(const uint64_t* indptr,
const unsigned* indices,
const float* data,
uint64_t nindptr,
uint64_t nelem,
uint64_t num_col,
const char* parameters,
const DatesetHandle* reference,
DatesetHandle* out);
/*!
* \brief create a dataset from CSC format
* \param col_ptr pointer to col headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows; when it's set to 0, then guess from data
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromCSC(const uint64_t* col_ptr,
const unsigned* indices,
const float* data,
uint64_t nindptr,
uint64_t nelem,
uint64_t num_row,
const char* parameters,
const DatesetHandle* reference,
DatesetHandle* out);
/*!
* \brief create dataset from dense matrix
* \param data pointer to the data space
* \param nrow number of rows
* \param ncol number columns
* \param missing which value to represent missing value
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromMat(const float* data,
uint64_t nrow,
uint64_t ncol,
float missing,
const char* parameters,
const DatesetHandle* reference,
DatesetHandle* out);
/*!
* \brief free space for dataset
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_DatasetFree(DatesetHandle* handle);
/*!
* \brief save dateset to binary file
* \param handle a instance of dataset
* \param filename file name
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_DatasetSaveBinary(DatesetHandle handle,
const char* filename);
/*!
* \brief set vector to a content in info
* \param handle a instance of dataset
* \param field_name field name, can be label, weight, group
* \param field_data pointer to float vector
* \param field_len number of element in field_data
* \param type float:0, int:1
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_DatasetSetField(DatesetHandle handle,
const char* field_name,
const void* field_data,
uint64_t num_element,
int type);
/*!
* \brief get float info vector from dataset
* \param handle a instance of data matrix
* \param field_name field name
* \param out_len used to set result length
* \param out_ptr pointer to the result
* \param out_type float:0, int:1
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_DatasetGetField(DatesetHandle handle,
const char* field_name,
uint64_t* out_len,
const void** out_ptr,
int* out_type);
/*!
* \brief get number of data.
* \param handle the handle to the dataset
* \param out The address to hold number of data
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
uint64_t* out);
/*!
* \brief get number of features
* \param handle the handle to the dataset
* \param out The output of number of features
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
uint64_t* out);
// --- start Booster interfaces
/*!
* \brief create an new boosting learner
* \param train_data traning data set
* \param valid_datas validation data sets
* \param valid_names names of validation data sets
* \param n_valid_datas number of validation set
* \param parameters format: 'key1=value1 key2=value2'
* \prama out handle of created Booster
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterCreate(DatesetHandle train_data,
DatesetHandle valid_datas[],
const char* valid_names[],
int n_valid_datas,
const char* parameters,
BoosterHandle* out);
/*!
* \brief load an exsiting boosting from model file
* \param filename filename of model
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterLoadFromModelfile(
const char* filename,
BoosterHandle* out);
/*!
* \brief free obj in handle
* \param handle handle to be freed
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterFree(BoosterHandle handle);
/*!
* \brief update the model in one round
* \param handle handle
* \param is_finished 1 means finised
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished);
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to support customized loss function
* \param handle handle
* \param grad gradient statistics
* \param hess second order gradient statistics
* \param is_finished 1 means finised
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
float* grad,
float* hess,
int* is_finished);
/*!
* \brief get evaluation for training data and validation datas
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_result the string containing evaluation statistics
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterEval(BoosterHandle handle,
int data,
const char** out_result);
/*!
* \brief make prediction for training data and validation datas
this can be used to support customized eval function
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param predict_type
* 0:raw score
* 1:with sigmoid transform(if needed)
* 2:leaf index
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterPredict(BoosterHandle handle,
int data,
int predict_type,
const float** out_result);
/*!
* \brief make prediction for an new data set
* \param handle handle
* \param indptr pointer to row headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param predict_type
* 0:raw score
* 1:with sigmoid transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
const uint64_t* indptr,
const unsigned* indices,
const float* data,
uint64_t nindptr,
uint64_t nelem,
uint64_t num_col,
int predict_type,
uint64_t n_used_trees,
const float** out_result);
/*!
* \brief make prediction for an new data set
* \param handle handle
* \param col_ptr pointer to col headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows; when it's set to 0, then guess from data
* \param predict_type
* 0:raw score
* 1:with sigmoid transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForCSC(BoosterHandle handle,
const uint64_t* col_ptr,
const unsigned* indices,
const float* data,
uint64_t nindptr,
uint64_t nelem,
uint64_t num_row,
int predict_type,
uint64_t n_used_trees,
const float** out_result);
/*!
* \brief make prediction for an new data set
* \param handle handle
* \param data pointer to the data space
* \param nrow number of rows
* \param ncol number columns
* \param missing which value to represent missing value
* \param predict_type
* 0:raw score
* 1:with sigmoid transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
const float* data,
uint64_t nrow,
uint64_t ncol,
float missing,
int predict_type,
uint64_t n_used_trees,
const float** out_result);
/*!
* \brief save model into file
* \param handle handle
* \param is_finished 1 means finised
* \param filename file name
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
int is_finished,
const char* filename);
#endif // LIGHTGBM_C_API_H_
...@@ -99,6 +99,8 @@ public: ...@@ -99,6 +99,8 @@ public:
bool is_enable_sparse = true; bool is_enable_sparse = true;
bool use_two_round_loading = false; bool use_two_round_loading = false;
bool is_save_binary_file = false; bool is_save_binary_file = false;
bool enable_load_from_binary_file = true;
int bin_construct_sample_cnt = 50000;
bool is_sigmoid = true; bool is_sigmoid = true;
bool has_header = false; bool has_header = false;
......
...@@ -17,6 +17,7 @@ namespace LightGBM { ...@@ -17,6 +17,7 @@ namespace LightGBM {
/*! \brief forward declaration */ /*! \brief forward declaration */
class Feature; class Feature;
class BinMapper;
/*! /*!
* \brief This class is used to store some meta(non-feature) data for training data, * \brief This class is used to store some meta(non-feature) data for training data,
...@@ -79,6 +80,13 @@ public: ...@@ -79,6 +80,13 @@ public:
void CheckOrPartition(data_size_t num_all_data, void CheckOrPartition(data_size_t num_all_data,
const std::vector<data_size_t>& used_data_indices); const std::vector<data_size_t>& used_data_indices);
void SetLabel(const float* label, data_size_t len);
void SetWeights(const float* weights, data_size_t len);
void SetQueryBoundaries(const data_size_t* QueryBoundaries, data_size_t len);
/*! /*!
* \brief Set initial scores * \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score. * \param init_score Initial scores, this class will manage memory for init_score.
...@@ -188,8 +196,6 @@ private: ...@@ -188,8 +196,6 @@ private:
data_size_t num_weights_; data_size_t num_weights_;
/*! \brief Label data */ /*! \brief Label data */
float* label_; float* label_;
/*! \brief Label data, int type */
int16_t* label_int_;
/*! \brief Weights data */ /*! \brief Weights data */
float* weights_; float* weights_;
/*! \brief Query boundaries */ /*! \brief Query boundaries */
...@@ -262,9 +268,24 @@ public: ...@@ -262,9 +268,24 @@ public:
: Dataset(data_filename, "", io_config, predict_fun) { : Dataset(data_filename, "", io_config, predict_fun) {
} }
/*!
* \brief Constructor, without filename, used to load data from memory
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset(const IOConfig& io_config, const PredictFunction& predict_fun);
/*! \brief Destructor */ /*! \brief Destructor */
~Dataset(); ~Dataset();
/*! \brief Init Dataset with specific binmapper */
void InitByBinMapper(std::vector<const BinMapper*> bin_mappers, data_size_t num_data);
/*! \brief push raw data into dataset */
void PushData(const std::vector<std::vector<std::pair<int, float>>>& datas, data_size_t start_idx, bool is_finished);
void SetField(const char* field_name, const void* field_data, data_size_t num_element, int type);
/*! /*!
* \brief Load training data on parallel training * \brief Load training data on parallel training
* \param rank Rank of local machine * \param rank Rank of local machine
...@@ -290,10 +311,21 @@ public: ...@@ -290,10 +311,21 @@ public:
*/ */
void LoadValidationData(const Dataset* train_set, bool use_two_round_loading); void LoadValidationData(const Dataset* train_set, bool use_two_round_loading);
/*!
* \brief Load data set from binary file
* \param bin_filename filename of bin data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void LoadDataFromBinFile(const char* bin_filename, int rank, int num_machines, bool is_pre_partition);
/*! /*!
* \brief Save current dataset into binary file, will save to "filename.bin" * \brief Save current dataset into binary file, will save to "filename.bin"
*/ */
void SaveBinaryFile(); void SaveBinaryFile(const char* bin_filename);
std::vector<const BinMapper*> GetBinMappers() const;
/*! /*!
* \brief Get a feature pointer for specific index * \brief Get a feature pointer for specific index
...@@ -371,14 +403,6 @@ private: ...@@ -371,14 +403,6 @@ private:
/*! \brief Check can load from binary file */ /*! \brief Check can load from binary file */
void CheckCanLoadFromBin(); void CheckCanLoadFromBin();
/*!
* \brief Load data set from binary file
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void LoadDataFromBinFile(int rank, int num_machines, bool is_pre_partition);
/*! \brief Check this data set is null or not */ /*! \brief Check this data set is null or not */
void CheckDataset(); void CheckDataset();
...@@ -424,6 +448,8 @@ private: ...@@ -424,6 +448,8 @@ private:
std::unordered_set<int> ignore_features_; std::unordered_set<int> ignore_features_;
/*! \brief store feature names */ /*! \brief store feature names */
std::vector<std::string> feature_names_; std::vector<std::string> feature_names_;
/*! \brief store feature names */
int bin_construct_sample_cnt_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -150,7 +150,7 @@ void Application::LoadData() { ...@@ -150,7 +150,7 @@ void Application::LoadData() {
} }
// need save binary file // need save binary file
if (config_.io_config.is_save_binary_file) { if (config_.io_config.is_save_binary_file) {
train_data_->SaveBinaryFile(); train_data_->SaveBinaryFile(nullptr);
} }
// create training metric // create training metric
if (config_.boosting_config->is_provide_training_metric) { if (config_.boosting_config->is_provide_training_metric) {
...@@ -175,7 +175,7 @@ void Application::LoadData() { ...@@ -175,7 +175,7 @@ void Application::LoadData() {
config_.io_config.use_two_round_loading); config_.io_config.use_two_round_loading);
// need save binary file // need save binary file
if (config_.io_config.is_save_binary_file) { if (config_.io_config.is_save_binary_file) {
valid_datas_.back()->SaveBinaryFile(); valid_datas_.back()->SaveBinaryFile(nullptr);
} }
// add metric for validation data // add metric for validation data
......
...@@ -25,7 +25,7 @@ void LoadFileToBoosting(Boosting* boosting, const char* filename) { ...@@ -25,7 +25,7 @@ void LoadFileToBoosting(Boosting* boosting, const char* filename) {
} }
Boosting* Boosting::CreateBoosting(BoostingType type, const char* filename) { Boosting* Boosting::CreateBoosting(BoostingType type, const char* filename) {
if (filename[0] == '\0') { if (filename == nullptr || filename[0] == '\0') {
if (type == BoostingType::kGBDT) { if (type == BoostingType::kGBDT) {
return new GBDT(); return new GBDT();
} else { } else {
......
#include <LightGBM/c_api.h>
#include <LightGBM/dataset.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
#include <LightGBM/config.h>
#include <cstdio>
#include <vector>
#include <string>
#include <cstring>
namespace LightGBM {
class Booster {
public:
explicit Booster(const char* filename):
boosting_(Boosting::CreateBoosting(filename)) {
}
Booster(const Dataset* train_data,
std::vector<const Dataset*> valid_data,
std::vector<std::string> valid_names,
const char* parameters)
:train_data_(train_data), valid_datas_(valid_data) {
config_.LoadFromString(parameters);
// create boosting
if (config_.io_config.input_model.size() > 0) {
Log::Error("continued train from model is not support for c_api, \
please use continued train with input score");
}
boosting_ = Boosting::CreateBoosting(config_.boosting_type, "");
// create objective function
objective_fun_ =
ObjectiveFunction::CreateObjectiveFunction(config_.objective_type,
config_.objective_config);
// create training metric
if (config_.boosting_config->is_provide_training_metric) {
for (auto metric_type : config_.metric_types) {
Metric* metric =
Metric::CreateMetric(metric_type, config_.metric_config);
if (metric == nullptr) { continue; }
metric->Init("training", train_data_->metadata(),
train_data_->num_data());
train_metric_.push_back(metric);
}
}
// add metric for validation data
for (size_t i = 0; i < valid_datas_.size(); ++i) {
valid_metrics_.emplace_back();
for (auto metric_type : config_.metric_types) {
Metric* metric = Metric::CreateMetric(metric_type, config_.metric_config);
if (metric == nullptr) { continue; }
metric->Init(valid_names[i].c_str(),
valid_datas_[i]->metadata(),
valid_datas_[i]->num_data());
valid_metrics_.back().push_back(metric);
}
}
// initialize the objective function
objective_fun_->Init(train_data_->metadata(), train_data_->num_data());
// initialize the boosting
boosting_->Init(config_.boosting_config, train_data_, objective_fun_,
ConstPtrInVectorWarpper<Metric>(train_metric_));
// add validation data into boosting
for (size_t i = 0; i < valid_datas_.size(); ++i) {
boosting_->AddDataset(valid_datas_[i],
ConstPtrInVectorWarpper<Metric>(valid_metrics_[i]));
}
}
~Booster() {
for (auto& metric : train_metric_) {
if (metric != nullptr) { delete metric; }
}
for (auto& metric : valid_metrics_) {
for (auto& sub_metric : metric) {
if (sub_metric != nullptr) { delete sub_metric; }
}
}
valid_metrics_.clear();
if (boosting_ != nullptr) { delete boosting_; }
if (objective_fun_ != nullptr) { delete objective_fun_; }
}
private:
Boosting* boosting_;
/*! \brief All configs */
OverallConfig config_;
/*! \brief Training data */
const Dataset* train_data_;
/*! \brief Validation data */
std::vector<const Dataset*> valid_datas_;
/*! \brief Metric for training data */
std::vector<Metric*> train_metric_;
/*! \brief Metrics for validation data */
std::vector<std::vector<Metric*>> valid_metrics_;
/*! \brief Training objective function */
ObjectiveFunction* objective_fun_;
};
}
...@@ -191,10 +191,12 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) { ...@@ -191,10 +191,12 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
} }
GetInt(params, "verbose", &verbosity); GetInt(params, "verbose", &verbosity);
GetInt(params, "num_model_predict", &num_model_predict); GetInt(params, "num_model_predict", &num_model_predict);
GetInt(params, "bin_construct_sample_cnt", &bin_construct_sample_cnt);
GetBool(params, "is_pre_partition", &is_pre_partition); GetBool(params, "is_pre_partition", &is_pre_partition);
GetBool(params, "is_enable_sparse", &is_enable_sparse); GetBool(params, "is_enable_sparse", &is_enable_sparse);
GetBool(params, "use_two_round_loading", &use_two_round_loading); GetBool(params, "use_two_round_loading", &use_two_round_loading);
GetBool(params, "is_save_binary_file", &is_save_binary_file); GetBool(params, "is_save_binary_file", &is_save_binary_file);
GetBool(params, "enable_load_from_binary_file", &enable_load_from_binary_file);
GetBool(params, "is_sigmoid", &is_sigmoid); GetBool(params, "is_sigmoid", &is_sigmoid);
GetString(params, "output_model", &output_model); GetString(params, "output_model", &output_model);
GetString(params, "input_model", &input_model); GetString(params, "input_model", &input_model);
......
...@@ -18,9 +18,11 @@ namespace LightGBM { ...@@ -18,9 +18,11 @@ namespace LightGBM {
Dataset::Dataset(const char* data_filename, const char* init_score_filename, Dataset::Dataset(const char* data_filename, const char* init_score_filename,
const IOConfig& io_config, const PredictFunction& predict_fun) const IOConfig& io_config, const PredictFunction& predict_fun)
:data_filename_(data_filename), random_(io_config.data_random_seed), :data_filename_(data_filename), random_(io_config.data_random_seed),
max_bin_(io_config.max_bin), is_enable_sparse_(io_config.is_enable_sparse), predict_fun_(predict_fun) { max_bin_(io_config.max_bin), is_enable_sparse_(io_config.is_enable_sparse),
predict_fun_(predict_fun), bin_construct_sample_cnt_(io_config.bin_construct_sample_cnt) {
CheckCanLoadFromBin(); if (io_config.enable_load_from_binary_file) {
CheckCanLoadFromBin();
}
if (is_loading_from_binfile_ && predict_fun != nullptr) { if (is_loading_from_binfile_ && predict_fun != nullptr) {
Log::Info("Cannot performing initialization of prediction by using binary file, using text file instead"); Log::Info("Cannot performing initialization of prediction by using binary file, using text file instead");
is_loading_from_binfile_ = false; is_loading_from_binfile_ = false;
...@@ -160,6 +162,17 @@ Dataset::Dataset(const char* data_filename, const char* init_score_filename, ...@@ -160,6 +162,17 @@ Dataset::Dataset(const char* data_filename, const char* init_score_filename,
} }
Dataset::Dataset(const IOConfig& io_config, const PredictFunction& predict_fun)
:data_filename_(""), random_(io_config.data_random_seed),
max_bin_(io_config.max_bin), is_enable_sparse_(io_config.is_enable_sparse),
predict_fun_(predict_fun), bin_construct_sample_cnt_(io_config.bin_construct_sample_cnt) {
parser_ = nullptr;
text_reader_ = nullptr;
}
Dataset::~Dataset() { Dataset::~Dataset() {
if (parser_ != nullptr) { delete parser_; } if (parser_ != nullptr) { delete parser_; }
if (text_reader_ != nullptr) { delete text_reader_; } if (text_reader_ != nullptr) { delete text_reader_; }
...@@ -216,7 +229,7 @@ void Dataset::LoadDataToMemory(int rank, int num_machines, bool is_pre_partition ...@@ -216,7 +229,7 @@ void Dataset::LoadDataToMemory(int rank, int num_machines, bool is_pre_partition
} }
void Dataset::SampleDataFromMemory(std::vector<std::string>* out_data) { void Dataset::SampleDataFromMemory(std::vector<std::string>* out_data) {
const size_t sample_cnt = static_cast<size_t>(num_data_ < 50000 ? num_data_ : 50000); const size_t sample_cnt = static_cast<size_t>(num_data_ < bin_construct_sample_cnt_ ? num_data_ : bin_construct_sample_cnt_);
std::vector<size_t> sample_indices = random_.Sample(num_data_, sample_cnt); std::vector<size_t> sample_indices = random_.Sample(num_data_, sample_cnt);
out_data->clear(); out_data->clear();
for (size_t i = 0; i < sample_indices.size(); ++i) { for (size_t i = 0; i < sample_indices.size(); ++i) {
...@@ -228,7 +241,7 @@ void Dataset::SampleDataFromMemory(std::vector<std::string>* out_data) { ...@@ -228,7 +241,7 @@ void Dataset::SampleDataFromMemory(std::vector<std::string>* out_data) {
void Dataset::SampleDataFromFile(int rank, int num_machines, bool is_pre_partition, void Dataset::SampleDataFromFile(int rank, int num_machines, bool is_pre_partition,
std::vector<std::string>* out_data) { std::vector<std::string>* out_data) {
used_data_indices_.clear(); used_data_indices_.clear();
const size_t sample_cnt = 50000; const data_size_t sample_cnt = static_cast<data_size_t>(bin_construct_sample_cnt_);
if (num_machines == 1 || is_pre_partition) { if (num_machines == 1 || is_pre_partition) {
num_data_ = static_cast<data_size_t>(text_reader_->SampleFromFile(random_, sample_cnt, out_data)); num_data_ = static_cast<data_size_t>(text_reader_->SampleFromFile(random_, sample_cnt, out_data));
global_num_data_ = num_data_; global_num_data_ = num_data_;
...@@ -272,6 +285,83 @@ void Dataset::SampleDataFromFile(int rank, int num_machines, bool is_pre_partiti ...@@ -272,6 +285,83 @@ void Dataset::SampleDataFromFile(int rank, int num_machines, bool is_pre_partiti
} }
} }
void Dataset::InitByBinMapper(std::vector<const BinMapper*> bin_mappers, data_size_t num_data) {
num_data_ = num_data;
global_num_data_ = num_data_;
// initialize label
metadata_.Init(num_data_, -1, -1);
// free old memory
for (auto& feature : features_) {
delete feature;
}
features_.clear();
used_feature_map_ = std::vector<int>(bin_mappers.size(), -1);
for (size_t i = 0; i < bin_mappers.size(); ++i) {
if (bin_mappers[i] != nullptr) {
features_.push_back(new Feature(static_cast<int>(i), new BinMapper(bin_mappers[i]), num_data_, is_enable_sparse_));
used_feature_map_[i] = static_cast<int>(features_.size());
}
}
num_features_ = static_cast<int>(features_.size());
}
std::vector<const BinMapper*> Dataset::GetBinMappers() const {
std::vector<const BinMapper*> ret(num_total_features_, nullptr);
for (const auto feature : features_) {
ret[feature->feature_index()] = feature->bin_mapper();
}
return ret;
}
void Dataset::PushData(const std::vector<std::vector<std::pair<int, float>>>& datas, data_size_t start_idx, bool is_finished) {
// if doesn't need to prediction with initial model
#pragma omp parallel for schedule(guided)
for (data_size_t i = 0; i < static_cast<int>(datas.size()); ++i) {
const int tid = omp_get_thread_num();
for (auto& inner_data : datas[i]) {
int feature_idx = used_feature_map_[inner_data.first];
if (feature_idx >= 0) {
// if is used feature
features_[feature_idx]->PushData(tid, start_idx + i, inner_data.second);
}
}
}
if (is_finished) {
#pragma omp parallel for schedule(guided)
for (int i = 0; i < num_features_; ++i) {
features_[i]->FinishLoad();
}
}
}
void Dataset::SetField(const char* field_name, const void* field_data, data_size_t num_element, int type) {
std::string name(field_name);
name = Common::Trim(name);
if (name == std::string("label") || name == std::string("target")) {
if (type != 0) {
Log::Fatal("type of label should be float");
}
metadata_.SetLabel(static_cast<const float*>(field_data), num_element);
}else if (name == std::string("weight") || name == std::string("weights")) {
if (type != 0) {
Log::Fatal("type of weights should be float");
}
metadata_.SetWeights(static_cast<const float*>(field_data), num_element);
} else if (name == std::string("init_score")) {
if (type != 0) {
Log::Fatal("type of init_score should be float");
}
metadata_.SetInitScore(static_cast<const float*>(field_data), num_element);
} else if (name == std::string("query") || name == std::string("group")) {
if (type != 1) {
Log::Fatal("type of init_score should be int");
}
metadata_.SetQueryBoundaries(static_cast<const data_size_t*>(field_data), num_element);
} else {
Log::Fatal("unknow field name: %s", field_name);
}
}
void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<std::string>& sample_data) { void Dataset::ConstructBinMappers(int rank, int num_machines, const std::vector<std::string>& sample_data) {
// sample_values[i][j], means the value of j-th sample on i-th feature // sample_values[i][j], means the value of j-th sample on i-th feature
std::vector<std::vector<float>> sample_values; std::vector<std::vector<float>> sample_values;
...@@ -452,8 +542,10 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b ...@@ -452,8 +542,10 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
ExtractFeaturesFromFile(); ExtractFeaturesFromFile();
} }
} else { } else {
std::string bin_filename(data_filename_);
bin_filename.append(".bin");
// load data from binary file // load data from binary file
LoadDataFromBinFile(rank, num_machines, is_pre_partition); LoadDataFromBinFile(bin_filename.c_str(), rank, num_machines, is_pre_partition);
} }
// check meta data // check meta data
metadata_.CheckOrPartition(static_cast<data_size_t>(global_num_data_), used_data_indices_); metadata_.CheckOrPartition(static_cast<data_size_t>(global_num_data_), used_data_indices_);
...@@ -497,8 +589,10 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo ...@@ -497,8 +589,10 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
ExtractFeaturesFromFile(); ExtractFeaturesFromFile();
} }
} else { } else {
std::string bin_filename(data_filename_);
bin_filename.append(".bin");
// load from binary file // load from binary file
LoadDataFromBinFile(0, 1, false); LoadDataFromBinFile(bin_filename.c_str(), 0, 1, false);
} }
// not need to check validation data // not need to check validation data
// check meta data // check meta data
...@@ -646,19 +740,23 @@ void Dataset::ExtractFeaturesFromFile() { ...@@ -646,19 +740,23 @@ void Dataset::ExtractFeaturesFromFile() {
} }
} }
void Dataset::SaveBinaryFile() { void Dataset::SaveBinaryFile(const char* bin_filename) {
// if is loaded from binary file, not need to save
if (!is_loading_from_binfile_) { if (!is_loading_from_binfile_) {
std::string bin_filename(data_filename_); // if not pass a filename, just append ".bin" of original file
bin_filename.append(".bin"); if (bin_filename == nullptr || bin_filename[0] == '\0') {
std::string bin_filename_str(data_filename_);
bin_filename_str.append(".bin");
bin_filename = bin_filename_str.c_str();
}
FILE* file; FILE* file;
#ifdef _MSC_VER #ifdef _MSC_VER
fopen_s(&file, bin_filename.c_str(), "wb"); fopen_s(&file, bin_filename, "wb");
#else #else
file = fopen(bin_filename.c_str(), "wb"); file = fopen(bin_filename, "wb");
#endif #endif
if (file == NULL) { if (file == NULL) {
Log::Fatal("Cannot write binary data to %s ", bin_filename.c_str()); Log::Fatal("Cannot write binary data to %s ", bin_filename);
} }
Log::Info("Saving data to binary file: %s", data_filename_); Log::Info("Saving data to binary file: %s", data_filename_);
...@@ -715,20 +813,18 @@ void Dataset::CheckCanLoadFromBin() { ...@@ -715,20 +813,18 @@ void Dataset::CheckCanLoadFromBin() {
} }
} }
void Dataset::LoadDataFromBinFile(int rank, int num_machines, bool is_pre_partition) { void Dataset::LoadDataFromBinFile(const char* bin_filename, int rank, int num_machines, bool is_pre_partition) {
std::string bin_filename(data_filename_);
bin_filename.append(".bin");
FILE* file; FILE* file;
#ifdef _MSC_VER #ifdef _MSC_VER
fopen_s(&file, bin_filename.c_str(), "rb"); fopen_s(&file, bin_filename, "rb");
#else #else
file = fopen(bin_filename.c_str(), "rb"); file = fopen(bin_filename, "rb");
#endif #endif
if (file == NULL) { if (file == NULL) {
Log::Fatal("Cannot read binary data from %s", bin_filename.c_str()); Log::Fatal("Cannot read binary data from %s", bin_filename);
} }
// buffer to read binary file // buffer to read binary file
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
namespace LightGBM { namespace LightGBM {
Metadata::Metadata() Metadata::Metadata()
:label_(nullptr), label_int_(nullptr), weights_(nullptr), :label_(nullptr), weights_(nullptr),
query_boundaries_(nullptr), query_boundaries_(nullptr),
query_weights_(nullptr), init_score_(nullptr), queries_(nullptr){ query_weights_(nullptr), init_score_(nullptr), queries_(nullptr){
...@@ -225,6 +225,48 @@ void Metadata::SetInitScore(const float* init_score, data_size_t len) { ...@@ -225,6 +225,48 @@ void Metadata::SetInitScore(const float* init_score, data_size_t len) {
} }
} }
void Metadata::SetLabel(const float* label, data_size_t len) {
if (num_data_ != len) {
Log::Fatal("len of label is not same with #data");
}
if (label_ != nullptr) { delete[] label_; }
label_ = new float[num_data_];
for (data_size_t i = 0; i < num_data_; ++i) {
label_[i] = label[i];
}
}
void Metadata::SetWeights(const float* weights, data_size_t len) {
if (num_data_ != len) {
Log::Fatal("len of weights is not same with #data");
}
if (weights_ != nullptr) { delete[] weights_; }
num_weights_ = num_data_;
weights_ = new float[num_weights_];
for (data_size_t i = 0; i < num_weights_; ++i) {
weights_[i] = weights[i];
}
LoadQueryWeights();
}
void Metadata::SetQueryBoundaries(const data_size_t* query_boundaries, data_size_t len) {
data_size_t sum = 0;
for (data_size_t i = 0; i < len; ++i) {
sum += query_boundaries[i];
}
if (num_data_ != sum) {
Log::Fatal("sum of query counts is not same with #data");
}
if (query_boundaries_ != nullptr) { delete[] query_boundaries_; }
num_queries_ = len;
query_boundaries_ = new data_size_t[num_queries_];
for (data_size_t i = 0; i < num_queries_; ++i) {
query_boundaries_[i] = query_boundaries[i];
}
LoadQueryWeights();
}
void Metadata::LoadWeights() { void Metadata::LoadWeights() {
num_weights_ = 0; num_weights_ = 0;
std::string weight_filename(data_filename_); std::string weight_filename(data_filename_);
......
...@@ -54,7 +54,7 @@ public: ...@@ -54,7 +54,7 @@ public:
void ConstructHistogram(data_size_t*, data_size_t , const score_t* , void ConstructHistogram(data_size_t*, data_size_t , const score_t* ,
const score_t* , HistogramBinEntry*) const override { const score_t* , HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead // Will use OrderedSparseBin->ConstructHistogram() instead
Log::Info("Should use OrderedSparseBin->ConstructHistogram() instead"); Log::Fatal("Should use OrderedSparseBin->ConstructHistogram() instead");
} }
data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data, data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
......
...@@ -159,6 +159,7 @@ ...@@ -159,6 +159,7 @@
<ClInclude Include="..\include\LightGBM\bin.h" /> <ClInclude Include="..\include\LightGBM\bin.h" />
<ClInclude Include="..\include\LightGBM\boosting.h" /> <ClInclude Include="..\include\LightGBM\boosting.h" />
<ClInclude Include="..\include\LightGBM\config.h" /> <ClInclude Include="..\include\LightGBM\config.h" />
<ClInclude Include="..\include\LightGBM\c_api.h" />
<ClInclude Include="..\include\LightGBM\dataset.h" /> <ClInclude Include="..\include\LightGBM\dataset.h" />
<ClInclude Include="..\include\LightGBM\feature.h" /> <ClInclude Include="..\include\LightGBM\feature.h" />
<ClInclude Include="..\include\LightGBM\meta.h" /> <ClInclude Include="..\include\LightGBM\meta.h" />
...@@ -203,6 +204,7 @@ ...@@ -203,6 +204,7 @@
<ClCompile Include="..\src\application\application.cpp" /> <ClCompile Include="..\src\application\application.cpp" />
<ClCompile Include="..\src\boosting\boosting.cpp" /> <ClCompile Include="..\src\boosting\boosting.cpp" />
<ClCompile Include="..\src\boosting\gbdt.cpp" /> <ClCompile Include="..\src\boosting\gbdt.cpp" />
<ClCompile Include="..\src\c_api.cpp" />
<ClCompile Include="..\src\io\bin.cpp" /> <ClCompile Include="..\src\io\bin.cpp" />
<ClCompile Include="..\src\io\config.cpp" /> <ClCompile Include="..\src\io\config.cpp" />
<ClCompile Include="..\src\io\dataset.cpp" /> <ClCompile Include="..\src\io\dataset.cpp" />
......
...@@ -165,6 +165,9 @@ ...@@ -165,6 +165,9 @@
<ClInclude Include="..\include\LightGBM\utils\lru_pool.h"> <ClInclude Include="..\include\LightGBM\utils\lru_pool.h">
<Filter>include\LightGBM\utils</Filter> <Filter>include\LightGBM\utils</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\include\LightGBM\c_api.h">
<Filter>include\LightGBM</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClCompile Include="..\src\application\application.cpp"> <ClCompile Include="..\src\application\application.cpp">
...@@ -230,5 +233,8 @@ ...@@ -230,5 +233,8 @@
<ClCompile Include="..\src\main.cpp"> <ClCompile Include="..\src\main.cpp">
<Filter>src</Filter> <Filter>src</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="..\src\c_api.cpp">
<Filter>src</Filter>
</ClCompile>
</ItemGroup> </ItemGroup>
</Project> </Project>
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment