Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
504d400c
Commit
504d400c
authored
Nov 01, 2016
by
Guolin Ke
Committed by
GitHub
Nov 01, 2016
Browse files
Merge pull request #56 from guolinke/master to dev
merge to dev
parents
e2fe9283
b25bbcc2
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
671 additions
and
36 deletions
+671
-36
include/LightGBM/c_api.h
include/LightGBM/c_api.h
+357
-0
include/LightGBM/config.h
include/LightGBM/config.h
+2
-0
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+37
-11
src/application/application.cpp
src/application/application.cpp
+2
-2
src/boosting/boosting.cpp
src/boosting/boosting.cpp
+1
-1
src/c_api.cpp
src/c_api.cpp
+102
-0
src/io/config.cpp
src/io/config.cpp
+2
-0
src/io/dataset.cpp
src/io/dataset.cpp
+116
-20
src/io/metadata.cpp
src/io/metadata.cpp
+43
-1
src/io/sparse_bin.hpp
src/io/sparse_bin.hpp
+1
-1
windows/LightGBM.vcxproj
windows/LightGBM.vcxproj
+2
-0
windows/LightGBM.vcxproj.filters
windows/LightGBM.vcxproj.filters
+6
-0
No files found.
include/LightGBM/c_api.h
0 → 100644
View file @
504d400c
#ifndef LIGHTGBM_C_API_H_
#define LIGHTGBM_C_API_H_
#include<cstdint>
#ifdef __cplusplus
#define DLL_EXTERN_C extern "C"
#else
#define DLL_EXTERN_C
#endif
#ifdef _MSC_VER
#define DllExport DLL_EXTERN_C __declspec(dllexport)
#else
#define DllExport DLL_EXTERN_C
#endif
typedef
void
*
DatesetHandle
;
typedef
void
*
BoosterHandle
;
/*!
* \brief get string message of the last error
* all function in this file will return 0 when success
* and -1 when an error occured,
* \return const char* error inforomation
*/
DllExport
const
char
*
LGBM_GetLastError
();
// --- start Dataset inferfaces
/*!
* \brief load data set from file like the command_line LightGBM do
* \param parameters additional parameters
* \param filename the name of the file
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out a loaded dataset
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_CreateDatasetFromFile
(
const
char
*
filename
,
const
char
*
parameters
,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
);
/*!
* \brief load data set from binary file like the command_line LightGBM do
* \param filename the name of the file
* \param out a loaded dataset
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_CreateDatasetFromBinaryFile
(
const
char
*
filename
,
DatesetHandle
*
out
);
/*!
* \brief create a dataset from CSR format
* \param indptr pointer to row headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_CreateDatasetFromCSR
(
const
uint64_t
*
indptr
,
const
unsigned
*
indices
,
const
float
*
data
,
uint64_t
nindptr
,
uint64_t
nelem
,
uint64_t
num_col
,
const
char
*
parameters
,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
);
/*!
* \brief create a dataset from CSC format
* \param col_ptr pointer to col headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows; when it's set to 0, then guess from data
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_CreateDatasetFromCSC
(
const
uint64_t
*
col_ptr
,
const
unsigned
*
indices
,
const
float
*
data
,
uint64_t
nindptr
,
uint64_t
nelem
,
uint64_t
num_row
,
const
char
*
parameters
,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
);
/*!
* \brief create dataset from dense matrix
* \param data pointer to the data space
* \param nrow number of rows
* \param ncol number columns
* \param missing which value to represent missing value
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_CreateDatasetFromMat
(
const
float
*
data
,
uint64_t
nrow
,
uint64_t
ncol
,
float
missing
,
const
char
*
parameters
,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
);
/*!
* \brief free space for dataset
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_DatasetFree
(
DatesetHandle
*
handle
);
/*!
* \brief save dateset to binary file
* \param handle a instance of dataset
* \param filename file name
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_DatasetSaveBinary
(
DatesetHandle
handle
,
const
char
*
filename
);
/*!
* \brief set vector to a content in info
* \param handle a instance of dataset
* \param field_name field name, can be label, weight, group
* \param field_data pointer to float vector
* \param field_len number of element in field_data
* \param type float:0, int:1
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_DatasetSetField
(
DatesetHandle
handle
,
const
char
*
field_name
,
const
void
*
field_data
,
uint64_t
num_element
,
int
type
);
/*!
* \brief get float info vector from dataset
* \param handle a instance of data matrix
* \param field_name field name
* \param out_len used to set result length
* \param out_ptr pointer to the result
* \param out_type float:0, int:1
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_DatasetGetField
(
DatesetHandle
handle
,
const
char
*
field_name
,
uint64_t
*
out_len
,
const
void
**
out_ptr
,
int
*
out_type
);
/*!
* \brief get number of data.
* \param handle the handle to the dataset
* \param out The address to hold number of data
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_DatasetGetNumData
(
DatesetHandle
handle
,
uint64_t
*
out
);
/*!
* \brief get number of features
* \param handle the handle to the dataset
* \param out The output of number of features
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_DatasetGetNumFeature
(
DatesetHandle
handle
,
uint64_t
*
out
);
// --- start Booster interfaces
/*!
* \brief create an new boosting learner
* \param train_data traning data set
* \param valid_datas validation data sets
* \param valid_names names of validation data sets
* \param n_valid_datas number of validation set
* \param parameters format: 'key1=value1 key2=value2'
* \prama out handle of created Booster
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterCreate
(
DatesetHandle
train_data
,
DatesetHandle
valid_datas
[],
const
char
*
valid_names
[],
int
n_valid_datas
,
const
char
*
parameters
,
BoosterHandle
*
out
);
/*!
* \brief load an exsiting boosting from model file
* \param filename filename of model
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterLoadFromModelfile
(
const
char
*
filename
,
BoosterHandle
*
out
);
/*!
* \brief free obj in handle
* \param handle handle to be freed
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterFree
(
BoosterHandle
handle
);
/*!
* \brief update the model in one round
* \param handle handle
* \param is_finished 1 means finised
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterUpdateOneIter
(
BoosterHandle
handle
,
int
*
is_finished
);
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to support customized loss function
* \param handle handle
* \param grad gradient statistics
* \param hess second order gradient statistics
* \param is_finished 1 means finised
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterUpdateOneIterCustom
(
BoosterHandle
handle
,
float
*
grad
,
float
*
hess
,
int
*
is_finished
);
/*!
* \brief get evaluation for training data and validation datas
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_result the string containing evaluation statistics
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterEval
(
BoosterHandle
handle
,
int
data
,
const
char
**
out_result
);
/*!
* \brief make prediction for training data and validation datas
this can be used to support customized eval function
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param predict_type
* 0:raw score
* 1:with sigmoid transform(if needed)
* 2:leaf index
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterPredict
(
BoosterHandle
handle
,
int
data
,
int
predict_type
,
const
float
**
out_result
);
/*!
* \brief make prediction for an new data set
* \param handle handle
* \param indptr pointer to row headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param predict_type
* 0:raw score
* 1:with sigmoid transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterPredictForCSR
(
BoosterHandle
handle
,
const
uint64_t
*
indptr
,
const
unsigned
*
indices
,
const
float
*
data
,
uint64_t
nindptr
,
uint64_t
nelem
,
uint64_t
num_col
,
int
predict_type
,
uint64_t
n_used_trees
,
const
float
**
out_result
);
/*!
* \brief make prediction for an new data set
* \param handle handle
* \param col_ptr pointer to col headers
* \param indices findex
* \param data fvalue
* \param nindptr number of rows in the matix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows; when it's set to 0, then guess from data
* \param predict_type
* 0:raw score
* 1:with sigmoid transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterPredictForCSC
(
BoosterHandle
handle
,
const
uint64_t
*
col_ptr
,
const
unsigned
*
indices
,
const
float
*
data
,
uint64_t
nindptr
,
uint64_t
nelem
,
uint64_t
num_row
,
int
predict_type
,
uint64_t
n_used_trees
,
const
float
**
out_result
);
/*!
* \brief make prediction for an new data set
* \param handle handle
* \param data pointer to the data space
* \param nrow number of rows
* \param ncol number columns
* \param missing which value to represent missing value
* \param predict_type
* 0:raw score
* 1:with sigmoid transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterPredictForMat
(
BoosterHandle
handle
,
const
float
*
data
,
uint64_t
nrow
,
uint64_t
ncol
,
float
missing
,
int
predict_type
,
uint64_t
n_used_trees
,
const
float
**
out_result
);
/*!
* \brief save model into file
* \param handle handle
* \param is_finished 1 means finised
* \param filename file name
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterSaveModel
(
BoosterHandle
handle
,
int
is_finished
,
const
char
*
filename
);
#endif // LIGHTGBM_C_API_H_
include/LightGBM/config.h
View file @
504d400c
...
...
@@ -99,6 +99,8 @@ public:
bool
is_enable_sparse
=
true
;
bool
use_two_round_loading
=
false
;
bool
is_save_binary_file
=
false
;
bool
enable_load_from_binary_file
=
true
;
int
bin_construct_sample_cnt
=
50000
;
bool
is_sigmoid
=
true
;
bool
has_header
=
false
;
...
...
include/LightGBM/dataset.h
View file @
504d400c
...
...
@@ -17,6 +17,7 @@ namespace LightGBM {
/*! \brief forward declaration */
class
Feature
;
class
BinMapper
;
/*!
* \brief This class is used to store some meta(non-feature) data for training data,
...
...
@@ -79,6 +80,13 @@ public:
void
CheckOrPartition
(
data_size_t
num_all_data
,
const
std
::
vector
<
data_size_t
>&
used_data_indices
);
void
SetLabel
(
const
float
*
label
,
data_size_t
len
);
void
SetWeights
(
const
float
*
weights
,
data_size_t
len
);
void
SetQueryBoundaries
(
const
data_size_t
*
QueryBoundaries
,
data_size_t
len
);
/*!
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
...
...
@@ -188,8 +196,6 @@ private:
data_size_t
num_weights_
;
/*! \brief Label data */
float
*
label_
;
/*! \brief Label data, int type */
int16_t
*
label_int_
;
/*! \brief Weights data */
float
*
weights_
;
/*! \brief Query boundaries */
...
...
@@ -262,9 +268,24 @@ public:
:
Dataset
(
data_filename
,
""
,
io_config
,
predict_fun
)
{
}
/*!
* \brief Constructor, without filename, used to load data from memory
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
);
/*! \brief Destructor */
~
Dataset
();
/*! \brief Init Dataset with specific binmapper */
void
InitByBinMapper
(
std
::
vector
<
const
BinMapper
*>
bin_mappers
,
data_size_t
num_data
);
/*! \brief push raw data into dataset */
void
PushData
(
const
std
::
vector
<
std
::
vector
<
std
::
pair
<
int
,
float
>>>&
datas
,
data_size_t
start_idx
,
bool
is_finished
);
void
SetField
(
const
char
*
field_name
,
const
void
*
field_data
,
data_size_t
num_element
,
int
type
);
/*!
* \brief Load training data on parallel training
* \param rank Rank of local machine
...
...
@@ -290,10 +311,21 @@ public:
*/
void
LoadValidationData
(
const
Dataset
*
train_set
,
bool
use_two_round_loading
);
/*!
* \brief Load data set from binary file
* \param bin_filename filename of bin data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void
LoadDataFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
,
bool
is_pre_partition
);
/*!
* \brief Save current dataset into binary file, will save to "filename.bin"
*/
void
SaveBinaryFile
();
void
SaveBinaryFile
(
const
char
*
bin_filename
);
std
::
vector
<
const
BinMapper
*>
GetBinMappers
()
const
;
/*!
* \brief Get a feature pointer for specific index
...
...
@@ -371,14 +403,6 @@ private:
/*! \brief Check can load from binary file */
void
CheckCanLoadFromBin
();
/*!
* \brief Load data set from binary file
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void
LoadDataFromBinFile
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
);
/*! \brief Check this data set is null or not */
void
CheckDataset
();
...
...
@@ -424,6 +448,8 @@ private:
std
::
unordered_set
<
int
>
ignore_features_
;
/*! \brief store feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief store feature names */
int
bin_construct_sample_cnt_
;
};
}
// namespace LightGBM
...
...
src/application/application.cpp
View file @
504d400c
...
...
@@ -150,7 +150,7 @@ void Application::LoadData() {
}
// need save binary file
if
(
config_
.
io_config
.
is_save_binary_file
)
{
train_data_
->
SaveBinaryFile
();
train_data_
->
SaveBinaryFile
(
nullptr
);
}
// create training metric
if
(
config_
.
boosting_config
->
is_provide_training_metric
)
{
...
...
@@ -175,7 +175,7 @@ void Application::LoadData() {
config_
.
io_config
.
use_two_round_loading
);
// need save binary file
if
(
config_
.
io_config
.
is_save_binary_file
)
{
valid_datas_
.
back
()
->
SaveBinaryFile
();
valid_datas_
.
back
()
->
SaveBinaryFile
(
nullptr
);
}
// add metric for validation data
...
...
src/boosting/boosting.cpp
View file @
504d400c
...
...
@@ -25,7 +25,7 @@ void LoadFileToBoosting(Boosting* boosting, const char* filename) {
}
Boosting
*
Boosting
::
CreateBoosting
(
BoostingType
type
,
const
char
*
filename
)
{
if
(
filename
[
0
]
==
'\0'
)
{
if
(
filename
==
nullptr
||
filename
[
0
]
==
'\0'
)
{
if
(
type
==
BoostingType
::
kGBDT
)
{
return
new
GBDT
();
}
else
{
...
...
src/c_api.cpp
0 → 100644
View file @
504d400c
#include <LightGBM/c_api.h>
#include <LightGBM/dataset.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
#include <LightGBM/config.h>
#include <cstdio>
#include <vector>
#include <string>
#include <cstring>
namespace
LightGBM
{
class
Booster
{
public:
explicit
Booster
(
const
char
*
filename
)
:
boosting_
(
Boosting
::
CreateBoosting
(
filename
))
{
}
Booster
(
const
Dataset
*
train_data
,
std
::
vector
<
const
Dataset
*>
valid_data
,
std
::
vector
<
std
::
string
>
valid_names
,
const
char
*
parameters
)
:
train_data_
(
train_data
),
valid_datas_
(
valid_data
)
{
config_
.
LoadFromString
(
parameters
);
// create boosting
if
(
config_
.
io_config
.
input_model
.
size
()
>
0
)
{
Log
::
Error
(
"continued train from model is not support for c_api, \
please use continued train with input score"
);
}
boosting_
=
Boosting
::
CreateBoosting
(
config_
.
boosting_type
,
""
);
// create objective function
objective_fun_
=
ObjectiveFunction
::
CreateObjectiveFunction
(
config_
.
objective_type
,
config_
.
objective_config
);
// create training metric
if
(
config_
.
boosting_config
->
is_provide_training_metric
)
{
for
(
auto
metric_type
:
config_
.
metric_types
)
{
Metric
*
metric
=
Metric
::
CreateMetric
(
metric_type
,
config_
.
metric_config
);
if
(
metric
==
nullptr
)
{
continue
;
}
metric
->
Init
(
"training"
,
train_data_
->
metadata
(),
train_data_
->
num_data
());
train_metric_
.
push_back
(
metric
);
}
}
// add metric for validation data
for
(
size_t
i
=
0
;
i
<
valid_datas_
.
size
();
++
i
)
{
valid_metrics_
.
emplace_back
();
for
(
auto
metric_type
:
config_
.
metric_types
)
{
Metric
*
metric
=
Metric
::
CreateMetric
(
metric_type
,
config_
.
metric_config
);
if
(
metric
==
nullptr
)
{
continue
;
}
metric
->
Init
(
valid_names
[
i
].
c_str
(),
valid_datas_
[
i
]
->
metadata
(),
valid_datas_
[
i
]
->
num_data
());
valid_metrics_
.
back
().
push_back
(
metric
);
}
}
// initialize the objective function
objective_fun_
->
Init
(
train_data_
->
metadata
(),
train_data_
->
num_data
());
// initialize the boosting
boosting_
->
Init
(
config_
.
boosting_config
,
train_data_
,
objective_fun_
,
ConstPtrInVectorWarpper
<
Metric
>
(
train_metric_
));
// add validation data into boosting
for
(
size_t
i
=
0
;
i
<
valid_datas_
.
size
();
++
i
)
{
boosting_
->
AddDataset
(
valid_datas_
[
i
],
ConstPtrInVectorWarpper
<
Metric
>
(
valid_metrics_
[
i
]));
}
}
~
Booster
()
{
for
(
auto
&
metric
:
train_metric_
)
{
if
(
metric
!=
nullptr
)
{
delete
metric
;
}
}
for
(
auto
&
metric
:
valid_metrics_
)
{
for
(
auto
&
sub_metric
:
metric
)
{
if
(
sub_metric
!=
nullptr
)
{
delete
sub_metric
;
}
}
}
valid_metrics_
.
clear
();
if
(
boosting_
!=
nullptr
)
{
delete
boosting_
;
}
if
(
objective_fun_
!=
nullptr
)
{
delete
objective_fun_
;
}
}
private:
Boosting
*
boosting_
;
/*! \brief All configs */
OverallConfig
config_
;
/*! \brief Training data */
const
Dataset
*
train_data_
;
/*! \brief Validation data */
std
::
vector
<
const
Dataset
*>
valid_datas_
;
/*! \brief Metric for training data */
std
::
vector
<
Metric
*>
train_metric_
;
/*! \brief Metrics for validation data */
std
::
vector
<
std
::
vector
<
Metric
*>>
valid_metrics_
;
/*! \brief Training objective function */
ObjectiveFunction
*
objective_fun_
;
};
}
src/io/config.cpp
View file @
504d400c
...
...
@@ -191,10 +191,12 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
}
GetInt
(
params
,
"verbose"
,
&
verbosity
);
GetInt
(
params
,
"num_model_predict"
,
&
num_model_predict
);
GetInt
(
params
,
"bin_construct_sample_cnt"
,
&
bin_construct_sample_cnt
);
GetBool
(
params
,
"is_pre_partition"
,
&
is_pre_partition
);
GetBool
(
params
,
"is_enable_sparse"
,
&
is_enable_sparse
);
GetBool
(
params
,
"use_two_round_loading"
,
&
use_two_round_loading
);
GetBool
(
params
,
"is_save_binary_file"
,
&
is_save_binary_file
);
GetBool
(
params
,
"enable_load_from_binary_file"
,
&
enable_load_from_binary_file
);
GetBool
(
params
,
"is_sigmoid"
,
&
is_sigmoid
);
GetString
(
params
,
"output_model"
,
&
output_model
);
GetString
(
params
,
"input_model"
,
&
input_model
);
...
...
src/io/dataset.cpp
View file @
504d400c
...
...
@@ -18,9 +18,11 @@ namespace LightGBM {
Dataset
::
Dataset
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
)
:
data_filename_
(
data_filename
),
random_
(
io_config
.
data_random_seed
),
max_bin_
(
io_config
.
max_bin
),
is_enable_sparse_
(
io_config
.
is_enable_sparse
),
predict_fun_
(
predict_fun
)
{
max_bin_
(
io_config
.
max_bin
),
is_enable_sparse_
(
io_config
.
is_enable_sparse
),
predict_fun_
(
predict_fun
),
bin_construct_sample_cnt_
(
io_config
.
bin_construct_sample_cnt
)
{
if
(
io_config
.
enable_load_from_binary_file
)
{
CheckCanLoadFromBin
();
}
if
(
is_loading_from_binfile_
&&
predict_fun
!=
nullptr
)
{
Log
::
Info
(
"Cannot performing initialization of prediction by using binary file, using text file instead"
);
is_loading_from_binfile_
=
false
;
...
...
@@ -160,6 +162,17 @@ Dataset::Dataset(const char* data_filename, const char* init_score_filename,
}
Dataset
::
Dataset
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
)
:
data_filename_
(
""
),
random_
(
io_config
.
data_random_seed
),
max_bin_
(
io_config
.
max_bin
),
is_enable_sparse_
(
io_config
.
is_enable_sparse
),
predict_fun_
(
predict_fun
),
bin_construct_sample_cnt_
(
io_config
.
bin_construct_sample_cnt
)
{
parser_
=
nullptr
;
text_reader_
=
nullptr
;
}
Dataset
::~
Dataset
()
{
if
(
parser_
!=
nullptr
)
{
delete
parser_
;
}
if
(
text_reader_
!=
nullptr
)
{
delete
text_reader_
;
}
...
...
@@ -216,7 +229,7 @@ void Dataset::LoadDataToMemory(int rank, int num_machines, bool is_pre_partition
}
void
Dataset
::
SampleDataFromMemory
(
std
::
vector
<
std
::
string
>*
out_data
)
{
const
size_t
sample_cnt
=
static_cast
<
size_t
>
(
num_data_
<
50000
?
num_data_
:
50000
);
const
size_t
sample_cnt
=
static_cast
<
size_t
>
(
num_data_
<
bin_construct_sample_cnt_
?
num_data_
:
bin_construct_sample_cnt_
);
std
::
vector
<
size_t
>
sample_indices
=
random_
.
Sample
(
num_data_
,
sample_cnt
);
out_data
->
clear
();
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
...
...
@@ -228,7 +241,7 @@ void Dataset::SampleDataFromMemory(std::vector<std::string>* out_data) {
void
Dataset
::
SampleDataFromFile
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
,
std
::
vector
<
std
::
string
>*
out_data
)
{
used_data_indices_
.
clear
();
const
size_t
sample_cnt
=
50000
;
const
data_
size_t
sample_cnt
=
static_cast
<
data_size_t
>
(
bin_construct_sample_cnt_
)
;
if
(
num_machines
==
1
||
is_pre_partition
)
{
num_data_
=
static_cast
<
data_size_t
>
(
text_reader_
->
SampleFromFile
(
random_
,
sample_cnt
,
out_data
));
global_num_data_
=
num_data_
;
...
...
@@ -272,6 +285,83 @@ void Dataset::SampleDataFromFile(int rank, int num_machines, bool is_pre_partiti
}
}
void
Dataset
::
InitByBinMapper
(
std
::
vector
<
const
BinMapper
*>
bin_mappers
,
data_size_t
num_data
)
{
num_data_
=
num_data
;
global_num_data_
=
num_data_
;
// initialize label
metadata_
.
Init
(
num_data_
,
-
1
,
-
1
);
// free old memory
for
(
auto
&
feature
:
features_
)
{
delete
feature
;
}
features_
.
clear
();
used_feature_map_
=
std
::
vector
<
int
>
(
bin_mappers
.
size
(),
-
1
);
for
(
size_t
i
=
0
;
i
<
bin_mappers
.
size
();
++
i
)
{
if
(
bin_mappers
[
i
]
!=
nullptr
)
{
features_
.
push_back
(
new
Feature
(
static_cast
<
int
>
(
i
),
new
BinMapper
(
bin_mappers
[
i
]),
num_data_
,
is_enable_sparse_
));
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
features_
.
size
());
}
}
num_features_
=
static_cast
<
int
>
(
features_
.
size
());
}
std
::
vector
<
const
BinMapper
*>
Dataset
::
GetBinMappers
()
const
{
std
::
vector
<
const
BinMapper
*>
ret
(
num_total_features_
,
nullptr
);
for
(
const
auto
feature
:
features_
)
{
ret
[
feature
->
feature_index
()]
=
feature
->
bin_mapper
();
}
return
ret
;
}
void
Dataset
::
PushData
(
const
std
::
vector
<
std
::
vector
<
std
::
pair
<
int
,
float
>>>&
datas
,
data_size_t
start_idx
,
bool
is_finished
)
{
// if doesn't need to prediction with initial model
#pragma omp parallel for schedule(guided)
for
(
data_size_t
i
=
0
;
i
<
static_cast
<
int
>
(
datas
.
size
());
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
for
(
auto
&
inner_data
:
datas
[
i
])
{
int
feature_idx
=
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
features_
[
feature_idx
]
->
PushData
(
tid
,
start_idx
+
i
,
inner_data
.
second
);
}
}
}
if
(
is_finished
)
{
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
features_
[
i
]
->
FinishLoad
();
}
}
}
void
Dataset
::
SetField
(
const
char
*
field_name
,
const
void
*
field_data
,
data_size_t
num_element
,
int
type
)
{
std
::
string
name
(
field_name
);
name
=
Common
::
Trim
(
name
);
if
(
name
==
std
::
string
(
"label"
)
||
name
==
std
::
string
(
"target"
))
{
if
(
type
!=
0
)
{
Log
::
Fatal
(
"type of label should be float"
);
}
metadata_
.
SetLabel
(
static_cast
<
const
float
*>
(
field_data
),
num_element
);
}
else
if
(
name
==
std
::
string
(
"weight"
)
||
name
==
std
::
string
(
"weights"
))
{
if
(
type
!=
0
)
{
Log
::
Fatal
(
"type of weights should be float"
);
}
metadata_
.
SetWeights
(
static_cast
<
const
float
*>
(
field_data
),
num_element
);
}
else
if
(
name
==
std
::
string
(
"init_score"
))
{
if
(
type
!=
0
)
{
Log
::
Fatal
(
"type of init_score should be float"
);
}
metadata_
.
SetInitScore
(
static_cast
<
const
float
*>
(
field_data
),
num_element
);
}
else
if
(
name
==
std
::
string
(
"query"
)
||
name
==
std
::
string
(
"group"
))
{
if
(
type
!=
1
)
{
Log
::
Fatal
(
"type of init_score should be int"
);
}
metadata_
.
SetQueryBoundaries
(
static_cast
<
const
data_size_t
*>
(
field_data
),
num_element
);
}
else
{
Log
::
Fatal
(
"unknow field name: %s"
,
field_name
);
}
}
void
Dataset
::
ConstructBinMappers
(
int
rank
,
int
num_machines
,
const
std
::
vector
<
std
::
string
>&
sample_data
)
{
// sample_values[i][j], means the value of j-th sample on i-th feature
std
::
vector
<
std
::
vector
<
float
>>
sample_values
;
...
...
@@ -452,8 +542,10 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
ExtractFeaturesFromFile
();
}
}
else
{
std
::
string
bin_filename
(
data_filename_
);
bin_filename
.
append
(
".bin"
);
// load data from binary file
LoadDataFromBinFile
(
rank
,
num_machines
,
is_pre_partition
);
LoadDataFromBinFile
(
bin_filename
.
c_str
(),
rank
,
num_machines
,
is_pre_partition
);
}
// check meta data
metadata_
.
CheckOrPartition
(
static_cast
<
data_size_t
>
(
global_num_data_
),
used_data_indices_
);
...
...
@@ -497,8 +589,10 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
ExtractFeaturesFromFile
();
}
}
else
{
std
::
string
bin_filename
(
data_filename_
);
bin_filename
.
append
(
".bin"
);
// load from binary file
LoadDataFromBinFile
(
0
,
1
,
false
);
LoadDataFromBinFile
(
bin_filename
.
c_str
(),
0
,
1
,
false
);
}
// not need to check validation data
// check meta data
...
...
@@ -646,19 +740,23 @@ void Dataset::ExtractFeaturesFromFile() {
}
}
void
Dataset
::
SaveBinaryFile
()
{
// if is loaded from binary file, not need to save
void
Dataset
::
SaveBinaryFile
(
const
char
*
bin_filename
)
{
if
(
!
is_loading_from_binfile_
)
{
std
::
string
bin_filename
(
data_filename_
);
bin_filename
.
append
(
".bin"
);
// if not pass a filename, just append ".bin" of original file
if
(
bin_filename
==
nullptr
||
bin_filename
[
0
]
==
'\0'
)
{
std
::
string
bin_filename_str
(
data_filename_
);
bin_filename_str
.
append
(
".bin"
);
bin_filename
=
bin_filename_str
.
c_str
();
}
FILE
*
file
;
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
.
c_str
()
,
"wb"
);
fopen_s
(
&
file
,
bin_filename
,
"wb"
);
#else
file
=
fopen
(
bin_filename
.
c_str
()
,
"wb"
);
file
=
fopen
(
bin_filename
,
"wb"
);
#endif
if
(
file
==
NULL
)
{
Log
::
Fatal
(
"Cannot write binary data to %s "
,
bin_filename
.
c_str
()
);
Log
::
Fatal
(
"Cannot write binary data to %s "
,
bin_filename
);
}
Log
::
Info
(
"Saving data to binary file: %s"
,
data_filename_
);
...
...
@@ -715,20 +813,18 @@ void Dataset::CheckCanLoadFromBin() {
}
}
void
Dataset
::
LoadDataFromBinFile
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
)
{
std
::
string
bin_filename
(
data_filename_
);
bin_filename
.
append
(
".bin"
);
void
Dataset
::
LoadDataFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
,
bool
is_pre_partition
)
{
FILE
*
file
;
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
.
c_str
()
,
"rb"
);
fopen_s
(
&
file
,
bin_filename
,
"rb"
);
#else
file
=
fopen
(
bin_filename
.
c_str
()
,
"rb"
);
file
=
fopen
(
bin_filename
,
"rb"
);
#endif
if
(
file
==
NULL
)
{
Log
::
Fatal
(
"Cannot read binary data from %s"
,
bin_filename
.
c_str
()
);
Log
::
Fatal
(
"Cannot read binary data from %s"
,
bin_filename
);
}
// buffer to read binary file
...
...
src/io/metadata.cpp
View file @
504d400c
...
...
@@ -8,7 +8,7 @@
namespace
LightGBM
{
Metadata
::
Metadata
()
:
label_
(
nullptr
),
label_int_
(
nullptr
),
weights_
(
nullptr
),
:
label_
(
nullptr
),
weights_
(
nullptr
),
query_boundaries_
(
nullptr
),
query_weights_
(
nullptr
),
init_score_
(
nullptr
),
queries_
(
nullptr
){
...
...
@@ -225,6 +225,48 @@ void Metadata::SetInitScore(const float* init_score, data_size_t len) {
}
}
void
Metadata
::
SetLabel
(
const
float
*
label
,
data_size_t
len
)
{
if
(
num_data_
!=
len
)
{
Log
::
Fatal
(
"len of label is not same with #data"
);
}
if
(
label_
!=
nullptr
)
{
delete
[]
label_
;
}
label_
=
new
float
[
num_data_
];
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
label_
[
i
]
=
label
[
i
];
}
}
void
Metadata
::
SetWeights
(
const
float
*
weights
,
data_size_t
len
)
{
if
(
num_data_
!=
len
)
{
Log
::
Fatal
(
"len of weights is not same with #data"
);
}
if
(
weights_
!=
nullptr
)
{
delete
[]
weights_
;
}
num_weights_
=
num_data_
;
weights_
=
new
float
[
num_weights_
];
for
(
data_size_t
i
=
0
;
i
<
num_weights_
;
++
i
)
{
weights_
[
i
]
=
weights
[
i
];
}
LoadQueryWeights
();
}
void
Metadata
::
SetQueryBoundaries
(
const
data_size_t
*
query_boundaries
,
data_size_t
len
)
{
data_size_t
sum
=
0
;
for
(
data_size_t
i
=
0
;
i
<
len
;
++
i
)
{
sum
+=
query_boundaries
[
i
];
}
if
(
num_data_
!=
sum
)
{
Log
::
Fatal
(
"sum of query counts is not same with #data"
);
}
if
(
query_boundaries_
!=
nullptr
)
{
delete
[]
query_boundaries_
;
}
num_queries_
=
len
;
query_boundaries_
=
new
data_size_t
[
num_queries_
];
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
query_boundaries_
[
i
]
=
query_boundaries
[
i
];
}
LoadQueryWeights
();
}
void
Metadata
::
LoadWeights
()
{
num_weights_
=
0
;
std
::
string
weight_filename
(
data_filename_
);
...
...
src/io/sparse_bin.hpp
View file @
504d400c
...
...
@@ -54,7 +54,7 @@ public:
void
ConstructHistogram
(
data_size_t
*
,
data_size_t
,
const
score_t
*
,
const
score_t
*
,
HistogramBinEntry
*
)
const
override
{
// Will use OrderedSparseBin->ConstructHistogram() instead
Log
::
Info
(
"Should use OrderedSparseBin->ConstructHistogram() instead"
);
Log
::
Fatal
(
"Should use OrderedSparseBin->ConstructHistogram() instead"
);
}
data_size_t
Split
(
unsigned
int
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
...
...
windows/LightGBM.vcxproj
View file @
504d400c
...
...
@@ -159,6 +159,7 @@
<ClInclude
Include=
"..\include\LightGBM\bin.h"
/>
<ClInclude
Include=
"..\include\LightGBM\boosting.h"
/>
<ClInclude
Include=
"..\include\LightGBM\config.h"
/>
<ClInclude
Include=
"..\include\LightGBM\c_api.h"
/>
<ClInclude
Include=
"..\include\LightGBM\dataset.h"
/>
<ClInclude
Include=
"..\include\LightGBM\feature.h"
/>
<ClInclude
Include=
"..\include\LightGBM\meta.h"
/>
...
...
@@ -203,6 +204,7 @@
<ClCompile
Include=
"..\src\application\application.cpp"
/>
<ClCompile
Include=
"..\src\boosting\boosting.cpp"
/>
<ClCompile
Include=
"..\src\boosting\gbdt.cpp"
/>
<ClCompile
Include=
"..\src\c_api.cpp"
/>
<ClCompile
Include=
"..\src\io\bin.cpp"
/>
<ClCompile
Include=
"..\src\io\config.cpp"
/>
<ClCompile
Include=
"..\src\io\dataset.cpp"
/>
...
...
windows/LightGBM.vcxproj.filters
View file @
504d400c
...
...
@@ -165,6 +165,9 @@
<ClInclude
Include=
"..\include\LightGBM\utils\lru_pool.h"
>
<Filter>
include\LightGBM\utils
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\c_api.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile
Include=
"..\src\application\application.cpp"
>
...
...
@@ -230,5 +233,8 @@
<ClCompile
Include=
"..\src\main.cpp"
>
<Filter>
src
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\c_api.cpp"
>
<Filter>
src
</Filter>
</ClCompile>
</ItemGroup>
</Project>
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment