"...git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "2b18b44f0b1c621ca73974e50b15a8baec2f7816"
Commit 422c0ef7 authored by Guolin Ke's avatar Guolin Ke
Browse files

almost finish, need some tests

parent fc383361
...@@ -73,7 +73,7 @@ public: ...@@ -73,7 +73,7 @@ public:
* \param result used to store prediction result, should allocate memory before call this function * \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score * \param out_len lenght of returned score
*/ */
virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) const = 0; virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) = 0;
/*! /*!
* \brief Prediction for one record, not sigmoid transform * \brief Prediction for one record, not sigmoid transform
...@@ -127,7 +127,7 @@ public: ...@@ -127,7 +127,7 @@ public:
* \brief Get number of weak sub-models * \brief Get number of weak sub-models
* \return Number of weak sub-models * \return Number of weak sub-models
*/ */
virtual int NumberOfSubModels() const = 0; virtual int NumberOfTotalModel() const = 0;
/*! /*!
* \brief Get number of classes * \brief Get number of classes
...@@ -138,7 +138,7 @@ public: ...@@ -138,7 +138,7 @@ public:
/*! /*!
* \brief Set number of used model for prediction * \brief Set number of used model for prediction
*/ */
virtual void SetNumUsedModel(int num_used_model) = 0; virtual void SetNumIterationForPred(int num_iteration) = 0;
/*! /*!
* \brief Get Type name of this boosting object * \brief Get Type name of this boosting object
......
...@@ -230,11 +230,13 @@ DllExport int LGBM_BoosterCreate(const DatesetHandle train_data, ...@@ -230,11 +230,13 @@ DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
/*! /*!
* \brief load an existing boosting from model file * \brief load an existing boosting from model file
* \param filename filename of model * \param filename filename of model
* \param out_num_total_model number of total models
* \param out handle of created Booster * \param out handle of created Booster
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
DllExport int LGBM_BoosterCreateFromModelfile( DllExport int LGBM_BoosterCreateFromModelfile(
const char* filename, const char* filename,
int64_t* out_num_total_model,
BoosterHandle* out); BoosterHandle* out);
/*! /*!
...@@ -244,6 +246,12 @@ DllExport int LGBM_BoosterCreateFromModelfile( ...@@ -244,6 +246,12 @@ DllExport int LGBM_BoosterCreateFromModelfile(
*/ */
DllExport int LGBM_BoosterFree(BoosterHandle handle); DllExport int LGBM_BoosterFree(BoosterHandle handle);
/*!
* \brief Get number of class
* \return number of class
*/
DllExport int LGBM_BoosterGetNumClasses(BoosterHandle handle, int64_t* out_len);
/*! /*!
* \brief update the model in one round * \brief update the model in one round
* \param handle handle * \param handle handle
...@@ -276,7 +284,7 @@ DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len); ...@@ -276,7 +284,7 @@ DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len);
* \brief Get number of eval * \brief Get number of eval
* \return total number of eval result * \return total number of eval result
*/ */
DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, const char*** out_strs); DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, char** out_strs);
/*! /*!
* \brief get evaluation for training data and validation data * \brief get evaluation for training data and validation data
...@@ -291,17 +299,6 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle, ...@@ -291,17 +299,6 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
int64_t* out_len, int64_t* out_len,
float* out_results); float* out_results);
/*!
* \brief get raw score for training data, used to calculate gradients outside
* \param handle handle
* \param out_len len of output result
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_BoosterGetTrainingScore(BoosterHandle handle,
int64_t* out_len,
const float** out_result);
/*! /*!
* \brief Get prediction for training data and validation data * \brief Get prediction for training data and validation data
this can be used to support customized eval function this can be used to support customized eval function
...@@ -319,21 +316,21 @@ DllExport int LGBM_BoosterGetPredict(BoosterHandle handle, ...@@ -319,21 +316,21 @@ DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
/*! /*!
* \brief make prediction for file * \brief make prediction for file
* \param handle handle * \param handle handle
* \param data_filename filename of data file
* \param data_has_header data file has header or not
* \param predict_type * \param predict_type
* 0:raw score * 0:raw score
* 1:with transform(if needed) * 1:with transform(if needed)
* 2:leaf index * 2:leaf index
* \param n_used_trees number of used tree * \param num_iteration number of iteration for prediction
* \param data_has_header data file has header or not
* \param data_filename filename of data file
* \param result_filename filename of result file * \param result_filename filename of result file
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle, DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
int predict_type,
int64_t n_used_trees,
int data_has_header,
const char* data_filename, const char* data_filename,
int data_has_header,
int predict_type,
int64_t num_iteration,
const char* result_filename); const char* result_filename);
/*! /*!
...@@ -351,7 +348,8 @@ DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle, ...@@ -351,7 +348,8 @@ DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
* 0:raw score * 0:raw score
* 1:with transform(if needed) * 1:with transform(if needed)
* 2:leaf index * 2:leaf index
* \param n_used_trees number of used tree * \param num_iteration number of iteration for prediction
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function * \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
...@@ -365,8 +363,9 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle, ...@@ -365,8 +363,9 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t nelem, int64_t nelem,
int64_t num_col, int64_t num_col,
int predict_type, int predict_type,
int64_t n_used_trees, int64_t num_iteration,
double* out_result); int64_t* out_len,
float* out_result);
/*! /*!
* \brief make prediction for an new data set * \brief make prediction for an new data set
...@@ -380,7 +379,8 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle, ...@@ -380,7 +379,8 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
* 0:raw score * 0:raw score
* 1:with transform(if needed) * 1:with transform(if needed)
* 2:leaf index * 2:leaf index
* \param n_used_trees number of used tree * \param num_iteration number of iteration for prediction
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function * \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
...@@ -391,18 +391,19 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle, ...@@ -391,18 +391,19 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int32_t ncol, int32_t ncol,
int is_row_major, int is_row_major,
int predict_type, int predict_type,
int64_t n_used_trees, int64_t num_iteration,
double* out_result); int64_t* out_len,
float* out_result);
/*! /*!
* \brief save model into file * \brief save model into file
* \param handle handle * \param handle handle
* \param num_used_model * \param num_iteration
* \param filename file name * \param filename file name
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
DllExport int LGBM_BoosterSaveModel(BoosterHandle handle, DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
int num_used_model, int num_iteration,
const char* filename); const char* filename);
......
...@@ -97,7 +97,7 @@ public: ...@@ -97,7 +97,7 @@ public:
std::string output_result = "LightGBM_predict_result.txt"; std::string output_result = "LightGBM_predict_result.txt";
std::string input_model = ""; std::string input_model = "";
int verbosity = 1; int verbosity = 1;
int num_model_predict = NO_LIMIT; int num_iteration_predict = NO_LIMIT;
bool is_pre_partition = false; bool is_pre_partition = false;
bool is_enable_sparse = true; bool is_enable_sparse = true;
bool use_two_round_loading = false; bool use_two_round_loading = false;
......
...@@ -6,6 +6,7 @@ import os ...@@ -6,6 +6,7 @@ import os
import ctypes import ctypes
import collections import collections
import re import re
import tempfile
import numpy as np import numpy as np
import scipy.sparse import scipy.sparse
...@@ -111,7 +112,7 @@ def c_array(ctype, values): ...@@ -111,7 +112,7 @@ def c_array(ctype, values):
return (ctype * len(values))(*values) return (ctype * len(values))(*values)
def dict_to_str(data): def dict_to_str(data):
if len(data) == 0: if data is None or len(data) == 0:
return "" return ""
pairs = [] pairs = []
for key in data: for key in data:
...@@ -131,10 +132,10 @@ def c_float_array(data): ...@@ -131,10 +132,10 @@ def c_float_array(data):
data = np.array(data, copy=False) data = np.array(data, copy=False)
if is_numpy_1d_array(data): if is_numpy_1d_array(data):
if data.dtype == np.float32: if data.dtype == np.float32:
ptr_data = c_array(ctypes.c_float, data) ptr_data = data.ctypes.data_as(ctypes.c_float)
type_data = C_API_DTYPE_FLOAT32 type_data = C_API_DTYPE_FLOAT32
elif data.dtype == np.float64: elif data.dtype == np.float64:
ptr_data = c_array(ctypes.c_double, data) ptr_data = data.ctypes.data_as(ctypes.c_double)
type_data = C_API_DTYPE_FLOAT64 type_data = C_API_DTYPE_FLOAT64
else: else:
raise TypeError("expected np.float32 or np.float64, met type({})".format(data.dtype)) raise TypeError("expected np.float32 or np.float64, met type({})".format(data.dtype))
...@@ -148,10 +149,10 @@ def c_int_array(data): ...@@ -148,10 +149,10 @@ def c_int_array(data):
data = np.array(data, copy=False) data = np.array(data, copy=False)
if is_numpy_1d_array(data): if is_numpy_1d_array(data):
if data.dtype == np.int32: if data.dtype == np.int32:
ptr_data = c_array(ctypes.c_int32, data) ptr_data = data.ctypes.data_as(ctypes.c_int32)
type_data = C_API_DTYPE_INT32 type_data = C_API_DTYPE_INT32
elif data.dtype == np.int64: elif data.dtype == np.int64:
ptr_data = c_array(ctypes.c_int64, data) ptr_data = data.ctypes.data_as(ctypes.c_int64)
type_data = C_API_DTYPE_INT64 type_data = C_API_DTYPE_INT64
else: else:
raise TypeError("expected np.int32 or np.int64, met type({})".format(data.dtype)) raise TypeError("expected np.int32 or np.int64, met type({})".format(data.dtype))
...@@ -206,6 +207,7 @@ class Dataset(object): ...@@ -206,6 +207,7 @@ class Dataset(object):
self.raw_data = data self.raw_data = data
else: else:
self.raw_data = None self.raw_data = None
self.data_has_header = False
"""process for args""" """process for args"""
params = {} params = {}
params["max_bin"] = max_bin params["max_bin"] = max_bin
...@@ -223,6 +225,10 @@ class Dataset(object): ...@@ -223,6 +225,10 @@ class Dataset(object):
raise TypeError('Reference dataset should be None or dataset instance') raise TypeError('Reference dataset should be None or dataset instance')
"""start construct data""" """start construct data"""
if is_str(data): if is_str(data):
"""check data has header or not"""
if "has_header" in params or "header" in params:
if params["has_header"].lower() == "true" or params["header"].lower() == "true":
data_has_header = True
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_CreateDatasetFromFile( _safe_call(_LIB.LGBM_CreateDatasetFromFile(
c_str(data), c_str(data),
...@@ -230,17 +236,21 @@ class Dataset(object): ...@@ -230,17 +236,21 @@ class Dataset(object):
ref_dataset, ref_dataset,
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
elif isinstance(data, scipy.sparse.csr_matrix): elif isinstance(data, scipy.sparse.csr_matrix):
self._init_from_csr(data, params_str, ref_dataset) self.__init_from_csr(data, params_str, ref_dataset)
elif isinstance(data, scipy.sparse.csc_matrix):
self._init_from_csc(data, params_str, ref_dataset)
elif isinstance(data, np.ndarray): elif isinstance(data, np.ndarray):
self._init_from_npy2d(data, params_str, ref_dataset) self.__init_from_np2d(data, params_str, ref_dataset)
else: else:
try: try:
csr = scipy.sparse.csr_matrix(data) csr = scipy.sparse.csr_matrix(data)
self._init_from_csr(csr) if self.raw_data is not None:
self.raw_data = csr
self.__init_from_csr(csr)
except: except:
raise TypeError('can not initialize Dataset from {}'.format(type(data).__name__)) raise TypeError('can not initialize Dataset from {}'.format(type(data).__name__))
self.__label = None
self.__weight = None
self.__init_score = None
self.__group = None
if label is not None: if label is not None:
self.set_label(label) self.set_label(label)
if weight is not None: if weight is not None:
...@@ -252,55 +262,7 @@ class Dataset(object): ...@@ -252,55 +262,7 @@ class Dataset(object):
def free_raw_data(self): def free_raw_data(self):
self.raw_data = None self.raw_data = None
def _init_from_csr(self, csr, params_str, ref_dataset): def __init_from_np2d(self, mat, params_str, ref_dataset):
"""
Initialize data from a CSR matrix.
"""
if len(csr.indices) != len(csr.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(_LIB.LGBM_CreateDatasetFromCSR(
ptr_indptr,
type_ptr_indptr,
c_array(ctypes.c_int32, csr.indices),
ptr_data,
type_ptr_data,
len(csr.indptr),
len(csr.data),
csr.shape[1],
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def _init_from_csc(self, csr, params_str, ref_dataset):
"""
Initialize data from a CSC matrix.
"""
if len(csc.indices) != len(csc.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csc.indptr)
ptr_data, type_ptr_data = c_float_array(csc.data)
_safe_call(_LIB.LGBM_CreateDatasetFromCSC(
ptr_indptr,
type_ptr_indptr,
c_array(ctypes.c_int32, csc.indices),
ptr_data,
type_ptr_data,
len(csc.indptr),
len(csc.data),
csc.shape[0],
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def _init_from_npy2d(self, mat, params_str, ref_dataset):
""" """
Initialize data from a 2-D numpy matrix. Initialize data from a 2-D numpy matrix.
""" """
...@@ -325,6 +287,30 @@ class Dataset(object): ...@@ -325,6 +287,30 @@ class Dataset(object):
ref_dataset, ref_dataset,
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
def __init_from_csr(self, csr, params_str, ref_dataset):
"""
Initialize data from a CSR matrix.
"""
if len(csr.indices) != len(csr.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(_LIB.LGBM_CreateDatasetFromCSR(
ptr_indptr,
type_ptr_indptr,
csr.indices.ctypes.data_as(ctypes.c_int32),
ptr_data,
type_ptr_data,
len(csr.indptr),
len(csr.data),
csr.shape[1],
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def __del__(self): def __del__(self):
_safe_call(_LIB.LGBM_DatasetFree(self.handle)) _safe_call(_LIB.LGBM_DatasetFree(self.handle))
...@@ -371,10 +357,10 @@ class Dataset(object): ...@@ -371,10 +357,10 @@ class Dataset(object):
if not is_numpy_1d_array(data): if not is_numpy_1d_array(data):
raise TypeError("Unknow type({})".format(type(data).__name__)) raise TypeError("Unknow type({})".format(type(data).__name__))
if data.dtype == np.float32: if data.dtype == np.float32:
ptr_data = c_array(ctypes.c_float, data) ptr_data = data.ctypes.data_as(ctypes.c_float)
type_data = C_API_DTYPE_FLOAT32 type_data = C_API_DTYPE_FLOAT32
elif data.dtype == np.int32: elif data.dtype == np.int32:
ptr_data = c_array(ctypes.c_int32, data) ptr_data = data.ctypes.data_as(ctypes.c_int32)
type_data = C_API_DTYPE_INT32 type_data = C_API_DTYPE_INT32
else: else:
raise TypeError("excepted np.float32 or np.int32, met type({})".format(data.dtype)) raise TypeError("excepted np.float32 or np.int32, met type({})".format(data.dtype))
...@@ -409,6 +395,7 @@ class Dataset(object): ...@@ -409,6 +395,7 @@ class Dataset(object):
label = list_to_1d_numpy(label, np.float32) label = list_to_1d_numpy(label, np.float32)
if label.dtype != np.float32: if label.dtype != np.float32:
label = label.astype(np.float32, copy=False) label = label.astype(np.float32, copy=False)
self.__label = label
self.set_field('label', label) self.set_field('label', label)
def set_weight(self, weight): def set_weight(self, weight):
...@@ -422,6 +409,7 @@ class Dataset(object): ...@@ -422,6 +409,7 @@ class Dataset(object):
weight = list_to_1d_numpy(weight, np.float32) weight = list_to_1d_numpy(weight, np.float32)
if weight.dtype != np.float32: if weight.dtype != np.float32:
weight = weight.astype(np.float32, copy=False) weight = weight.astype(np.float32, copy=False)
self.__weight = weight
self.set_field('weight', weight) self.set_field('weight', weight)
def set_init_score(self, score): def set_init_score(self, score):
...@@ -434,6 +422,7 @@ class Dataset(object): ...@@ -434,6 +422,7 @@ class Dataset(object):
score = list_to_1d_numpy(score, np.float32) score = list_to_1d_numpy(score, np.float32)
if score.dtype != np.float32: if score.dtype != np.float32:
score = score.astype(np.float32, copy=False) score = score.astype(np.float32, copy=False)
self.__init_score = init_score
self.set_field('init_score', score) self.set_field('init_score', score)
def set_group(self, group): def set_group(self, group):
...@@ -447,6 +436,7 @@ class Dataset(object): ...@@ -447,6 +436,7 @@ class Dataset(object):
group = list_to_1d_numpy(group, np.int32) group = list_to_1d_numpy(group, np.int32)
if group.dtype != np.int32: if group.dtype != np.int32:
group = group.astype(np.int32, copy=False) group = group.astype(np.int32, copy=False)
self.__group = group
self.set_field('group', group) self.set_field('group', group)
def set_group_id(self, group_id): def set_group_id(self, group_id):
...@@ -470,7 +460,9 @@ class Dataset(object): ...@@ -470,7 +460,9 @@ class Dataset(object):
------- -------
label : array label : array
""" """
return self.get_field('label') if self.__label is None:
self.__label = self.get_field('label')
return self.__label
def get_weight(self): def get_weight(self):
"""Get the weight of the Dataset. """Get the weight of the Dataset.
...@@ -479,7 +471,9 @@ class Dataset(object): ...@@ -479,7 +471,9 @@ class Dataset(object):
------- -------
weight : array weight : array
""" """
return self.get_field('weight') if self.__weight is None:
self.__weight = self.get_field('weight')
return self.__weight
def get_init_score(self): def get_init_score(self):
"""Get the initial score of the Dataset. """Get the initial score of the Dataset.
...@@ -488,7 +482,20 @@ class Dataset(object): ...@@ -488,7 +482,20 @@ class Dataset(object):
------- -------
init_score : array init_score : array
""" """
return self.get_field('init_score') if self.__init_score is None:
self.__init_score = self.get_field('init_score')
return self.__init_score
def get_group(self):
"""Get the initial score of the Dataset.
Returns
-------
init_score : array
"""
if self.__group is None:
self.__group = self.get_field('group')
return self.__group
def num_data(self): def num_data(self):
"""Get the number of rows in the Dataset. """Get the number of rows in the Dataset.
...@@ -553,6 +560,9 @@ class Dataset(object): ...@@ -553,6 +560,9 @@ class Dataset(object):
else: else:
self._feature_names = None self._feature_names = None
C_API_PREDICT_NORMAL =0
C_API_PREDICT_RAW_SCORE =1
C_API_PREDICT_LEAF_INDEX =2
class Booster(object): class Booster(object):
""""A Booster of of LightGBM. """"A Booster of of LightGBM.
...@@ -560,12 +570,9 @@ class Booster(object): ...@@ -560,12 +570,9 @@ class Booster(object):
feature_names = None feature_names = None
def __init__(self, params=None, def __init__(self,params=None,
train_set=None, train_set=None, valid_sets=None,
valid_sets=None, name_valid_sets=None, model_file=None):
name_valid_sets=None,
model_file=None,
fobj=None):
# pylint: disable=invalid-name # pylint: disable=invalid-name
"""Initialize the Booster. """Initialize the Booster.
...@@ -581,14 +588,16 @@ class Booster(object): ...@@ -581,14 +588,16 @@ class Booster(object):
name of validation datasets name of validation datasets
model_file : string model_file : string
Path to the model file. Path to the model file.
If tarin_set is not None, used for continued train.
else used for loading model prediction task
""" """
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
if train_set is not None: if train_set is not None:
"""Training task"""
if not isinstance(train_set, Dataset): if not isinstance(train_set, Dataset):
raise TypeError('training data should be Dataset instance, met{}'.format(type(train_set).__name__)) raise TypeError('training data should be Dataset instance, met{}'.format(type(train_set).__name__))
valid_handles = None valid_handles = None
valid_cnames = None
n_valid = 0 n_valid = 0
if valid_sets is not None: if valid_sets is not None:
for valid in valid_sets: for valid in valid_sets:
...@@ -596,36 +605,364 @@ class Booster(object): ...@@ -596,36 +605,364 @@ class Booster(object):
raise TypeError('valid data should be Dataset instance, met{}'.format(type(valid).__name__)) raise TypeError('valid data should be Dataset instance, met{}'.format(type(valid).__name__))
valid_handles = c_array(ctypes.c_void_p, [valid.handle for valid in valid_sets]) valid_handles = c_array(ctypes.c_void_p, [valid.handle for valid in valid_sets])
if name_valid_sets is None: if name_valid_sets is None:
name_valid_sets = ["valid_{}".format(x) for x in range(len(valid_sets)) ] name_valid_sets = ["valid_{}".format(x+1) for x in range(len(valid_sets)) ]
if len(valid_sets) != len(name_valid_sets): if len(valid_sets) != len(name_valid_sets):
raise Exception('len of valid_sets should be equal with len of name_valid_sets') raise Exception('len of valid_sets should be equal with len of name_valid_sets')
valid_cnames = c_array(ctypes.c_char_p, [c_str(x) for x in name_valid_sets])
n_valid = len(valid_sets) n_valid = len(valid_sets)
ref_input_model = None ref_input_model = None
params_str = dict_to_str(params) params_str = dict_to_str(params)
if model_file is not None: if model_file is not None:
ref_input_model = c_str(model_file) ref_input_model = c_str(model_file)
"""construct booster object""" """construct booster object"""
_safe_call(LIB.LGBM_BoosterCreate( _safe_call(_LIB.LGBM_BoosterCreate(
train_set.handle, train_set.handle,
valid_handles, valid_handles,
valid_cnames,
n_valid, n_valid,
params_str, c_str(params_str),
ref_input_model, ref_input_model,
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
"""if need to continue train""" """if need to continue train"""
if model_file is not None: if model_file is not None:
self.init_continue_train(train_set) self.__init_continue_train(train_set)
if valid_sets is not None: if valid_sets is not None:
for valid in valid_sets: for valid in valid_sets:
self.init_continue_train(valid) self.__init_continue_train(valid)
"""save reference to data"""
self.train_set = train_set
self.valid_sets = valid_sets
self.name_valid_sets = name_valid_sets
self.__num_dataset = 1 + n_valid
self.__training_score = None
out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_len)))
self.__num_class = out_len.value
"""buffer for inner predict"""
self.__inner_predict_buffer = [None for _ in range(self.__num_dataset)]
"""Get num of inner evals"""
_safe_call(_LIB.LGBM_BoosterGetEvalCounts(
self.handle,
ctypes.byref(out_len)))
self.__num_inner_eval = out_len.value
if self.__num_inner_eval > 0:
"""Get name of evals"""
string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)]
ptr_string_buffers = (ctypes.c_char_p*self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
_safe_call(_LIB.LGBM_BoosterGetEvalNames(
self.handle,
ctypes.byref(out_len),
ptr_string_buffers))
if self.__num_inner_eval != out_len.value:
raise ValueError("size of eval names doesn't equal with num_evals")
self.__name_inner_eval = []
for i in range(self.__num_inner_eval):
self.__name_inner_eval.append(string_buffers[i].value.decode())
elif model_file is not None: elif model_file is not None:
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(c_str(model_file), ctypes.byref(self.handle))) """Prediction task"""
out_num_total_model = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file),
ctypes.byref(out_num_total_model),
ctypes.byref(self.handle)))
self.__num_total_model = out_num_total_model.value
out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_len)))
self.__num_class = out_len.value
else: else:
raise TypeError('At least need training dataset or model file to create booster instance') raise TypeError('At least need training dataset or model file to create booster instance')
def __del__(self): def __del__(self):
_LIB.LGBM_BoosterFree(self.handle) _safe_call(_LIB.LGBM_BoosterFree(self.handle))
def update(self, fobj=None):
"""
Update for one iteration
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
fobj : function
Customized objective function.
Returns
-------
is_finished, bool
"""
is_finished = ctypes.c_int(0)
if fobj is None:
_safe_call(_LIB.LGBM_BoosterUpdateOneIter(
self.handle,
ctypes.byref(is_finished)))
return is_finished.value == 1
else:
grad, hess = fobj(self.__inner_predict(0), self.train_set)
return self.boost(grad, hess)
def boost(self, grad, hess):
"""
Boost the booster for one iteration, with customized gradient statistics.
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
grad : 1d numpy with dtype=float32
The first order of gradient.
hess : 1d numpy with dtype=float32
The second order of gradient.
Returns
-------
is_finished, bool
"""
if not is_numpy_1d_array(grad) and not is_numpy_1d_array(hess):
raise TypeError('type of grad / hess should be 1d numpy object')
if not grad.dtype == np.float32 and not hess.dtype == np.float32:
raise TypeError('type of grad / hess should be np.float32')
if len(grad) != len(hess):
raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
is_finished = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
self.handle,
grad.ctypes.data_as(ctypes.c_float),
hess.ctypes.data_as(ctypes.c_float),
ctypes.byref(is_finished)))
return is_finished.value == 1
def eval_train(self, feval=None):
"""Evaluate for training data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result string.
"""
return self.__inner_eval("training", 0, feval)
def eval_valid(self, feval=None):
"""Evaluate for validation data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result string.
"""
ret = []
for i in range(1, self.__num_dataset):
ret.append(self.__inner_eval(self.name_valid_sets[i-1], i, feval))
return '\n'.join(ret)
def save_model(self, filename, num_iteration=-1):
_safe_call(_LIB.LGBM_BoosterSaveModel(
self.handle,
num_iteration,
c_str(filename)))
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
if isinstance(data, Dataset):
raise TypeError("cannot use Dataset instance for prediction, please use raw data instead")
predict_type = C_API_PREDICT_NORMAL
if raw_score:
predict_type = cC_API_PREDICT_RAW_SCORE
if pred_leaf:
predict_type = C_API_PREDICT_LEAF_INDEX
int_data_has_header = 0
if data_has_header:
int_data_has_header = 1
if is_str(data):
tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
_safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle,
c_str(data),
int_data_has_header,
predict_type,
num_iteration,
c_str(tmp_pred_fname)))
lines = open(tmp_pred_fname,"r").readlines()
nrow = len(lines)
preds = []
for line in lines:
for token in line.split('\t'):
preds.append(float(token))
preds = np.array(preds, copy=False)
os.remove(tmp_pred_fname)
elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type)
elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
else:
try:
csr = scipy.sparse.csr_matrix(data)
res = self.__pred_for_csr(csr, num_iteration, predict_type)
except:
raise TypeError('can not predict data for type {}'.format(type(data).__name__))
if pred_leaf:
preds = preds.astype(np.int32)
if preds.size != nrow and is_reshape:
if preds.size % nrow == 0:
ncol = int(preds.size / nrow)
preds = preds.reshape(nrow, ncol)
else:
raise ValueError('len of predict result(%d) cannot be divide nrow(%d)' %(preds.size, nrow) )
return preds
def __pred_for_np2d(self, mat, num_iteration, predict_type):
"""
Predict for a 2-D numpy matrix.
"""
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data = c_float_array(data)
n_preds = self.__num_class * mat.shape[0]
if predict_type == C_API_PREDICT_LEAF_INDEX:
if num_iteration > 0:
n_preds *= num_iteration
else:
used_iteration = self.__num_total_model / self.__num_class
n_preds *= used_iteration
preds = np.zeros(n_preds, dtype=np.float32)
out_num_preds = ctypes.c_int64(0)
_safe_call(LIB.LGBM_BoosterPredictForMat(
self.handle,
ptr_data,
type_ptr_data,
mat.shape[0],
mat.shape[1],
C_API_IS_ROW_MAJOR,
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
return preds, mat.shape[0]
def __pred_for_csr(self, csr, num_iteration, predict_type):
"""
Predict for a csr data
"""
nrow = len(csr.indptr) - 1
n_preds = self.__num_class * nrow
if predict_type == C_API_PREDICT_LEAF_INDEX:
if num_iteration > 0:
n_preds *= num_iteration
else:
used_iteration = self.__num_total_model / self.__num_class
n_preds *= used_iteration
preds = np.zeros(n_preds, dtype=np.float32)
out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(LIB.LGBM_BoosterPredictForCSR(
self.handle,
ptr_indptr,
type_ptr_indptr,
csr.indices.ctypes.data_as(ctypes.c_int32),
ptr_data,
type_ptr_data,
len(csr.indptr),
len(csr.data),
csr.shape[1],
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
return preds, nrow
def __inner_eval(self, data_name, data_idx, feval=None):
if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset")
ret = []
if self.__num_inner_eval > 0:
result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float32)
out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetEval(
self.handle,
data_idx,
ctypes.byref(out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
if out_len.value != self.__num_inner_eval:
raise ValueError("incorrect number of eval results")
for i in range(self.__num_inner_eval):
ret.append('%s %s : %f' %(data_name, self.__name_inner_eval[i], result[i]))
if feval is not None:
if data_idx == 0:
cur_data = self.train_set
else:
cur_data = self.valid_sets[data_idx - 1]
feval_ret = feval(self.__inner_predict(data_idx), cur_data)
if isinstance(feval_ret, list):
for name, val in feval_ret:
ret.append('%s %s : %f' % (data_name, name, val))
else:
name, val = feval_ret
ret.append('%s %s : %f' % (data_name, name, val))
return '\t'.join(ret)
def __inner_predict(self, data_idx):
if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset")
if self.__inner_predict_buffer[data_idx] is None:
if data_idx == 0:
num_data = self.train_set.num_data() * self.__num_class
else:
num_data = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = \
np.array([0.0 for _ in range(num_data)], dtype=np.float32, copy=False)
out_len = ctypes.c_int64(0)
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
_safe_call(_LIB.LGBM_BoosterGetPredict(
self.handle,
data_idx,
ctypes.byref(out_len),
data_ptr))
if out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" %(data_idx) )
return self.__inner_predict_buffer[data_idx]
def __init_continue_train(self, dataset):
if dataset.raw_data is None:
raise ValueError("should set is_continue_train=True in dataset while need to continue train")
init_score = self.predict(dataset.raw_data, raw_score=True,data_has_header=dataset.data_has_header, is_reshape=False)
dataset.set_init_score(init_score)
dataset.free_raw_data()
#tmp test
train_data = Dataset('../../examples/binary_classification/binary.train')
test_data = Dataset('../../examples/binary_classification/binary.test', reference = train_data)
param = {"metric":"l2,l1"}
lgb = Booster(train_set=train_data, valid_sets=[test_data], params=param)
for i in range(100):
lgb.update()
print(lgb.eval_valid())
print(lgb.eval_train())
print(lgb.predict('../../examples/binary_classification/binary.train'))
\ No newline at end of file
...@@ -108,7 +108,7 @@ void Application::LoadData() { ...@@ -108,7 +108,7 @@ void Application::LoadData() {
// prediction is needed if using input initial model(continued train) // prediction is needed if using input initial model(continued train)
PredictFunction predict_fun = nullptr; PredictFunction predict_fun = nullptr;
// need to continue training // need to continue training
if (boosting_->NumberOfSubModels() > 0) { if (boosting_->NumberOfTotalModel() > 0) {
Predictor predictor(boosting_.get(), true, false); Predictor predictor(boosting_.get(), true, false);
predict_fun = predictor.GetPredictFunction(); predict_fun = predictor.GetPredictFunction();
} }
...@@ -235,7 +235,7 @@ void Application::Train() { ...@@ -235,7 +235,7 @@ void Application::Train() {
void Application::Predict() { void Application::Predict() {
boosting_->SetNumUsedModel(config_.io_config.num_model_predict); boosting_->SetNumIterationForPred(config_.io_config.num_iteration_predict);
// create predictor // create predictor
Predictor predictor(boosting_.get(), config_.io_config.is_predict_raw_score, Predictor predictor(boosting_.get(), config_.io_config.is_predict_raw_score,
config_.io_config.is_predict_leaf_index); config_.io_config.is_predict_leaf_index);
......
...@@ -43,6 +43,7 @@ public: ...@@ -43,6 +43,7 @@ public:
* \brief one training iteration * \brief one training iteration
*/ */
bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override { bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) override {
is_update_score_cur_iter_ = false;
GBDT::TrainOneIter(gradient, hessian, false); GBDT::TrainOneIter(gradient, hessian, false);
// normalize // normalize
Normalize(); Normalize();
...@@ -58,20 +59,24 @@ public: ...@@ -58,20 +59,24 @@ public:
* \return training score * \return training score
*/ */
const score_t* GetTrainingScore(data_size_t* out_len) override { const score_t* GetTrainingScore(data_size_t* out_len) override {
if (!is_update_score_cur_iter_) {
// only drop one time in one iteration
DroppingTrees(); DroppingTrees();
is_update_score_cur_iter_ = true;
}
*out_len = train_score_updater_->num_data() * num_class_; *out_len = train_score_updater_->num_data() * num_class_;
return train_score_updater_->score(); return train_score_updater_->score();
} }
/*! /*!
* \brief save model to file * \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all * \param num_iteration -1 means save all
* \param is_finish is training finished or not * \param is_finish is training finished or not
* \param filename filename that want to save to * \param filename filename that want to save to
*/ */
void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) override { void SaveModelToFile(int num_iteration, bool is_finish, const char* filename) override {
// only save model once when is_finish = true // only save model once when is_finish = true
if (is_finish && saved_model_size_ < 0) { if (is_finish && saved_model_size_ < 0) {
GBDT::SaveModelToFile(num_used_model, is_finish, filename); GBDT::SaveModelToFile(num_iteration, is_finish, filename);
} }
} }
/*! /*!
...@@ -133,6 +138,8 @@ private: ...@@ -133,6 +138,8 @@ private:
double drop_rate_; double drop_rate_;
/*! \brief Random generator, used to select dropping trees */ /*! \brief Random generator, used to select dropping trees */
Random random_for_drop_; Random random_for_drop_;
/*! \brief Flag that the score is update on current iter or not*/
bool is_update_score_cur_iter_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
namespace LightGBM { namespace LightGBM {
GBDT::GBDT() : saved_model_size_(-1), num_used_model_(0) { GBDT::GBDT() : saved_model_size_(-1), num_iteration_for_pred_(0) {
} }
...@@ -29,7 +29,7 @@ void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const O ...@@ -29,7 +29,7 @@ void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const O
gbdt_config_ = config; gbdt_config_ = config;
iter_ = 0; iter_ = 0;
saved_model_size_ = -1; saved_model_size_ = -1;
num_used_model_ = 0; num_iteration_for_pred_ = 0;
max_feature_idx_ = 0; max_feature_idx_ = 0;
early_stopping_round_ = gbdt_config_->early_stopping_round; early_stopping_round_ = gbdt_config_->early_stopping_round;
shrinkage_rate_ = gbdt_config_->learning_rate; shrinkage_rate_ = gbdt_config_->learning_rate;
...@@ -296,24 +296,23 @@ const score_t* GBDT::GetTrainingScore(data_size_t* out_len) { ...@@ -296,24 +296,23 @@ const score_t* GBDT::GetTrainingScore(data_size_t* out_len) {
return train_score_updater_->score(); return train_score_updater_->score();
} }
void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) const { void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size())); CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
std::vector<double> ret; std::vector<double> ret;
const score_t* raw_scores = nullptr; const score_t* raw_scores = nullptr;
data_size_t num_data = 0; data_size_t num_data = 0;
if (data_idx == 0) { if (data_idx == 0) {
raw_scores = train_score_updater_->score(); raw_scores = GetTrainingScore(out_len);
num_data = train_score_updater_->num_data(); num_data = train_score_updater_->num_data();
} else { } else {
auto used_idx = data_idx - 1; auto used_idx = data_idx - 1;
raw_scores = valid_score_updater_[used_idx]->score(); raw_scores = valid_score_updater_[used_idx]->score();
num_data = valid_score_updater_[used_idx]->num_data(); num_data = valid_score_updater_[used_idx]->num_data();
}
*out_len = num_data * num_class_; *out_len = num_data * num_class_;
}
if (num_class_ > 1) { if (num_class_ > 1) {
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
std::vector<double> tmp_result; std::vector<double> tmp_result;
for (int j = 0; j < num_class_; ++j) { for (int j = 0; j < num_class_; ++j) {
...@@ -325,12 +324,12 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) ...@@ -325,12 +324,12 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len)
} }
} }
} else if(sigmoid_ > 0.0f){ } else if(sigmoid_ > 0.0f){
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i]))); out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
} }
} else { } else {
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = raw_scores[i]; out_result[i] = raw_scores[i];
} }
...@@ -348,7 +347,7 @@ void GBDT::Boosting() { ...@@ -348,7 +347,7 @@ void GBDT::Boosting() {
GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data()); GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
} }
void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filename) { void GBDT::SaveModelToFile(int num_iteration, bool is_finish, const char* filename) {
// first time to this function, open file // first time to this function, open file
if (saved_model_size_ < 0) { if (saved_model_size_ < 0) {
model_output_file_.open(filename); model_output_file_.open(filename);
...@@ -373,10 +372,11 @@ void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filen ...@@ -373,10 +372,11 @@ void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filen
if (!model_output_file_.is_open()) { if (!model_output_file_.is_open()) {
return; return;
} }
if (num_used_model == NO_LIMIT) { int num_used_model = 0;
if (num_iteration == NO_LIMIT) {
num_used_model = static_cast<int>(models_.size()); num_used_model = static_cast<int>(models_.size());
} else { } else {
num_used_model = num_used_model * num_class_; num_used_model = num_iteration * num_class_;
} }
int rest = num_used_model - early_stopping_round_ * num_class_; int rest = num_used_model - early_stopping_round_ * num_class_;
// output tree models // output tree models
...@@ -452,7 +452,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) { ...@@ -452,7 +452,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
} }
} }
Log::Info("Finished loading %d models", models_.size()); Log::Info("Finished loading %d models", models_.size());
num_used_model_ = static_cast<int>(models_.size()) / num_class_; num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
} }
std::string GBDT::FeatureImportance() const { std::string GBDT::FeatureImportance() const {
...@@ -486,7 +486,7 @@ std::string GBDT::FeatureImportance() const { ...@@ -486,7 +486,7 @@ std::string GBDT::FeatureImportance() const {
std::vector<double> GBDT::PredictRaw(const double* value) const { std::vector<double> GBDT::PredictRaw(const double* value) const {
std::vector<double> ret(num_class_, 0.0f); std::vector<double> ret(num_class_, 0.0f);
for (int i = 0; i < num_used_model_; ++i) { for (int i = 0; i < num_iteration_for_pred_; ++i) {
for (int j = 0; j < num_class_; ++j) { for (int j = 0; j < num_class_; ++j) {
ret[j] += models_[i * num_class_ + j]->Predict(value); ret[j] += models_[i * num_class_ + j]->Predict(value);
} }
...@@ -496,7 +496,7 @@ std::vector<double> GBDT::PredictRaw(const double* value) const { ...@@ -496,7 +496,7 @@ std::vector<double> GBDT::PredictRaw(const double* value) const {
std::vector<double> GBDT::Predict(const double* value) const { std::vector<double> GBDT::Predict(const double* value) const {
std::vector<double> ret(num_class_, 0.0f); std::vector<double> ret(num_class_, 0.0f);
for (int i = 0; i < num_used_model_; ++i) { for (int i = 0; i < num_iteration_for_pred_; ++i) {
for (int j = 0; j < num_class_; ++j) { for (int j = 0; j < num_class_; ++j) {
ret[j] += models_[i * num_class_ + j]->Predict(value); ret[j] += models_[i * num_class_ + j]->Predict(value);
} }
...@@ -512,7 +512,7 @@ std::vector<double> GBDT::Predict(const double* value) const { ...@@ -512,7 +512,7 @@ std::vector<double> GBDT::Predict(const double* value) const {
std::vector<int> GBDT::PredictLeafIndex(const double* value) const { std::vector<int> GBDT::PredictLeafIndex(const double* value) const {
std::vector<int> ret; std::vector<int> ret;
for (int i = 0; i < num_used_model_; ++i) { for (int i = 0; i < num_iteration_for_pred_; ++i) {
for (int j = 0; j < num_class_; ++j) { for (int j = 0; j < num_class_; ++j) {
ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value)); ret.push_back(models_[i * num_class_ + j]->PredictLeafIndex(value));
} }
......
...@@ -73,7 +73,7 @@ public: ...@@ -73,7 +73,7 @@ public:
* \param result used to store prediction result, should allocate memory before call this function * \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score * \param out_len lenght of returned score
*/ */
void GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) const override; void GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) override;
/*! /*!
* \brief Predtion for one record without sigmoid transformation * \brief Predtion for one record without sigmoid transformation
...@@ -98,11 +98,11 @@ public: ...@@ -98,11 +98,11 @@ public:
/*! /*!
* \brief save model to file * \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all * \param num_iteration -1 means save all
* \param is_finish is training finished or not * \param is_finish is training finished or not
* \param filename filename that want to save to * \param filename filename that want to save to
*/ */
virtual void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) override; virtual void SaveModelToFile(int num_iteration, bool is_finish, const char* filename) override;
/*! /*!
* \brief Restore from a serialized string * \brief Restore from a serialized string
*/ */
...@@ -119,11 +119,12 @@ public: ...@@ -119,11 +119,12 @@ public:
*/ */
inline int LabelIdx() const override { return label_idx_; } inline int LabelIdx() const override { return label_idx_; }
/*! /*!
* \brief Get number of weak sub-models * \brief Get number of weak sub-models
* \return Number of weak sub-models * \return Number of weak sub-models
*/ */
inline int NumberOfSubModels() const override { return static_cast<int>(models_.size()); } inline int NumberOfTotalModel() const override { return static_cast<int>(models_.size()); }
/*! /*!
* \brief Get number of classes * \brief Get number of classes
...@@ -132,11 +133,13 @@ public: ...@@ -132,11 +133,13 @@ public:
inline int NumberOfClasses() const override { return num_class_; } inline int NumberOfClasses() const override { return num_class_; }
/*! /*!
* \brief Set number of used model for prediction * \brief Set number of iterations for prediction
*/ */
inline void SetNumUsedModel(int num_used_model) { inline void SetNumIterationForPred(int num_iteration) override {
if (num_used_model >= 0) { if (num_iteration > 0) {
num_used_model_ = static_cast<int>(num_used_model / num_class_); num_iteration_for_pred_ = num_iteration;
} else {
num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
} }
} }
...@@ -236,7 +239,7 @@ protected: ...@@ -236,7 +239,7 @@ protected:
/*! \brief File to write models */ /*! \brief File to write models */
std::ofstream model_output_file_; std::ofstream model_output_file_;
/*! \brief number of used model */ /*! \brief number of used model */
int num_used_model_; int num_iteration_for_pred_;
/*! \brief Shrinkage rate for one iteration */ /*! \brief Shrinkage rate for one iteration */
double shrinkage_rate_; double shrinkage_rate_;
}; };
......
...@@ -95,8 +95,8 @@ public: ...@@ -95,8 +95,8 @@ public:
return boosting_->TrainOneIter(gradients, hessians, false); return boosting_->TrainOneIter(gradients, hessians, false);
} }
void PrepareForPrediction(int num_used_model, int predict_type) { void PrepareForPrediction(int num_iteration, int predict_type) {
boosting_->SetNumUsedModel(num_used_model); boosting_->SetNumIterationForPred(num_iteration);
bool is_predict_leaf = false; bool is_predict_leaf = false;
bool is_raw_score = false; bool is_raw_score = false;
if (predict_type == C_API_PREDICT_LEAF_INDEX) { if (predict_type == C_API_PREDICT_LEAF_INDEX) {
...@@ -109,6 +109,10 @@ public: ...@@ -109,6 +109,10 @@ public:
predictor_.reset(new Predictor(boosting_.get(), is_raw_score, is_predict_leaf)); predictor_.reset(new Predictor(boosting_.get(), is_raw_score, is_predict_leaf));
} }
void GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len) {
boosting_->GetPredictAt(data_idx, out_result, out_len);
}
std::vector<double> Predict(const std::vector<std::pair<int, double>>& features) { std::vector<double> Predict(const std::vector<std::pair<int, double>>& features) {
return predictor_->GetPredictFunction()(features); return predictor_->GetPredictFunction()(features);
} }
...@@ -117,8 +121,8 @@ public: ...@@ -117,8 +121,8 @@ public:
predictor_->Predict(data_filename, result_filename, data_has_header); predictor_->Predict(data_filename, result_filename, data_has_header);
} }
void SaveModelToFile(int num_used_model, const char* filename) { void SaveModelToFile(int num_iteration, const char* filename) {
boosting_->SaveModelToFile(num_used_model, true, filename); boosting_->SaveModelToFile(num_iteration, true, filename);
} }
int GetEvalCounts() const { int GetEvalCounts() const {
...@@ -129,11 +133,18 @@ public: ...@@ -129,11 +133,18 @@ public:
return ret; return ret;
} }
int GetEvalNames(const char*** out_strs) const { int GetEvalNames(char** out_strs) const {
int idx = 0; int idx = 0;
for (const auto& metric : train_metric_) { for (const auto& metric : train_metric_) {
for (const auto& name : metric->GetName()) { for (const auto& name : metric->GetName()) {
*(out_strs[idx++]) = name.c_str(); int j = 0;
auto name_cstr = name.c_str();
while (name_cstr[j] != '\0') {
out_strs[idx][j] = name_cstr[j];
++j;
}
out_strs[idx][j] = '\0';
++idx;
} }
} }
return idx; return idx;
...@@ -141,10 +152,6 @@ public: ...@@ -141,10 +152,6 @@ public:
const Boosting* GetBoosting() const { return boosting_.get(); } const Boosting* GetBoosting() const { return boosting_.get(); }
const float* GetTrainingScore(int* out_len) const { return boosting_->GetTrainingScore(out_len); }
const inline int NumberOfClasses() const { return boosting_->NumberOfClasses(); }
private: private:
std::unique_ptr<Boosting> boosting_; std::unique_ptr<Boosting> boosting_;
...@@ -449,9 +456,12 @@ DllExport int LGBM_BoosterCreate(const DatesetHandle train_data, ...@@ -449,9 +456,12 @@ DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
DllExport int LGBM_BoosterCreateFromModelfile( DllExport int LGBM_BoosterCreateFromModelfile(
const char* filename, const char* filename,
int64_t* num_total_model,
BoosterHandle* out) { BoosterHandle* out) {
API_BEGIN(); API_BEGIN();
*out = new Booster(filename); auto ret = std::unique_ptr<Booster>(new Booster(filename));
*num_total_model = static_cast<int64_t>(ret->GetBoosting()->NumberOfTotalModel());
*out = ret.release();
API_END(); API_END();
} }
...@@ -461,6 +471,13 @@ DllExport int LGBM_BoosterFree(BoosterHandle handle) { ...@@ -461,6 +471,13 @@ DllExport int LGBM_BoosterFree(BoosterHandle handle) {
API_END(); API_END();
} }
DllExport int LGBM_BoosterGetNumClasses(BoosterHandle handle, int64_t* out_len) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_len = ref_booster->GetBoosting()->NumberOfClasses();
API_END();
}
DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished) { DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
...@@ -501,7 +518,7 @@ DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len) ...@@ -501,7 +518,7 @@ DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len)
* \brief Get number of eval * \brief Get number of eval
* \return total number of eval result * \return total number of eval result
*/ */
DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, const char*** out_strs) { DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, char** out_strs) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_len = ref_booster->GetEvalNames(out_strs); *out_len = ref_booster->GetEvalNames(out_strs);
...@@ -524,39 +541,27 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle, ...@@ -524,39 +541,27 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
API_END(); API_END();
} }
DllExport int LGBM_BoosterGetTrainingScore(BoosterHandle handle,
int64_t* out_len,
const float** out_result) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
int len = 0;
*out_result = ref_booster->GetTrainingScore(&len);
*out_len = static_cast<int64_t>(len);
API_END();
}
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle, DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
int data, int data,
int64_t* out_len, int64_t* out_len,
float* out_result) { float* out_result) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto boosting = ref_booster->GetBoosting();
int len = 0; int len = 0;
boosting->GetPredictAt(data, out_result, &len); ref_booster->GetPredictAt(data, out_result, &len);
*out_len = static_cast<int64_t>(len); *out_len = static_cast<int64_t>(len);
API_END(); API_END();
} }
DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle, DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
int predict_type,
int64_t n_used_trees,
int data_has_header,
const char* data_filename, const char* data_filename,
int data_has_header,
int predict_type,
int64_t num_iteration,
const char* result_filename) { const char* result_filename) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type); ref_booster->PrepareForPrediction(static_cast<int>(num_iteration), predict_type);
bool bool_data_has_header = data_has_header > 0 ? true : false; bool bool_data_has_header = data_has_header > 0 ? true : false;
ref_booster->PredictForFile(data_filename, result_filename, bool_data_has_header); ref_booster->PredictForFile(data_filename, result_filename, bool_data_has_header);
API_END(); API_END();
...@@ -572,23 +577,32 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle, ...@@ -572,23 +577,32 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t nelem, int64_t nelem,
int64_t, int64_t,
int predict_type, int predict_type,
int64_t n_used_trees, int64_t num_iteration,
double* out_result) { int64_t* out_len,
float* out_result) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type); ref_booster->PrepareForPrediction(static_cast<int>(num_iteration), predict_type);
auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
int num_class = ref_booster->NumberOfClasses(); int num_preb_in_one_row = ref_booster->GetBoosting()->NumberOfClasses();
if (predict_type == C_API_PREDICT_LEAF_INDEX) {
if (num_iteration > 0) {
num_preb_in_one_row *= static_cast<int>(num_iteration);
} else {
num_preb_in_one_row *= ref_booster->GetBoosting()->NumberOfTotalModel() / num_preb_in_one_row;
}
}
int nrow = static_cast<int>(nindptr - 1); int nrow = static_cast<int>(nindptr - 1);
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
auto one_row = get_row_fun(i); auto one_row = get_row_fun(i);
auto predicton_result = ref_booster->Predict(one_row); auto predicton_result = ref_booster->Predict(one_row);
for (int j = 0; j < num_class; ++j) { for (int j = 0; j < static_cast<int>(predicton_result.size()); ++j) {
out_result[i * num_class + j] = predicton_result[j]; out_result[i * num_preb_in_one_row + j] = static_cast<float>(predicton_result[j]);
} }
} }
*out_len = nrow * num_preb_in_one_row;
API_END(); API_END();
} }
...@@ -599,31 +613,40 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle, ...@@ -599,31 +613,40 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int32_t ncol, int32_t ncol,
int is_row_major, int is_row_major,
int predict_type, int predict_type,
int64_t n_used_trees, int64_t num_iteration,
double* out_result) { int64_t* out_len,
float* out_result) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type); ref_booster->PrepareForPrediction(static_cast<int>(num_iteration), predict_type);
auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major); auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
int num_class = ref_booster->NumberOfClasses(); int num_preb_in_one_row = ref_booster->GetBoosting()->NumberOfClasses();
if (predict_type == C_API_PREDICT_LEAF_INDEX) {
if (num_iteration > 0) {
num_preb_in_one_row *= static_cast<int>(num_iteration);
} else {
num_preb_in_one_row *= ref_booster->GetBoosting()->NumberOfTotalModel() / num_preb_in_one_row;
}
}
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
auto one_row = get_row_fun(i); auto one_row = get_row_fun(i);
auto predicton_result = ref_booster->Predict(one_row); auto predicton_result = ref_booster->Predict(one_row);
for (int j = 0; j < num_class; ++j) { for (int j = 0; j < static_cast<int>(predicton_result.size()); ++j) {
out_result[i * num_class + j] = predicton_result[j]; out_result[i * num_preb_in_one_row + j] = static_cast<float>(predicton_result[j]);
} }
} }
*out_len = nrow * num_preb_in_one_row;
API_END(); API_END();
} }
DllExport int LGBM_BoosterSaveModel(BoosterHandle handle, DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
int num_used_model, int num_iteration,
const char* filename) { const char* filename) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->SaveModelToFile(num_used_model, filename); ref_booster->SaveModelToFile(num_iteration, filename);
API_END(); API_END();
} }
......
...@@ -183,7 +183,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) { ...@@ -183,7 +183,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "data_random_seed", &data_random_seed); GetInt(params, "data_random_seed", &data_random_seed);
GetString(params, "data", &data_filename); GetString(params, "data", &data_filename);
GetInt(params, "verbose", &verbosity); GetInt(params, "verbose", &verbosity);
GetInt(params, "num_model_predict", &num_model_predict); GetInt(params, "num_iteration_predict", &num_iteration_predict);
GetInt(params, "bin_construct_sample_cnt", &bin_construct_sample_cnt); GetInt(params, "bin_construct_sample_cnt", &bin_construct_sample_cnt);
GetBool(params, "is_pre_partition", &is_pre_partition); GetBool(params, "is_pre_partition", &is_pre_partition);
GetBool(params, "is_enable_sparse", &is_enable_sparse); GetBool(params, "is_enable_sparse", &is_enable_sparse);
......
...@@ -190,14 +190,16 @@ def test_booster(): ...@@ -190,14 +190,16 @@ def test_booster():
test_free_dataset(train) test_free_dataset(train)
test_free_dataset(test[0]) test_free_dataset(test[0])
booster2 = ctypes.c_void_p() booster2 = ctypes.c_void_p()
LIB.LGBM_BoosterCreateFromModelfile(c_str('model.txt'), ctypes.byref(booster2)) num_total_model = ctypes.c_long()
LIB.LGBM_BoosterCreateFromModelfile(c_str('model.txt'), ctypes.byref(num_total_model), ctypes.byref(booster2))
data = [] data = []
inp = open('../../examples/binary_classification/binary.test', 'r') inp = open('../../examples/binary_classification/binary.test', 'r')
for line in inp.readlines(): for line in inp.readlines():
data.append( [float(x) for x in line.split('\t')[1:]] ) data.append( [float(x) for x in line.split('\t')[1:]] )
inp.close() inp.close()
mat = np.array(data) mat = np.array(data)
preb = np.zeros(( mat.shape[0],1 ), dtype=np.float64) preb = np.zeros(mat.shape[0], dtype=np.float32)
num_preb = ctypes.c_long()
data = np.array(mat.reshape(mat.size), copy=False) data = np.array(mat.reshape(mat.size), copy=False)
LIB.LGBM_BoosterPredictForMat(booster2, LIB.LGBM_BoosterPredictForMat(booster2,
data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)), data.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p)),
...@@ -207,8 +209,9 @@ def test_booster(): ...@@ -207,8 +209,9 @@ def test_booster():
1, 1,
1, 1,
50, 50,
ctypes.byref(num_preb),
preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double))) preb.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
LIB.LGBM_BoosterPredictForFile(booster2, 1, 50, 0, c_str('../../examples/binary_classification/binary.test'), c_str('preb.txt')) LIB.LGBM_BoosterPredictForFile(booster2,c_str('../../examples/binary_classification/binary.test'),0 , 0, 50, c_str('preb.txt'))
LIB.LGBM_BoosterFree(booster2) LIB.LGBM_BoosterFree(booster2)
test_dataset() test_dataset()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment