Commit 72c2d790 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

some refine for c_api (#152)

1. add csc support
2. some data type from float to double
parent bd7274ba
...@@ -96,13 +96,19 @@ public: ...@@ -96,13 +96,19 @@ public:
*/ */
virtual const score_t* GetTrainingScore(int64_t* out_len) = 0; virtual const score_t* GetTrainingScore(int64_t* out_len) = 0;
/*!
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \return out_len lenght of returned score
*/
virtual int64_t GetNumPredictAt(int data_idx) const = 0;
/*! /*!
* \brief Get prediction result at data_idx data * \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data * \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function * \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score * \param out_len lenght of returned score
*/ */
virtual void GetPredictAt(int data_idx, score_t* result, int64_t* out_len) = 0; virtual void GetPredictAt(int data_idx, double* result, int64_t* out_len) = 0;
/*! /*!
* \brief Prediction for one record, not sigmoid transform * \brief Prediction for one record, not sigmoid transform
......
...@@ -370,7 +370,20 @@ DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, c ...@@ -370,7 +370,20 @@ DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, c
DllExport int LGBM_BoosterGetEval(BoosterHandle handle, DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
int data_idx, int data_idx,
int64_t* out_len, int64_t* out_len,
float* out_results); double* out_results);
/*!
* \brief Get number of predict for inner dataset
this can be used to support customized eval function
Note: should pre-allocate memory for out_result, its length is equal to num_class * num_data
* \param handle handle
* \param data_idx 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetNumPredict(BoosterHandle handle,
int data_idx,
int64_t* out_len);
/*! /*!
* \brief Get prediction for training data and validation data * \brief Get prediction for training data and validation data
...@@ -385,7 +398,7 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle, ...@@ -385,7 +398,7 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle, DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
int data_idx, int data_idx,
int64_t* out_len, int64_t* out_len,
float* out_result); double* out_result);
/*! /*!
* \brief make prediction for file * \brief make prediction for file
...@@ -407,6 +420,24 @@ DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle, ...@@ -407,6 +420,24 @@ DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
int64_t num_iteration, int64_t num_iteration,
const char* result_filename); const char* result_filename);
/*!
* \brief Get number of prediction
* \param handle handle
* \param num_row
* \param predict_type
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len lenght of prediction
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
int64_t num_row,
int predict_type,
int64_t num_iteration,
int64_t* out_len);
/*! /*!
* \brief make prediction for an new data set * \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result, * Note: should pre-allocate memory for out_result,
...@@ -442,7 +473,44 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle, ...@@ -442,7 +473,44 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int predict_type, int predict_type,
int64_t num_iteration, int64_t num_iteration,
int64_t* out_len, int64_t* out_len,
float* out_result); double* out_result);
/*!
* \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result,
* for noraml and raw score: its length is equal to num_class * num_data
* for leaf index, its length is equal to num_class * num_data * num_iteration
* \param handle handle
* \param col_ptr pointer to col headers
* \param col_ptr_type type of col_ptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param ncol_ptr number of cols in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows
* \param predict_type
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForCSC(BoosterHandle handle,
const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
int64_t nelem,
int64_t num_row,
int predict_type,
int64_t num_iteration,
int64_t* out_len,
double* out_result);
/*! /*!
* \brief make prediction for an new data set * \brief make prediction for an new data set
...@@ -473,7 +541,7 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle, ...@@ -473,7 +541,7 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int predict_type, int predict_type,
int64_t num_iteration, int64_t num_iteration,
int64_t* out_len, int64_t* out_len,
float* out_result); double* out_result);
/*! /*!
* \brief save model into file * \brief save model into file
...@@ -497,7 +565,7 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle, ...@@ -497,7 +565,7 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle, DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len, int buffer_len,
int64_t* out_len, int64_t* out_len,
char** out_str); char* out_str);
/*! /*!
* \brief Get leaf value * \brief Get leaf value
...@@ -510,7 +578,7 @@ DllExport int LGBM_BoosterDumpModel(BoosterHandle handle, ...@@ -510,7 +578,7 @@ DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle, DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle,
int tree_idx, int tree_idx,
int leaf_idx, int leaf_idx,
float* out_val); double* out_val);
/*! /*!
* \brief Set leaf value * \brief Set leaf value
...@@ -523,26 +591,7 @@ DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle, ...@@ -523,26 +591,7 @@ DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle,
DllExport int LGBM_BoosterSetLeafValue(BoosterHandle handle, DllExport int LGBM_BoosterSetLeafValue(BoosterHandle handle,
int tree_idx, int tree_idx,
int leaf_idx, int leaf_idx,
float val); double val);
// some help functions used to convert data
std::function<std::vector<double>(int row_idx)>
RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major);
std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major);
std::function<std::vector<std::pair<int, double>>(int idx)>
RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
const void* data, int data_type, int64_t nindptr, int64_t nelem);
std::function<std::vector<std::pair<int, double>>(int idx)>
ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices,
const void* data, int data_type, int64_t ncol_ptr, int64_t nelem);
std::vector<double>
SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<int>& indices);
#if defined(_MSC_VER) #if defined(_MSC_VER)
// exception handle and error msg // exception handle and error msg
......
...@@ -324,14 +324,8 @@ public: ...@@ -324,14 +324,8 @@ public:
} }
} }
inline void PushOneColumn(int tid, data_size_t col_idx, const std::vector<std::pair<int, double>>& feature_values) { inline int GetInnerFeatureIndex(int col_idx) const {
if (col_idx >= num_total_features_) { return; } return used_feature_map_[col_idx];
int feature_idx = used_feature_map_[col_idx];
if (feature_idx >= 0) {
for (auto& inner_data : feature_values) {
features_[feature_idx]->PushData(tid, inner_data.first, inner_data.second);
}
}
} }
Dataset* Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const; Dataset* Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const;
...@@ -358,7 +352,7 @@ public: ...@@ -358,7 +352,7 @@ public:
* \param i Index for feature * \param i Index for feature
* \return Pointer of feature * \return Pointer of feature
*/ */
inline const Feature* FeatureAt(int i) const { return features_[i].get(); } inline Feature* FeatureAt(int i) const { return features_[i].get(); }
/*! /*!
* \brief Get meta data pointer * \brief Get meta data pointer
......
...@@ -288,10 +288,13 @@ class _InnerPredictor(object): ...@@ -288,10 +288,13 @@ class _InnerPredictor(object):
lines = tmp_file.readlines() lines = tmp_file.readlines()
nrow = len(lines) nrow = len(lines)
preds = [float(token) for line in lines for token in line.split('\t')] preds = [float(token) for line in lines for token in line.split('\t')]
preds = np.array(preds, dtype=np.float32, copy=False) preds = np.array(preds, dtype=np.float64, copy=False)
elif isinstance(data, scipy.sparse.csr_matrix): elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, num_iteration, preds, nrow = self.__pred_for_csr(data, num_iteration,
predict_type) predict_type)
elif isinstance(data, scipy.sparse.csc_matrix):
preds, nrow = self.__pred_for_csc(data, num_iteration,
predict_type)
elif isinstance(data, np.ndarray): elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration, preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type) predict_type)
...@@ -319,13 +322,14 @@ class _InnerPredictor(object): ...@@ -319,13 +322,14 @@ class _InnerPredictor(object):
""" """
Get size of prediction result Get size of prediction result
""" """
n_preds = self.num_class * nrow n_preds = ctypes.c_int64(0)
if predict_type == C_API_PREDICT_LEAF_INDEX: _safe_call(_LIB.LGBM_BoosterCalcNumPredict(
if num_iteration > 0: self.handle,
n_preds *= min(num_iteration, self.num_total_iteration) nrow,
else: predict_type,
n_preds *= self.num_total_iteration num_iteration,
return n_preds ctypes.byref(n_preds)))
return n_preds.value
def __pred_for_np2d(self, mat, num_iteration, predict_type): def __pred_for_np2d(self, mat, num_iteration, predict_type):
""" """
...@@ -342,7 +346,7 @@ class _InnerPredictor(object): ...@@ -342,7 +346,7 @@ class _InnerPredictor(object):
ptr_data, type_ptr_data = c_float_array(data) ptr_data, type_ptr_data = c_float_array(data)
n_preds = self.__get_num_preds(num_iteration, mat.shape[0], n_preds = self.__get_num_preds(num_iteration, mat.shape[0],
predict_type) predict_type)
preds = np.zeros(n_preds, dtype=np.float32) preds = np.zeros(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0) out_num_preds = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterPredictForMat( _safe_call(_LIB.LGBM_BoosterPredictForMat(
self.handle, self.handle,
...@@ -354,7 +358,7 @@ class _InnerPredictor(object): ...@@ -354,7 +358,7 @@ class _InnerPredictor(object):
predict_type, predict_type,
num_iteration, num_iteration,
ctypes.byref(out_num_preds), ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
)) ))
if n_preds != out_num_preds.value: if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results") raise ValueError("Wrong length for predict results")
...@@ -366,7 +370,7 @@ class _InnerPredictor(object): ...@@ -366,7 +370,7 @@ class _InnerPredictor(object):
""" """
nrow = len(csr.indptr) - 1 nrow = len(csr.indptr) - 1
n_preds = self.__get_num_preds(num_iteration, nrow, predict_type) n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
preds = np.zeros(n_preds, dtype=np.float32) preds = np.zeros(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0) out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr) ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
...@@ -385,7 +389,38 @@ class _InnerPredictor(object): ...@@ -385,7 +389,38 @@ class _InnerPredictor(object):
predict_type, predict_type,
num_iteration, num_iteration,
ctypes.byref(out_num_preds), ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
return preds, nrow
def __pred_for_csc(self, csc, num_iteration, predict_type):
"""
Predict for a csc data
"""
nrow = csc.shape[0]
n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
preds = np.zeros(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csc.indptr)
ptr_data, type_ptr_data = c_float_array(csc.data)
_safe_call(_LIB.LGBM_BoosterPredictForCSC(
self.handle,
ptr_indptr,
type_ptr_indptr,
csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
type_ptr_data,
len(csc.indptr),
len(csc.data),
csc.shape[0],
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
)) ))
if n_preds != out_num_preds.value: if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results") raise ValueError("Wrong length for predict results")
...@@ -511,6 +546,8 @@ class _InnerDataset(object): ...@@ -511,6 +546,8 @@ class _InnerDataset(object):
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
elif isinstance(data, scipy.sparse.csr_matrix): elif isinstance(data, scipy.sparse.csr_matrix):
self.__init_from_csr(data, params_str, ref_dataset) self.__init_from_csr(data, params_str, ref_dataset)
elif isinstance(data, scipy.sparse.csc_matrix):
self.__init_from_csc(data, params_str, ref_dataset)
elif isinstance(data, np.ndarray): elif isinstance(data, np.ndarray):
self.__init_from_np2d(data, params_str, ref_dataset) self.__init_from_np2d(data, params_str, ref_dataset)
else: else:
...@@ -541,6 +578,7 @@ class _InnerDataset(object): ...@@ -541,6 +578,7 @@ class _InnerDataset(object):
for j in range(self.predictor.num_class): for j in range(self.predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j] new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j]
init_score = new_init_score init_score = new_init_score
init_score = init_score.astype(dtype=np.float32, copy=False)
self.set_init_score(init_score) self.set_init_score(init_score)
elif self.predictor is not None: elif self.predictor is not None:
raise TypeError('wrong predictor type {}'.format(type(self.predictor).__name__)) raise TypeError('wrong predictor type {}'.format(type(self.predictor).__name__))
...@@ -655,6 +693,30 @@ class _InnerDataset(object): ...@@ -655,6 +693,30 @@ class _InnerDataset(object):
ref_dataset, ref_dataset,
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
def __init_from_csc(self, csc, params_str, ref_dataset):
"""
Initialize data from a csc matrix.
"""
if len(csc.indices) != len(csc.data):
raise ValueError('Length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csc.indptr)
ptr_data, type_ptr_data = c_float_array(csc.data)
_safe_call(_LIB.LGBM_DatasetCreateFromCSC(
ptr_indptr,
type_ptr_indptr,
csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
type_ptr_data,
len(csc.indptr),
len(csc.data),
csc.shape[0],
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def __del__(self): def __del__(self):
_safe_call(_LIB.LGBM_DatasetFree(self.handle)) _safe_call(_LIB.LGBM_DatasetFree(self.handle))
...@@ -1498,7 +1560,7 @@ class Booster(object): ...@@ -1498,7 +1560,7 @@ class Booster(object):
self.handle, self.handle,
buffer_len, buffer_len,
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer))) ptr_string_buffer))
actual_len = tmp_out_len.value actual_len = tmp_out_len.value
'''if buffer length is not long enough, reallocate a buffer''' '''if buffer length is not long enough, reallocate a buffer'''
if actual_len > buffer_len: if actual_len > buffer_len:
...@@ -1577,13 +1639,13 @@ class Booster(object): ...@@ -1577,13 +1639,13 @@ class Booster(object):
self.__get_eval_info() self.__get_eval_info()
ret = [] ret = []
if self.__num_inner_eval > 0: if self.__num_inner_eval > 0:
result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float32) result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float64)
tmp_out_len = ctypes.c_int64(0) tmp_out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetEval( _safe_call(_LIB.LGBM_BoosterGetEval(
self.handle, self.handle,
data_idx, data_idx,
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))) result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if tmp_out_len.value != self.__num_inner_eval: if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("Wrong length of eval results") raise ValueError("Wrong length of eval results")
for i in range(self.__num_inner_eval): for i in range(self.__num_inner_eval):
...@@ -1614,11 +1676,11 @@ class Booster(object): ...@@ -1614,11 +1676,11 @@ class Booster(object):
else: else:
n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = \ self.__inner_predict_buffer[data_idx] = \
np.array([0.0 for _ in range(n_preds)], dtype=np.float32, copy=False) np.array([0.0 for _ in range(n_preds)], dtype=np.float64, copy=False)
"""avoid to predict many time in one iteration""" """avoid to predict many time in one iteration"""
if not self.__is_predicted_cur_iter[data_idx]: if not self.__is_predicted_cur_iter[data_idx]:
tmp_out_len = ctypes.c_int64(0) tmp_out_len = ctypes.c_int64(0)
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float)) data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))
_safe_call(_LIB.LGBM_BoosterGetPredict( _safe_call(_LIB.LGBM_BoosterGetPredict(
self.handle, self.handle,
data_idx, data_idx,
......
...@@ -350,7 +350,7 @@ std::string GBDT::OutputMetric(int iter) { ...@@ -350,7 +350,7 @@ std::string GBDT::OutputMetric(int iter) {
/*! \brief Get eval result */ /*! \brief Get eval result */
std::vector<double> GBDT::GetEvalAt(int data_idx) const { std::vector<double> GBDT::GetEvalAt(int data_idx) const {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size())); CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
std::vector<double> ret; std::vector<double> ret;
if (data_idx == 0) { if (data_idx == 0) {
for (auto& sub_metric : training_metrics_) { for (auto& sub_metric : training_metrics_) {
...@@ -378,8 +378,8 @@ const score_t* GBDT::GetTrainingScore(int64_t* out_len) { ...@@ -378,8 +378,8 @@ const score_t* GBDT::GetTrainingScore(int64_t* out_len) {
return train_score_updater_->score(); return train_score_updater_->score();
} }
void GBDT::GetPredictAt(int data_idx, score_t* out_result, int64_t* out_len) { void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size())); CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
const score_t* raw_scores = nullptr; const score_t* raw_scores = nullptr;
data_size_t num_data = 0; data_size_t num_data = 0;
...@@ -401,18 +401,18 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, int64_t* out_len) { ...@@ -401,18 +401,18 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, int64_t* out_len) {
} }
Common::Softmax(&tmp_result); Common::Softmax(&tmp_result);
for (int j = 0; j < num_class_; ++j) { for (int j = 0; j < num_class_; ++j) {
out_result[j * num_data + i] = static_cast<score_t>(tmp_result[j]); out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
} }
} }
} else if(sigmoid_ > 0.0f){ } else if(sigmoid_ > 0.0f){
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i]))); out_result[i] = static_cast<double>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
} }
} else { } else {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = raw_scores[i]; out_result[i] = static_cast<double>(raw_scores[i]);
} }
} }
......
...@@ -107,13 +107,21 @@ public: ...@@ -107,13 +107,21 @@ public:
*/ */
virtual const score_t* GetTrainingScore(int64_t* out_len) override; virtual const score_t* GetTrainingScore(int64_t* out_len) override;
virtual int64_t GetNumPredictAt(int data_idx) const override {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
data_size_t num_data = train_data_->num_data();
if (data_idx > 0) {
num_data = valid_score_updater_[data_idx - 1]->num_data();
}
return num_data * num_class_;
}
/*! /*!
* \brief Get prediction result at data_idx data * \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data * \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function * \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score * \param out_len lenght of returned score
*/ */
void GetPredictAt(int data_idx, score_t* out_result, int64_t* out_len) override; void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) override;
/*! /*!
* \brief Prediction for one record without sigmoid transformation * \brief Prediction for one record without sigmoid transformation
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <LightGBM/utils/common.h> #include <LightGBM/utils/common.h>
#include <LightGBM/utils/random.h> #include <LightGBM/utils/random.h>
#include <LightGBM/utils/threading.h>
#include <LightGBM/c_api.h> #include <LightGBM/c_api.h>
#include <LightGBM/dataset_loader.h> #include <LightGBM/dataset_loader.h>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
...@@ -17,6 +18,7 @@ ...@@ -17,6 +18,7 @@
#include <memory> #include <memory>
#include <stdexcept> #include <stdexcept>
#include <mutex> #include <mutex>
#include <functional>
#include "./application/predictor.hpp" #include "./application/predictor.hpp"
#include "./boosting/gbdt.h" #include "./boosting/gbdt.h"
...@@ -29,7 +31,7 @@ public: ...@@ -29,7 +31,7 @@ public:
boosting_.reset(Boosting::CreateBoosting(filename)); boosting_.reset(Boosting::CreateBoosting(filename));
} }
Booster(const Dataset* train_data, Booster(const Dataset* train_data,
const char* parameters) { const char* parameters) {
auto param = ConfigBase::Str2Map(parameters); auto param = ConfigBase::Str2Map(parameters);
config_.Set(param); config_.Set(param);
...@@ -85,7 +87,7 @@ public: ...@@ -85,7 +87,7 @@ public:
} }
train_metric_.shrink_to_fit(); train_metric_.shrink_to_fit();
// reset the boosting // reset the boosting
boosting_->ResetTrainingData(&config_.boosting_config, train_data_, boosting_->ResetTrainingData(&config_.boosting_config, train_data_,
objective_fun_.get(), Common::ConstPtrInVectorWrapper<Metric>(train_metric_)); objective_fun_.get(), Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
} }
...@@ -122,7 +124,7 @@ public: ...@@ -122,7 +124,7 @@ public:
boosting_->ResetTrainingData(&config_.boosting_config, train_data_, boosting_->ResetTrainingData(&config_.boosting_config, train_data_,
objective_fun_.get(), Common::ConstPtrInVectorWrapper<Metric>(train_metric_)); objective_fun_.get(), Common::ConstPtrInVectorWrapper<Metric>(train_metric_));
} }
void AddValidData(const Dataset* valid_data) { void AddValidData(const Dataset* valid_data) {
...@@ -171,7 +173,7 @@ public: ...@@ -171,7 +173,7 @@ public:
return Predictor(boosting_.get(), is_raw_score, is_predict_leaf); return Predictor(boosting_.get(), is_raw_score, is_predict_leaf);
} }
void GetPredictAt(int data_idx, score_t* out_result, int64_t* out_len) { void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
boosting_->GetPredictAt(data_idx, out_result, out_len); boosting_->GetPredictAt(data_idx, out_result, out_len);
} }
...@@ -212,7 +214,7 @@ public: ...@@ -212,7 +214,7 @@ public:
} }
const Boosting* GetBoosting() const { return boosting_.get(); } const Boosting* GetBoosting() const { return boosting_.get(); }
private: private:
const Dataset* train_data_; const Dataset* train_data_;
...@@ -233,6 +235,38 @@ private: ...@@ -233,6 +235,38 @@ private:
using namespace LightGBM; using namespace LightGBM;
// some help functions used to convert data
std::function<std::vector<double>(int row_idx)>
RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major);
std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major);
std::function<std::vector<std::pair<int, double>>(int idx)>
RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
const void* data, int data_type, int64_t nindptr, int64_t nelem);
// Row iterator of on column for CSC matrix
class CSC_RowIterator {
public:
CSC_RowIterator(const void* col_ptr, int col_ptr_type, const int32_t* indices,
const void* data, int data_type, int64_t ncol_ptr, int64_t nelem, int col_idx);
~CSC_RowIterator() {}
// return value at idx, only can access by ascent order
double Get(int idx);
// return next non-zero pair, if index < 0, means no more data
std::pair<int, double> NextNonZero();
private:
int nonzero_idx_ = 0;
int cur_idx_ = -1;
double cur_val_ = 0.0f;
bool is_end_ = false;
std::function<std::pair<int, double>(int idx)> iter_fun_;
};
// start of c_api functions
DllExport const char* LGBM_GetLastError() { DllExport const char* LGBM_GetLastError() {
return LastErrorMsg(); return LastErrorMsg();
} }
...@@ -382,10 +416,8 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr, ...@@ -382,10 +416,8 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
IOConfig io_config; IOConfig io_config;
io_config.Set(param); io_config.Set(param);
std::unique_ptr<Dataset> ret; std::unique_ptr<Dataset> ret;
auto get_col_fun = ColumnFunctionFromCSC(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem);
int32_t nrow = static_cast<int32_t>(num_row); int32_t nrow = static_cast<int32_t>(num_row);
if (reference == nullptr) { if (reference == nullptr) {
Log::Warning("Construct from CSC format is not efficient");
// sample data first // sample data first
Random rand(io_config.data_random_seed); Random rand(io_config.data_random_seed);
const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt); const int sample_cnt = static_cast<int>(nrow < io_config.bin_construct_sample_cnt ? nrow : io_config.bin_construct_sample_cnt);
...@@ -393,8 +425,13 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr, ...@@ -393,8 +425,13 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
std::vector<std::vector<double>> sample_values(ncol_ptr - 1); std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) { for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
auto cur_col = get_col_fun(i); CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
sample_values[i] = SampleFromOneColumn(cur_col, sample_indices); for (int j = 0; j < sample_cnt; j++) {
auto val = col_it.Get(sample_indices[j]);
if (std::fabs(val) > kEpsilon) {
sample_values[i].push_back(val);
}
}
} }
DatasetLoader loader(io_config, nullptr, 1, nullptr); DatasetLoader loader(io_config, nullptr, 1, nullptr);
ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow)); ret.reset(loader.CostructFromSampleData(sample_values, sample_cnt, nrow));
...@@ -408,8 +445,17 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr, ...@@ -408,8 +445,17 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < ncol_ptr - 1; ++i) { for (int i = 0; i < ncol_ptr - 1; ++i) {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
auto one_col = get_col_fun(i); int feature_idx = ret->GetInnerFeatureIndex(i);
ret->PushOneColumn(tid, i, one_col); if (feature_idx < 0) { continue; }
CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
int row_idx = 0;
while (row_idx < nrow) {
auto pair = col_it.NextNonZero();
row_idx = pair.first;
// no more data
if (row_idx < 0) { break; }
ret->FeatureAt(feature_idx)->PushData(tid, row_idx, pair.second);
}
} }
ret->FinishLoad(); ret->FinishLoad();
*out = ret.release(); *out = ret.release();
...@@ -429,7 +475,7 @@ DllExport int LGBM_DatasetGetSubset( ...@@ -429,7 +475,7 @@ DllExport int LGBM_DatasetGetSubset(
auto full_dataset = reinterpret_cast<const Dataset*>(handle); auto full_dataset = reinterpret_cast<const Dataset*>(handle);
auto ret = std::unique_ptr<Dataset>( auto ret = std::unique_ptr<Dataset>(
full_dataset->Subset(used_row_indices, full_dataset->Subset(used_row_indices,
num_used_row_indices, num_used_row_indices,
io_config.is_enable_sparse)); io_config.is_enable_sparse));
ret->FinishLoad(); ret->FinishLoad();
*out = ret.release(); *out = ret.release();
...@@ -517,7 +563,6 @@ DllExport int LGBM_DatasetGetNumFeature(DatasetHandle handle, ...@@ -517,7 +563,6 @@ DllExport int LGBM_DatasetGetNumFeature(DatasetHandle handle,
API_END(); API_END();
} }
// ---- start of booster // ---- start of booster
DllExport int LGBM_BoosterCreate(const DatasetHandle train_data, DllExport int LGBM_BoosterCreate(const DatasetHandle train_data,
...@@ -627,10 +672,7 @@ DllExport int LGBM_BoosterGetCurrentIteration(BoosterHandle handle, int64_t* out ...@@ -627,10 +672,7 @@ DllExport int LGBM_BoosterGetCurrentIteration(BoosterHandle handle, int64_t* out
*out_iteration = ref_booster->GetBoosting()->GetCurrentIteration(); *out_iteration = ref_booster->GetBoosting()->GetCurrentIteration();
API_END(); API_END();
} }
/*!
* \brief Get number of eval
* \return total number of eval result
*/
DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len) { DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
...@@ -638,10 +680,6 @@ DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len) ...@@ -638,10 +680,6 @@ DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len)
API_END(); API_END();
} }
/*!
* \brief Get number of eval
* \return total number of eval result
*/
DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, char** out_strs) { DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, char** out_strs) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
...@@ -649,26 +687,34 @@ DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, c ...@@ -649,26 +687,34 @@ DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, c
API_END(); API_END();
} }
DllExport int LGBM_BoosterGetEval(BoosterHandle handle, DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
int data_idx, int data_idx,
int64_t* out_len, int64_t* out_len,
float* out_results) { double* out_results) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto boosting = ref_booster->GetBoosting(); auto boosting = ref_booster->GetBoosting();
auto result_buf = boosting->GetEvalAt(data_idx); auto result_buf = boosting->GetEvalAt(data_idx);
*out_len = static_cast<int64_t>(result_buf.size()); *out_len = static_cast<int64_t>(result_buf.size());
for (size_t i = 0; i < result_buf.size(); ++i) { for (size_t i = 0; i < result_buf.size(); ++i) {
(out_results)[i] = static_cast<float>(result_buf[i]); (out_results)[i] = static_cast<double>(result_buf[i]);
} }
API_END(); API_END();
} }
DllExport int LGBM_BoosterGetNumPredict(BoosterHandle handle,
int data_idx,
int64_t* out_len) {
API_BEGIN();
auto boosting = reinterpret_cast<Booster*>(handle)->GetBoosting();
*out_len = boosting->GetNumPredictAt(data_idx);
API_END();
}
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle, DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
int data_idx, int data_idx,
int64_t* out_len, int64_t* out_len,
float* out_result) { double* out_result) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->GetPredictAt(data_idx, out_result, out_len); ref_booster->GetPredictAt(data_idx, out_result, out_len);
...@@ -689,6 +735,30 @@ DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle, ...@@ -689,6 +735,30 @@ DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
API_END(); API_END();
} }
int GetNumPredOneRow(const Booster* ref_booster, int predict_type, int64_t num_iteration) {
int num_preb_in_one_row = ref_booster->GetBoosting()->NumberOfClasses();
if (predict_type == C_API_PREDICT_LEAF_INDEX) {
int64_t max_iteration = ref_booster->GetBoosting()->GetCurrentIteration();
if (num_iteration > 0) {
num_preb_in_one_row *= static_cast<int>(std::min(max_iteration, num_iteration));
} else {
num_preb_in_one_row *= max_iteration;
}
}
return num_preb_in_one_row;
}
DllExport int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
int64_t num_row,
int predict_type,
int64_t num_iteration,
int64_t* out_len) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_len = static_cast<int64_t>(num_row * GetNumPredOneRow(ref_booster, predict_type, num_iteration));
API_END();
}
DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle, DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
const void* indptr, const void* indptr,
int indptr_type, int indptr_type,
...@@ -701,32 +771,70 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle, ...@@ -701,32 +771,70 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int predict_type, int predict_type,
int64_t num_iteration, int64_t num_iteration,
int64_t* out_len, int64_t* out_len,
float* out_result) { double* out_result) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type); auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type);
auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
int num_preb_in_one_row = ref_booster->GetBoosting()->NumberOfClasses(); int num_preb_in_one_row = GetNumPredOneRow(ref_booster, predict_type, num_iteration);
if (predict_type == C_API_PREDICT_LEAF_INDEX) {
if (num_iteration > 0) {
num_preb_in_one_row *= static_cast<int>(num_iteration);
} else {
num_preb_in_one_row *= ref_booster->GetBoosting()->NumberOfTotalModel() / num_preb_in_one_row;
}
}
int nrow = static_cast<int>(nindptr - 1); int nrow = static_cast<int>(nindptr - 1);
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
auto one_row = get_row_fun(i); auto one_row = get_row_fun(i);
auto predicton_result = predictor.GetPredictFunction()(one_row); auto predicton_result = predictor.GetPredictFunction()(one_row);
for (int j = 0; j < static_cast<int>(predicton_result.size()); ++j) { for (int j = 0; j < static_cast<int>(predicton_result.size()); ++j) {
out_result[i * num_preb_in_one_row + j] = static_cast<float>(predicton_result[j]); out_result[i * num_preb_in_one_row + j] = static_cast<double>(predicton_result[j]);
} }
} }
*out_len = nrow * num_preb_in_one_row; *out_len = nrow * num_preb_in_one_row;
API_END(); API_END();
} }
DllExport int LGBM_BoosterPredictForCSC(BoosterHandle handle,
const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
int64_t nelem,
int64_t num_row,
int predict_type,
int64_t num_iteration,
int64_t* out_len,
double* out_result) {
API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type);
int num_preb_in_one_row = GetNumPredOneRow(ref_booster, predict_type, num_iteration);
int ncol = static_cast<int>(ncol_ptr - 1);
Threading::For<int64_t>(0, num_row,
[&predictor, &out_result, num_preb_in_one_row, ncol, col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem]
(int, data_size_t start, data_size_t end) {
std::vector<CSC_RowIterator> iterators;
for (int j = 0; j < ncol; ++j) {
iterators.emplace_back(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, j);
}
std::vector<std::pair<int, double>> one_row;
for (int64_t i = start; i < end; ++i) {
one_row.clear();
for (int j = 0; j < ncol; ++j) {
auto val = iterators[j].Get(static_cast<int>(i));
if (std::fabs(val) > kEpsilon) {
one_row.emplace_back(j, val);
}
}
auto predicton_result = predictor.GetPredictFunction()(one_row);
for (int j = 0; j < static_cast<int>(predicton_result.size()); ++j) {
out_result[i * num_preb_in_one_row + j] = static_cast<double>(predicton_result[j]);
}
}
});
*out_len = num_row * num_preb_in_one_row;
API_END();
}
DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle, DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
const void* data, const void* data,
int data_type, int data_type,
...@@ -736,25 +844,18 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle, ...@@ -736,25 +844,18 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int predict_type, int predict_type,
int64_t num_iteration, int64_t num_iteration,
int64_t* out_len, int64_t* out_len,
float* out_result) { double* out_result) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type); auto predictor = ref_booster->NewPredictor(static_cast<int>(num_iteration), predict_type);
auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major); auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
int num_preb_in_one_row = ref_booster->GetBoosting()->NumberOfClasses(); int num_preb_in_one_row = GetNumPredOneRow(ref_booster, predict_type, num_iteration);
if (predict_type == C_API_PREDICT_LEAF_INDEX) {
if (num_iteration > 0) {
num_preb_in_one_row *= static_cast<int>(num_iteration);
} else {
num_preb_in_one_row *= ref_booster->GetBoosting()->NumberOfTotalModel() / num_preb_in_one_row;
}
}
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < nrow; ++i) { for (int i = 0; i < nrow; ++i) {
auto one_row = get_row_fun(i); auto one_row = get_row_fun(i);
auto predicton_result = predictor.GetPredictFunction()(one_row); auto predicton_result = predictor.GetPredictFunction()(one_row);
for (int j = 0; j < static_cast<int>(predicton_result.size()); ++j) { for (int j = 0; j < static_cast<int>(predicton_result.size()); ++j) {
out_result[i * num_preb_in_one_row + j] = static_cast<float>(predicton_result[j]); out_result[i * num_preb_in_one_row + j] = static_cast<double>(predicton_result[j]);
} }
} }
*out_len = nrow * num_preb_in_one_row; *out_len = nrow * num_preb_in_one_row;
...@@ -773,37 +874,34 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle, ...@@ -773,37 +874,34 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle, DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len, int buffer_len,
int64_t* out_len, int64_t* out_len,
char** out_str) { char* out_str) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
std::string model = ref_booster->DumpModel(); std::string model = ref_booster->DumpModel();
*out_len = static_cast<int64_t>(model.size()) + 1; *out_len = static_cast<int64_t>(model.size()) + 1;
if (*out_len <= buffer_len) { if (*out_len <= buffer_len) {
std::strcpy(*out_str, model.c_str()); std::strcpy(out_str, model.c_str());
} }
API_END(); API_END();
} }
DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle, DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle,
int tree_idx, int tree_idx,
int leaf_idx, int leaf_idx,
float* out_val) { double* out_val) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
*out_val = static_cast<float>(ref_booster->GetLeafValue(tree_idx, leaf_idx)); *out_val = static_cast<double>(ref_booster->GetLeafValue(tree_idx, leaf_idx));
API_END(); API_END();
} }
DllExport int LGBM_BoosterSetLeafValue(BoosterHandle handle, DllExport int LGBM_BoosterSetLeafValue(BoosterHandle handle,
int tree_idx, int tree_idx,
int leaf_idx, int leaf_idx,
float val) { double val) {
API_BEGIN(); API_BEGIN();
Booster* ref_booster = reinterpret_cast<Booster*>(handle); Booster* ref_booster = reinterpret_cast<Booster*>(handle);
ref_booster->SetLeafValue(tree_idx, leaf_idx, static_cast<double>(val)); ref_booster->SetLeafValue(tree_idx, leaf_idx, val);
API_END(); API_END();
} }
...@@ -924,77 +1022,108 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, ...@@ -924,77 +1022,108 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
} }
return ret; return ret;
}; };
} }
} }
throw std::runtime_error("unknown data type in RowFunctionFromCSR"); throw std::runtime_error("unknown data type in RowFunctionFromCSR");
} }
std::function<std::vector<std::pair<int, double>>(int idx)> std::function<std::pair<int, double>(int idx)>
ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices, const void* data, int data_type, int64_t ncol_ptr, int64_t nelem) { IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices, const void* data, int data_type, int64_t ncol_ptr, int64_t nelem, int col_idx) {
CHECK(col_idx < ncol_ptr && col_idx >= 0);
if (data_type == C_API_DTYPE_FLOAT32) { if (data_type == C_API_DTYPE_FLOAT32) {
const float* data_ptr = reinterpret_cast<const float*>(data); const float* data_ptr = reinterpret_cast<const float*>(data);
if (col_ptr_type == C_API_DTYPE_INT32) { if (col_ptr_type == C_API_DTYPE_INT32) {
const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr); const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) { int64_t start = ptr_col_ptr[col_idx];
std::vector<std::pair<int, double>> ret; int64_t end = ptr_col_ptr[col_idx + 1];
int64_t start = ptr_col_ptr[idx]; return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end](int bias) {
int64_t end = ptr_col_ptr[idx + 1]; int64_t i = static_cast<int64_t>(start + bias);
for (int64_t i = start; i < end; ++i) { if (i >= end) {
ret.emplace_back(indices[i], data_ptr[i]); return std::make_pair(-1, 0.0);
} }
return ret; int idx = static_cast<int>(indices[i]);
double val = static_cast<double>(data_ptr[i]);
return std::make_pair(idx, val);
}; };
} else if (col_ptr_type == C_API_DTYPE_INT64) { } else if (col_ptr_type == C_API_DTYPE_INT64) {
const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr); const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) { int64_t start = ptr_col_ptr[col_idx];
std::vector<std::pair<int, double>> ret; int64_t end = ptr_col_ptr[col_idx + 1];
int64_t start = ptr_col_ptr[idx]; return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end](int bias) {
int64_t end = ptr_col_ptr[idx + 1]; int64_t i = static_cast<int64_t>(start + bias);
for (int64_t i = start; i < end; ++i) { if (i >= end) {
ret.emplace_back(indices[i], data_ptr[i]); return std::make_pair(-1, 0.0);
} }
return ret; int idx = static_cast<int>(indices[i]);
double val = static_cast<double>(data_ptr[i]);
return std::make_pair(idx, val);
}; };
} }
} else if (data_type == C_API_DTYPE_FLOAT64) { } else if (data_type == C_API_DTYPE_FLOAT64) {
const double* data_ptr = reinterpret_cast<const double*>(data); const double* data_ptr = reinterpret_cast<const double*>(data);
if (col_ptr_type == C_API_DTYPE_INT32) { if (col_ptr_type == C_API_DTYPE_INT32) {
const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr); const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) { int64_t start = ptr_col_ptr[col_idx];
std::vector<std::pair<int, double>> ret; int64_t end = ptr_col_ptr[col_idx + 1];
int64_t start = ptr_col_ptr[idx]; return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end](int bias) {
int64_t end = ptr_col_ptr[idx + 1]; int64_t i = static_cast<int64_t>(start + bias);
for (int64_t i = start; i < end; ++i) { if (i >= end) {
ret.emplace_back(indices[i], data_ptr[i]); return std::make_pair(-1, 0.0);
} }
return ret; int idx = static_cast<int>(indices[i]);
double val = static_cast<double>(data_ptr[i]);
return std::make_pair(idx, val);
}; };
} else if (col_ptr_type == C_API_DTYPE_INT64) { } else if (col_ptr_type == C_API_DTYPE_INT64) {
const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr); const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) { int64_t start = ptr_col_ptr[col_idx];
std::vector<std::pair<int, double>> ret; int64_t end = ptr_col_ptr[col_idx + 1];
int64_t start = ptr_col_ptr[idx]; return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem, start, end](int bias) {
int64_t end = ptr_col_ptr[idx + 1]; int64_t i = static_cast<int64_t>(start + bias);
for (int64_t i = start; i < end; ++i) { if (i >= end) {
ret.emplace_back(indices[i], data_ptr[i]); return std::make_pair(-1, 0.0);
} }
return ret; int idx = static_cast<int>(indices[i]);
double val = static_cast<double>(data_ptr[i]);
return std::make_pair(idx, val);
}; };
} }
} }
throw std::runtime_error("unknown data type in ColumnFunctionFromCSC"); throw std::runtime_error("unknown data type in CSC matrix");
} }
std::vector<double> SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<int>& indices) { CSC_RowIterator::CSC_RowIterator(const void* col_ptr, int col_ptr_type, const int32_t* indices,
size_t j = 0; const void* data, int data_type, int64_t ncol_ptr, int64_t nelem, int col_idx) {
std::vector<double> ret; iter_fun_ = IterateFunctionFromCSC(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, col_idx);
for (auto row_idx : indices) { }
while (j < data.size() && data[j].first < static_cast<int>(row_idx)) {
++j; double CSC_RowIterator::Get(int idx) {
while (idx > cur_idx_ && !is_end_) {
auto ret = iter_fun_(nonzero_idx_);
if (ret.first < 0) {
is_end_ = true;
break;
} }
if (j < data.size() && data[j].first == static_cast<int>(row_idx)) { cur_idx_ = ret.first;
ret.push_back(data[j].second); cur_val_ = ret.second;
++nonzero_idx_;
}
if (idx == cur_idx_) {
return cur_val_;
} else {
return 0.0f;
}
}
std::pair<int, double> CSC_RowIterator::NextNonZero() {
if (!is_end_) {
auto ret = iter_fun_(nonzero_idx_);
++nonzero_idx_;
if (ret.first < 0) {
is_end_ = true;
} }
return ret;
} else {
return std::make_pair(-1, 0.0);
} }
return ret;
} }
...@@ -175,9 +175,9 @@ def test_booster(): ...@@ -175,9 +175,9 @@ def test_booster():
is_finished = ctypes.c_int(0) is_finished = ctypes.c_int(0)
for i in range(100): for i in range(100):
LIB.LGBM_BoosterUpdateOneIter(booster,ctypes.byref(is_finished)) LIB.LGBM_BoosterUpdateOneIter(booster,ctypes.byref(is_finished))
result = np.array([0.0], dtype=np.float32) result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_ulong(0) out_len = ctypes.c_ulong(0)
LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))) LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
print ('%d Iteration test AUC %f' %(i, result[0])) print ('%d Iteration test AUC %f' %(i, result[0]))
LIB.LGBM_BoosterSaveModel(booster, -1, c_str('model.txt')) LIB.LGBM_BoosterSaveModel(booster, -1, c_str('model.txt'))
LIB.LGBM_BoosterFree(booster) LIB.LGBM_BoosterFree(booster)
...@@ -192,7 +192,7 @@ def test_booster(): ...@@ -192,7 +192,7 @@ def test_booster():
data.append( [float(x) for x in line.split('\t')[1:]] ) data.append( [float(x) for x in line.split('\t')[1:]] )
inp.close() inp.close()
mat = np.array(data) mat = np.array(data)
preb = np.zeros(mat.shape[0], dtype=np.float32) preb = np.zeros(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_long() num_preb = ctypes.c_long()
data = np.array(mat.reshape(mat.size), copy=False) data = np.array(mat.reshape(mat.size), copy=False)
LIB.LGBM_BoosterPredictForMat(booster2, LIB.LGBM_BoosterPredictForMat(booster2,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment