Commit 72c2d790 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

some refine for c_api (#152)

1. add csc support
2. some data type from float to double
parent bd7274ba
......@@ -96,13 +96,19 @@ public:
*/
virtual const score_t* GetTrainingScore(int64_t* out_len) = 0;
/*!
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \return out_len lenght of returned score
*/
virtual int64_t GetNumPredictAt(int data_idx) const = 0;
/*!
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
*/
virtual void GetPredictAt(int data_idx, score_t* result, int64_t* out_len) = 0;
virtual void GetPredictAt(int data_idx, double* result, int64_t* out_len) = 0;
/*!
* \brief Prediction for one record, not sigmoid transform
......
......@@ -370,7 +370,20 @@ DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, c
DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
int data_idx,
int64_t* out_len,
float* out_results);
double* out_results);
/*!
* \brief Get number of predict for inner dataset
this can be used to support customized eval function
Note: should pre-allocate memory for out_result, its length is equal to num_class * num_data
* \param handle handle
* \param data_idx 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetNumPredict(BoosterHandle handle,
int data_idx,
int64_t* out_len);
/*!
* \brief Get prediction for training data and validation data
......@@ -385,7 +398,7 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
int data_idx,
int64_t* out_len,
float* out_result);
double* out_result);
/*!
* \brief make prediction for file
......@@ -407,6 +420,24 @@ DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
int64_t num_iteration,
const char* result_filename);
/*!
* \brief Get number of prediction
* \param handle handle
* \param num_row
* \param predict_type
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len lenght of prediction
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
int64_t num_row,
int predict_type,
int64_t num_iteration,
int64_t* out_len);
/*!
* \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result,
......@@ -442,7 +473,44 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int predict_type,
int64_t num_iteration,
int64_t* out_len,
float* out_result);
double* out_result);
/*!
* \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result,
* for noraml and raw score: its length is equal to num_class * num_data
* for leaf index, its length is equal to num_class * num_data * num_iteration
* \param handle handle
* \param col_ptr pointer to col headers
* \param col_ptr_type type of col_ptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param ncol_ptr number of cols in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows
* \param predict_type
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForCSC(BoosterHandle handle,
const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
int64_t nelem,
int64_t num_row,
int predict_type,
int64_t num_iteration,
int64_t* out_len,
double* out_result);
/*!
* \brief make prediction for an new data set
......@@ -473,7 +541,7 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int predict_type,
int64_t num_iteration,
int64_t* out_len,
float* out_result);
double* out_result);
/*!
* \brief save model into file
......@@ -497,7 +565,7 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len,
int64_t* out_len,
char** out_str);
char* out_str);
/*!
* \brief Get leaf value
......@@ -510,7 +578,7 @@ DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle,
int tree_idx,
int leaf_idx,
float* out_val);
double* out_val);
/*!
* \brief Set leaf value
......@@ -523,26 +591,7 @@ DllExport int LGBM_BoosterGetLeafValue(BoosterHandle handle,
DllExport int LGBM_BoosterSetLeafValue(BoosterHandle handle,
int tree_idx,
int leaf_idx,
float val);
// some help functions used to convert data
std::function<std::vector<double>(int row_idx)>
RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major);
std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major);
std::function<std::vector<std::pair<int, double>>(int idx)>
RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
const void* data, int data_type, int64_t nindptr, int64_t nelem);
std::function<std::vector<std::pair<int, double>>(int idx)>
ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices,
const void* data, int data_type, int64_t ncol_ptr, int64_t nelem);
std::vector<double>
SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<int>& indices);
double val);
#if defined(_MSC_VER)
// exception handle and error msg
......
......@@ -324,14 +324,8 @@ public:
}
}
inline void PushOneColumn(int tid, data_size_t col_idx, const std::vector<std::pair<int, double>>& feature_values) {
if (col_idx >= num_total_features_) { return; }
int feature_idx = used_feature_map_[col_idx];
if (feature_idx >= 0) {
for (auto& inner_data : feature_values) {
features_[feature_idx]->PushData(tid, inner_data.first, inner_data.second);
}
}
inline int GetInnerFeatureIndex(int col_idx) const {
return used_feature_map_[col_idx];
}
Dataset* Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const;
......@@ -358,7 +352,7 @@ public:
* \param i Index for feature
* \return Pointer of feature
*/
inline const Feature* FeatureAt(int i) const { return features_[i].get(); }
inline Feature* FeatureAt(int i) const { return features_[i].get(); }
/*!
* \brief Get meta data pointer
......
......@@ -288,10 +288,13 @@ class _InnerPredictor(object):
lines = tmp_file.readlines()
nrow = len(lines)
preds = [float(token) for line in lines for token in line.split('\t')]
preds = np.array(preds, dtype=np.float32, copy=False)
preds = np.array(preds, dtype=np.float64, copy=False)
elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, num_iteration,
predict_type)
elif isinstance(data, scipy.sparse.csc_matrix):
preds, nrow = self.__pred_for_csc(data, num_iteration,
predict_type)
elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type)
......@@ -319,13 +322,14 @@ class _InnerPredictor(object):
"""
Get size of prediction result
"""
n_preds = self.num_class * nrow
if predict_type == C_API_PREDICT_LEAF_INDEX:
if num_iteration > 0:
n_preds *= min(num_iteration, self.num_total_iteration)
else:
n_preds *= self.num_total_iteration
return n_preds
n_preds = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCalcNumPredict(
self.handle,
nrow,
predict_type,
num_iteration,
ctypes.byref(n_preds)))
return n_preds.value
def __pred_for_np2d(self, mat, num_iteration, predict_type):
"""
......@@ -342,7 +346,7 @@ class _InnerPredictor(object):
ptr_data, type_ptr_data = c_float_array(data)
n_preds = self.__get_num_preds(num_iteration, mat.shape[0],
predict_type)
preds = np.zeros(n_preds, dtype=np.float32)
preds = np.zeros(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterPredictForMat(
self.handle,
......@@ -354,7 +358,7 @@ class _InnerPredictor(object):
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
......@@ -366,7 +370,7 @@ class _InnerPredictor(object):
"""
nrow = len(csr.indptr) - 1
n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
preds = np.zeros(n_preds, dtype=np.float32)
preds = np.zeros(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
......@@ -385,7 +389,38 @@ class _InnerPredictor(object):
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
return preds, nrow
def __pred_for_csc(self, csc, num_iteration, predict_type):
"""
Predict for a csc data
"""
nrow = csc.shape[0]
n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
preds = np.zeros(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csc.indptr)
ptr_data, type_ptr_data = c_float_array(csc.data)
_safe_call(_LIB.LGBM_BoosterPredictForCSC(
self.handle,
ptr_indptr,
type_ptr_indptr,
csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
type_ptr_data,
len(csc.indptr),
len(csc.data),
csc.shape[0],
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
))
if n_preds != out_num_preds.value:
raise ValueError("Wrong length for predict results")
......@@ -511,6 +546,8 @@ class _InnerDataset(object):
ctypes.byref(self.handle)))
elif isinstance(data, scipy.sparse.csr_matrix):
self.__init_from_csr(data, params_str, ref_dataset)
elif isinstance(data, scipy.sparse.csc_matrix):
self.__init_from_csc(data, params_str, ref_dataset)
elif isinstance(data, np.ndarray):
self.__init_from_np2d(data, params_str, ref_dataset)
else:
......@@ -541,6 +578,7 @@ class _InnerDataset(object):
for j in range(self.predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j]
init_score = new_init_score
init_score = init_score.astype(dtype=np.float32, copy=False)
self.set_init_score(init_score)
elif self.predictor is not None:
raise TypeError('wrong predictor type {}'.format(type(self.predictor).__name__))
......@@ -655,6 +693,30 @@ class _InnerDataset(object):
ref_dataset,
ctypes.byref(self.handle)))
def __init_from_csc(self, csc, params_str, ref_dataset):
"""
Initialize data from a csc matrix.
"""
if len(csc.indices) != len(csc.data):
raise ValueError('Length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data)))
self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csc.indptr)
ptr_data, type_ptr_data = c_float_array(csc.data)
_safe_call(_LIB.LGBM_DatasetCreateFromCSC(
ptr_indptr,
type_ptr_indptr,
csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
type_ptr_data,
len(csc.indptr),
len(csc.data),
csc.shape[0],
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def __del__(self):
_safe_call(_LIB.LGBM_DatasetFree(self.handle))
......@@ -1498,7 +1560,7 @@ class Booster(object):
self.handle,
buffer_len,
ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer)))
ptr_string_buffer))
actual_len = tmp_out_len.value
'''if buffer length is not long enough, reallocate a buffer'''
if actual_len > buffer_len:
......@@ -1577,13 +1639,13 @@ class Booster(object):
self.__get_eval_info()
ret = []
if self.__num_inner_eval > 0:
result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float32)
result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float64)
tmp_out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetEval(
self.handle,
data_idx,
ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("Wrong length of eval results")
for i in range(self.__num_inner_eval):
......@@ -1614,11 +1676,11 @@ class Booster(object):
else:
n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = \
np.array([0.0 for _ in range(n_preds)], dtype=np.float32, copy=False)
np.array([0.0 for _ in range(n_preds)], dtype=np.float64, copy=False)
"""avoid to predict many time in one iteration"""
if not self.__is_predicted_cur_iter[data_idx]:
tmp_out_len = ctypes.c_int64(0)
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))
_safe_call(_LIB.LGBM_BoosterGetPredict(
self.handle,
data_idx,
......
......@@ -350,7 +350,7 @@ std::string GBDT::OutputMetric(int iter) {
/*! \brief Get eval result */
std::vector<double> GBDT::GetEvalAt(int data_idx) const {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
std::vector<double> ret;
if (data_idx == 0) {
for (auto& sub_metric : training_metrics_) {
......@@ -378,8 +378,8 @@ const score_t* GBDT::GetTrainingScore(int64_t* out_len) {
return train_score_updater_->score();
}
void GBDT::GetPredictAt(int data_idx, score_t* out_result, int64_t* out_len) {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_metrics_.size()));
void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
const score_t* raw_scores = nullptr;
data_size_t num_data = 0;
......@@ -401,18 +401,18 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, int64_t* out_len) {
}
Common::Softmax(&tmp_result);
for (int j = 0; j < num_class_; ++j) {
out_result[j * num_data + i] = static_cast<score_t>(tmp_result[j]);
out_result[j * num_data + i] = static_cast<double>(tmp_result[j]);
}
}
} else if(sigmoid_ > 0.0f){
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = static_cast<score_t>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
out_result[i] = static_cast<double>(1.0f / (1.0f + std::exp(-2.0f * sigmoid_ * raw_scores[i])));
}
} else {
#pragma omp parallel for schedule(static)
for (data_size_t i = 0; i < num_data; ++i) {
out_result[i] = raw_scores[i];
out_result[i] = static_cast<double>(raw_scores[i]);
}
}
......
......@@ -107,13 +107,21 @@ public:
*/
virtual const score_t* GetTrainingScore(int64_t* out_len) override;
virtual int64_t GetNumPredictAt(int data_idx) const override {
CHECK(data_idx >= 0 && data_idx <= static_cast<int>(valid_score_updater_.size()));
data_size_t num_data = train_data_->num_data();
if (data_idx > 0) {
num_data = valid_score_updater_[data_idx - 1]->num_data();
}
return num_data * num_class_;
}
/*!
* \brief Get prediction result at data_idx data
* \param data_idx 0: training data, 1: 1st validation data
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
*/
void GetPredictAt(int data_idx, score_t* out_result, int64_t* out_len) override;
void GetPredictAt(int data_idx, double* out_result, int64_t* out_len) override;
/*!
* \brief Prediction for one record without sigmoid transformation
......
This diff is collapsed.
......@@ -175,9 +175,9 @@ def test_booster():
is_finished = ctypes.c_int(0)
for i in range(100):
LIB.LGBM_BoosterUpdateOneIter(booster,ctypes.byref(is_finished))
result = np.array([0.0], dtype=np.float32)
result = np.array([0.0], dtype=np.float64)
out_len = ctypes.c_ulong(0)
LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))
LIB.LGBM_BoosterGetEval(booster, 0, ctypes.byref(out_len), result.ctypes.data_as(ctypes.POINTER(ctypes.c_double)))
print ('%d Iteration test AUC %f' %(i, result[0]))
LIB.LGBM_BoosterSaveModel(booster, -1, c_str('model.txt'))
LIB.LGBM_BoosterFree(booster)
......@@ -192,7 +192,7 @@ def test_booster():
data.append( [float(x) for x in line.split('\t')[1:]] )
inp.close()
mat = np.array(data)
preb = np.zeros(mat.shape[0], dtype=np.float32)
preb = np.zeros(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_long()
data = np.array(mat.reshape(mat.size), copy=False)
LIB.LGBM_BoosterPredictForMat(booster2,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment