Commit 629fc047 authored by Guolin Ke's avatar Guolin Ke
Browse files

more flexity python basic object

parent b41e0f0a
......@@ -37,6 +37,7 @@ public:
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
virtual void MergeFrom(const Boosting* other) = 0;
......
......@@ -126,16 +126,27 @@ C_API_DTYPE_INT64 =3
"""Matric is row major in python"""
C_API_IS_ROW_MAJOR =1
C_API_PREDICT_NORMAL =0
C_API_PREDICT_RAW_SCORE =1
C_API_PREDICT_LEAF_INDEX =2
FIELD_TYPE_MAPPER = {"label":C_API_DTYPE_FLOAT32,
"wegiht":C_API_DTYPE_FLOAT32,
"init_score":C_API_DTYPE_FLOAT32,
"group_id":C_API_DTYPE_INT32,
"group":C_API_DTYPE_INT32,
}
def c_float_array(data):
"""Convert numpy array / list to c float array."""
if isinstance(data, list):
data = np.array(data, copy=False)
if is_numpy_1d_array(data):
if data.dtype == np.float32:
ptr_data = data.ctypes.data_as(ctypes.c_float)
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
type_data = C_API_DTYPE_FLOAT32
elif data.dtype == np.float64:
ptr_data = data.ctypes.data_as(ctypes.c_double)
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
type_data = C_API_DTYPE_FLOAT64
else:
raise TypeError("expected np.float32 or np.float64, met type({})".format(data.dtype))
......@@ -149,10 +160,10 @@ def c_int_array(data):
data = np.array(data, copy=False)
if is_numpy_1d_array(data):
if data.dtype == np.int32:
ptr_data = data.ctypes.data_as(ctypes.c_int32)
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
type_data = C_API_DTYPE_INT32
elif data.dtype == np.int64:
ptr_data = data.ctypes.data_as(ctypes.c_int64)
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
type_data = C_API_DTYPE_INT64
else:
raise TypeError("expected np.int32 or np.int64, met type({})".format(data.dtype))
......@@ -160,19 +171,188 @@ def c_int_array(data):
raise TypeError("Unknow type({})".format(type(data).__name__))
return (ptr_data, type_data)
class Predictor(object):
""""A Predictor of LightGBM.
"""
def __init__(self,model_file=None, params=None, booster_handle=None, is_manage_handle=True):
# pylint: disable=invalid-name
"""Initialize the Booster.
Parameters
----------
model_file : string
Path to the model file.
params : dict
Parameters for boosters.
"""
self.handle = ctypes.c_void_p()
self.__is_manage_handle = True
if model_file is not None:
"""Prediction task"""
out_num_total_model = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file),
ctypes.byref(out_num_total_model),
ctypes.byref(self.handle)))
self.__num_total_model = out_num_total_model.value
tmp_out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(tmp_out_len)))
self.num_class = tmp_out_len.value
elif booster_handle is not None:
self.__is_manage_handle = is_manage_handle
self.handle = booster_handle
tmp_out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(tmp_out_len)))
self.num_class = tmp_out_len.value
_safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
self.handle,
ctypes.byref(tmp_out_len)))
self.__num_total_model = self.num_class * tmp_out_len.value
else:
raise TypeError('Need Model file to create a booster')
def __del__(self):
if self.__is_manage_handle:
_safe_call(_LIB.LGBM_BoosterFree(self.handle))
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
if isinstance(data, Dataset):
raise TypeError("cannot use Dataset instance for prediction, please use raw data instead")
predict_type = C_API_PREDICT_NORMAL
if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE
if pred_leaf:
predict_type = C_API_PREDICT_LEAF_INDEX
int_data_has_header = 0
if data_has_header:
int_data_has_header = 1
if is_str(data):
tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
_safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle,
c_str(data),
int_data_has_header,
predict_type,
num_iteration,
c_str(tmp_pred_fname)))
lines = open(tmp_pred_fname,"r").readlines()
nrow = len(lines)
preds = []
for line in lines:
for token in line.split('\t'):
preds.append(float(token))
preds = np.array(preds, copy=False)
os.remove(tmp_pred_fname)
elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type)
elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
else:
try:
csr = scipy.sparse.csr_matrix(data)
res = self.__pred_for_csr(csr, num_iteration, predict_type)
except:
raise TypeError('can not predict data for type {}'.format(type(data).__name__))
if pred_leaf:
preds = preds.astype(np.int32)
if preds.size != nrow and is_reshape:
if preds.size % nrow == 0:
ncol = int(preds.size / nrow)
preds = preds.reshape(nrow, ncol)
else:
raise ValueError('len of predict result(%d) cannot be divide nrow(%d)' %(preds.size, nrow) )
return preds
def __pred_for_np2d(self, mat, num_iteration, predict_type):
"""
Predict for a 2-D numpy matrix.
"""
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data = c_float_array(data)
n_preds = self.num_class * mat.shape[0]
if predict_type == C_API_PREDICT_LEAF_INDEX:
if num_iteration > 0:
n_preds *= num_iteration
else:
used_iteration = self.__num_total_model / self.num_class
n_preds *= used_iteration
preds = np.zeros(n_preds, dtype=np.float32)
out_num_preds = ctypes.c_int64(0)
_safe_call(LIB.LGBM_BoosterPredictForMat(
self.handle,
ptr_data,
type_ptr_data,
mat.shape[0],
mat.shape[1],
C_API_IS_ROW_MAJOR,
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
return preds, mat.shape[0]
def __pred_for_csr(self, csr, num_iteration, predict_type):
"""
Predict for a csr data
"""
nrow = len(csr.indptr) - 1
n_preds = self.num_class * nrow
if predict_type == C_API_PREDICT_LEAF_INDEX:
if num_iteration > 0:
n_preds *= num_iteration
else:
used_iteration = self.__num_total_model / self.num_class
n_preds *= used_iteration
preds = np.zeros(n_preds, dtype=np.float32)
out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(LIB.LGBM_BoosterPredictForCSR(
self.handle,
ptr_indptr,
type_ptr_indptr,
csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
type_ptr_data,
len(csr.indptr),
len(csr.data),
csr.shape[1],
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
return preds, nrow
class Dataset(object):
"""Dataset used in LightGBM.
Dataset is a internal data structure that used by LightGBM
You can construct Dataset from numpy.arrays
"""
_feature_names = None
def __init__(self, data, max_bin=255, reference=None,
label=None, weight=None, group_id=None,
silent=False, feature_names=None,
other_params=None, is_continue_train=False):
def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group_id=None, predictor=None,
silent=False, params=None):
"""
Dataset used in LightGBM.
......@@ -181,41 +361,35 @@ class Dataset(object):
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data is string type, it represents the path of txt file,
label : list or numpy 1-D array, optional
Label of the data
max_bin : int, required
max number of discrete bin for features
reference : Other Dataset, optional
If this dataset validation, need to use training data as reference
label : list or numpy 1-D array, optional
Label of the training data.
weight : list or numpy 1-D array , optional
Weight for each instance.
group_id : list or numpy 1-D array , optional
group/query id for each instance. Note: if having group/query id, data should group by this id
silent : boolean, optional
Whether print messages during construction
feature_names : list, optional
Set names for features.
other_params: dict, optional
params: dict, optional
other parameters
"""
if data is None:
self.handle = None
return
"""save raw data for continue train """
if is_continue_train:
self.raw_data = data
else:
self.raw_data = None
self.data_has_header = False
"""process for args"""
params = {}
if params is None:
params = {}
self.max_bin = max_bin
self.predictor = predictor
params["max_bin"] = max_bin
if silent:
params["verbose"] = 0
if other_params:
other_params.update(params)
params = other_params
else:
params["verbose"] = 1
params_str = dict_to_str(params)
"""process for reference dataset"""
ref_dataset = None
......@@ -228,7 +402,7 @@ class Dataset(object):
"""check data has header or not"""
if "has_header" in params or "header" in params:
if params["has_header"].lower() == "true" or params["header"].lower() == "true":
data_has_header = True
self.data_has_header = True
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_CreateDatasetFromFile(
c_str(data),
......@@ -242,8 +416,6 @@ class Dataset(object):
else:
try:
csr = scipy.sparse.csr_matrix(data)
if self.raw_data is not None:
self.raw_data = csr
self.__init_from_csr(csr)
except:
raise TypeError('can not initialize Dataset from {}'.format(type(data).__name__))
......@@ -253,14 +425,52 @@ class Dataset(object):
self.__group = None
if label is not None:
self.set_label(label)
if self.get_label() is None:
raise ValueError("label should not be None")
if weight is not None:
self.set_weight(weight)
if group_id is not None:
self.set_group_id(group_id)
self.feature_names = feature_names
# load init score
if self.predictor is not None and isinstance(self.predictor, Predictor):
init_score = self.predictor.predict(data,
raw_score=True,
data_has_header=self.data_has_header,
is_reshape=False)
if self.predictor.num_class > 1:
# need re group init score
new_init_score = np.zeros(init_score.size(), dtype=np.float32)
num_data = self.num_data()
for i in range(num_data):
for j in range(self.predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j]
init_score = new_init_score
self.set_init_score(init_score)
def new_valid_dataset(self, data, label=None, weight=None, group_id=None,
silent=False, params=None):
"""
Create validation data align with current dataset
def free_raw_data(self):
self.raw_data = None
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data is string type, it represents the path of txt file,
label : list or numpy 1-D array, optional
Label of the training data.
weight : list or numpy 1-D array , optional
Weight for each instance.
group_id : list or numpy 1-D array , optional
group/query id for each instance. Note: if having group/query id, data should group by this id
silent : boolean, optional
Whether print messages during construction
other_params: dict, optional
other parameters
"""
return Dataset(data, label=label, max_bin=self.max_bin, reference=self,
weight=weight, group_id=group_id, predictor=self.predictor,
silent=silent, params=params)
def __init_from_np2d(self, mat, params_str, ref_dataset):
"""
......@@ -301,7 +511,7 @@ class Dataset(object):
_safe_call(_LIB.LGBM_CreateDatasetFromCSR(
ptr_indptr,
type_ptr_indptr,
csr.indices.ctypes.data_as(ctypes.c_int32),
csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
type_ptr_data,
len(csr.indptr),
......@@ -327,19 +537,23 @@ class Dataset(object):
info : array
a numpy array of information of the data
"""
out_len = ctypes.c_int32()
tmp_out_len = ctypes.c_int64()
out_type = ctypes.c_int32()
ret = ctypes.POINTER(ctypes.c_void_p)()
_safe_call(_LIB.LGBM_DatasetGetField(
self.handle,
c_str(field_name),
ctypes.byref(out_len),
ctypes.byref(tmp_out_len),
ctypes.byref(ret),
ctypes.byref(out_type)))
if out_type.value != FIELD_TYPE_MAPPER[field_name]:
raise TypeError("Return type error for get_field")
if tmp_out_len.value == 0:
return None
if out_type.value == C_API_DTYPE_INT32:
return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(c_int32), out_len.value))
return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_FLOAT32:
return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(c_float), out_len.value))
return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
else:
raise TypeError("unknow type")
......@@ -351,19 +565,29 @@ class Dataset(object):
field_name: str
The field name of the information
data: numpy array or list
data: numpy array or list or None
The array ofdata to be set
"""
if data is None:
_safe_call(_LIB.LGBM_DatasetSetField(
self.handle,
c_str(field_name),
None,
0,
FIELD_TYPE_MAPPER[field_name]))
return
if not is_numpy_1d_array(data):
raise TypeError("Unknow type({})".format(type(data).__name__))
if data.dtype == np.float32:
ptr_data = data.ctypes.data_as(ctypes.c_float)
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
type_data = C_API_DTYPE_FLOAT32
elif data.dtype == np.int32:
ptr_data = data.ctypes.data_as(ctypes.c_int32)
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
type_data = C_API_DTYPE_INT32
else:
raise TypeError("excepted np.float32 or np.int32, met type({})".format(data.dtype))
if type_data != FIELD_TYPE_MAPPER[field_name]:
raise TypeError("type error for set_field")
_safe_call(_LIB.LGBM_DatasetSetField(
self.handle,
c_str(field_name),
......@@ -406,9 +630,10 @@ class Dataset(object):
weight : array like
Weight for each data point
"""
weight = list_to_1d_numpy(weight, np.float32)
if weight.dtype != np.float32:
weight = weight.astype(np.float32, copy=False)
if weight is not None:
weight = list_to_1d_numpy(weight, np.float32)
if weight.dtype != np.float32:
weight = weight.astype(np.float32, copy=False)
self.__weight = weight
self.set_field('weight', weight)
......@@ -419,10 +644,11 @@ class Dataset(object):
score: array like
"""
score = list_to_1d_numpy(score, np.float32)
if score.dtype != np.float32:
score = score.astype(np.float32, copy=False)
self.__init_score = init_score
if score is not None:
score = list_to_1d_numpy(score, np.float32)
if score.dtype != np.float32:
score = score.astype(np.float32, copy=False)
self.__init_score = score
self.set_field('init_score', score)
def set_group(self, group):
......@@ -433,9 +659,10 @@ class Dataset(object):
group : array like
Group size of each group
"""
group = list_to_1d_numpy(group, np.int32)
if group.dtype != np.int32:
group = group.astype(np.int32, copy=False)
if group is not None:
group = list_to_1d_numpy(group, np.int32)
if group.dtype != np.int32:
group = group.astype(np.int32, copy=False)
self.__group = group
self.set_field('group', group)
......@@ -448,9 +675,10 @@ class Dataset(object):
group : array like
group_id of Dataset (used for ranking).
"""
group_id = list_to_1d_numpy(group_id, np.int32)
if group_id.dtype != np.int32:
group_id = group_id.astype(np.int32, copy=False)
if group_id is not None:
group_id = list_to_1d_numpy(group_id, np.int32)
if group_id.dtype != np.int32:
group_id = group_id.astype(np.int32, copy=False)
self.set_field('group_id', group_id)
def get_label(self):
......@@ -462,6 +690,8 @@ class Dataset(object):
"""
if self.__label is None:
self.__label = self.get_field('label')
if self.__label is None:
raise TypeError("label should not be None")
return self.__label
def get_weight(self):
......@@ -521,58 +751,11 @@ class Dataset(object):
ctypes.byref(ret)))
return ret.value
@property
def feature_names(self):
"""Get feature names (column labels).
Returns
-------
feature_names : list
"""
if self._feature_names is None:
self._feature_names = ['Column_{0}'.format(i) for i in range(self.num_col())]
return self._feature_names
@feature_names.setter
def feature_names(self, feature_names):
"""Set feature names (column labels).
Parameters
----------
feature_names : list
Labels for features
"""
if feature_names is not None:
# validate feature name
if not isinstance(feature_names, list):
feature_names = list(feature_names)
if len(feature_names) != len(set(feature_names)):
raise ValueError('feature_names must be unique')
if len(feature_names) != self.num_col():
msg = 'feature_names must have the same length as data'
raise ValueError(msg)
# prohibit to use symbols may affect to parse. e.g. []<
if not all(isinstance(f, STRING_TYPES) and
not any(x in f for x in set(('[', ']', '<')))
for f in feature_names):
raise ValueError('feature_names may not contain [, ] or <')
self._feature_names = feature_names
else:
self._feature_names = None
C_API_PREDICT_NORMAL =0
C_API_PREDICT_RAW_SCORE =1
C_API_PREDICT_LEAF_INDEX =2
class Booster(object):
""""A Booster of of LightGBM.
"""
feature_names = None
def __init__(self,params=None,
train_set=None, valid_sets=None,
name_valid_sets=None, model_file=None):
def __init__(self, params=None, train_set=None, model_file=None, silent=False):
# pylint: disable=invalid-name
"""Initialize the Booster.
......@@ -582,83 +765,46 @@ class Booster(object):
Parameters for boosters.
train_set : Dataset
training dataset
valid_sets : List of Dataset or None
validation datasets
name_valid_sets : List of string
name of validation datasets
model_file : string
Path to the model file.
If tarin_set is not None, used for continued train.
else used for loading model prediction task
"""
self.handle = ctypes.c_void_p()
self.__need_reload_eval_info = True
self.__is_manage_handle = True
if params is None:
params = {}
if silent:
params["verbose"] = 0
else:
params["verbose"] = 1
if train_set is not None:
"""Training task"""
if not isinstance(train_set, Dataset):
raise TypeError('training data should be Dataset instance, met{}'.format(type(train_set).__name__))
valid_handles = None
n_valid = 0
if valid_sets is not None:
for valid in valid_sets:
if not isinstance(valid, Dataset):
raise TypeError('valid data should be Dataset instance, met{}'.format(type(valid).__name__))
valid_handles = c_array(ctypes.c_void_p, [valid.handle for valid in valid_sets])
if name_valid_sets is None:
name_valid_sets = ["valid_{}".format(x+1) for x in range(len(valid_sets)) ]
if len(valid_sets) != len(name_valid_sets):
raise Exception('len of valid_sets should be equal with len of name_valid_sets')
n_valid = len(valid_sets)
ref_input_model = None
params_str = dict_to_str(params)
if model_file is not None:
ref_input_model = c_str(model_file)
"""construct booster object"""
_safe_call(_LIB.LGBM_BoosterCreate(
train_set.handle,
valid_handles,
n_valid,
c_str(params_str),
ref_input_model,
ctypes.byref(self.handle)))
"""if need to continue train"""
if model_file is not None:
self.__init_continue_train(train_set)
if valid_sets is not None:
for valid in valid_sets:
self.__init_continue_train(valid)
"""save reference to data"""
self.train_set = train_set
self.valid_sets = valid_sets
self.name_valid_sets = name_valid_sets
self.__num_dataset = 1 + n_valid
self.__training_score = None
out_len = ctypes.c_int64(0)
self.valid_sets = []
self.name_valid_sets = []
self.__num_dataset = 1
self.init_predictor = train_set.predictor
if self.init_predictor is not None:
_safe_call(_LIB.LGBM_BoosterMerge(
self.handle,
self.init_predictor.handle))
out_num_class = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_len)))
self.__num_class = out_len.value
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
"""buffer for inner predict"""
self.__inner_predict_buffer = [None for _ in range(self.__num_dataset)]
"""Get num of inner evals"""
_safe_call(_LIB.LGBM_BoosterGetEvalCounts(
self.handle,
ctypes.byref(out_len)))
self.__num_inner_eval = out_len.value
if self.__num_inner_eval > 0:
"""Get name of evals"""
string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)]
ptr_string_buffers = (ctypes.c_char_p*self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
_safe_call(_LIB.LGBM_BoosterGetEvalNames(
self.handle,
ctypes.byref(out_len),
ptr_string_buffers))
if self.__num_inner_eval != out_len.value:
raise ValueError("size of eval names doesn't equal with num_evals")
self.__name_inner_eval = []
for i in range(self.__num_inner_eval):
self.__name_inner_eval.append(string_buffers[i].value.decode())
self.__inner_predict_buffer = [None]
self.__get_eval_info()
elif model_file is not None:
"""Prediction task"""
out_num_total_model = ctypes.c_int64(0)
......@@ -667,18 +813,40 @@ class Booster(object):
ctypes.byref(out_num_total_model),
ctypes.byref(self.handle)))
self.__num_total_model = out_num_total_model.value
out_len = ctypes.c_int64(0)
out_num_class = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_len)))
self.__num_class = out_len.value
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
else:
raise TypeError('At least need training dataset or model file to create booster instance')
def __del__(self):
_safe_call(_LIB.LGBM_BoosterFree(self.handle))
if self.handle is not None and self.__is_manage_handle:
_safe_call(_LIB.LGBM_BoosterFree(self.handle))
def add_valid_data(self, data, name):
if data.predictor is not self.init_predictor:
raise Exception("Add validation data failed, you should use same predictor for these data")
_safe_call(_LIB.LGBM_BoosterAddValidData(
self.handle,
data.handle))
self.valid_sets.append(data)
self.name_valid_sets.append(name)
self.__num_dataset += 1
def update(self, fobj=None):
def ResetParameter(self, params, silent=False):
self.__need_reload_eval_info = True
if silent:
params["verbose"] = 0
else:
params["verbose"] = 1
params_str = dict_to_str(params)
_safe_call(_LIB.LGBM_BoosterResetParameter(
self.handle,
c_str(params_str)))
def update(self, train_set=None, fobj=None):
"""
Update for one iteration
Note: for multi-class task, the score is group by class_id first, then group by row_id
......@@ -686,6 +854,7 @@ class Booster(object):
and you should group grad and hess in this way as well
Parameters
----------
train_set : training data, None means use last training data
fobj : function
Customized objective function.
......@@ -693,6 +862,15 @@ class Booster(object):
-------
is_finished, bool
"""
"""need reset training data"""
if train_set is not None and train_set is not self.train_set:
if train_set.predictor is not self.init_predictor:
raise Exception("Replace training data failed, you should use same predictor for these data")
self.train_set = train_set
_safe_call(_LIB.LGBM_BoosterResetTrainingData(
self.handle,
self.train_set.handle))
self.__inner_predict_buffer[0] = None
is_finished = ctypes.c_int(0)
if fobj is None:
_safe_call(_LIB.LGBM_BoosterUpdateOneIter(
......@@ -701,9 +879,9 @@ class Booster(object):
return is_finished.value == 1
else:
grad, hess = fobj(self.__inner_predict(0), self.train_set)
return self.boost(grad, hess)
return self.__boost(grad, hess)
def boost(self, grad, hess):
def __boost(self, grad, hess):
"""
Boost the booster for one iteration, with customized gradient statistics.
Note: for multi-class task, the score is group by class_id first, then group by row_id
......@@ -729,11 +907,53 @@ class Booster(object):
is_finished = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
self.handle,
grad.ctypes.data_as(ctypes.c_float),
hess.ctypes.data_as(ctypes.c_float),
grad.ctypes.data_as(ctypes.ctypes.POINTER(ctypes.c_float)),
hess.ctypes.data_as(ctypes.ctypes.POINTER(ctypes.c_float)),
ctypes.byref(is_finished)))
return is_finished.value == 1
def rollback_one_iter(self):
_safe_call(_LIB.LGBM_BoosterRollbackOneIter(
self.handle))
def current_iteration(self):
out_cur_iter = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
self.handle,
ctypes.byref(out_cur_iter)))
return out_cur_iter.value
def eval(self, data, name, feval=None):
"""Evaluate for data
Parameters
----------
data : Dataset object
name : name of data
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result string.
"""
if not isinstance(data, Dataset):
raise TypeError("Can only eval for Dataset instance")
data_idx = -1
if data is self.train_set:
data_idx = 0
else:
for i in range(len(self.valid_sets)):
if data is self.valid_sets[i]:
data_idx = i + 1
break
"""need push new valid data"""
if data_idx == -1:
self.add_valid_data(data, name)
data_idx = self.__num_dataset - 1
return self.__inner_eval(name, data_idx, feval)
def eval_train(self, feval=None):
"""Evaluate for training data
......@@ -774,141 +994,28 @@ class Booster(object):
c_str(filename)))
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
if isinstance(data, Dataset):
raise TypeError("cannot use Dataset instance for prediction, please use raw data instead")
predict_type = C_API_PREDICT_NORMAL
if raw_score:
predict_type = cC_API_PREDICT_RAW_SCORE
if pred_leaf:
predict_type = C_API_PREDICT_LEAF_INDEX
int_data_has_header = 0
if data_has_header:
int_data_has_header = 1
if is_str(data):
tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
_safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle,
c_str(data),
int_data_has_header,
predict_type,
num_iteration,
c_str(tmp_pred_fname)))
lines = open(tmp_pred_fname,"r").readlines()
nrow = len(lines)
preds = []
for line in lines:
for token in line.split('\t'):
preds.append(float(token))
preds = np.array(preds, copy=False)
os.remove(tmp_pred_fname)
elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type)
elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
else:
try:
csr = scipy.sparse.csr_matrix(data)
res = self.__pred_for_csr(csr, num_iteration, predict_type)
except:
raise TypeError('can not predict data for type {}'.format(type(data).__name__))
if pred_leaf:
preds = preds.astype(np.int32)
if preds.size != nrow and is_reshape:
if preds.size % nrow == 0:
ncol = int(preds.size / nrow)
preds = preds.reshape(nrow, ncol)
else:
raise ValueError('len of predict result(%d) cannot be divide nrow(%d)' %(preds.size, nrow) )
return preds
predictor = Predictor(booster_handle=self.handle, is_manage_handle=False)
return predictor.predict(data, num_iteration, raw_score, pred_leaf, data_has_header, is_reshape)
def __pred_for_np2d(self, mat, num_iteration, predict_type):
"""
Predict for a 2-D numpy matrix.
"""
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data = c_float_array(data)
n_preds = self.__num_class * mat.shape[0]
if predict_type == C_API_PREDICT_LEAF_INDEX:
if num_iteration > 0:
n_preds *= num_iteration
else:
used_iteration = self.__num_total_model / self.__num_class
n_preds *= used_iteration
preds = np.zeros(n_preds, dtype=np.float32)
out_num_preds = ctypes.c_int64(0)
_safe_call(LIB.LGBM_BoosterPredictForMat(
self.handle,
ptr_data,
type_ptr_data,
mat.shape[0],
mat.shape[1],
C_API_IS_ROW_MAJOR,
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
return preds, mat.shape[0]
def __pred_for_csr(self, csr, num_iteration, predict_type):
"""
Predict for a csr data
"""
nrow = len(csr.indptr) - 1
n_preds = self.__num_class * nrow
if predict_type == C_API_PREDICT_LEAF_INDEX:
if num_iteration > 0:
n_preds *= num_iteration
else:
used_iteration = self.__num_total_model / self.__num_class
n_preds *= used_iteration
preds = np.zeros(n_preds, dtype=np.float32)
out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(LIB.LGBM_BoosterPredictForCSR(
self.handle,
ptr_indptr,
type_ptr_indptr,
csr.indices.ctypes.data_as(ctypes.c_int32),
ptr_data,
type_ptr_data,
len(csr.indptr),
len(csr.data),
csr.shape[1],
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
return preds, nrow
def to_predictor(self):
predictor = Predictor(booster_handle=self.handle, is_manage_handle=True)
self.__is_manage_handle = False
return predictor
def __inner_eval(self, data_name, data_idx, feval=None):
if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset")
self.__get_eval_info()
ret = []
if self.__num_inner_eval > 0:
result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float32)
out_len = ctypes.c_int64(0)
tmp_out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetEval(
self.handle,
data_idx,
ctypes.byref(out_len),
ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
if out_len.value != self.__num_inner_eval:
if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("incorrect number of eval results")
for i in range(self.__num_inner_eval):
ret.append('%s %s : %f' %(data_name, self.__name_inner_eval[i], result[i]))
......@@ -936,33 +1043,37 @@ class Booster(object):
num_data = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = \
np.array([0.0 for _ in range(num_data)], dtype=np.float32, copy=False)
out_len = ctypes.c_int64(0)
tmp_out_len = ctypes.c_int64(0)
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
_safe_call(_LIB.LGBM_BoosterGetPredict(
self.handle,
data_idx,
ctypes.byref(out_len),
ctypes.byref(tmp_out_len),
data_ptr))
if out_len.value != len(self.__inner_predict_buffer[data_idx]):
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" %(data_idx) )
return self.__inner_predict_buffer[data_idx]
def __init_continue_train(self, dataset):
if dataset.raw_data is None:
raise ValueError("should set is_continue_train=True in dataset while need to continue train")
init_score = self.predict(dataset.raw_data, raw_score=True,data_has_header=dataset.data_has_header, is_reshape=False)
dataset.set_init_score(init_score)
dataset.free_raw_data()
#tmp test
train_data = Dataset('../../examples/binary_classification/binary.train')
test_data = Dataset('../../examples/binary_classification/binary.test', reference = train_data)
param = {"metric":"l2,l1"}
lgb = Booster(train_set=train_data, valid_sets=[test_data], params=param)
for i in range(100):
lgb.update()
print(lgb.eval_valid())
print(lgb.eval_train())
print(lgb.predict('../../examples/binary_classification/binary.train'))
\ No newline at end of file
def __get_eval_info(self):
if self.__need_reload_eval_info:
self.__need_reload_eval_info = False
out_num_eval = ctypes.c_int64(0)
"""Get num of inner evals"""
_safe_call(_LIB.LGBM_BoosterGetEvalCounts(
self.handle,
ctypes.byref(out_num_eval)))
self.__num_inner_eval = out_num_eval.value
if self.__num_inner_eval > 0:
"""Get name of evals"""
tmp_out_len = ctypes.c_int64(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)]
ptr_string_buffers = (ctypes.c_char_p*self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
_safe_call(_LIB.LGBM_BoosterGetEvalNames(
self.handle,
ctypes.byref(tmp_out_len),
ptr_string_buffers))
if self.__num_inner_eval != tmp_out_len.value:
raise ValueError("size of eval names doesn't equal with num_evals")
self.__name_inner_eval = []
for i in range(self.__num_inner_eval):
self.__name_inner_eval.append(string_buffers[i].value.decode())
......@@ -46,12 +46,12 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
gbdt_config_ = config;
early_stopping_round_ = gbdt_config_->early_stopping_round;
shrinkage_rate_ = gbdt_config_->learning_rate;
train_data_ = train_data;
random_ = Random(gbdt_config_->bagging_seed);
// create tree learner
tree_learner_.clear();
for (int i = 0; i < num_class_; ++i) {
auto new_tree_learner = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(gbdt_config_->tree_learner_type, gbdt_config_->tree_config));
new_tree_learner->Init(train_data_);
new_tree_learner->Init(train_data);
// init tree learner
tree_learner_.push_back(std::move(new_tree_learner));
}
......@@ -63,42 +63,45 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
training_metrics_.push_back(metric);
}
training_metrics_.shrink_to_fit();
// create score tracker
train_score_updater_.reset(new ScoreUpdater(train_data_, num_class_));
num_data_ = train_data_->num_data();
// create buffer for gradients and hessians
if (object_function_ != nullptr) {
gradients_ = std::vector<score_t>(num_data_ * num_class_);
hessians_ = std::vector<score_t>(num_data_ * num_class_);
}
sigmoid_ = -1.0f;
if (object_function_ != nullptr
&& std::string(object_function_->GetName()) == std::string("binary")) {
// only binary classification need sigmoid transform
sigmoid_ = gbdt_config_->sigmoid;
}
// get max feature index
max_feature_idx_ = train_data_->num_total_features() - 1;
// get label index
label_idx_ = train_data_->label_idx();
// if need bagging, create buffer
if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
bag_data_indices_ = std::vector<data_size_t>(num_data_);
} else {
out_of_bag_data_cnt_ = 0;
out_of_bag_data_indices_.clear();
bag_data_cnt_ = num_data_;
bag_data_indices_.clear();
}
random_ = Random(gbdt_config_->bagging_seed);
// update score
for (int i = 0; i < iter_; ++i) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = i * num_class_ + curr_class;
train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
if (train_data_ != train_data) {
// not same training data, need reset score and others
// create score tracker
train_score_updater_.reset(new ScoreUpdater(train_data, num_class_));
// update score
for (int i = 0; i < iter_; ++i) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
train_score_updater_->AddScore(models_[curr_tree].get(), curr_class);
}
}
num_data_ = train_data->num_data();
// create buffer for gradients and hessians
if (object_function_ != nullptr) {
gradients_ = std::vector<score_t>(num_data_ * num_class_);
hessians_ = std::vector<score_t>(num_data_ * num_class_);
}
// get max feature index
max_feature_idx_ = train_data->num_total_features() - 1;
// get label index
label_idx_ = train_data->label_idx();
// if need bagging, create buffer
if (gbdt_config_->bagging_fraction < 1.0 && gbdt_config_->bagging_freq > 0) {
out_of_bag_data_indices_ = std::vector<data_size_t>(num_data_);
bag_data_indices_ = std::vector<data_size_t>(num_data_);
} else {
out_of_bag_data_cnt_ = 0;
out_of_bag_data_indices_.clear();
bag_data_cnt_ = num_data_;
bag_data_indices_.clear();
}
}
train_data_ = train_data;
}
void GBDT::AddValidDataset(const Dataset* valid_data,
......@@ -111,7 +114,7 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
// update score
for (int i = 0; i < iter_; ++i) {
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = i * num_class_ + curr_class;
auto curr_tree = (i + num_init_iteration_) * num_class_ + curr_class;
new_score_updater->AddScore(models_[curr_tree].get(), curr_class);
}
}
......@@ -232,7 +235,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
void GBDT::RollbackOneIter() {
if (iter_ == 0) { return; }
int cur_iter = iter_ - 1;
int cur_iter = iter_ + num_init_iteration_ - 1;
// reset score
for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
auto curr_tree = cur_iter * num_class_ + curr_class;
......
......@@ -36,12 +36,28 @@ public:
const std::vector<const Metric*>& training_metrics)
override;
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
void MergeFrom(const Boosting* other) override {
auto other_gbdt = reinterpret_cast<const GBDT*>(other);
// tmp move to other vector
auto original_models = std::move(models_);
models_ = std::vector<std::unique_ptr<Tree>>();
// push model from other first
for (const auto& tree : other_gbdt->models_) {
auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
models_.push_back(std::move(new_tree));
}
num_init_iteration_ = static_cast<int>(models_.size()) / num_class_;
// push model in current object
for (const auto& tree : original_models) {
auto new_tree = std::unique_ptr<Tree>(new Tree(*(tree.get())));
models_.push_back(std::move(new_tree));
}
num_iteration_for_pred_ = static_cast<int>(models_.size()) / num_class_;
}
/*!
......@@ -266,6 +282,7 @@ protected:
int num_iteration_for_pred_;
/*! \brief Shrinkage rate for one iteration */
double shrinkage_rate_;
/*! \brief Number of loaded initial models */
int num_init_iteration_;
};
......
......@@ -36,7 +36,7 @@ public:
Log::Warning("continued train from model is not support for c_api, \
please use continued train with input score");
}
boosting_.reset(Boosting::CreateBoosting(config_.boosting_type, ""));
boosting_.reset(Boosting::CreateBoosting(config_.boosting_type, nullptr));
ConstructObjectAndTrainingMetrics(train_data);
// initialize the boosting
boosting_->Init(&config_.boosting_config, train_data, objective_fun_.get(),
......@@ -114,6 +114,10 @@ public:
return boosting_->TrainOneIter(gradients, hessians, false);
}
void RollbackOneIter() {
boosting_->RollbackOneIter();
}
void PrepareForPrediction(int num_iteration, int predict_type) {
boosting_->SetNumIterationForPred(num_iteration);
bool is_predict_leaf = false;
......@@ -156,24 +160,13 @@ public:
int idx = 0;
for (const auto& metric : train_metric_) {
for (const auto& name : metric->GetName()) {
int j = 0;
auto name_cstr = name.c_str();
while (name_cstr[j] != '\0') {
out_strs[idx][j] = name_cstr[j];
++j;
}
out_strs[idx][j] = '\0';
std::strcpy(out_strs[idx], name.c_str());
++idx;
}
}
return idx;
}
void RollbackOneIter() {
boosting_->RollbackOneIter();
}
const Boosting* GetBoosting() const { return boosting_.get(); }
private:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment