Commit d8ecdaf5 authored by Guolin Ke's avatar Guolin Ke
Browse files

change metric name. update some comments

parent f059d0fe
...@@ -187,8 +187,7 @@ class Predictor(object): ...@@ -187,8 +187,7 @@ class Predictor(object):
""""A Predictor of LightGBM. """"A Predictor of LightGBM.
""" """
def __init__(self,model_file=None, params=None, booster_handle=None, is_manage_handle=True): def __init__(self,model_file=None, params=None, booster_handle=None, is_manage_handle=True):
# pylint: disable=invalid-name """Initialize the Predictor.
"""Initialize the Booster.
Parameters Parameters
---------- ----------
...@@ -233,6 +232,29 @@ class Predictor(object): ...@@ -233,6 +232,29 @@ class Predictor(object):
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True): def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
"""
Predict logic
Parameters
----------
data : string/numpy array/scipy.sparse
Data source for prediction
When data is string type, it represents the path of txt file,
num_iteration :
used iteration for prediction
raw_score : bool
True for predict raw score
pred_leaf : bool
True for predict leaf index
data_has_header : bool
Used for txt data
is_reshape : bool
True for reshape to [nrow, ...]
Returns
-------
Prediction result
"""
if isinstance(data, Dataset): if isinstance(data, Dataset):
raise TypeError("cannot use Dataset instance for prediction, please use raw data instead") raise TypeError("cannot use Dataset instance for prediction, please use raw data instead")
predict_type = C_API_PREDICT_NORMAL predict_type = C_API_PREDICT_NORMAL
...@@ -400,7 +422,7 @@ class Dataset(object): ...@@ -400,7 +422,7 @@ class Dataset(object):
params["max_bin"] = max_bin params["max_bin"] = max_bin
if silent: if silent:
params["verbose"] = 0 params["verbose"] = 0
else: elif "verbose" not in params:
params["verbose"] = 1 params["verbose"] = 1
params_str = dict_to_str(params) params_str = dict_to_str(params)
"""process for reference dataset""" """process for reference dataset"""
...@@ -477,7 +499,7 @@ class Dataset(object): ...@@ -477,7 +499,7 @@ class Dataset(object):
group/query id for each instance. Note: if having group/query id, data should group by this id group/query id for each instance. Note: if having group/query id, data should group by this id
silent : boolean, optional silent : boolean, optional
Whether print messages during construction Whether print messages during construction
other_params: dict, optional params: dict, optional
other parameters other parameters
""" """
return Dataset(data, label=label, max_bin=self.max_bin, reference=self, return Dataset(data, label=label, max_bin=self.max_bin, reference=self,
...@@ -758,7 +780,6 @@ class Booster(object): ...@@ -758,7 +780,6 @@ class Booster(object):
""""A Booster of of LightGBM. """"A Booster of of LightGBM.
""" """
def __init__(self, params=None, train_set=None, model_file=None, silent=False): def __init__(self, params=None, train_set=None, model_file=None, silent=False):
# pylint: disable=invalid-name
"""Initialize the Booster. """Initialize the Booster.
Parameters Parameters
...@@ -769,6 +790,8 @@ class Booster(object): ...@@ -769,6 +790,8 @@ class Booster(object):
training dataset training dataset
model_file : string model_file : string
Path to the model file. Path to the model file.
silent : boolean, optional
Whether print messages during construction
""" """
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
self.__need_reload_eval_info = True self.__need_reload_eval_info = True
...@@ -777,7 +800,7 @@ class Booster(object): ...@@ -777,7 +800,7 @@ class Booster(object):
params = {} params = {}
if silent: if silent:
params["verbose"] = 0 params["verbose"] = 0
else: elif "verbose" not in params:
params["verbose"] = 1 params["verbose"] = 1
if train_set is not None: if train_set is not None:
"""Training task""" """Training task"""
...@@ -806,6 +829,7 @@ class Booster(object): ...@@ -806,6 +829,7 @@ class Booster(object):
self.__num_class = out_num_class.value self.__num_class = out_num_class.value
"""buffer for inner predict""" """buffer for inner predict"""
self.__inner_predict_buffer = [None] self.__inner_predict_buffer = [None]
self.__is_predicted_cur_iter = [False]
self.__get_eval_info() self.__get_eval_info()
elif model_file is not None: elif model_file is not None:
"""Prediction task""" """Prediction task"""
...@@ -828,6 +852,15 @@ class Booster(object): ...@@ -828,6 +852,15 @@ class Booster(object):
_safe_call(_LIB.LGBM_BoosterFree(self.handle)) _safe_call(_LIB.LGBM_BoosterFree(self.handle))
def add_valid(self, data, name): def add_valid(self, data, name):
"""Add an validation data
Parameters
----------
data : Dataset
validation data
name : String
name of validation data
"""
if data.predictor is not self.init_predictor: if data.predictor is not self.init_predictor:
raise Exception("Add validation data failed, you should use same predictor for these data") raise Exception("Add validation data failed, you should use same predictor for these data")
_safe_call(_LIB.LGBM_BoosterAddValidData( _safe_call(_LIB.LGBM_BoosterAddValidData(
...@@ -836,12 +869,23 @@ class Booster(object): ...@@ -836,12 +869,23 @@ class Booster(object):
self.valid_sets.append(data) self.valid_sets.append(data)
self.name_valid_sets.append(name) self.name_valid_sets.append(name)
self.__num_dataset += 1 self.__num_dataset += 1
self.__inner_predict_buffer.append(None)
self.__is_predicted_cur_iter.append(False)
def reset_parameter(self, params, silent=False): def reset_parameter(self, params, silent=False):
"""Reset parameters for booster
Parameters
----------
params : dict
params
silent : boolean, optional
Whether print messages during construction
"""
self.__need_reload_eval_info = True self.__need_reload_eval_info = True
if silent: if silent:
params["verbose"] = 0 params["verbose"] = 0
else: elif "verbose" not in params:
params["verbose"] = 1 params["verbose"] = 1
params_str = dict_to_str(params) params_str = dict_to_str(params)
_safe_call(_LIB.LGBM_BoosterResetParameter( _safe_call(_LIB.LGBM_BoosterResetParameter(
...@@ -864,6 +908,7 @@ class Booster(object): ...@@ -864,6 +908,7 @@ class Booster(object):
------- -------
is_finished, bool is_finished, bool
""" """
"""need reset training data""" """need reset training data"""
if train_set is not None and train_set is not self.train_set: if train_set is not None and train_set is not self.train_set:
if train_set.predictor is not self.init_predictor: if train_set.predictor is not self.init_predictor:
...@@ -878,6 +923,7 @@ class Booster(object): ...@@ -878,6 +923,7 @@ class Booster(object):
_safe_call(_LIB.LGBM_BoosterUpdateOneIter( _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
self.handle, self.handle,
ctypes.byref(is_finished))) ctypes.byref(is_finished)))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
return is_finished.value == 1 return is_finished.value == 1
else: else:
grad, hess = fobj(self.__inner_predict(0), self.train_set) grad, hess = fobj(self.__inner_predict(0), self.train_set)
...@@ -891,9 +937,9 @@ class Booster(object): ...@@ -891,9 +937,9 @@ class Booster(object):
and you should group grad and hess in this way as well and you should group grad and hess in this way as well
Parameters Parameters
---------- ----------
grad : 1d numpy or list grad : 1d numpy or 1d list
The first order of gradient. The first order of gradient.
hess : 1d numpy or list hess : 1d numpy or 1d list
The second order of gradient. The second order of gradient.
Returns Returns
...@@ -922,11 +968,16 @@ class Booster(object): ...@@ -922,11 +968,16 @@ class Booster(object):
grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.byref(is_finished))) ctypes.byref(is_finished)))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
return is_finished.value == 1 return is_finished.value == 1
def rollback_one_iter(self): def rollback_one_iter(self):
"""
Rollback one iteration
"""
_safe_call(_LIB.LGBM_BoosterRollbackOneIter( _safe_call(_LIB.LGBM_BoosterRollbackOneIter(
self.handle)) self.handle))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
def current_iteration(self): def current_iteration(self):
out_cur_iter = ctypes.c_int64(0) out_cur_iter = ctypes.c_int64(0)
...@@ -946,8 +997,8 @@ class Booster(object): ...@@ -946,8 +997,8 @@ class Booster(object):
Custom evaluation function. Custom evaluation function.
Returns Returns
------- -------
result: str result: list
Evaluation result string. Evaluation result list.
""" """
if not isinstance(data, Dataset): if not isinstance(data, Dataset):
raise TypeError("Can only eval for Dataset instance") raise TypeError("Can only eval for Dataset instance")
...@@ -977,7 +1028,7 @@ class Booster(object): ...@@ -977,7 +1028,7 @@ class Booster(object):
Returns Returns
------- -------
result: str result: str
Evaluation result string. Evaluation result list.
""" """
return self.__inner_eval("training", 0, feval) return self.__inner_eval("training", 0, feval)
...@@ -992,29 +1043,67 @@ class Booster(object): ...@@ -992,29 +1043,67 @@ class Booster(object):
Returns Returns
------- -------
result: str result: str
Evaluation result string. Evaluation result list.
""" """
ret = [] ret = []
for i in range(1, self.__num_dataset): for i in range(1, self.__num_dataset):
ret.append(self.__inner_eval(self.name_valid_sets[i-1], i, feval)) ret.extend(self.__inner_eval(self.name_valid_sets[i-1], i, feval))
return '\n'.join(ret) return ret
def save_model(self, filename, num_iteration=-1): def save_model(self, filename, num_iteration=-1):
"""Save model of booster to file
Parameters
----------
filename : str
filename to save
num_iteration: int
number of iteration that want to save. < 0 means save all
"""
_safe_call(_LIB.LGBM_BoosterSaveModel( _safe_call(_LIB.LGBM_BoosterSaveModel(
self.handle, self.handle,
num_iteration, num_iteration,
c_str(filename))) c_str(filename)))
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True): def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
"""
Predict logic
Parameters
----------
data : string/numpy array/scipy.sparse
Data source for prediction
When data is string type, it represents the path of txt file,
num_iteration :
used iteration for prediction
raw_score : bool
True for predict raw score
pred_leaf : bool
True for predict leaf index
data_has_header : bool
Used for txt data
is_reshape : bool
True for reshape to [nrow, ...]
Returns
-------
Prediction result
"""
predictor = Predictor(booster_handle=self.handle, is_manage_handle=False) predictor = Predictor(booster_handle=self.handle, is_manage_handle=False)
return predictor.predict(data, num_iteration, raw_score, pred_leaf, data_has_header, is_reshape) return predictor.predict(data, num_iteration, raw_score, pred_leaf, data_has_header, is_reshape)
def to_predictor(self): def to_predictor(self):
"""Convert to predictor
Note: Predictor will manage the handle after doing this
"""
predictor = Predictor(booster_handle=self.handle, is_manage_handle=True) predictor = Predictor(booster_handle=self.handle, is_manage_handle=True)
self.__is_manage_handle = False self.__is_manage_handle = False
return predictor return predictor
def __inner_eval(self, data_name, data_idx, feval=None): def __inner_eval(self, data_name, data_idx, feval=None):
"""
Evaulate traning or validation data
"""
if data_idx >= self.__num_dataset: if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset") raise ValueError("data_idx should be smaller than number of dataset")
self.__get_eval_info() self.__get_eval_info()
...@@ -1030,7 +1119,7 @@ class Booster(object): ...@@ -1030,7 +1119,7 @@ class Booster(object):
if tmp_out_len.value != self.__num_inner_eval: if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("incorrect number of eval results") raise ValueError("incorrect number of eval results")
for i in range(self.__num_inner_eval): for i in range(self.__num_inner_eval):
ret.append('%s %s : %f' %(data_name, self.__name_inner_eval[i], result[i])) ret.append((data_name, self.__name_inner_eval[i], result[i]))
if feval is not None: if feval is not None:
if data_idx == 0: if data_idx == 0:
cur_data = self.train_set cur_data = self.train_set
...@@ -1038,14 +1127,17 @@ class Booster(object): ...@@ -1038,14 +1127,17 @@ class Booster(object):
cur_data = self.valid_sets[data_idx - 1] cur_data = self.valid_sets[data_idx - 1]
feval_ret = feval(self.__inner_predict(data_idx), cur_data) feval_ret = feval(self.__inner_predict(data_idx), cur_data)
if isinstance(feval_ret, list): if isinstance(feval_ret, list):
for name, val in feval_ret: for eval_name, val in feval_ret:
ret.append('%s %s : %f' % (data_name, name, val)) ret.append((data_name, eval_name, val))
else: else:
name, val = feval_ret eval_name, val = feval_ret
ret.append('%s %s : %f' % (data_name, name, val)) ret.append((data_name, eval_name, val))
return '\t'.join(ret) return ret
def __inner_predict(self, data_idx): def __inner_predict(self, data_idx):
"""
Predict for training and validation dataset
"""
if data_idx >= self.__num_dataset: if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset") raise ValueError("data_idx should be smaller than number of dataset")
if self.__inner_predict_buffer[data_idx] is None: if self.__inner_predict_buffer[data_idx] is None:
...@@ -1055,6 +1147,8 @@ class Booster(object): ...@@ -1055,6 +1147,8 @@ class Booster(object):
num_data = self.valid_sets[data_idx - 1].num_data() * self.__num_class num_data = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = \ self.__inner_predict_buffer[data_idx] = \
np.array([0.0 for _ in range(num_data)], dtype=np.float32, copy=False) np.array([0.0 for _ in range(num_data)], dtype=np.float32, copy=False)
"""avoid to predict many time in one iteration"""
if not self.__is_predicted_cur_iter[data_idx]:
tmp_out_len = ctypes.c_int64(0) tmp_out_len = ctypes.c_int64(0)
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float)) data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
_safe_call(_LIB.LGBM_BoosterGetPredict( _safe_call(_LIB.LGBM_BoosterGetPredict(
...@@ -1064,9 +1158,13 @@ class Booster(object): ...@@ -1064,9 +1158,13 @@ class Booster(object):
data_ptr)) data_ptr))
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]): if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" %(data_idx) ) raise ValueError("incorrect number of predict results for data %d" %(data_idx) )
self.__is_predicted_cur_iter[data_idx] = True
return self.__inner_predict_buffer[data_idx] return self.__inner_predict_buffer[data_idx]
def __get_eval_info(self): def __get_eval_info(self):
"""
Get inner evaluation count and names
"""
if self.__need_reload_eval_info: if self.__need_reload_eval_info:
self.__need_reload_eval_info = False self.__need_reload_eval_info = False
out_num_eval = ctypes.c_int64(0) out_num_eval = ctypes.c_int64(0)
......
...@@ -116,7 +116,7 @@ public: ...@@ -116,7 +116,7 @@ public:
} }
inline static const char* Name() { inline static const char* Name() {
return "log loss"; return "logloss";
} }
}; };
/*! /*!
...@@ -135,7 +135,7 @@ public: ...@@ -135,7 +135,7 @@ public:
} }
inline static const char* Name() { inline static const char* Name() {
return "error rate"; return "error";
} }
}; };
...@@ -160,7 +160,7 @@ public: ...@@ -160,7 +160,7 @@ public:
} }
void Init(const Metadata& metadata, data_size_t num_data) override { void Init(const Metadata& metadata, data_size_t num_data) override {
name_.emplace_back("AUC"); name_.emplace_back("auc");
num_data_ = num_data; num_data_ = num_data;
// get label // get label
......
...@@ -109,7 +109,7 @@ public: ...@@ -109,7 +109,7 @@ public:
} }
inline static const char* Name() { inline static const char* Name() {
return "multi error"; return "multi_error";
} }
}; };
...@@ -129,7 +129,7 @@ public: ...@@ -129,7 +129,7 @@ public:
} }
inline static const char* Name() { inline static const char* Name() {
return "multi logloss"; return "multi_logloss";
} }
}; };
......
...@@ -35,7 +35,7 @@ public: ...@@ -35,7 +35,7 @@ public:
} }
void Init(const Metadata& metadata, data_size_t num_data) override { void Init(const Metadata& metadata, data_size_t num_data) override {
for (auto k : eval_at_) { for (auto k : eval_at_) {
name_.emplace_back(std::string("NDCG@") + std::to_string(k)); name_.emplace_back(std::string("ndcg@") + std::to_string(k));
} }
num_data_ = num_data; num_data_ = num_data;
// get label // get label
......
...@@ -101,7 +101,7 @@ public: ...@@ -101,7 +101,7 @@ public:
} }
inline static const char* Name() { inline static const char* Name() {
return "l2 loss"; return "l2";
} }
}; };
...@@ -114,7 +114,7 @@ public: ...@@ -114,7 +114,7 @@ public:
return std::fabs(score - label); return std::fabs(score - label);
} }
inline static const char* Name() { inline static const char* Name() {
return "l1 loss"; return "l1";
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment