"...git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "742d72f8bb051105484fd5cca11620493ffb0b2b"
Commit 5b539788 authored by Guolin Ke's avatar Guolin Ke
Browse files

fix some pep8 check

parent 1a8c23ed
...@@ -4,8 +4,6 @@ from __future__ import absolute_import ...@@ -4,8 +4,6 @@ from __future__ import absolute_import
import sys import sys
import os import os
import ctypes import ctypes
import collections
import re
import tempfile import tempfile
import numpy as np import numpy as np
...@@ -59,7 +57,7 @@ def is_1d_list(data): ...@@ -59,7 +57,7 @@ def is_1d_list(data):
if not isinstance(data, list): if not isinstance(data, list):
return False return False
if len(data) > 0: if len(data) > 0:
if not isinstance(data[0], (int, float, bool) ): if not isinstance(data[0], (int, float, bool)):
return False return False
return True return True
...@@ -108,29 +106,29 @@ def param_dict_to_str(data): ...@@ -108,29 +106,29 @@ def param_dict_to_str(data):
if is_str(val): if is_str(val):
pairs.append(str(key)+'='+str(val)) pairs.append(str(key)+'='+str(val))
elif isinstance(val, (list, tuple)): elif isinstance(val, (list, tuple)):
pairs.append(str(key)+'='+','.join(map(str,val))) pairs.append(str(key)+'='+','.join(map(str, val)))
elif isinstance(val, (int, float, bool)): elif isinstance(val, (int, float, bool)):
pairs.append(str(key)+'='+str(val)) pairs.append(str(key)+'='+str(val))
else: else:
raise TypeError('unknow type of parameter:%s , got:%s' %(key, type(val).__name__)) raise TypeError('unknow type of parameter:%s , got:%s'
% (key, type(val).__name__))
return ' '.join(pairs) return ' '.join(pairs)
"""marco definition of data type in c_api of LightGBM""" """marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32 =0 C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 =1 C_API_DTYPE_FLOAT64 = 1
C_API_DTYPE_INT32 =2 C_API_DTYPE_INT32 = 2
C_API_DTYPE_INT64 =3 C_API_DTYPE_INT64 = 3
"""Matric is row major in python""" """Matric is row major in python"""
C_API_IS_ROW_MAJOR =1 C_API_IS_ROW_MAJOR = 1
C_API_PREDICT_NORMAL =0 C_API_PREDICT_NORMAL = 0
C_API_PREDICT_RAW_SCORE =1 C_API_PREDICT_RAW_SCORE = 1
C_API_PREDICT_LEAF_INDEX =2 C_API_PREDICT_LEAF_INDEX = 2
FIELD_TYPE_MAPPER = {"label":C_API_DTYPE_FLOAT32, FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"weight":C_API_DTYPE_FLOAT32, "weight": C_API_DTYPE_FLOAT32,
"init_score":C_API_DTYPE_FLOAT32, "init_score": C_API_DTYPE_FLOAT32,
"group":C_API_DTYPE_INT32, "group": C_API_DTYPE_INT32}
}
def c_float_array(data): def c_float_array(data):
"""Convert numpy array / list to c float array.""" """Convert numpy array / list to c float array."""
...@@ -144,7 +142,8 @@ def c_float_array(data): ...@@ -144,7 +142,8 @@ def c_float_array(data):
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)) ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
type_data = C_API_DTYPE_FLOAT64 type_data = C_API_DTYPE_FLOAT64
else: else:
raise TypeError("expected np.float32 or np.float64, met type({})".format(data.dtype)) raise TypeError("expected np.float32 or np.float64, met type({})"
.format(data.dtype))
else: else:
raise TypeError("Unknow type({})".format(type(data).__name__)) raise TypeError("Unknow type({})".format(type(data).__name__))
return (ptr_data, type_data) return (ptr_data, type_data)
...@@ -161,7 +160,8 @@ def c_int_array(data): ...@@ -161,7 +160,8 @@ def c_int_array(data):
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
type_data = C_API_DTYPE_INT64 type_data = C_API_DTYPE_INT64
else: else:
raise TypeError("expected np.int32 or np.int64, met type({})".format(data.dtype)) raise TypeError("expected np.int32 or np.int64, met type({})"
.format(data.dtype))
else: else:
raise TypeError("Unknow type({})".format(type(data).__name__)) raise TypeError("Unknow type({})".format(type(data).__name__))
return (ptr_data, type_data) return (ptr_data, type_data)
...@@ -169,13 +169,13 @@ def c_int_array(data): ...@@ -169,13 +169,13 @@ def c_int_array(data):
class Predictor(object): class Predictor(object):
""""A Predictor of LightGBM. """"A Predictor of LightGBM.
""" """
def __init__(self,model_file=None, booster_handle=None, is_manage_handle=True): def __init__(self, model_file=None, booster_handle=None, is_manage_handle=True):
"""Initialize the Predictor. """Initialize the Predictor.
Parameters Parameters
---------- ----------
model_file : string model_file : string
Path to the model file. Path to the model file.
""" """
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
self.__is_manage_handle = True self.__is_manage_handle = True
...@@ -191,7 +191,7 @@ class Predictor(object): ...@@ -191,7 +191,7 @@ class Predictor(object):
self.handle, self.handle,
ctypes.byref(out_num_class))) ctypes.byref(out_num_class)))
self.num_class = out_num_class.value self.num_class = out_num_class.value
self.__num_total_iteration = out_num_iterations.value self.__num_total_iteration = out_num_iterations.value
elif booster_handle is not None: elif booster_handle is not None:
self.__is_manage_handle = is_manage_handle self.__is_manage_handle = is_manage_handle
self.handle = booster_handle self.handle = booster_handle
...@@ -204,7 +204,7 @@ class Predictor(object): ...@@ -204,7 +204,7 @@ class Predictor(object):
_safe_call(_LIB.LGBM_BoosterGetCurrentIteration( _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
self.handle, self.handle,
ctypes.byref(out_num_iterations))) ctypes.byref(out_num_iterations)))
self.__num_total_iteration = out_num_iterations.value self.__num_total_iteration = out_num_iterations.value
else: else:
raise TypeError('Need Model file to create a booster') raise TypeError('Need Model file to create a booster')
...@@ -213,7 +213,9 @@ class Predictor(object): ...@@ -213,7 +213,9 @@ class Predictor(object):
_safe_call(_LIB.LGBM_BoosterFree(self.handle)) _safe_call(_LIB.LGBM_BoosterFree(self.handle))
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True): def predict(self, data, num_iteration=-1,
raw_score=False, pred_leaf=False, data_has_header=False,
is_reshape=True):
""" """
Predict logic Predict logic
...@@ -222,23 +224,24 @@ class Predictor(object): ...@@ -222,23 +224,24 @@ class Predictor(object):
data : string/numpy array/scipy.sparse data : string/numpy array/scipy.sparse
Data source for prediction Data source for prediction
When data is string type, it represents the path of txt file, When data is string type, it represents the path of txt file,
num_iteration : num_iteration : int
used iteration for prediction used iteration for prediction
raw_score : bool raw_score : bool
True for predict raw score True for predict raw score
pred_leaf : bool pred_leaf : bool
True for predict leaf index True for predict leaf index
data_has_header : bool data_has_header : bool
Used for txt data Used for txt data
is_reshape : bool is_reshape : bool
True for reshape to [nrow, ...] True for reshape to [nrow, ...]
Returns Returns
------- -------
Prediction result Prediction result
""" """
if isinstance(data, Dataset): if isinstance(data, Dataset):
raise TypeError("cannot use Dataset instance for prediction, please use raw data instead") raise TypeError("cannot use Dataset instance for prediction, \
please use raw data instead")
predict_type = C_API_PREDICT_NORMAL predict_type = C_API_PREDICT_NORMAL
if raw_score: if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE predict_type = C_API_PREDICT_RAW_SCORE
...@@ -251,12 +254,12 @@ class Predictor(object): ...@@ -251,12 +254,12 @@ class Predictor(object):
tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
_safe_call(_LIB.LGBM_BoosterPredictForFile( _safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle, self.handle,
c_str(data), c_str(data),
int_data_has_header, int_data_has_header,
predict_type, predict_type,
num_iteration, num_iteration,
c_str(tmp_pred_fname))) c_str(tmp_pred_fname)))
tmp_file = open(tmp_pred_fname,"r") tmp_file = open(tmp_pred_fname, "r")
lines = tmp_file.readlines() lines = tmp_file.readlines()
tmp_file.close() tmp_file.close()
nrow = len(lines) nrow = len(lines)
...@@ -267,15 +270,19 @@ class Predictor(object): ...@@ -267,15 +270,19 @@ class Predictor(object):
preds = np.array(preds, copy=False) preds = np.array(preds, copy=False)
os.remove(tmp_pred_fname) os.remove(tmp_pred_fname)
elif isinstance(data, scipy.sparse.csr_matrix): elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type) preds, nrow = self.__pred_for_csr(data, num_iteration,
predict_type)
elif isinstance(data, np.ndarray): elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type) preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type)
else: else:
try: try:
csr = scipy.sparse.csr_matrix(data) csr = scipy.sparse.csr_matrix(data)
preds, nrow = self.__pred_for_csr(csr, num_iteration, predict_type) preds, nrow = self.__pred_for_csr(csr, num_iteration,
predict_type)
except: except:
raise TypeError('can not predict data for type {}'.format(type(data).__name__)) raise TypeError('can not predict data for type {}'.
format(type(data).__name__))
if pred_leaf: if pred_leaf:
preds = preds.astype(np.int32) preds = preds.astype(np.int32)
if preds.size != nrow and is_reshape: if preds.size != nrow and is_reshape:
...@@ -283,7 +290,8 @@ class Predictor(object): ...@@ -283,7 +290,8 @@ class Predictor(object):
ncol = int(preds.size / nrow) ncol = int(preds.size / nrow)
preds = preds.reshape(nrow, ncol) preds = preds.reshape(nrow, ncol)
else: else:
raise ValueError('len of predict result(%d) cannot be divide nrow(%d)' %(preds.size, nrow) ) raise ValueError('len of predict result(%d) cannot be divide nrow (%d)'
% (preds.size, nrow))
return preds return preds
def __get_num_preds(self, num_iteration, nrow, predict_type): def __get_num_preds(self, num_iteration, nrow, predict_type):
...@@ -308,12 +316,13 @@ class Predictor(object): ...@@ -308,12 +316,13 @@ class Predictor(object):
"""change non-float data to float data, need to copy""" """change non-float data to float data, need to copy"""
data = np.array(mat.reshape(mat.size), dtype=np.float32) data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data = c_float_array(data) ptr_data, type_ptr_data = c_float_array(data)
n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type) n_preds = self.__get_num_preds(num_iteration, mat.shape[0],
predict_type)
preds = np.zeros(n_preds, dtype=np.float32) preds = np.zeros(n_preds, dtype=np.float32)
out_num_preds = ctypes.c_int64(0) out_num_preds = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterPredictForMat( _safe_call(_LIB.LGBM_BoosterPredictForMat(
self.handle, self.handle,
ptr_data, ptr_data,
type_ptr_data, type_ptr_data,
mat.shape[0], mat.shape[0],
mat.shape[1], mat.shape[1],
...@@ -341,12 +350,12 @@ class Predictor(object): ...@@ -341,12 +350,12 @@ class Predictor(object):
_safe_call(_LIB.LGBM_BoosterPredictForCSR( _safe_call(_LIB.LGBM_BoosterPredictForCSR(
self.handle, self.handle,
ptr_indptr, ptr_indptr,
type_ptr_indptr, type_ptr_indptr,
csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data, ptr_data,
type_ptr_data, type_ptr_data,
len(csr.indptr), len(csr.indptr),
len(csr.data), len(csr.data),
csr.shape[1], csr.shape[1],
predict_type, predict_type,
...@@ -365,10 +374,10 @@ except ImportError: ...@@ -365,10 +374,10 @@ except ImportError:
class DataFrame(object): class DataFrame(object):
pass pass
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int', PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int', 'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float', 'uint32': 'int', 'uint64': 'int', 'float16': 'float',
'bool': 'i'} 'float32': 'float', 'float64': 'float', 'bool': 'i'}
def _data_from_pandas(data): def _data_from_pandas(data):
if isinstance(data, DataFrame): if isinstance(data, DataFrame):
...@@ -399,8 +408,8 @@ class Dataset(object): ...@@ -399,8 +408,8 @@ class Dataset(object):
""" """
def __init__(self, data, label=None, max_bin=255, reference=None, def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, predictor=None, weight=None, group=None, predictor=None,
silent=False, params=None): silent=False, params=None):
""" """
Dataset used in LightGBM. Dataset used in LightGBM.
...@@ -412,7 +421,7 @@ class Dataset(object): ...@@ -412,7 +421,7 @@ class Dataset(object):
label : list or numpy 1-D array, optional label : list or numpy 1-D array, optional
Label of the data Label of the data
max_bin : int, required max_bin : int, required
max number of discrete bin for features max number of discrete bin for features
reference : Other Dataset, optional reference : Other Dataset, optional
If this dataset validation, need to use training data as reference If this dataset validation, need to use training data as reference
weight : list or numpy 1-D array , optional weight : list or numpy 1-D array , optional
...@@ -482,10 +491,10 @@ class Dataset(object): ...@@ -482,10 +491,10 @@ class Dataset(object):
self.set_group(group) self.set_group(group)
# load init score # load init score
if self.predictor is not None and isinstance(self.predictor, Predictor): if self.predictor is not None and isinstance(self.predictor, Predictor):
init_score = self.predictor.predict(data, init_score = self.predictor.predict(data,
raw_score=True, raw_score=True,
data_has_header=self.data_has_header, data_has_header=self.data_has_header,
is_reshape=False) is_reshape=False)
if self.predictor.num_class > 1: if self.predictor.num_class > 1:
# need re group init score # need re group init score
new_init_score = np.zeros(init_score.size(), dtype=np.float32) new_init_score = np.zeros(init_score.size(), dtype=np.float32)
...@@ -496,8 +505,8 @@ class Dataset(object): ...@@ -496,8 +505,8 @@ class Dataset(object):
init_score = new_init_score init_score = new_init_score
self.set_init_score(init_score) self.set_init_score(init_score)
def create_valid(self, data, label=None, weight=None, group=None, def create_valid(self, data, label=None, weight=None, group=None,
silent=False, params=None): silent=False, params=None):
""" """
Create validation data align with current dataset Create validation data align with current dataset
...@@ -518,8 +527,8 @@ class Dataset(object): ...@@ -518,8 +527,8 @@ class Dataset(object):
other parameters other parameters
""" """
return Dataset(data, label=label, max_bin=self.max_bin, reference=self, return Dataset(data, label=label, max_bin=self.max_bin, reference=self,
weight=weight, group=group, predictor=self.predictor, weight=weight, group=group, predictor=self.predictor,
silent=silent, params=params) silent=silent, params=params)
def subset(self, used_indices, params=None): def subset(self, used_indices, params=None):
""" """
...@@ -530,10 +539,10 @@ class Dataset(object): ...@@ -530,10 +539,10 @@ class Dataset(object):
ret.handle = ctypes.c_void_p() ret.handle = ctypes.c_void_p()
params_str = param_dict_to_str(params) params_str = param_dict_to_str(params)
_safe_call(_LIB.LGBM_DatasetGetSubset( _safe_call(_LIB.LGBM_DatasetGetSubset(
ctypes.byref(self.handle), ctypes.byref(self.handle),
used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
used_indices.shape[0], used_indices.shape[0],
c_str(params_str), c_str(params_str),
ctypes.byref(ret.handle))) ctypes.byref(ret.handle)))
ret.max_bin = self.max_bin ret.max_bin = self.max_bin
ret.predictor = self.predictor ret.predictor = self.predictor
...@@ -557,13 +566,13 @@ class Dataset(object): ...@@ -557,13 +566,13 @@ class Dataset(object):
ptr_data, type_ptr_data = c_float_array(data) ptr_data, type_ptr_data = c_float_array(data)
_safe_call(_LIB.LGBM_DatasetCreateFromMat( _safe_call(_LIB.LGBM_DatasetCreateFromMat(
ptr_data, ptr_data,
type_ptr_data, type_ptr_data,
mat.shape[0], mat.shape[0],
mat.shape[1], mat.shape[1],
C_API_IS_ROW_MAJOR, C_API_IS_ROW_MAJOR,
c_str(params_str), c_str(params_str),
ref_dataset, ref_dataset,
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
def __init_from_csr(self, csr, params_str, ref_dataset): def __init_from_csr(self, csr, params_str, ref_dataset):
...@@ -578,16 +587,16 @@ class Dataset(object): ...@@ -578,16 +587,16 @@ class Dataset(object):
ptr_data, type_ptr_data = c_float_array(csr.data) ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(_LIB.LGBM_DatasetCreateFromCSR( _safe_call(_LIB.LGBM_DatasetCreateFromCSR(
ptr_indptr, ptr_indptr,
type_ptr_indptr, type_ptr_indptr,
csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data, ptr_data,
type_ptr_data, type_ptr_data,
len(csr.indptr), len(csr.indptr),
len(csr.data), len(csr.data),
csr.shape[1], csr.shape[1],
c_str(params_str), c_str(params_str),
ref_dataset, ref_dataset,
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
def __del__(self): def __del__(self):
...@@ -784,7 +793,7 @@ class Dataset(object): ...@@ -784,7 +793,7 @@ class Dataset(object):
""" """
ret = ctypes.c_int64() ret = ctypes.c_int64()
_safe_call(_LIB.LGBM_DatasetGetNumData(self.handle, _safe_call(_LIB.LGBM_DatasetGetNumData(self.handle,
ctypes.byref(ret))) ctypes.byref(ret)))
return ret.value return ret.value
def num_feature(self): def num_feature(self):
...@@ -796,7 +805,7 @@ class Dataset(object): ...@@ -796,7 +805,7 @@ class Dataset(object):
""" """
ret = ctypes.c_int64() ret = ctypes.c_int64()
_safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle, _safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle,
ctypes.byref(ret))) ctypes.byref(ret)))
return ret.value return ret.value
class Booster(object): class Booster(object):
...@@ -812,7 +821,7 @@ class Booster(object): ...@@ -812,7 +821,7 @@ class Booster(object):
train_set : Dataset train_set : Dataset
training dataset training dataset
model_file : string model_file : string
Path to the model file. Path to the model file.
silent : boolean, optional silent : boolean, optional
Whether print messages during construction Whether print messages during construction
""" """
...@@ -833,7 +842,7 @@ class Booster(object): ...@@ -833,7 +842,7 @@ class Booster(object):
params_str = param_dict_to_str(params) params_str = param_dict_to_str(params)
"""construct booster object""" """construct booster object"""
_safe_call(_LIB.LGBM_BoosterCreate( _safe_call(_LIB.LGBM_BoosterCreate(
train_set.handle, train_set.handle,
c_str(params_str), c_str(params_str),
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
"""save reference to data""" """save reference to data"""
...@@ -859,7 +868,7 @@ class Booster(object): ...@@ -859,7 +868,7 @@ class Booster(object):
"""Prediction task""" """Prediction task"""
out_num_iterations = ctypes.c_int64(0) out_num_iterations = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile( _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file), c_str(model_file),
ctypes.byref(out_num_iterations), ctypes.byref(out_num_iterations),
ctypes.byref(self.handle))) ctypes.byref(self.handle)))
out_num_class = ctypes.c_int64(0) out_num_class = ctypes.c_int64(0)
...@@ -939,13 +948,13 @@ class Booster(object): ...@@ -939,13 +948,13 @@ class Booster(object):
raise Exception("Replace training data failed, you should use same predictor for these data") raise Exception("Replace training data failed, you should use same predictor for these data")
self.train_set = train_set self.train_set = train_set
_safe_call(_LIB.LGBM_BoosterResetTrainingData( _safe_call(_LIB.LGBM_BoosterResetTrainingData(
self.handle, self.handle,
self.train_set.handle)) self.train_set.handle))
self.__inner_predict_buffer[0] = None self.__inner_predict_buffer[0] = None
is_finished = ctypes.c_int(0) is_finished = ctypes.c_int(0)
if fobj is None: if fobj is None:
_safe_call(_LIB.LGBM_BoosterUpdateOneIter( _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
self.handle, self.handle,
ctypes.byref(is_finished))) ctypes.byref(is_finished)))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)] self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
return is_finished.value == 1 return is_finished.value == 1
...@@ -1080,7 +1089,7 @@ class Booster(object): ...@@ -1080,7 +1089,7 @@ class Booster(object):
Parameters Parameters
---------- ----------
filename : str filename : str
filename to save filename to save
num_iteration: int num_iteration: int
number of iteration that want to save. < 0 means save all number of iteration that want to save. < 0 means save all
""" """
...@@ -1098,16 +1107,16 @@ class Booster(object): ...@@ -1098,16 +1107,16 @@ class Booster(object):
data : string/numpy array/scipy.sparse data : string/numpy array/scipy.sparse
Data source for prediction Data source for prediction
When data is string type, it represents the path of txt file, When data is string type, it represents the path of txt file,
num_iteration : num_iteration : int
used iteration for prediction used iteration for prediction
raw_score : bool raw_score : bool
True for predict raw score True for predict raw score
pred_leaf : bool pred_leaf : bool
True for predict leaf index True for predict leaf index
data_has_header : bool data_has_header : bool
Used for txt data Used for txt data
is_reshape : bool is_reshape : bool
True for reshape to [nrow, ...] True for reshape to [nrow, ...]
Returns Returns
------- -------
...@@ -1136,8 +1145,8 @@ class Booster(object): ...@@ -1136,8 +1145,8 @@ class Booster(object):
result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float32) result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float32)
tmp_out_len = ctypes.c_int64(0) tmp_out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetEval( _safe_call(_LIB.LGBM_BoosterGetEval(
self.handle, self.handle,
data_idx, data_idx,
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float)))) result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
if tmp_out_len.value != self.__num_inner_eval: if tmp_out_len.value != self.__num_inner_eval:
...@@ -1176,12 +1185,12 @@ class Booster(object): ...@@ -1176,12 +1185,12 @@ class Booster(object):
tmp_out_len = ctypes.c_int64(0) tmp_out_len = ctypes.c_int64(0)
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float)) data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
_safe_call(_LIB.LGBM_BoosterGetPredict( _safe_call(_LIB.LGBM_BoosterGetPredict(
self.handle, self.handle,
data_idx, data_idx,
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
data_ptr)) data_ptr))
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]): if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" %(data_idx) ) raise ValueError("incorrect number of predict results for data %d" % (data_idx) )
self.__is_predicted_cur_iter[data_idx] = True self.__is_predicted_cur_iter[data_idx] = True
return self.__inner_predict_buffer[data_idx] return self.__inner_predict_buffer[data_idx]
......
...@@ -148,7 +148,6 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -148,7 +148,6 @@ def early_stop(stopping_rounds, verbose=True):
callback : function callback : function
The requested callback function. The requested callback function.
""" """
state = {}
factor_to_bigger_better = {} factor_to_bigger_better = {}
best_score = {} best_score = {}
best_iter = {} best_iter = {}
...@@ -172,23 +171,21 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -172,23 +171,21 @@ def early_stop(stopping_rounds, verbose=True):
factor_to_bigger_better[i] = -1.0 factor_to_bigger_better[i] = -1.0
if env.evaluation_result_list[i][3]: if env.evaluation_result_list[i][3]:
factor_to_bigger_better[i] = 1.0 factor_to_bigger_better[i] = 1.0
state['best_iter'] = 0
def callback(env): def callback(env):
"""internal function""" """internal function"""
if len(best_score) == 0: if len(best_score) == 0:
init(env) init(env)
for i in range(len(env.evaluation_result_list)): for i in range(len(env.evaluation_result_list)):
score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i] score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
if score > best_score[i]: if score > best_score[i]:
best_score[i] = score best_score[i] = score
best_iter[i] = env.iteration best_iter[i] = env.iteration
if verbose: if verbose:
best_msg[i] = '[%d]\t%s' % ( env.iteration, best_msg[i] = '[%d]\t%s' % ( env.iteration,
'\t'.join([_format_eval_result(x) for x in env.evaluation_result_list])) '\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
else: else:
if env.iteration - best_iter[i] >= stopping_rounds: if env.iteration - best_iter[i] >= stopping_rounds:
state['best_iter'] = best_iter[i]
if env.model is not None: if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i])) env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose: if verbose:
......
"""Training Library containing training routines of LightGBM.""" """Training Library containing training routines of LightGBM."""
from __future__ import absolute_import from __future__ import absolute_import
import collections
import numpy as np import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from . import callback from . import callback
def _construct_dataset(X_y, reference=None, def _construct_dataset(X_y, reference=None,
params=None, other_fields=None, predictor=None): params=None, other_fields=None,
predictor=None):
if 'max_bin' in params: if 'max_bin' in params:
max_bin = int(params['max_bin']) max_bin = int(params['max_bin'])
else: else:
...@@ -31,20 +31,22 @@ def _construct_dataset(X_y, reference=None, ...@@ -31,20 +31,22 @@ def _construct_dataset(X_y, reference=None,
label = X_y[1] label = X_y[1]
if reference is None: if reference is None:
ret = Dataset(data, label=label, max_bin=max_bin, ret = Dataset(data, label=label, max_bin=max_bin,
weight=weight, group=group, predictor=predictor, params=params) weight=weight, group=group,
predictor=predictor, params=params)
else: else:
ret = reference.create_valid(data, label=label, weight=weight, group=group, params=params) ret = reference.create_valid(data, label=label, weight=weight,
group=group, params=params)
if init_score is not None: if init_score is not None:
ret.set_init_score(init_score) ret.set_init_score(init_score)
return ret return ret
def train(params, train_data, num_boost_round=100, def train(params, train_data, num_boost_round=100,
valid_datas=None, valid_names=None, valid_datas=None, valid_names=None,
fobj=None, feval=None, init_model=None, fobj=None, feval=None, init_model=None,
train_fields=None, valid_fields=None, train_fields=None, valid_fields=None,
early_stopping_rounds=None, evals_result=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None): verbose_eval=True, learning_rates=None, callbacks=None):
"""Train with given parameters. """Train with given parameters.
Parameters Parameters
...@@ -134,9 +136,9 @@ def train(params, train_data, num_boost_round=100, ...@@ -134,9 +136,9 @@ def train(params, train_data, num_boost_round=100,
continue continue
valid_set = _construct_dataset( valid_set = _construct_dataset(
valid_datas[i], valid_datas[i],
train_set, train_set,
params, params,
other_fields, other_fields,
predictor) predictor)
valid_sets.append(valid_set) valid_sets.append(valid_set)
if valid_names is not None: if valid_names is not None:
...@@ -182,11 +184,11 @@ def train(params, train_data, num_boost_round=100, ...@@ -182,11 +184,11 @@ def train(params, train_data, num_boost_round=100,
for i in range(num_boost_round): for i in range(num_boost_round):
for cb in callbacks_before_iter: for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=booster, cb(callback.CallbackEnv(model=booster,
cvfolds=None, cvfolds=None,
iteration=i, iteration=i,
begin_iteration=0, begin_iteration=0,
end_iteration=num_boost_round, end_iteration=num_boost_round,
evaluation_result_list=None)) evaluation_result_list=None))
booster.update(fobj=fobj) booster.update(fobj=fobj)
...@@ -199,11 +201,11 @@ def train(params, train_data, num_boost_round=100, ...@@ -199,11 +201,11 @@ def train(params, train_data, num_boost_round=100,
try: try:
for cb in callbacks_after_iter: for cb in callbacks_after_iter:
cb(callback.CallbackEnv(model=booster, cb(callback.CallbackEnv(model=booster,
cvfolds=None, cvfolds=None,
iteration=i, iteration=i,
begin_iteration=0, begin_iteration=0,
end_iteration=num_boost_round, end_iteration=num_boost_round,
evaluation_result_list=evaluation_result_list)) evaluation_result_list=evaluation_result_list))
except callback.EarlyStopException: except callback.EarlyStopException:
break break
if booster.attr('best_iteration') is not None: if booster.attr('best_iteration') is not None:
...@@ -384,11 +386,11 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -384,11 +386,11 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
for i in range(num_boost_round): for i in range(num_boost_round):
for cb in callbacks_before_iter: for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=None, cb(callback.CallbackEnv(model=None,
cvfolds=cvfolds, cvfolds=cvfolds,
iteration=i, iteration=i,
begin_iteration=0, begin_iteration=0,
end_iteration=num_boost_round, end_iteration=num_boost_round,
evaluation_result_list=None)) evaluation_result_list=None))
for fold in cvfolds: for fold in cvfolds:
fold.update(fobj) fold.update(fobj)
res = _agg_cv_result([f.eval(feval) for f in cvfolds]) res = _agg_cv_result([f.eval(feval) for f in cvfolds])
...@@ -402,13 +404,13 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -402,13 +404,13 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
try: try:
for cb in callbacks_after_iter: for cb in callbacks_after_iter:
cb(callback.CallbackEnv(model=None, cb(callback.CallbackEnv(model=None,
cvfolds=cvfolds, cvfolds=cvfolds,
iteration=i, iteration=i,
begin_iteration=0, begin_iteration=0,
end_iteration=num_boost_round, end_iteration=num_boost_round,
evaluation_result_list=res)) evaluation_result_list=res))
except callback.EarlyStopException as e: except callback.EarlyStopException as e:
for k in results.keys(): for k in results.keys():
results[k] = results[k][:(e.state['best_iter'] + 1)] results[k] = results[k][:(e.best_iteration + 1)]
break break
return results return results
...@@ -12,9 +12,9 @@ def find_lib_path(): ...@@ -12,9 +12,9 @@ def find_lib_path():
""" """
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
dll_path = [curr_path, os.path.join(curr_path, '../../lib/'), dll_path = [curr_path, os.path.join(curr_path, '../../lib/'),
os.path.join(curr_path, '../../'), os.path.join(curr_path, '../../'),
os.path.join(curr_path, './lib/'), os.path.join(curr_path, './lib/'),
os.path.join(sys.prefix, 'lightgbm')] os.path.join(sys.prefix, 'lightgbm')]
if os.name == 'nt': if os.name == 'nt':
dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/')) dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/'))
dll_path.append(os.path.join(curr_path, './windows/x64/Dll/')) dll_path.append(os.path.join(curr_path, './windows/x64/Dll/'))
......
...@@ -194,7 +194,8 @@ class LGBMModel(LGBMModelBase): ...@@ -194,7 +194,8 @@ class LGBMModel(LGBMModelBase):
return params return params
def fit(self, X, y, eval_set=None, eval_metric=None, def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, train_fields=None, valid_fields=None, other_params=None): early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None):
""" """
Fit the gradient boosting model Fit the gradient boosting model
...@@ -308,7 +309,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -308,7 +309,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:]) """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None, def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None): train_fields=None, valid_fields=None, other_params=None):
self.classes_ = np.unique(y) self.classes_ = np.unique(y)
...@@ -328,8 +329,10 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -328,8 +329,10 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
if eval_set is not None: if eval_set is not None:
eval_set = list( (x[0], self._le.transform(x[1])) for x in eval_set ) eval_set = list( (x[0], self._le.transform(x[1])) for x in eval_set )
super(LGBMClassifier, self).fit(X, training_labels, eval_set, eval_metric, super(LGBMClassifier, self).fit(X, training_labels, eval_set,
early_stopping_rounds, verbose, train_fields, valid_fields, other_params) eval_metric, early_stopping_rounds,
verbose, train_fields, valid_fields,
other_params)
return self return self
def predict(self, data, raw_score=False, num_iteration=0): def predict(self, data, raw_score=False, num_iteration=0):
...@@ -405,7 +408,7 @@ class LGBMRanker(LGBMModel): ...@@ -405,7 +408,7 @@ class LGBMRanker(LGBMModel):
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:]) """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None, def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None): train_fields=None, valid_fields=None, other_params=None):
"""check group data""" """check group data"""
...@@ -428,6 +431,8 @@ class LGBMRanker(LGBMModel): ...@@ -428,6 +431,8 @@ class LGBMRanker(LGBMModel):
self.objective = "lambdarank" self.objective = "lambdarank"
self.fobj = None self.fobj = None
super(LGBMRanker, self).fit(X, y, eval_set, eval_metric, super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
early_stopping_rounds, verbose, train_fields, valid_fields, other_params) early_stopping_rounds, verbose,
train_fields, valid_fields,
other_params)
return self return self
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment