basic.py

"""Wrapper c_api of LightGBM"""
from __future__ import absolute_import

import sys
import os
import ctypes
import collections
import re
import tempfile

import numpy as np
import scipy.sparse


IS_PY3 = (sys.version_info[0] == 3)


def find_lib_path():
    """Find the path to LightGBM library files.
    Returns
    -------
    lib_path: list(string)
       List of all found library path to LightGBM
    """
    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
    dll_path = [curr_path, os.path.join(curr_path, '../../lib/'),
                os.path.join(curr_path, './lib/'),
                os.path.join(sys.prefix, 'lightgbm')]
    if os.name == 'nt':
        dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/'))
        dll_path.append(os.path.join(curr_path, './windows/x64/Dll/'))
        dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path]
    else:
        dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path]
    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
    if not lib_path:
        raise Exception('Cannot find lightgbm Library')
    return lib_path

def _load_lib():
    """Load LightGBM Library."""
    lib_path = find_lib_path()
    if len(lib_path) == 0:
        return None
    lib = ctypes.cdll.LoadLibrary(lib_path[0])
    lib.LGBM_GetLastError.restype = ctypes.c_char_p
    return lib

_LIB = _load_lib()

class LightGBMError(Exception):
    """Error throwed by LightGBM"""
    pass

def _safe_call(ret):
    """Check the return value of C API call
    Parameters
    ----------
    ret : int
        return value from API calls
    """
    if ret != 0:
        raise LightGBMError(_LIB.LGBM_GetLastError())

def is_str(s):
    if IS_PY3:
        return isinstance(s, str)
    else:
        return isinstance(s, basestring)

def is_numpy_object(data):
    return type(data).__module__ == np.__name__

def is_numpy_1d_array(data):
    if isinstance(data, np.ndarray) and len(data.shape) == 1:
        return True
    else:
        return False

def list_to_1d_numpy(data, dtype):
    if is_numpy_1d_array(data):
        return data
    elif isinstance(data, list):
        return np.array(data, dtype=dtype, copy=False)
    else:
        raise TypeError("Unknow type({})".format(type(data).__name__))

def cfloat32_array_to_numpy(cptr, length):
    """Convert a ctypes float pointer array to a numpy array.
    """
    if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
        res = np.fromiter(cptr, dtype=np.float32, count=length)
        return res
    else:
        raise RuntimeError('expected float pointer')

def cint32_array_to_numpy(cptr, length):
    """Convert a ctypes float pointer array to a numpy array.
    """
    if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
        res = np.fromiter(cptr, dtype=np.int32, count=length)
        return res
    else:
        raise RuntimeError('expected int pointer')

def c_str(string):
    """Convert a python string to cstring."""
    return ctypes.c_char_p(string.encode('utf-8'))

def c_array(ctype, values):
    """Convert a python array to c array."""
    return (ctype * len(values))(*values)

def dict_to_str(data):
    if data is None or len(data) == 0:
        return ""
    pairs = []
    for key in data:
        pairs.append(str(key)+'='+str(data[key]))
    return ' '.join(pairs)
"""marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32 =0
C_API_DTYPE_FLOAT64 =1
C_API_DTYPE_INT32   =2
C_API_DTYPE_INT64   =3
"""Matric is row major in python"""
C_API_IS_ROW_MAJOR  =1

def c_float_array(data):
    """Convert numpy array / list to c float array."""
    if isinstance(data, list):
        data = np.array(data, copy=False)
    if is_numpy_1d_array(data):
        if data.dtype == np.float32:
            ptr_data = data.ctypes.data_as(ctypes.c_float)
            type_data = C_API_DTYPE_FLOAT32
        elif data.dtype == np.float64:
            ptr_data = data.ctypes.data_as(ctypes.c_double)
            type_data = C_API_DTYPE_FLOAT64
        else:
            raise TypeError("expected np.float32 or np.float64, met type({})".format(data.dtype))
    else:
        raise TypeError("Unknow type({})".format(type(data).__name__))
    return (ptr_data, type_data)

def c_int_array(data):
    """Convert numpy array to c int array."""
    if isinstance(data, list):
        data = np.array(data, copy=False)
    if is_numpy_1d_array(data):
        if data.dtype == np.int32:
            ptr_data = data.ctypes.data_as(ctypes.c_int32)
            type_data = C_API_DTYPE_INT32
        elif data.dtype == np.int64:
            ptr_data = data.ctypes.data_as(ctypes.c_int64)
            type_data = C_API_DTYPE_INT64
        else:
            raise TypeError("expected np.int32 or np.int64, met type({})".format(data.dtype))
    else:
        raise TypeError("Unknow type({})".format(type(data).__name__))
    return (ptr_data, type_data)

class Dataset(object):
    """Dataset used in LightGBM.

    Dataset is a internal data structure that used by LightGBM
    You can construct Dataset from numpy.arrays
    """

    _feature_names = None

    def __init__(self, data, max_bin=255, reference=None,
        label=None, weight=None, group_id=None, 
        silent=False, feature_names=None, 
        other_params=None, is_continue_train=False):
        """
        Dataset used in LightGBM.

        Parameters
        ----------
        data : string/numpy array/scipy.sparse
            Data source of Dataset.
            When data is string type, it represents the path of txt file,
        max_bin : int, required
            max number of discrete bin for features 
        reference : Other Dataset, optional
            If this dataset validation, need to use training data as reference
        label : list or numpy 1-D array, optional
            Label of the training data.
        weight : list or numpy 1-D array , optional
            Weight for each instance.
        group_id : list or numpy 1-D array , optional
            group/query id for each instance. Note: if having group/query id, data should group by this id
        silent : boolean, optional
            Whether print messages during construction
        feature_names : list, optional
            Set names for features.
        other_params: dict, optional
            other parameters
        """

        if data is None:
            self.handle = None
            return
        """save raw data for continue train """
        if is_continue_train:
            self.raw_data = data
        else:
            self.raw_data = None
        self.data_has_header = False
        """process for args"""
        params = {}
        params["max_bin"] = max_bin
        if silent:
            params["verbose"] = 0
        if other_params:
            other_params.update(params)
            params = other_params
        params_str = dict_to_str(params)
        """process for reference dataset"""
        ref_dataset = None
        if isinstance(reference, Dataset):
            ref_dataset = ctypes.byref(reference.handle)
        elif reference is not None:
            raise TypeError('Reference dataset should be None or dataset instance')
        """start construct data"""
        if is_str(data):
            """check data has header or not"""
            if "has_header" in params or "header" in params:
                if params["has_header"].lower() == "true" or params["header"].lower() == "true":
                    data_has_header = True
            self.handle = ctypes.c_void_p()
            _safe_call(_LIB.LGBM_CreateDatasetFromFile(
                c_str(data), 
                c_str(params_str), 
                ref_dataset,
                ctypes.byref(self.handle)))
        elif isinstance(data, scipy.sparse.csr_matrix):
            self.__init_from_csr(data, params_str, ref_dataset)
        elif isinstance(data, np.ndarray):
            self.__init_from_np2d(data, params_str, ref_dataset)
        else:
            try:
                csr = scipy.sparse.csr_matrix(data)
                if self.raw_data is not None:
                    self.raw_data = csr
                self.__init_from_csr(csr)
            except:
                raise TypeError('can not initialize Dataset from {}'.format(type(data).__name__))
        self.__label = None
        self.__weight = None
        self.__init_score = None
        self.__group = None
        if label is not None:
            self.set_label(label)
        if weight is not None:
            self.set_weight(weight)
        if group_id is not None:
            self.set_group_id(group_id)
        self.feature_names = feature_names

    def free_raw_data(self):
        self.raw_data = None

    def __init_from_np2d(self, mat, params_str, ref_dataset):
        """
        Initialize data from a 2-D numpy matrix.
        """
        if len(mat.shape) != 2:
            raise ValueError('Input numpy.ndarray must be 2 dimensional')

        self.handle = ctypes.c_void_p()
        if mat.dtype == np.float32 or mat.dtype == np.float64:
            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
        else:
            """change non-float data to float data, need to copy"""
            data = np.array(mat.reshape(mat.size), dtype=np.float32)

        ptr_data, type_ptr_data = c_float_array(data)
        _safe_call(LIB.LGBM_CreateDatasetFromMat(
            ptr_data, 
            type_ptr_data,
            mat.shape[0],
            mat.shape[1],
            C_API_IS_ROW_MAJOR,
            c_str(params_str), 
            ref_dataset, 
            ctypes.byref(self.handle)))

    def __init_from_csr(self, csr, params_str, ref_dataset):
        """
        Initialize data from a CSR matrix.
        """
        if len(csr.indices) != len(csr.data):
            raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
        self.handle = ctypes.c_void_p()

        ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
        ptr_data, type_ptr_data = c_float_array(csr.data)

        _safe_call(_LIB.LGBM_CreateDatasetFromCSR(
            ptr_indptr, 
            type_ptr_indptr,
            csr.indices.ctypes.data_as(ctypes.c_int32),
            ptr_data,
            type_ptr_data, 
            len(csr.indptr), 
            len(csr.data),
            csr.shape[1], 
            c_str(params_str), 
            ref_dataset, 
            ctypes.byref(self.handle)))

    def __del__(self):
        _safe_call(_LIB.LGBM_DatasetFree(self.handle))

    def get_field(self, field_name):
        """Get property from the Dataset.

        Parameters
        ----------
        field_name: str
            The field name of the information

        Returns
        -------
        info : array
            a numpy array of information of the data
        """
        out_len = ctypes.c_int32()
        out_type = ctypes.c_int32()
        ret = ctypes.POINTER(ctypes.c_void_p)()
        _safe_call(_LIB.LGBM_DatasetGetField(
            self.handle,
            c_str(field_name),
            ctypes.byref(out_len),
            ctypes.byref(ret),
            ctypes.byref(out_type)))
        if out_type.value == C_API_DTYPE_INT32:
            return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(c_int32), out_len.value))
        elif out_type.value == C_API_DTYPE_FLOAT32:
            return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(c_float), out_len.value))
        else:
            raise TypeError("unknow type")

    def set_field(self, field_name, data):
        """Set property into the Dataset.

        Parameters
        ----------
        field_name: str
            The field name of the information

        data: numpy array or list
            The array ofdata to be set
        """
        if not is_numpy_1d_array(data):
            raise TypeError("Unknow type({})".format(type(data).__name__))
        if data.dtype == np.float32:
            ptr_data = data.ctypes.data_as(ctypes.c_float)
            type_data = C_API_DTYPE_FLOAT32
        elif data.dtype == np.int32:
            ptr_data = data.ctypes.data_as(ctypes.c_int32)
            type_data = C_API_DTYPE_INT32
        else:
            raise TypeError("excepted np.float32 or np.int32, met type({})".format(data.dtype))
        _safe_call(_LIB.LGBM_DatasetSetField(
            self.handle,
            c_str(field_name),
            ptr_data,
            len(data),
            type_data))


    def save_binary(self, filename):
        """Save Dataset to binary file

        Parameters
        ----------
        filename : string
            Name of the output file.
        """
        _safe_call(_LIB.LGBM_DatasetSaveBinary(
            self.handle,
            c_str(filename)))

    def set_label(self, label):
        """Set label of Dataset

        Parameters
        ----------
        label: array like
            The label information to be set into Dataset
        """
        label = list_to_1d_numpy(label, np.float32)
        if label.dtype != np.float32:
            label = label.astype(np.float32, copy=False)
        self.__label = label
        self.set_field('label', label)

    def set_weight(self, weight):
        """ Set weight of each instance.

        Parameters
        ----------
        weight : array like
            Weight for each data point
        """
        weight = list_to_1d_numpy(weight, np.float32)
        if weight.dtype != np.float32:
            weight = weight.astype(np.float32, copy=False)
        self.__weight = weight
        self.set_field('weight', weight)

    def set_init_score(self, score):
        """ Set init score of booster to start from.
        Parameters
        ----------
        score: array like

        """
        score = list_to_1d_numpy(score, np.float32)
        if score.dtype != np.float32:
            score = score.astype(np.float32, copy=False)
        self.__init_score = init_score
        self.set_field('init_score', score)

    def set_group(self, group):
        """Set group size of Dataset (used for ranking).

        Parameters
        ----------
        group : array like
            Group size of each group
        """
        group = list_to_1d_numpy(group, np.int32)
        if group.dtype != np.int32:
            group = group.astype(np.int32, copy=False)
        self.__group = group
        self.set_field('group', group)

    def set_group_id(self, group_id):

        """Set group_id of Dataset (used for ranking).

        Parameters
        ----------
        group : array like
            group_id of Dataset (used for ranking).
        """
        group_id = list_to_1d_numpy(group_id, np.int32)
        if group_id.dtype != np.int32:
            group_id = group_id.astype(np.int32, copy=False)
        self.set_field('group_id', group_id)

    def get_label(self):
        """Get the label of the Dataset.

        Returns
        -------
        label : array
        """
        if self.__label is None:
            self.__label = self.get_field('label')
        return self.__label

    def get_weight(self):
        """Get the weight of the Dataset.

        Returns
        -------
        weight : array
        """
        if self.__weight is None:
            self.__weight = self.get_field('weight')
        return self.__weight

    def get_init_score(self):
        """Get the initial score of the Dataset.

        Returns
        -------
        init_score : array
        """
        if self.__init_score is None:
            self.__init_score = self.get_field('init_score')
        return self.__init_score

    def get_group(self):
        """Get the initial score of the Dataset.

        Returns
        -------
        init_score : array
        """
        if self.__group is None:
            self.__group = self.get_field('group')
        return self.__group

    def num_data(self):
        """Get the number of rows in the Dataset.

        Returns
        -------
        number of rows : int
        """
        ret = ctypes.c_int64()
        _safe_call(_LIB.LGBM_DatasetGetNumData(self.handle,
                                         ctypes.byref(ret)))
        return ret.value

    def num_feature(self):
        """Get the number of columns (features) in the Dataset.

        Returns
        -------
        number of columns : int
        """
        ret = ctypes.c_int64()
        _safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle,
                                         ctypes.byref(ret)))
        return ret.value

    @property
    def feature_names(self):
        """Get feature names (column labels).

        Returns
        -------
        feature_names : list
        """
        if self._feature_names is None:
            self._feature_names = ['Column_{0}'.format(i) for i in range(self.num_col())]
        return self._feature_names

    @feature_names.setter
    def feature_names(self, feature_names):
        """Set feature names (column labels).

        Parameters
        ----------
        feature_names : list
            Labels for features
        """
        if feature_names is not None:
            # validate feature name
            if not isinstance(feature_names, list):
                feature_names = list(feature_names)
            if len(feature_names) != len(set(feature_names)):
                raise ValueError('feature_names must be unique')
            if len(feature_names) != self.num_col():
                msg = 'feature_names must have the same length as data'
                raise ValueError(msg)
            # prohibit to use symbols may affect to parse. e.g. []<
            if not all(isinstance(f, STRING_TYPES) and
                       not any(x in f for x in set(('[', ']', '<')))
                       for f in feature_names):
                raise ValueError('feature_names may not contain [, ] or <')
            self._feature_names = feature_names
        else:
            self._feature_names = None

C_API_PREDICT_NORMAL     =0
C_API_PREDICT_RAW_SCORE  =1
C_API_PREDICT_LEAF_INDEX =2

class Booster(object):
    """"A Booster of of LightGBM.
    """

    feature_names = None

    def __init__(self,params=None, 
        train_set=None, valid_sets=None, 
        name_valid_sets=None, model_file=None):
        # pylint: disable=invalid-name
        """Initialize the Booster.

        Parameters
        ----------
        params : dict
            Parameters for boosters.
        train_set : Dataset
            training dataset
        valid_sets : List of Dataset or None
            validation datasets
        name_valid_sets : List of string
            name of validation datasets
        model_file : string
            Path to the model file. 
            If tarin_set is not None, used for continued train. 
            else used for loading model prediction task
        """
        self.handle = ctypes.c_void_p()
        if train_set is not None:
            """Training task"""
            if not isinstance(train_set, Dataset):
                raise TypeError('training data should be Dataset instance, met{}'.format(type(train_set).__name__))

            valid_handles = None
            n_valid = 0
            if valid_sets is not None:
                for valid in valid_sets:
                    if not isinstance(valid, Dataset):
                        raise TypeError('valid data should be Dataset instance, met{}'.format(type(valid).__name__))
                valid_handles = c_array(ctypes.c_void_p, [valid.handle for valid in valid_sets])
                if name_valid_sets is None:
                    name_valid_sets = ["valid_{}".format(x+1) for x in range(len(valid_sets)) ]
                if len(valid_sets) != len(name_valid_sets):
                    raise Exception('len of valid_sets should be equal with len of name_valid_sets')
                n_valid = len(valid_sets)
            ref_input_model = None
            params_str = dict_to_str(params)
            if model_file is not None:
                ref_input_model = c_str(model_file)
            """construct booster object"""
            _safe_call(_LIB.LGBM_BoosterCreate(
                train_set.handle, 
                valid_handles, 
                n_valid, 
                c_str(params_str),
                ref_input_model, 
                ctypes.byref(self.handle)))
            """if need to continue train"""
            if model_file is not None:
                self.__init_continue_train(train_set)
                if valid_sets is not None:
                    for valid in valid_sets:
                        self.__init_continue_train(valid)
            """save reference to data"""
            self.train_set = train_set
            self.valid_sets = valid_sets
            self.name_valid_sets = name_valid_sets
            self.__num_dataset = 1 + n_valid
            self.__training_score = None
            out_len = ctypes.c_int64(0)
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_len)))
            self.__num_class = out_len.value
            """buffer for inner predict"""
            self.__inner_predict_buffer = [None for _ in range(self.__num_dataset)]
            """Get num of inner evals"""
            _safe_call(_LIB.LGBM_BoosterGetEvalCounts(
                self.handle,
                ctypes.byref(out_len)))
            self.__num_inner_eval = out_len.value
            if self.__num_inner_eval > 0:
                """Get name of evals"""
                string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)]
                ptr_string_buffers = (ctypes.c_char_p*self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
                _safe_call(_LIB.LGBM_BoosterGetEvalNames(
                    self.handle,
                    ctypes.byref(out_len),
                    ptr_string_buffers))
                if self.__num_inner_eval != out_len.value:
                    raise ValueError("size of eval names doesn't equal with num_evals")
                self.__name_inner_eval = []
                for i in range(self.__num_inner_eval):
                    self.__name_inner_eval.append(string_buffers[i].value.decode())

        elif model_file is not None:
            """Prediction task"""
            out_num_total_model = ctypes.c_int64(0)
            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
                c_str(model_file), 
                ctypes.byref(out_num_total_model),
                ctypes.byref(self.handle)))
            self.__num_total_model = out_num_total_model.value
            out_len = ctypes.c_int64(0)
            _safe_call(_LIB.LGBM_BoosterGetNumClasses(
                self.handle,
                ctypes.byref(out_len)))
            self.__num_class = out_len.value
        else:
            raise TypeError('At least need training dataset or model file to create booster instance')

    def __del__(self):
        _safe_call(_LIB.LGBM_BoosterFree(self.handle))

    def update(self, fobj=None):
        """
        Update for one iteration
        Note: for multi-class task, the score is group by class_id first, then group by row_id
              if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
              and you should group grad and hess in this way as well
        Parameters
        ----------
        fobj : function
            Customized objective function.

        Returns
        -------
        is_finished, bool
        """
        is_finished = ctypes.c_int(0)
        if fobj is None:
            _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
                self.handle, 
                ctypes.byref(is_finished)))
            return is_finished.value == 1
        else:
            grad, hess = fobj(self.__inner_predict(0), self.train_set)
            return self.boost(grad, hess)

    def boost(self, grad, hess):
        """
        Boost the booster for one iteration, with customized gradient statistics.
        Note: for multi-class task, the score is group by class_id first, then group by row_id
              if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
              and you should group grad and hess in this way as well
        Parameters
        ----------
        grad : 1d numpy with dtype=float32
            The first order of gradient.
        hess : 1d numpy with dtype=float32
            The second order of gradient.

        Returns
        -------
        is_finished, bool
        """
        if not is_numpy_1d_array(grad) and not is_numpy_1d_array(hess):
            raise TypeError('type of grad / hess should be 1d numpy object')
        if not grad.dtype == np.float32 and not hess.dtype == np.float32:
            raise TypeError('type of grad / hess should be np.float32')
        if len(grad) != len(hess):
            raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
        is_finished = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
            self.handle,
            grad.ctypes.data_as(ctypes.c_float),
            hess.ctypes.data_as(ctypes.c_float),
            ctypes.byref(is_finished)))
        return is_finished.value == 1

    def eval_train(self, feval=None):
        """Evaluate for training data

        Parameters
        ----------
        feval : function
            Custom evaluation function.

        Returns
        -------
        result: str
            Evaluation result string.
        """
        return self.__inner_eval("training", 0, feval)

    def eval_valid(self, feval=None):
        """Evaluate for validation data

        Parameters
        ----------
        feval : function
            Custom evaluation function.

        Returns
        -------
        result: str
            Evaluation result string.
        """
        ret = []
        for i in range(1, self.__num_dataset):
            ret.append(self.__inner_eval(self.name_valid_sets[i-1], i, feval))
        return '\n'.join(ret)

    def save_model(self, filename, num_iteration=-1):
        _safe_call(_LIB.LGBM_BoosterSaveModel(
            self.handle,
            num_iteration,
            c_str(filename)))

    def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
        if isinstance(data, Dataset):
            raise TypeError("cannot use Dataset instance for prediction, please use raw data instead")
        predict_type = C_API_PREDICT_NORMAL
        if raw_score:
            predict_type = cC_API_PREDICT_RAW_SCORE
        if pred_leaf:
            predict_type = C_API_PREDICT_LEAF_INDEX
        int_data_has_header = 0
        if data_has_header:
            int_data_has_header = 1
        if is_str(data):
            tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
            _safe_call(_LIB.LGBM_BoosterPredictForFile(
                self.handle,
                c_str(data), 
                int_data_has_header,
                predict_type,
                num_iteration,
                c_str(tmp_pred_fname)))
            lines = open(tmp_pred_fname,"r").readlines()
            nrow = len(lines)
            preds = []
            for line in lines:
                for token in line.split('\t'):
                    preds.append(float(token))
            preds = np.array(preds, copy=False)
            os.remove(tmp_pred_fname)
        elif isinstance(data, scipy.sparse.csr_matrix):
            preds, nrow = self.__pred_for_csr(data, num_iteration, predict_type)
        elif isinstance(data, np.ndarray):
            preds, nrow = self.__pred_for_np2d(data, num_iteration, predict_type)
        else:
            try:
                csr = scipy.sparse.csr_matrix(data)
                res = self.__pred_for_csr(csr, num_iteration, predict_type)
            except:
                raise TypeError('can not predict data for type {}'.format(type(data).__name__))
        if pred_leaf:
            preds = preds.astype(np.int32)
        if preds.size != nrow and is_reshape:
            if preds.size % nrow == 0:
                ncol = int(preds.size / nrow)
                preds = preds.reshape(nrow, ncol)
            else:
                raise ValueError('len of predict result(%d) cannot be divide nrow(%d)' %(preds.size, nrow) )
        return preds

    def __pred_for_np2d(self, mat, num_iteration, predict_type):
        """
        Predict for a 2-D numpy matrix.
        """
        if len(mat.shape) != 2:
            raise ValueError('Input numpy.ndarray must be 2 dimensional')

        if mat.dtype == np.float32 or mat.dtype == np.float64:
            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
        else:
            """change non-float data to float data, need to copy"""
            data = np.array(mat.reshape(mat.size), dtype=np.float32)
        ptr_data, type_ptr_data = c_float_array(data)
        n_preds = self.__num_class * mat.shape[0]
        if predict_type == C_API_PREDICT_LEAF_INDEX:
            if num_iteration > 0:
                n_preds *= num_iteration
            else:
                used_iteration = self.__num_total_model / self.__num_class
                n_preds *= used_iteration
        preds = np.zeros(n_preds, dtype=np.float32)
        out_num_preds = ctypes.c_int64(0)
        _safe_call(LIB.LGBM_BoosterPredictForMat(
            self.handle,
            ptr_data, 
            type_ptr_data,
            mat.shape[0],
            mat.shape[1],
            C_API_IS_ROW_MAJOR,
            predict_type,
            num_iteration,
            ctypes.byref(out_num_preds),
            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            ))
        if n_preds != out_num_preds.value:
            raise ValueError("incorrect number for predict result")
        return preds, mat.shape[0]

    def __pred_for_csr(self, csr, num_iteration, predict_type):
        """
        Predict for a csr data
        """
        nrow = len(csr.indptr) - 1
        n_preds = self.__num_class * nrow
        if predict_type == C_API_PREDICT_LEAF_INDEX:
            if num_iteration > 0:
                n_preds *= num_iteration
            else:
                used_iteration = self.__num_total_model / self.__num_class
                n_preds *= used_iteration
        preds = np.zeros(n_preds, dtype=np.float32)
        out_num_preds = ctypes.c_int64(0)

        ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
        ptr_data, type_ptr_data = c_float_array(csr.data)

        _safe_call(LIB.LGBM_BoosterPredictForCSR(
            self.handle,
            ptr_indptr, 
            type_ptr_indptr,
            csr.indices.ctypes.data_as(ctypes.c_int32),
            ptr_data,
            type_ptr_data, 
            len(csr.indptr), 
            len(csr.data),
            csr.shape[1], 
            predict_type,
            num_iteration,
            ctypes.byref(out_num_preds),
            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            ))
        if n_preds != out_num_preds.value:
            raise ValueError("incorrect number for predict result")
        return preds, nrow

    def __inner_eval(self, data_name, data_idx, feval=None):
        if data_idx >= self.__num_dataset:
            raise ValueError("data_idx should be smaller than number of dataset")
        ret = []
        if self.__num_inner_eval > 0:
            result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float32)
            out_len = ctypes.c_int64(0)
            _safe_call(_LIB.LGBM_BoosterGetEval(
                self.handle, 
                data_idx, 
                ctypes.byref(out_len), 
                result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
            if out_len.value != self.__num_inner_eval:
                raise ValueError("incorrect number of eval results")
            for i in range(self.__num_inner_eval):
                ret.append('%s %s : %f' %(data_name, self.__name_inner_eval[i], result[i]))
        if feval is not None:
            if data_idx == 0:
                cur_data = self.train_set
            else:
                cur_data = self.valid_sets[data_idx - 1]
            feval_ret = feval(self.__inner_predict(data_idx), cur_data)
            if isinstance(feval_ret, list):
                for name, val in feval_ret:
                    ret.append('%s %s : %f' % (data_name, name, val))
            else:
                name, val = feval_ret
                ret.append('%s %s : %f' % (data_name, name, val))
        return '\t'.join(ret)

    def __inner_predict(self, data_idx):
        if data_idx >= self.__num_dataset:
            raise ValueError("data_idx should be smaller than number of dataset")
        if self.__inner_predict_buffer[data_idx] is None:
            if data_idx == 0:
                num_data = self.train_set.num_data() * self.__num_class
            else:
                num_data = self.valid_sets[data_idx - 1].num_data() * self.__num_class
            self.__inner_predict_buffer[data_idx] = \
                np.array([0.0 for _ in range(num_data)], dtype=np.float32, copy=False)
        out_len = ctypes.c_int64(0)
        data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
        _safe_call(_LIB.LGBM_BoosterGetPredict(
            self.handle, 
            data_idx, 
            ctypes.byref(out_len), 
            data_ptr))
        if out_len.value != len(self.__inner_predict_buffer[data_idx]):
            raise ValueError("incorrect number of predict results for data %d" %(data_idx) )
        return self.__inner_predict_buffer[data_idx]


    def __init_continue_train(self, dataset):
        if dataset.raw_data is None:
            raise ValueError("should set is_continue_train=True in dataset while need to continue train")
        init_score = self.predict(dataset.raw_data, raw_score=True,data_has_header=dataset.data_has_header, is_reshape=False)
        dataset.set_init_score(init_score)
        dataset.free_raw_data()


#tmp test
train_data = Dataset('../../examples/binary_classification/binary.train')
test_data = Dataset('../../examples/binary_classification/binary.test', reference = train_data)
param = {"metric":"l2,l1"}
lgb = Booster(train_set=train_data, valid_sets=[test_data], params=param)
for i in range(100):
    lgb.update()
    print(lgb.eval_valid())
    print(lgb.eval_train())
print(lgb.predict('../../examples/binary_classification/binary.train'))