add feature importance in python (#109)

* add feature importances in python; add pandas support * solve best_iteration issue

add feature importance in python (#109)
* add feature importances in python; add pandas support * solve best_iteration issue
2cd024e9 · wxchan · Guolin Ke · 6f7669df · 2cd024e9 · 2cd024e9
Commit 2cd024e9 authored Dec 06, 2016 by wxchan Committed by Guolin Ke Dec 06, 2016
9 changed files
--- a/examples/python-guide/simple_example.py
+++ b/examples/python-guide/simple_example.py
@@ -17,7 +17,11 @@ X_test = df_test.drop(0, axis=1)
 # create dataset for lightgbm
 lgb_train = lgb.Dataset(X_train, y_train)
 lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
-# or you can simply use a tuple of length=2 here
+# ATTENTION: you should carefully use lightgbm.Dataset
+# it requires setting up categorical_feature when you init it
+# rather than passing from lightgbm.train
+# instead, you can simply use a tuple of length=2 like below
+# it will help you construct Datasets with parameters in lightgbm.train
 lgb_train = (X_train, y_train)
 lgb_eval = (X_test, y_test)
@@ -26,14 +30,12 @@ params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
-    'metric' : 'l2',
+    'metric' : {'l2', 'auc'},
    'num_leaves' : 31,
    'learning_rate' : 0.05,
    'feature_fraction' : 0.9,
    'bagging_fraction' : 0.8,
    'bagging_freq': 5,
-    # 'ndcg_eval_at' : [1, 3, 5, 10],
-    # this metric is not needed in this task, show as an example
    'verbose' : 0
 }
@@ -49,9 +51,6 @@ gbm = lgb.train(params,
 # save model to file
 gbm.save_model('model.txt')
-# load model from file
-gbm = lgb.Booster(model_file='model.txt')
 # predict
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 # eval
@@ -62,3 +61,7 @@ model_json = gbm.dump_model()
 with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)
+# feature importances
+print('Feature importances:', gbm.feature_importance())
+print('Feature importances:', gbm.feature_importance("gain"))
--- a/examples/python-guide/sklearn_example.py
+++ b/examples/python-guide/sklearn_example.py
@@ -26,3 +26,6 @@ gbm.fit(X_train, y_train,
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 # eval
 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
+# feature importances
+print('Feature importances:', gbm.feature_importance())
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
 # coding: utf-8
-# pylint: disable = invalid-name, C0111, R0912, R0913, R0914, W0105
+# pylint: disable = invalid-name, C0111, C0301, R0912, R0913, R0914, W0105
+# pylint: disable = E1101
 """Wrapper c_api of LightGBM"""
 from __future__ import absolute_import
 import sys
-import os
 import ctypes
 import tempfile
 import json
@@ -19,11 +19,11 @@ try:
    from pandas import Series, DataFrame
    IS_PANDAS_INSTALLED = True
 except ImportError:
-    IS_PANDAS_INSTALLED = False
    class Series(object):
        pass
    class DataFrame(object):
        pass
+    IS_PANDAS_INSTALLED = False
 IS_PY3 = (sys.version_info[0] == 3)
@@ -62,18 +62,11 @@ def is_numpy_object(data):
    return type(data).__module__ == np.__name__
 def is_numpy_1d_array(data):
-    if isinstance(data, np.ndarray) and len(data.shape) == 1:
+    return isinstance(data, np.ndarray) and len(data.shape) == 1
-        return True
-    else:
-        return False
 def is_1d_list(data):
-    if not isinstance(data, list):
+    return isinstance(data, list) and \
-        return False
+        (not data or isinstance(data[0], (int, float, bool)))
-    if len(data) > 0:
-        if not isinstance(data[0], (int, float, bool)):
-            return False
-    return True
 def list_to_1d_numpy(data, dtype):
    if is_numpy_1d_array(data):
@@ -115,20 +108,19 @@ def c_array(ctype, values):
    return (ctype * len(values))(*values)
 def param_dict_to_str(data):
-    if data is None or len(data) == 0:
+    if not data:
        return ""
    pairs = []
    for key, val in data.items():
-        if is_str(val):
+        if is_str(val) or isinstance(val, (int, float, bool)):
            pairs.append(str(key)+'='+str(val))
-        elif isinstance(val, (list, tuple)):
+        elif isinstance(val, (list, tuple, set)):
            pairs.append(str(key)+'='+','.join(map(str, val)))
-        elif isinstance(val, (int, float, bool)):
-            pairs.append(str(key)+'='+str(val))
        else:
            raise TypeError('unknow type of parameter:%s , got:%s'
                            % (key, type(val).__name__))
    return ' '.join(pairs)
 """marco definition of data type in c_api of LightGBM"""
 C_API_DTYPE_FLOAT32 = 0
 C_API_DTYPE_FLOAT64 = 1
@@ -207,7 +199,7 @@ class Predictor(object):
                self.handle,
                ctypes.byref(out_num_class)))
            self.num_class = out_num_class.value
-            self.__num_total_iteration = out_num_iterations.value
+            self.num_total_iteration = out_num_iterations.value
        elif booster_handle is not None:
            self.__is_manage_handle = is_manage_handle
            self.handle = booster_handle
@@ -220,7 +212,7 @@ class Predictor(object):
            _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
                self.handle,
                ctypes.byref(out_num_iterations)))
-            self.__num_total_iteration = out_num_iterations.value
+            self.num_total_iteration = out_num_iterations.value
        else:
            raise TypeError('Need Model file to create a booster')
@@ -239,9 +231,9 @@ class Predictor(object):
        ----------
        data : string/numpy array/scipy.sparse
            Data source for prediction
-            When data is string type, it represents the path of txt file,
+            When data type is string, it represents the path of txt file
        num_iteration : int
-            used iteration for prediction
+            Used iteration for prediction
        raw_score : bool
            True for predict raw score
        pred_leaf : bool
@@ -249,23 +241,22 @@ class Predictor(object):
        data_has_header : bool
            Used for txt data
        is_reshape : bool
-            True for reshape to [nrow, ...]
+            Reshape to (nrow, ncol) if true
        Returns
        -------
        Prediction result
        """
        if isinstance(data, Dataset):
-            raise TypeError("cannot use Dataset instance for prediction, \
+            raise TypeError("cannot use Dataset instance for prediction, please use raw data instead")
-                            please use raw data instead")
        predict_type = C_API_PREDICT_NORMAL
        if raw_score:
            predict_type = C_API_PREDICT_RAW_SCORE
        if pred_leaf:
            predict_type = C_API_PREDICT_LEAF_INDEX
        int_data_has_header = 1 if data_has_header else 0
-        if num_iteration > self.__num_total_iteration:
+        if num_iteration > self.num_total_iteration:
-            num_iteration = self.__num_total_iteration
+            num_iteration = self.num_total_iteration
        if is_str(data):
            tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
            _safe_call(_LIB.LGBM_BoosterPredictForFile(
@@ -275,22 +266,20 @@ class Predictor(object):
                predict_type,
                num_iteration,
                c_str(tmp_pred_fname)))
-            tmp_file = open(tmp_pred_fname, "r")
+            with open(tmp_pred_fname, "r") as tmp_file:
                lines = tmp_file.readlines()
-            tmp_file.close()
                nrow = len(lines)
-            preds = []
+                preds = [float(token) for line in lines for token in line.split('\t')]
-            for line in lines:
+                preds = np.array(preds, dtype=np.float32, copy=False)
-                for token in line.split('\t'):
-                    preds.append(float(token))
-            preds = np.array(preds, copy=False)
-            os.remove(tmp_pred_fname)
        elif isinstance(data, scipy.sparse.csr_matrix):
            preds, nrow = self.__pred_for_csr(data, num_iteration,
                                              predict_type)
        elif isinstance(data, np.ndarray):
            preds, nrow = self.__pred_for_np2d(data, num_iteration,
                                               predict_type)
+        elif IS_PANDAS_INSTALLED and isinstance(data, DataFrame):
+            preds, nrow = self.__pred_for_np2d(data.values, num_iteration,
+                                               predict_type)
        else:
            try:
                csr = scipy.sparse.csr_matrix(data)
@@ -301,12 +290,11 @@ class Predictor(object):
                                format(type(data).__name__))
        if pred_leaf:
            preds = preds.astype(np.int32)
-        if preds.size != nrow and is_reshape:
+        if is_reshape and preds.size != nrow:
            if preds.size % nrow == 0:
-                ncol = int(preds.size / nrow)
+                preds = preds.reshape(nrow, -1)
-                preds = preds.reshape(nrow, ncol)
            else:
-                raise ValueError('len of predict result(%d) cannot be divide nrow (%d)'
+                raise ValueError('length of predict result (%d) cannot be divide nrow (%d)'
                                 % (preds.size, nrow))
        return preds
@@ -314,9 +302,9 @@ class Predictor(object):
        n_preds = self.num_class * nrow
        if predict_type == C_API_PREDICT_LEAF_INDEX:
            if num_iteration > 0:
-                n_preds *= min(num_iteration, self.__num_total_iteration)
+                n_preds *= min(num_iteration, self.num_total_iteration)
            else:
-                n_preds *= self.__num_total_iteration
+                n_preds *= self.num_total_iteration
        return n_preds
    def __pred_for_np2d(self, mat, num_iteration, predict_type):
@@ -386,7 +374,7 @@ class Predictor(object):
 PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
                       'int64': 'int', 'uint8': 'int', 'uint16': 'int',
                       'uint32': 'int', 'uint64': 'int', 'float16': 'float',
-                       'float32': 'float', 'float64': 'float', 'bool': 'i'}
+                       'float32': 'float', 'float64': 'float', 'bool': 'int'}
 def _data_from_pandas(data):
    if isinstance(data, DataFrame):
@@ -427,26 +415,26 @@ class Dataset(object):
        ----------
        data : string/numpy array/scipy.sparse
            Data source of Dataset.
-            When data is string type, it represents the path of txt file,
+            When data type is string, it represents the path of txt file
        label : list or numpy 1-D array, optional
            Label of the data
        max_bin : int, required
-            max number of discrete bin for features
+            Max number of discrete bin for features
        reference : Other Dataset, optional
            If this dataset validation, need to use training data as reference
        weight : list or numpy 1-D array , optional
            Weight for each instance.
        group : list or numpy 1-D array , optional
-            group/query size for dataset
+            Group/query size for dataset
        silent : boolean, optional
            Whether print messages during construction
        feature_name : list of str
-            feature names
+            Feature names
-        categorical_feature : list of str/int
+        categorical_feature : list of str or int
-            categorical features , int type to use index, 
+            Categorical features, type int represents index, \
-            str type to use feature names (feature_name cannot be None)
+            type str represents feature names (need to specify feature_name as well)
        params: dict, optional
-            other parameters
+            Other parameters
        """
        self.__label = None
        self.__weight = None
@@ -469,17 +457,17 @@ class Dataset(object):
            params["verbose"] = 1
        """get categorical features"""
        if categorical_feature is not None:
-            categorical_indices = []
+            categorical_indices = set()
            feature_dict = {}
            if feature_name is not None:
-                feature_dict =dict((name, i) for i, name in enumerate(feature_name))
+                feature_dict = {name: i for i, name in enumerate(feature_name)}
            for name in categorical_feature:
                if is_str(name) and name in feature_dict:
-                    categorical_indices.append(feature_dict[name])
+                    categorical_indices.add(feature_dict[name])
                elif isinstance(name, int):
-                    categorical_indices.append(name)
+                    categorical_indices.add(name)
                else:
-                    raise TypeError("unknown type({}) or unknown name({}) in categorical_feature"
+                    raise TypeError("unknown type({}) or unknown name({}) in categorical_feature" \
                        .format(type(name).__name__, name))
            params['categorical_column'] = categorical_indices
@@ -494,8 +482,8 @@ class Dataset(object):
        """start construct data"""
        if is_str(data):
            """check data has header or not"""
-            if "has_header" in params or "header" in params:
+            if params.get("has_header", "").lower() == "true" \
-                if params["has_header"].lower() == "true" or params["header"].lower() == "true":
+                or params.get("header", "").lower() == "true":
                self.data_has_header = True
            self.handle = ctypes.c_void_p()
            _safe_call(_LIB.LGBM_DatasetCreateFromFile(
@@ -548,17 +536,17 @@ class Dataset(object):
        ----------
        data : string/numpy array/scipy.sparse
            Data source of Dataset.
-            When data is string type, it represents the path of txt file,
+            When data type is string, it represents the path of txt file
        label : list or numpy 1-D array, optional
            Label of the training data.
        weight : list or numpy 1-D array , optional
            Weight for each instance.
        group : list or numpy 1-D array , optional
-            group/query size for dataset
+            Group/query size for dataset
        silent : boolean, optional
            Whether print messages during construction
        params: dict, optional
-            other parameters
+            Other parameters
        """
        return Dataset(data, label=label, max_bin=self.max_bin, reference=self,
                       weight=weight, group=group, predictor=self.predictor,
@@ -658,7 +646,7 @@ class Dataset(object):
        Returns
        -------
        info : array
-            a numpy array of information of the data
+            A numpy array of information of the data
        """
        tmp_out_len = ctypes.c_int64()
        out_type = ctypes.c_int32()
@@ -700,6 +688,9 @@ class Dataset(object):
                0,
                FIELD_TYPE_MAPPER[field_name]))
            return
+        if IS_PANDAS_INSTALLED and isinstance(data, Series):
+            dtype = np.int32 if field_name == 'group' else np.float32
+            data = data.astype(dtype).values
        if not is_numpy_1d_array(data):
            raise TypeError("Unknow type({})".format(type(data).__name__))
        if data.dtype == np.float32:
@@ -719,7 +710,6 @@ class Dataset(object):
            len(data),
            type_data))
    def save_binary(self, filename):
        """Save Dataset to binary file
@@ -737,7 +727,7 @@ class Dataset(object):
        Parameters
        ----------
-        label: array like
+        label: numpy array or list or None
            The label information to be set into Dataset
        """
        label = list_to_1d_numpy(label, np.float32)
@@ -749,7 +739,7 @@ class Dataset(object):
        Parameters
        ----------
-        weight : array like
+        weight : numpy array or list or None
            Weight for each data point
        """
        if weight is not None:
@@ -759,10 +749,11 @@ class Dataset(object):
    def set_init_score(self, score):
        """ Set init score of booster to start from.
        Parameters
        ----------
-        score: array like
+        score: numpy array or list or None
+            Init score for booster
        """
        if score is not None:
            score = list_to_1d_numpy(score, np.float32)
@@ -774,7 +765,7 @@ class Dataset(object):
        Parameters
        ----------
-        group : array like
+        group : numpy array or list or None
            Group size of each group
        """
        if group is not None:
@@ -782,7 +773,6 @@ class Dataset(object):
        self.__group = group
        self.set_field('group', group)
    def get_label(self):
        """Get the label of the Dataset.
@@ -854,7 +844,7 @@ class Dataset(object):
        return ret.value
 class Booster(object):
-    """"A Booster of of LightGBM.
+    """"A Booster of LightGBM.
    """
    def __init__(self, params=None, train_set=None, model_file=None, silent=False):
        """Initialize the Booster.
@@ -864,7 +854,7 @@ class Booster(object):
        params : dict
            Parameters for boosters.
        train_set : Dataset
-            training dataset
+            Training dataset
        model_file : string
            Path to the model file.
        silent : boolean, optional
@@ -884,7 +874,7 @@ class Booster(object):
        if train_set is not None:
            """Training task"""
            if not isinstance(train_set, Dataset):
-                raise TypeError('training data should be Dataset instance, met{}'.format(type(train_set).__name__))
+                raise TypeError('training data should be Dataset instance, met {}'.format(type(train_set).__name__))
            params_str = param_dict_to_str(params)
            """construct booster object"""
            _safe_call(_LIB.LGBM_BoosterCreate(
@@ -938,9 +928,9 @@ class Booster(object):
        Parameters
        ----------
        data : Dataset
-            validation data
+            Validation data
        name : String
-            name of validation data
+            Name of validation data
        """
        if data.predictor is not self.init_predictor:
            raise Exception("Add validation data failed, you should use same predictor for these data")
@@ -959,7 +949,7 @@ class Booster(object):
        Parameters
        ----------
        params : dict
-            params
+            New parameters for boosters
        silent : boolean, optional
            Whether print messages during construction
        """
@@ -977,9 +967,11 @@ class Booster(object):
        Note: for multi-class task, the score is group by class_id first, then group by row_id
              if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
              and you should group grad and hess in this way as well
        Parameters
        ----------
-        train_set : training data, None means use last training data
+        train_set :
+            Training data, None means use last training data
        fobj : function
            Customized objective function.
@@ -1014,6 +1006,7 @@ class Booster(object):
        Note: for multi-class task, the score is group by class_id first, then group by row_id
              if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
              and you should group grad and hess in this way as well
        Parameters
        ----------
        grad : 1d numpy or 1d list
@@ -1036,7 +1029,7 @@ class Booster(object):
            else:
                raise TypeError("hess should be numpy 1d array or 1d list")
        if len(grad) != len(hess):
-            raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
+            raise ValueError('grad / hess lengths mismatch: {} / {}'.format(len(grad), len(hess)))
        if grad.dtype != np.float32:
            grad = grad.astype(np.float32, copy=False)
        if hess.dtype != np.float32:
@@ -1071,7 +1064,8 @@ class Booster(object):
        Parameters
        ----------
        data : Dataset object
-        name : name of data
+        name :
+            Name of data
        feval : function
            Custom evaluation function.
        Returns
@@ -1124,10 +1118,8 @@ class Booster(object):
        result: str
            Evaluation result list.
        """
-        ret = []
+        return [item for i in range(1, self.__num_dataset) \
-        for i in range(1, self.__num_dataset):
+            for item in self.__inner_eval(self.name_valid_sets[i-1], i, feval)]
-            ret.extend(self.__inner_eval(self.name_valid_sets[i-1], i, feval))
-        return ret
    def save_model(self, filename, num_iteration=-1):
        """Save model of booster to file
@@ -1135,9 +1127,9 @@ class Booster(object):
        Parameters
        ----------
        filename : str
-            filename to save
+            Filename to save
        num_iteration: int
-            number of iteration that want to save. < 0 means save all
+            Number of iteration that want to save. < 0 means save all
        """
        _safe_call(_LIB.LGBM_BoosterSaveModel(
            self.handle,
@@ -1145,8 +1137,7 @@ class Booster(object):
            c_str(filename)))
    def dump_model(self):
-        """
+        """Dump model to json format
-        Dump model to json format
        Returns
        -------
@@ -1162,6 +1153,7 @@ class Booster(object):
            ctypes.byref(tmp_out_len),
            ctypes.byref(ptr_string_buffer)))
        actual_len = tmp_out_len.value
+        '''if buffer length is not long enough, reallocate a buffer'''
        if actual_len > buffer_len:
            string_buffer = ctypes.create_string_buffer(actual_len)
            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
@@ -1173,16 +1165,15 @@ class Booster(object):
        return json.loads(string_buffer.value.decode())
    def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
-        """
+        """Predict logic
-        Predict logic
        Parameters
        ----------
        data : string/numpy array/scipy.sparse
            Data source for prediction
-            When data is string type, it represents the path of txt file,
+            When data type is string, it represents the path of txt file
        num_iteration : int
-            used iteration for prediction
+            Used iteration for prediction
        raw_score : bool
            True for predict raw score
        pred_leaf : bool
@@ -1190,7 +1181,7 @@ class Booster(object):
        data_has_header : bool
            Used for txt data
        is_reshape : bool
-            True for reshape to [nrow, ...]
+            Reshape to (nrow, ncol) if true
        Returns
        -------
@@ -1207,6 +1198,29 @@ class Booster(object):
        self.__is_manage_handle = False
        return predictor
+    def feature_importance(self, importance_type='split'):
+        """Feature importances
+        Returns
+        -------
+        Array of feature importances
+        """
+        if importance_type not in ["split", "gain"]:
+            raise KeyError("importance_type must be split or gain")
+        dump_model = self.dump_model()
+        ret = [0] * (dump_model["max_feature_idx"] + 1)
+        def dfs(root):
+            if "split_feature" in root:
+                if importance_type == 'split':
+                    ret[root["split_feature"]] += 1
+                elif importance_type == 'gain':
+                    ret[root["split_feature"]] += root["split_gain"]
+                dfs(root["left_child"])
+                dfs(root["right_child"])
+        for tree in dump_model["tree_info"]:
+            dfs(tree["tree_structure"])
+        return np.array(ret)
    def __inner_eval(self, data_name, data_idx, feval=None):
        """
        Evaulate training or validation data
@@ -1291,16 +1305,11 @@ class Booster(object):
                    ptr_string_buffers))
                if self.__num_inner_eval != tmp_out_len.value:
                    raise ValueError("size of eval names doesn't equal with num_evals")
-                self.__name_inner_eval = []
+                self.__name_inner_eval = \
-                for i in range(self.__num_inner_eval):
+                    [string_buffers[i].value.decode() for i in range(self.__num_inner_eval)]
-                    self.__name_inner_eval.append(string_buffers[i].value.decode())
+                self.__higher_better_inner_eval = \
-                self.__higher_better_inner_eval = []
+                    [name.startswith(('auc', 'ndcg')) for name in self.__name_inner_eval]
-                higher_better_metric = ['auc', 'ndcg']
-                for name in self.__name_inner_eval:
-                    if any(name.startswith(x) for x in higher_better_metric):
-                        self.__higher_better_inner_eval.append(True)
-                    else:
-                        self.__higher_better_inner_eval.append(False)
    def attr(self, key):
        """Get attribute string from the Booster.
@@ -1314,10 +1323,7 @@ class Booster(object):
        value : str
            The attribute value of the key, returns None if attribute do not exist.
        """
-        if key in self.__attr:
+        return self.__attr.get(key, None)
-            return self.__attr[key]
-        else:
-            return None
    def set_attr(self, **kwargs):
        """Set the attribute of the Booster.
@@ -1330,7 +1336,7 @@ class Booster(object):
        for key, value in kwargs.items():
            if value is not None:
                if not is_str(value):
-                    raise ValueError("Set Attr only accepts string values")
+                    raise ValueError("set_attr only accepts string values")
                self.__attr[key] = value
            else:
                self.__attr.pop(key, None)
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -55,7 +55,7 @@ def print_evaluation(period=1, show_stdv=True):
    """
    def callback(env):
        """internal function"""
-        if len(env.evaluation_result_list) == 0 or period is False:
+        if not env.evaluation_result_list or period <= 0:
            return
        if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
            result = '\t'.join([_format_eval_result(x, show_stdv) \
@@ -83,15 +83,12 @@ def record_evaluation(eval_result):
    def init(env):
        """internal function"""
-        for data_name, eval_name, _, _ in env.evaluation_result_list:
+        for data_name, _, _, _ in env.evaluation_result_list:
-            if data_name not in eval_result:
+            eval_result.setdefault(data_name, collections.defaultdict(list))
-                eval_result[data_name] = {}
-            if eval_name not in eval_result[data_name]:
-                eval_result[data_name][eval_name] = []
    def callback(env):
        """internal function"""
-        if len(eval_result) == 0:
+        if not eval_result:
            init(env)
        for data_name, eval_name, result, _ in env.evaluation_result_list:
            eval_result[data_name][eval_name].append(result)
@@ -99,17 +96,17 @@ def record_evaluation(eval_result):
 def reset_learning_rate(learning_rates):
-    """Reset learning rate after iteration 1
+    """Reset learning rate after first iteration
    NOTE: the initial learning rate will still take in-effect on first iteration.
    Parameters
    ----------
    learning_rates: list or function
-        List of learning rate for each boosting round
+        List of learning rate for each boosting round \
-        or a customized function that calculates learning_rate in terms of
+        or a customized function that calculates learning_rate in terms of \
-        current number of round and the total number of boosting round (e.g. yields
+        current number of round and the total number of boosting round \
-        learning rate decay)
+        (e.g. yields learning rate decay)
        - list l: learning_rate = l[current_round]
        - function f: learning_rate = f(current_round, total_boost_round)
@@ -121,13 +118,13 @@ def reset_learning_rate(learning_rates):
    def callback(env):
        """internal function"""
        booster = env.model
-        i = env.iteration
+        iteration = env.iteration
        if isinstance(learning_rates, list):
            if len(learning_rates) != env.end_iteration:
                raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
-            booster.reset_parameter({'learning_rate':learning_rates[i]})
+            booster.reset_parameter({'learning_rate':learning_rates[iteration]})
        else:
-            booster.reset_parameter({'learning_rate':learning_rates(i, env.end_iteration)})
+            booster.reset_parameter({'learning_rate':learning_rates(iteration, env.end_iteration)})
    callback.before_iteration = True
    return callback
@@ -157,7 +154,7 @@ def early_stop(stopping_rounds, verbose=True):
    best_msg = {}
    def init(env):
        """internal function"""
-        if len(env.evaluation_result_list) == 0:
+        if not env.evaluation_result_list:
            raise ValueError('For early stopping you need at least one set in evals.')
        if verbose:
@@ -169,13 +166,11 @@ def early_stop(stopping_rounds, verbose=True):
            best_iter[i] = 0
            if verbose:
                best_msg[i] = ""
-            factor_to_bigger_better[i] = -1.0
+            factor_to_bigger_better[i] = 1.0 if env.evaluation_result_list[i][3] else -1.0
-            if env.evaluation_result_list[i][3]:
-                factor_to_bigger_better[i] = 1.0
    def callback(env):
        """internal function"""
-        if len(best_score) == 0:
+        if not best_score:
            init(env)
        for i in range(len(env.evaluation_result_list)):
            score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
@@ -190,6 +185,7 @@ def early_stop(stopping_rounds, verbose=True):
                    if env.model is not None:
                        env.model.set_attr(best_iteration=str(best_iter[i]))
                    if verbose:
-                        print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
+                        print('early stopping, best iteration is:')
+                        print(best_msg[i])
                    raise EarlyStopException(best_iter[i])
    return callback
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -21,9 +21,9 @@ def _construct_dataset(X_y, reference=None,
    if other_fields is not None:
        if not isinstance(other_fields, dict):
            raise TypeError("other filed data should be dict type")
-        weight = None if 'weight' not in other_fields else other_fields['weight']
+        weight = other_fields.get('weight', None)
-        group = None if 'group' not in other_fields else other_fields['group']
+        group = other_fields.get('group', None)
-        init_score = None if 'init_score' not in other_fields else other_fields['init_score']
+        init_score = other_fields.get('init_score', None)
    if is_str(X_y):
        data = X_y
        label = None
@@ -58,15 +58,15 @@ def train(params, train_data, num_boost_round=100,
    Parameters
    ----------
    params : dict
-         params.
+        Parameters for training.
    train_data : Dataset, tuple (X, y) or filename of data
        Data to be trained.
    num_boost_round: int
        Number of boosting iterations.
-    valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
+    valid_datas: list of Datasets, tuples (valid_X, valid_y) or filenames of data
        List of data to be evaluated during training
    valid_names: list of string
-        names of valid_datas
+        Names of valid_datas
    fobj : function
        Customized objective function.
    feval : function
@@ -75,17 +75,17 @@ def train(params, train_data, num_boost_round=100,
    init_model : file name of lightgbm model or 'Booster' instance
        model used for continued train
    train_fields : dict
-        other data file in training data. e.g. train_fields['weight'] is weight data
+        Other data file in training data. e.g. train_fields['weight'] is weight data
-        support fields: weight, group, init_score
+        Support fields: weight, group, init_score
    valid_fields : dict
-        other data file in training data. \
+        Other data file in training data. \
        e.g. valid_fields[0]['weight'] is weight data for first valid data
-        support fields: weight, group, init_score
+        Support fields: weight, group, init_score
    feature_name : list of str
-        feature names
+        Feature names
-    categorical_feature : list of str/int
+    categorical_feature : list of str or int
-        categorical features , int type to use index, 
+        Categorical features, type int represents index, \
-        str type to use feature names (feature_name cannot be None)
+        type str represents feature names (need to specify feature_name as well)
    early_stopping_rounds: int
        Activates early stopping.
        Requires at least one validation data and one metric
@@ -101,18 +101,18 @@ def train(params, train_data, num_boost_round=100,
        passed with None means no using this function
    verbose_eval : bool or int
        Requires at least one item in evals.
-        If `verbose_eval` is True then the evaluation metric on the validation set is
+        If `verbose_eval` is True then the evaluation metric on the validation set is \
        printed at each boosting stage.
-        If `verbose_eval` is an integer then the evaluation metric on the validation set
+        If `verbose_eval` is an integer then the evaluation metric on the validation set \
-        is printed at every given `verbose_eval` boosting stage. The last boosting stage
+        is printed at every given `verbose_eval` boosting stage. The last boosting stage \
        / the boosting stage found by using `early_stopping_rounds` is also printed.
-        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
+        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric \
        is printed every 4 boosting stages, instead of every boosting stage.
    learning_rates: list or function
-        List of learning rate for each boosting round
+        List of learning rate for each boosting round \
-        or a customized function that calculates learning_rate in terms of
+        or a customized function that calculates learning_rate in terms of \
-        current number of round and the total number of boosting round (e.g. yields
+        current number of round and the total number of boosting round \
-        learning rate decay)
+        (e.g. yields learning rate decay)
        - list l: learning_rate = l[current_round]
        - function f: learning_rate = f(current_round, total_boost_round)
    callbacks : list of callback functions
@@ -131,9 +131,13 @@ def train(params, train_data, num_boost_round=100,
        predictor = init_model
    else:
        predictor = None
+    init_iteration = predictor.num_total_iteration if predictor else 0
    """create dataset"""
    if isinstance(train_data, Dataset):
        train_set = train_data
+        if train_fields is not None:
+            for field, data in train_fields.items():
+                train_set.set_field(field, data)
    else:
        train_set = _construct_dataset(train_data, None, params,
                                       other_fields=train_fields,
@@ -150,7 +154,7 @@ def train(params, train_data, num_boost_round=100,
        if isinstance(valid_names, str):
            valid_names = [valid_names]
        for i, valid_data in enumerate(valid_datas):
-            other_fields = None if valid_fields is None else valid_fields[i]
+            other_fields = None if valid_fields is None else valid_fields.get(i, None)
            """reduce cost for prediction training data"""
            if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]:
                is_valid_contain_train = True
@@ -159,6 +163,9 @@ def train(params, train_data, num_boost_round=100,
                continue
            if isinstance(valid_data, Dataset):
                valid_set = valid_data
+                if other_fields is not None:
+                    for field, data in other_fields.items():
+                        valid_set.set_field(field, data)
            else:
                valid_set = _construct_dataset(
                    valid_data,
@@ -169,7 +176,7 @@ def train(params, train_data, num_boost_round=100,
                    categorical_feature=categorical_feature,
                    predictor=predictor)
            valid_sets.append(valid_set)
-            if valid_names is not None:
+            if valid_names is not None and len(valid_names) > i:
                name_valid_sets.append(valid_names[i])
            else:
                name_valid_sets.append('valid_'+str(i))
@@ -179,13 +186,13 @@ def train(params, train_data, num_boost_round=100,
    # Most of legacy advanced options becomes callbacks
    if isinstance(verbose_eval, bool) and verbose_eval:
        callbacks.append(callback.print_evaluation())
-    else:
+    elif isinstance(verbose_eval, int):
-        if isinstance(verbose_eval, int):
        callbacks.append(callback.print_evaluation(verbose_eval))
    if early_stopping_rounds is not None:
        callbacks.append(callback.early_stop(early_stopping_rounds,
                                             verbose=bool(verbose_eval)))
    if learning_rates is not None:
        callbacks.append(callback.reset_learning_rate(learning_rates))
@@ -197,32 +204,26 @@ def train(params, train_data, num_boost_round=100,
    callbacks_after_iter = [
        cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
    """construct booster"""
-    if 'metric' in params:
-        if is_str(params['metric']):
-            params['metric'] = params['metric'].split(',')
-        else:
-            params['metric'] = list(params['metric'])
    booster = Booster(params=params, train_set=train_set)
    if is_valid_contain_train:
        booster.set_train_data_name(train_data_name)
    for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
        booster.add_valid(valid_set, name_valid_set)
    """start training"""
-    for i in range(num_boost_round):
+    for i in range(init_iteration, init_iteration + num_boost_round):
        for cb in callbacks_before_iter:
            cb(callback.CallbackEnv(model=booster,
                                    cvfolds=None,
                                    iteration=i,
-                                    begin_iteration=0,
+                                    begin_iteration=init_iteration,
-                                    end_iteration=num_boost_round,
+                                    end_iteration=init_iteration + num_boost_round,
                                    evaluation_result_list=None))
        booster.update(fobj=fobj)
        evaluation_result_list = []
        # check evaluation result.
-        if len(valid_sets) != 0:
+        if valid_sets:
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
@@ -231,8 +232,8 @@ def train(params, train_data, num_boost_round=100,
                cb(callback.CallbackEnv(model=booster,
                                        cvfolds=None,
                                        iteration=i,
-                                        begin_iteration=0,
+                                        begin_iteration=init_iteration,
-                                        end_iteration=num_boost_round,
+                                        end_iteration=init_iteration + num_boost_round,
                                        evaluation_result_list=evaluation_result_list))
        except callback.EarlyStopException:
            break
@@ -347,24 +348,24 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
    feval : function
        Custom evaluation function.
    train_fields : dict
-        other data file in training data. e.g. train_fields['weight'] is weight data
+        Other data file in training data. e.g. train_fields['weight'] is weight data
-        support fields: weight, group, init_score
+        Support fields: weight, group, init_score
    feature_name : list of str
-        feature names
+        Feature names
-    categorical_feature : list of str/int
+    categorical_feature : list of str or int
-        categorical features , int type to use index, 
+        Categorical features, type int represents index, \
-        str type to use feature names (feature_name cannot be None)
+        type str represents feature names (need to specify feature_name as well)
    early_stopping_rounds: int
-        Activates early stopping. CV error needs to decrease at least
+        Activates early stopping. CV error needs to decrease at least \
        every <early_stopping_rounds> round(s) to continue.
        Last entry in evaluation history is the one from best iteration.
    fpreproc : function
-        Preprocessing function that takes (dtrain, dtest, param) and returns
+        Preprocessing function that takes (dtrain, dtest, param) and returns \
        transformed versions of those.
    verbose_eval : bool, int, or None, default None
-        Whether to display the progress. If None, progress will be displayed
+        Whether to display the progress. If None, progress will be displayed \
-        when np.ndarray is returned. If True, progress will be displayed at
+        when np.ndarray is returned. If True, progress will be displayed at \
-        boosting stage. If an integer is given, progress will be displayed
+        boosting stage. If an integer is given, progress will be displayed \
        at every given `verbose_eval` boosting stage.
    show_stdv : bool, default True
        Whether to display the standard deviation in progress.
@@ -378,22 +379,11 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
    -------
    evaluation history : list(string)
    """
+    if metrics:
-    if isinstance(metrics, str):
+        params.setdefault('metric', [])
-        metrics = [metrics]
+        if is_str(metrics):
+            params['metric'].append(metrics)
-    if isinstance(params, list):
-        params = dict(params)
-    if 'metric' not in params:
-        params['metric'] = []
-    else:
-        if is_str(params['metric']):
-            params['metric'] = params['metric'].split(',')
        else:
-            params['metric'] = list(params['metric'])
-    if metrics is not None and len(metrics) > 0:
            params['metric'].extend(metrics)
    train_set = _construct_dataset(train_data, None, params,
@@ -411,8 +401,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
                                             verbose=False))
    if isinstance(verbose_eval, bool) and verbose_eval:
        callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
-    else:
+    elif isinstance(verbose_eval, int):
-        if isinstance(verbose_eval, int):
        callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
    callbacks_before_iter = [

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
 # coding: utf-8
-# pylint: disable = invalid-name, W0105
+# pylint: disable = invalid-name, W0105, C0111
 """Scikit-Learn Wrapper interface for LightGBM."""
 from __future__ import absolute_import
 import numpy as np
-from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
+from .basic import LightGBMError, is_str
 from .engine import train
 # sklearn
 try:
@@ -66,7 +66,7 @@ def _point_wise_objective(func):
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
-                    raise ValueError("length of grad and hess should equal with num_class * num_data")
+                    raise ValueError("length of grad and hess should equal to num_class * num_data")
                for k in range(num_class):
                    for i in range(num_data):
                        idx = k * num_data + i
@@ -169,6 +169,7 @@ class LGBMModel(LGBMModelBase):
        self.is_unbalance = is_unbalance
        self.seed = seed
        self._Booster = None
+        self.best_iteration = -1
        if callable(self.objective):
            self.fobj = _point_wise_objective(self.objective)
        else:
@@ -190,7 +191,6 @@ class LGBMModel(LGBMModelBase):
    def get_params(self, deep=False):
        """Get parameters"""
        params = super(LGBMModel, self).get_params(deep=deep)
-        params['verbose'] = 0 if self.silent else 1
        if self.nthread <= 0:
            params.pop('nthread', None)
        return params
@@ -213,30 +213,31 @@ class LGBMModel(LGBMModelBase):
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
-            If callable, a custom evaluation metric. The call
+            If callable, a custom evaluation metric. The call \
-            signature is func(y_predicted, dataset) where dataset will be a
+            signature is func(y_predicted, dataset) where dataset will be a \
-            Dataset fobject such that you may need to call the get_label
+            Dataset fobject such that you may need to call the get_label \
            method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
        train_fields : dict
-            other data file in training data. e.g. train_fields['weight'] is weight data
+            Other data file in training data. e.g. train_fields['weight'] is weight data
-            support fields: weight, group, init_score
+            Support fields: weight, group, init_score
        valid_fields : dict
-            other data file in training data. \
+            Other data file in training data. \
            e.g. valid_fields[0]['weight'] is weight data for first valid data
-            support fields: weight, group, init_score
+            Support fields: weight, group, init_score
        feature_name : list of str
-            feature names
+            Feature names
-        categorical_feature : list of str/int
+        categorical_feature : list of str or int
-            categorical features , int type to use index, 
+            Categorical features, type int represents index, \
-            str type to use feature names (feature_name cannot be None)
+            type str represents feature names (need to specify feature_name as well)
        other_params: dict
-            other parameters
+            Other parameters
        """
        evals_result = {}
        params = self.get_params()
+        params['verbose'] = 0 if self.silent else 1
        if other_params is not None:
            params.update(other_params)
@@ -317,6 +318,14 @@ class LGBMModel(LGBMModelBase):
        return evals_result
+    def feature_importance(self):
+        """Feature importances
+        Returns
+        -------
+        Array of normailized feature importances
+        """
+        importace_array = self._Booster.feature_importance().astype(np.float32)
+        return importace_array / importace_array.sum()
 class LGBMRegressor(LGBMModel, LGBMRegressorBase):
    __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
@@ -394,7 +403,7 @@ def _group_wise_objective(func):
        y_true: array_like of shape [n_samples]
            The target values
        group : array_like of shape
-            group size data of data
+            Group size data of data
        y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
            The predicted values
    Returns

--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -5,7 +5,7 @@ from __future__ import absolute_import
 import sys
 import os
 from setuptools import setup, find_packages
-# import subprocess
 sys.path.insert(0, '.')
 CURRENT_DIR = os.path.dirname(__file__)

--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -227,8 +227,6 @@ Tree::Tree(const std::string& str) {
  leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_);
  leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);
 }
 }  // namespace LightGBM
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -101,6 +101,7 @@ def test_early_stopping():
    from sklearn.datasets import load_boston
    from sklearn.cross_validation import KFold
    from sklearn import datasets, metrics, model_selection
+    from sklearn.base import clone
    boston = load_boston()
    y = boston['target']
@@ -111,6 +112,7 @@ def test_early_stopping():
                eval_metric='l2', 
                early_stopping_rounds=10,
                verbose=10)
+    lgb_model_clone = clone(lgb_model)
    print(lgb_model.best_iteration)
 test_binary_classification()