add an advanced example; add guide-python README.md details; clean error messages (#117)

ebfc8521 · wxchan · Guolin Ke · b51c7be4 · ebfc8521 · ebfc8521
Commit ebfc8521 authored Dec 09, 2016 by wxchan Committed by Guolin Ke Dec 09, 2016
8 changed files
--- a/examples/python-guide/README.md
+++ b/examples/python-guide/README.md
@@ -16,3 +16,23 @@ Now you can run examples in this folder, for example:
 ```
 python simple_example.py
 ```
+Examples including:
+- [simple_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py)
+    - Construct Dataset
+    - Basic train and predict
+    - Eval during training 
+    - Early stopping
+    - Save model to file
+    - Dump model to json format
+    - Feature importances
+- [sklearn_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py)
+    - Basic train and predict with sklearn interface
+    - Feature importances with sklearn interface
+- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
+    - Set feature names
+    - Directly use categorical features without one-hot encoding
+    - Load model file to continue training
+    - Change learning rates during training
+    - Self-defined objective function
+    - Self-defined eval metric
+    - Callback function
\ No newline at end of file
--- a/examples/python-guide/advanced_example.py
+++ b/examples/python-guide/advanced_example.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111
+import lightgbm as lgb
+import pandas as pd
+import numpy as np
+
+# load or create your dataset
+print('Load data...')
+df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
+df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
+W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
+W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]
+
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+
+num_train, num_feature = X_train.shape
+
+# create dataset for lightgbm
+# if you want to re-use data, remember to set free_raw_data=False
+lgb_train = lgb.Dataset(X_train, y_train,
+                        weight=W_train, free_raw_data=False)
+lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
+                       weight=W_test, free_raw_data=False)
+
+# specify your configurations as a dict
+params = {
+    'boosting_type' : 'gbdt',
+    'objective' : 'binary',
+    'metric' : 'binary_logloss',
+    'num_leaves' : 31,
+    'learning_rate' : 0.05,
+    'feature_fraction' : 0.9,
+    'bagging_fraction' : 0.8,
+    'bagging_freq': 5,
+    'verbose' : 0
+}
+
+# generate a feature name
+feature_name = ['feature_' + str(col) for col in range(num_feature)]
+
+print('Start training...')
+# feature_name and categorical_feature
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                valid_sets=lgb_train, # eval training data
+                feature_name=feature_name,
+                categorical_feature=[21])
+
+# check feature name
+print('Finish first 10 rounds...')
+print('7th feature name is:', repr(lgb_train.feature_name[6]))
+
+# save model to file
+gbm.save_model('model.txt')
+
+# continue training
+# init_model accepts:
+# 1. model file name
+# 2. Booster()
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                init_model='model.txt',
+                valid_sets=lgb_eval)
+
+print('Finish 10 - 20 rounds with model file...')
+
+# decay learning rates
+# learning_rates accepts:
+# 1. list/tuple with length = num_boost_round
+# 2. function(curr_iter)
+# 3. function(curr_iter, total_iter)
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                init_model=gbm,
+                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
+                valid_sets=lgb_eval)
+
+print('Finish 20 - 30 rounds with decay learning rates...')
+
+# self-defined objective function
+# f(preds: array, train_data: Dataset) -> grad: array, hess: array
+# log likelihood loss
+def loglikelood(preds, train_data):
+    labels = train_data.get_label()
+    preds = 1. / (1. + np.exp(-preds))
+    grad = preds - labels
+    hess = preds * (1. - preds)
+    return grad, hess
+
+# self-defined eval metric
+# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
+# binary error
+def binary_error(preds, train_data):
+    labels = train_data.get_label()
+    return 'error', np.mean(labels != (preds > 0.5)), False
+
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                init_model=gbm,
+                fobj=loglikelood,
+                feval=binary_error,
+                valid_sets=lgb_eval)
+
+print('Finish 30 - 40 rounds with self-defined objective function and eval metric...')
+
+print('Start a new training job...')
+# callback
+def reset_metrics():
+    def callback(env):
+        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
+        if env.iteration - env.begin_iteration == 5:
+            print('Add a new valid dataset at iteration 5...')
+            env.model.add_valid(lgb_eval_new, 'new valid')
+    callback.before_iteration = True
+    callback.order = 0
+    return callback
+
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                valid_sets=lgb_train,
+                callbacks=[reset_metrics()])
+
+print('Finish first 10 rounds with callback function...')
--- a/examples/python-guide/simple_example.py
+++ b/examples/python-guide/simple_example.py
@@ -6,6 +6,7 @@ import pandas as pd
 from sklearn.metrics import mean_squared_error

 # load or create your dataset
+print('Load data...')
 df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
 df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

@@ -18,7 +19,6 @@ X_test = df_test.drop(0, axis=1)
 lgb_train = lgb.Dataset(X_train, y_train)
 lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

-
 # specify your configurations as a dict
 params = {
    'task' : 'train',
@@ -33,27 +33,32 @@ params = {
    'verbose' : 0
 }

+print('Start training...')
 # train
 gbm = lgb.train(params,
                lgb_train,
-                num_boost_round=100,
+                num_boost_round=20,
                valid_sets=lgb_eval,
-                early_stopping_rounds=10)
+                early_stopping_rounds=5)

+print('Save model...')
 # save model to file
 gbm.save_model('model.txt')

+print('Start predicting...')
 # predict
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 # eval
 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

+print('Dump model to JSON...')
 # dump model to json (and save to file)
 model_json = gbm.dump_model()

 with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

+print('Calculate feature importances...')
 # feature importances
-print('Feature importances:', gbm.feature_importance())
-print('Feature importances:', gbm.feature_importance("gain"))
+print('Feature importances:', list(gbm.feature_importance()))
+# print('Feature importances:', list(gbm.feature_importance("gain")))
--- a/examples/python-guide/sklearn_example.py
+++ b/examples/python-guide/sklearn_example.py
@@ -5,6 +5,7 @@ import pandas as pd
 from sklearn.metrics import mean_squared_error

 # load or create your dataset
+print('Load data...')
 df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
 df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

@@ -13,19 +14,23 @@ y_test = df_test[0]
 X_train = df_train.drop(0, axis=1)
 X_test = df_test.drop(0, axis=1)

+print('Start training...')
 # train
 gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=31,
                        learning_rate=0.05,
-                        n_estimators=100)
+                        n_estimators=20)
 gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
-        early_stopping_rounds=10)
+        eval_metric='l1',
+        early_stopping_rounds=5)

+print('Start predicting...')
 # predict
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 # eval
 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

+print('Calculate feature importances...')
 # feature importances
-print('Feature importances:', gbm.feature_importance())
+print('Feature importances:', list(gbm.feature_importance()))
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
 # coding: utf-8
-# pylint: disable = invalid-name, C0111, C0301, R0912, R0913, R0914, W0105
+# pylint: disable = invalid-name, C0111, C0301
+# pylint: disable = R0912, R0913, R0914, W0105, W0201, W0212
 # pylint: disable = E1101
 """Wrapper c_api of LightGBM"""
 from __future__ import absolute_import
@@ -17,13 +18,11 @@ from .libpath import find_lib_path
 """pandas"""
 try:
    from pandas import Series, DataFrame
-    IS_PANDAS_INSTALLED = True
 except ImportError:
    class Series(object):
        pass
    class DataFrame(object):
        pass
-    IS_PANDAS_INSTALLED = False

 IS_PY3 = (sys.version_info[0] == 3)

@@ -72,7 +71,7 @@ def is_1d_list(data):
    return isinstance(data, list) and \
        (not data or isinstance(data[0], (int, float, bool)))

-def list_to_1d_numpy(data, dtype):
+def list_to_1d_numpy(data, dtype=np.float32, name='list'):
    """convert to 1d numpy array"""
    if is_numpy_1d_array(data):
        if data.dtype == dtype:
@@ -81,28 +80,26 @@ def list_to_1d_numpy(data, dtype):
            return data.astype(dtype=dtype, copy=False)
    elif is_1d_list(data):
        return np.array(data, dtype=dtype, copy=False)
-    elif IS_PANDAS_INSTALLED and isinstance(data, Series):
-        return data.astype(dtype).values
+    elif isinstance(data, Series):
+        return data.values.astype(dtype)
    else:
-        raise TypeError("Unknow type({})".format(type(data).__name__))
+        raise TypeError("Wrong type({}) for {}, should be list or numpy array".format(type(data).__name__, name))

 def cfloat32_array_to_numpy(cptr, length):
    """Convert a ctypes float pointer array to a numpy array.
    """
    if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
-        res = np.fromiter(cptr, dtype=np.float32, count=length)
-        return res
+        return np.fromiter(cptr, dtype=np.float32, count=length)
    else:
-        raise RuntimeError('expected float pointer')
+        raise RuntimeError('Expected float pointer')

 def cint32_array_to_numpy(cptr, length):
    """Convert a ctypes float pointer array to a numpy array.
    """
    if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
-        res = np.fromiter(cptr, dtype=np.int32, count=length)
-        return res
+        return np.fromiter(cptr, dtype=np.int32, count=length)
    else:
-        raise RuntimeError('expected int pointer')
+        raise RuntimeError('Expected int pointer')

 def c_str(string):
    """Convert a python string to cstring."""
@@ -113,7 +110,7 @@ def c_array(ctype, values):
    return (ctype * len(values))(*values)

 def param_dict_to_str(data):
-    if not data:
+    if data is None or not data:
        return ""
    pairs = []
    for key, val in data.items():
@@ -122,7 +119,7 @@ def param_dict_to_str(data):
        elif isinstance(val, (list, tuple, set)):
            pairs.append(str(key)+'='+','.join(map(str, val)))
        else:
-            raise TypeError('unknow type of parameter:%s , got:%s'
+            raise TypeError('Unknown type of parameter:%s, got:%s'
                            % (key, type(val).__name__))
    return ' '.join(pairs)

@@ -158,10 +155,10 @@ def c_float_array(data):
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
            type_data = C_API_DTYPE_FLOAT64
        else:
-            raise TypeError("expected np.float32 or np.float64, met type({})"
+            raise TypeError("Expected np.float32 or np.float64, met type({})"
                            .format(data.dtype))
    else:
-        raise TypeError("Unknow type({})".format(type(data).__name__))
+        raise TypeError("Unknown type({})".format(type(data).__name__))
    return (ptr_data, type_data)

 def c_int_array(data):
@@ -176,10 +173,10 @@ def c_int_array(data):
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
            type_data = C_API_DTYPE_INT64
        else:
-            raise TypeError("expected np.int32 or np.int64, met type({})"
+            raise TypeError("Expected np.int32 or np.int64, met type({})"
                            .format(data.dtype))
    else:
-        raise TypeError("Unknow type({})".format(type(data).__name__))
+        raise TypeError("Unknown type({})".format(type(data).__name__))
    return (ptr_data, type_data)

 class _InnerPredictor(object):
@@ -261,7 +258,7 @@ class _InnerPredictor(object):
        Prediction result
        """
        if isinstance(data, (_InnerDataset, Dataset)):
-            raise TypeError("cannot use Dataset instance for prediction, please use raw data instead")
+            raise TypeError("Cannot use Dataset instance for prediction, please use raw data instead")
        predict_type = C_API_PREDICT_NORMAL
        if raw_score:
            predict_type = C_API_PREDICT_RAW_SCORE
@@ -290,7 +287,7 @@ class _InnerPredictor(object):
        elif isinstance(data, np.ndarray):
            preds, nrow = self.__pred_for_np2d(data, num_iteration,
                                               predict_type)
-        elif IS_PANDAS_INSTALLED and isinstance(data, DataFrame):
+        elif isinstance(data, DataFrame):
            preds, nrow = self.__pred_for_np2d(data.values, num_iteration,
                                               predict_type)
        else:
@@ -299,15 +296,14 @@ class _InnerPredictor(object):
                preds, nrow = self.__pred_for_csr(csr, num_iteration,
                                                  predict_type)
            except:
-                raise TypeError('can not predict data for type {}'.
-                                format(type(data).__name__))
+                raise TypeError('Cannot predict data for type {}'.format(type(data).__name__))
        if pred_leaf:
            preds = preds.astype(np.int32)
        if is_reshape and preds.size != nrow:
            if preds.size % nrow == 0:
                preds = preds.reshape(nrow, -1)
            else:
-                raise ValueError('length of predict result (%d) cannot be divide nrow (%d)'
+                raise ValueError('Length of predict result (%d) cannot be divide nrow (%d)'
                                 % (preds.size, nrow))
        return preds

@@ -353,7 +349,7 @@ class _InnerPredictor(object):
            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            ))
        if n_preds != out_num_preds.value:
-            raise ValueError("incorrect number for predict result")
+            raise ValueError("Wrong length for predict results")
        return preds, mat.shape[0]

    def __pred_for_csr(self, csr, num_iteration, predict_type):
@@ -384,7 +380,7 @@ class _InnerPredictor(object):
            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            ))
        if n_preds != out_num_preds.value:
-            raise ValueError("incorrect number for predict result")
+            raise ValueError("Wrong length for predict results")
        return preds, nrow

 PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
@@ -481,10 +477,10 @@ class _InnerDataset(object):
                elif isinstance(name, int):
                    categorical_indices.add(name)
                else:
-                    raise TypeError("unknown type({}) or unknown name({}) in categorical_feature" \
+                    raise TypeError("Wrong type({}) or unknown name({}) in categorical_feature" \
                        .format(type(name).__name__, name))

-            params['categorical_column'] = categorical_indices
+            params['categorical_column'] = sorted(categorical_indices)

        params_str = param_dict_to_str(params)
        """process for reference dataset"""
@@ -514,11 +510,11 @@ class _InnerDataset(object):
                csr = scipy.sparse.csr_matrix(data)
                self.__init_from_csr(csr, params_str, ref_dataset)
            except:
-                raise TypeError('can not initialize _InnerDataset from {}'.format(type(data).__name__))
+                raise TypeError('Cannot initialize _InnerDataset from {}'.format(type(data).__name__))
        if label is not None:
            self.set_label(label)
        if self.get_label() is None:
-            raise ValueError("label should not be None")
+            raise ValueError("Label should not be None")
        if weight is not None:
            self.set_weight(weight)
        if group is not None:
@@ -572,7 +568,7 @@ class _InnerDataset(object):
        """
        Get subset of current dataset
        """
-        used_indices = list_to_1d_numpy(used_indices, np.int32)
+        used_indices = list_to_1d_numpy(used_indices, np.int32, name='used_indices')
        ret = _InnerDataset(None)
        ret.handle = ctypes.c_void_p()
        params_str = param_dict_to_str(params)
@@ -585,7 +581,7 @@ class _InnerDataset(object):
        ret.max_bin = self.max_bin
        ret.predictor = self.predictor
        if ret.get_label() is None:
-            raise ValueError("label should not be None")
+            raise ValueError("Label should not be None")
        return ret

    def set_feature_name(self, feature_name):
@@ -595,7 +591,7 @@ class _InnerDataset(object):
        if feature_name is None:
            return
        if len(feature_name) != self.num_feature():
-            raise ValueError("size of feature_name error")
+            raise ValueError("Length of feature_name({}) and num_feature({}) don't match".format(len(feature_name), self.num_feature()))
        c_feature_name = [c_str(name) for name in feature_name]
        _safe_call(_LIB.LGBM_DatasetSetFeatureNames(
            self.handle,
@@ -632,7 +628,7 @@ class _InnerDataset(object):
        Initialize data from a CSR matrix.
        """
        if len(csr.indices) != len(csr.data):
-            raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
+            raise ValueError('Length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
        self.handle = ctypes.c_void_p()

        ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
@@ -685,7 +681,7 @@ class _InnerDataset(object):
        elif out_type.value == C_API_DTYPE_FLOAT32:
            return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
        else:
-            raise TypeError("unknow type")
+            raise TypeError("Unknown type")

    def set_field(self, field_name, data):
        """Set property into the _InnerDataset.
@@ -707,11 +703,8 @@ class _InnerDataset(object):
                0,
                FIELD_TYPE_MAPPER[field_name]))
            return
-        if IS_PANDAS_INSTALLED and isinstance(data, Series):
        dtype = np.int32 if field_name == 'group' else np.float32
-            data = data.astype(dtype).values
-        if not is_numpy_1d_array(data):
-            raise TypeError("Unknow type({})".format(type(data).__name__))
+        data = list_to_1d_numpy(data, dtype, name=field_name)
        if data.dtype == np.float32:
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            type_data = C_API_DTYPE_FLOAT32
@@ -719,9 +712,9 @@ class _InnerDataset(object):
            ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
            type_data = C_API_DTYPE_INT32
        else:
-            raise TypeError("excepted np.float32 or np.int32, met type({})".format(data.dtype))
+            raise TypeError("Excepted np.float32 or np.int32, meet type({})".format(data.dtype))
        if type_data != FIELD_TYPE_MAPPER[field_name]:
-            raise TypeError("type error for set_field")
+            raise TypeError("Input type error for set_field")
        _safe_call(_LIB.LGBM_DatasetSetField(
            self.handle,
            c_str(field_name),
@@ -749,7 +742,7 @@ class _InnerDataset(object):
        label: numpy array or list or None
            The label information to be set into _InnerDataset
        """
-        label = list_to_1d_numpy(label, np.float32)
+        label = list_to_1d_numpy(label, name='label')
        self.set_field('label', label)

    def set_weight(self, weight):
@@ -761,7 +754,7 @@ class _InnerDataset(object):
            Weight for each data point
        """
        if weight is not None:
-            weight = list_to_1d_numpy(weight, np.float32)
+            weight = list_to_1d_numpy(weight, name='weight')
        self.set_field('weight', weight)

    def set_init_score(self, score):
@@ -773,7 +766,7 @@ class _InnerDataset(object):
            Init score for booster
        """
        if score is not None:
-            score = list_to_1d_numpy(score, np.float32)
+            score = list_to_1d_numpy(score, name='init score')
        self.set_field('init_score', score)

    def set_group(self, group):
@@ -785,7 +778,7 @@ class _InnerDataset(object):
            Group size of each group
        """
        if group is not None:
-            group = list_to_1d_numpy(group, np.int32)
+            group = list_to_1d_numpy(group, np.int32, name='group')
        self.set_field('group', group)

    def get_label(self):
@@ -941,7 +934,8 @@ class Dataset(object):
            else:
                self.inner_dataset = _InnerDataset(self.data, self.label, self.max_bin,
                                                   None, self.weight, self.group, self._predictor,
-                    self.silent, self.feature_name, self.categorical_feature, self.params)
+                                                   self.silent, self.feature_name,
+                                                   self.categorical_feature, self.params)
            if self.free_raw_data:
                self.data = None

@@ -994,7 +988,7 @@ class Dataset(object):
        Parameters
        ----------
        reference : Dataset
-            will use reference as template to consturct current dataset
+            Will use reference as template to consturct current dataset
        """
        self.set_categorical_feature(reference.categorical_feature)
        self.set_feature_name(reference.feature_name)
@@ -1015,7 +1009,7 @@ class Dataset(object):
        Parameters
        ----------
        feature_name : list of str
-            feature names
+            Feature names
        """
        self.feature_name = feature_name
        if self.__is_constructed():
@@ -1028,9 +1022,9 @@ class Dataset(object):
        Parameters
        ----------
        used_indices : list of int
-            use indices of this subset
+            Used indices of this subset
        params : dict
-            other parameters
+            Other parameters
        """
        ret = Dataset(None)
        ret.feature_name = self.feature_name
@@ -1198,7 +1192,7 @@ class Booster(object):
        if train_set is not None:
            """Training task"""
            if not isinstance(train_set, Dataset):
-                raise TypeError('training data should be Dataset instance, met {}'.format(type(train_set).__name__))
+                raise TypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))
            params_str = param_dict_to_str(params)
            """construct booster object"""
            _safe_call(_LIB.LGBM_BoosterCreate(
@@ -1237,7 +1231,7 @@ class Booster(object):
                ctypes.byref(out_num_class)))
            self.__num_class = out_num_class.value
        else:
-            raise TypeError('At least need training dataset or model file to create booster instance')
+            raise TypeError('Need at least one training dataset or model file to create booster instance')

    def __del__(self):
        if self.handle is not None:
@@ -1342,22 +1336,10 @@ class Booster(object):
        -------
        is_finished, bool
        """
-        if not is_numpy_1d_array(grad):
-            if is_1d_list(grad):
-                grad = np.array(grad, dtype=np.float32, copy=False)
-            else:
-                raise TypeError("grad should be numpy 1d array or 1d list")
-        if not is_numpy_1d_array(hess):
-            if is_1d_list(hess):
-                hess = np.array(hess, dtype=np.float32, copy=False)
-            else:
-                raise TypeError("hess should be numpy 1d array or 1d list")
+        grad = list_to_1d_numpy(grad, name='gradient')
+        hess = list_to_1d_numpy(hess, name='hessian')
        if len(grad) != len(hess):
-            raise ValueError('grad / hess lengths mismatch: {} / {}'.format(len(grad), len(hess)))
-        if grad.dtype != np.float32:
-            grad = grad.astype(np.float32, copy=False)
-        if hess.dtype != np.float32:
-            hess = hess.astype(np.float32, copy=False)
+            raise ValueError("Lengths of gradient({}) and hessian({}) don't match".format(len(grad), len(hess)))
        is_finished = ctypes.c_int(0)
        _safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
            self.handle,
@@ -1548,7 +1530,7 @@ class Booster(object):
        Evaulate training or validation data
        """
        if data_idx >= self.__num_dataset:
-            raise ValueError("data_idx should be smaller than number of dataset")
+            raise ValueError("Data_idx should be smaller than number of dataset")
        self.__get_eval_info()
        ret = []
        if self.__num_inner_eval > 0:
@@ -1560,7 +1542,7 @@ class Booster(object):
                ctypes.byref(tmp_out_len),
                result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
            if tmp_out_len.value != self.__num_inner_eval:
-                raise ValueError("incorrect number of eval results")
+                raise ValueError("Wrong length of eval results")
            for i in range(self.__num_inner_eval):
                ret.append((data_name, self.__name_inner_eval[i], result[i], self.__higher_better_inner_eval[i]))
        if feval is not None:
@@ -1582,7 +1564,7 @@ class Booster(object):
        Predict for training and validation dataset
        """
        if data_idx >= self.__num_dataset:
-            raise ValueError("data_idx should be smaller than number of dataset")
+            raise ValueError("Data_idx should be smaller than number of dataset")
        if self.__inner_predict_buffer[data_idx] is None:
            if data_idx == 0:
                n_preds = self.train_set.num_data() * self.__num_class
@@ -1600,7 +1582,7 @@ class Booster(object):
                ctypes.byref(tmp_out_len),
                data_ptr))
            if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
-                raise ValueError("incorrect number of predict results for data %d" % (data_idx))
+                raise ValueError("Wrong length of predict results for data %d" % (data_idx))
            self.__is_predicted_cur_iter[data_idx] = True
        return self.__inner_predict_buffer[data_idx]

@@ -1626,7 +1608,7 @@ class Booster(object):
                    ctypes.byref(tmp_out_len),
                    ptr_string_buffers))
                if self.__num_inner_eval != tmp_out_len.value:
-                    raise ValueError("size of eval names doesn't equal with num_evals")
+                    raise ValueError("Length of eval names doesn't equal with num_evals")
                self.__name_inner_eval = \
                    [string_buffers[i].value.decode() for i in range(self.__num_inner_eval)]
                self.__higher_better_inner_eval = \
@@ -1658,7 +1640,7 @@ class Booster(object):
        for key, value in kwargs.items():
            if value is not None:
                if not is_str(value):
-                    raise ValueError("set_attr only accepts string values")
+                    raise ValueError("Set attr only accepts strings")
                self.__attr[key] = value
            else:
                self.__attr.pop(key, None)
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -35,7 +35,7 @@ def _format_eval_result(value, show_stdv=True):
        else:
            return '%s\'s %s:%g' % (value[0], value[1], value[2])
    else:
-        raise ValueError("wrong metric value")
+        raise ValueError("Wrong metric value")


 def print_evaluation(period=1, show_stdv=True):
@@ -80,7 +80,7 @@ def record_evaluation(eval_result):
        The requested callback function.
    """
    if not isinstance(eval_result, dict):
-        raise TypeError('eval_result has to be a dictionary')
+        raise TypeError('Eval_result should be a dictionary')
    eval_result.clear()

    def init(env):
@@ -164,7 +164,7 @@ def early_stop(stopping_rounds, verbose=True):
    def init(env):
        """internal function"""
        if not env.evaluation_result_list:
-            raise ValueError('For early stopping you need at least one set in evals.')
+            raise ValueError('For early stopping, at least one dataset is required for evaluation')

        if verbose:
            msg = "Train until valid scores didn't improve in {} rounds."
@@ -194,7 +194,7 @@ def early_stop(stopping_rounds, verbose=True):
                    if env.model is not None:
                        env.model.set_attr(best_iteration=str(best_iter[i]))
                    if verbose:
-                        print('early stopping, best iteration is:')
+                        print('Early stopping, best iteration is:')
                        print(best_msg[i])
                    raise EarlyStopException(best_iter[i])
    callback.order = 30

--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -85,10 +85,10 @@ def train(params, train_set, num_boost_round=100,
        predictor = init_model._to_predictor()
    else:
        predictor = None
-    init_iteration = predictor.num_total_iteration if predictor else 0
+    init_iteration = predictor.num_total_iteration if predictor is not None else 0
    """check dataset"""
    if not isinstance(train_set, Dataset):
-        raise TypeError("only can accept Dataset instance for traninig")
+        raise TypeError("Traninig only accepts Dataset object")

    train_set._set_predictor(predictor)
    train_set.set_feature_name(feature_name)
@@ -98,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
    train_data_name = "training"
    reduced_valid_sets = []
    name_valid_sets = []
-    if valid_sets:
+    if valid_sets is not None:
        if isinstance(valid_sets, Dataset):
            valid_sets = [valid_sets]
        if isinstance(valid_names, str):
@@ -111,7 +111,7 @@ def train(params, train_set, num_boost_round=100,
                    train_data_name = valid_names[i]
                continue
            if not isinstance(valid_data, Dataset):
-                raise TypeError("only can accept Dataset instance for traninig")
+                raise TypeError("Traninig only accepts Dataset object")
            valid_data.set_reference(train_set)
            reduced_valid_sets.append(valid_data)
            if valid_names is not None and len(valid_names) > i:
@@ -120,7 +120,7 @@ def train(params, train_set, num_boost_round=100,
                name_valid_sets.append('valid_'+str(i))

    """process callbacks"""
-    if not callbacks:
+    if callbacks is None:
        callbacks = set()
    else:
        for i, cb in enumerate(callbacks):
@@ -133,7 +133,7 @@ def train(params, train_set, num_boost_round=100,
    elif isinstance(verbose_eval, int):
        callbacks.add(callback.print_evaluation(verbose_eval))

-    if early_stopping_rounds:
+    if early_stopping_rounds is not None:
        callbacks.add(callback.early_stop(early_stopping_rounds,
                                          verbose=bool(verbose_eval)))

@@ -169,7 +169,7 @@ def train(params, train_set, num_boost_round=100,

        evaluation_result_list = []
        # check evaluation result.
-        if valid_sets:
+        if valid_sets is not None:
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
@@ -227,7 +227,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
            sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
            idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
        else:
-            raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
+            raise LightGBMError('Scikit-learn is required for stratified cv')
    else:
        full_data.construct()
        randidx = np.random.permutation(full_data.num_data())
@@ -318,7 +318,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
    evaluation history : list(string)
    """
    if not isinstance(train_set, Dataset):
-        raise TypeError("only can accept Dataset instance for traninig")
+        raise TypeError("Traninig only accepts Dataset object")

    if is_str(init_model):
        predictor = _InnerPredictor(model_file=init_model)
@@ -342,13 +342,13 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
    cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)

    # setup callbacks
-    if not callbacks:
+    if callbacks is None:
        callbacks = set()
    else:
        for i, cb in enumerate(callbacks):
            cb.__dict__.setdefault('order', i - len(callbacks))
        callbacks = set(callbacks)
-    if early_stopping_rounds:
+    if early_stopping_rounds is not None:
        callbacks.add(callback.early_stop(early_stopping_rounds, verbose=False))
    if verbose_eval is True:
        callbacks.add(callback.print_evaluation(show_stdv=show_stdv))

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -6,7 +6,7 @@ from __future__ import absolute_import
 import numpy as np
 from .basic import LightGBMError, Dataset, is_str
 from .engine import train
-# sklearn
+'''sklearn'''
 try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
@@ -38,7 +38,6 @@ def _point_wise_objective(func):
        y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
            The predicted values

-
    Returns
    -------
    new_func: callable
@@ -66,7 +65,7 @@ def _point_wise_objective(func):
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
-                    raise ValueError("length of grad and hess should equal to num_class * num_data")
+                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
                for k in range(num_class):
                    for i in range(num_data):
                        idx = k * num_data + i
@@ -147,7 +146,7 @@ class LGBMModel(LGBMModelBase):
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
        if not SKLEARN_INSTALLED:
-            raise LightGBMError('sklearn needs to be installed in order to use this module')
+            raise LightGBMError('Scikit-learn is required for this module')

        self.num_leaves = num_leaves
        self.max_depth = max_depth
@@ -185,7 +184,7 @@ class LGBMModel(LGBMModelBase):
        booster : a lightgbm booster of underlying model
        """
        if self._Booster is None:
-            raise LightGBMError('need to call fit beforehand')
+            raise LightGBMError('Need to call fit beforehand')
        return self._Booster

    def get_params(self, deep=False):
@@ -343,7 +342,7 @@ class LGBMModel(LGBMModelBase):
        if self.evals_result_:
            evals_result = self.evals_result_
        else:
-            raise LightGBMError('No results.')
+            raise LightGBMError('No results found.')

        return evals_result

@@ -390,8 +389,8 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
                 is_unbalance=False, seed=0):
        super(LGBMClassifier, self).__init__(num_leaves, max_depth,
                                             learning_rate, n_estimators, max_bin,
-                                             silent, objective,
-                                             nthread, min_split_gain, min_child_weight, min_child_samples,
+                                             silent, objective, nthread,
+                                             min_split_gain, min_child_weight, min_child_samples,
                                             subsample, subsample_freq, colsample_bytree,
                                             reg_alpha, reg_lambda, scale_pos_weight,
                                             is_unbalance, seed)
@@ -480,7 +479,7 @@ def _group_wise_objective(func):
        labels = dataset.get_label()
        group = dataset.get_group()
        if group is None:
-            raise ValueError("group should not be None for ranking task")
+            raise ValueError("Group should not be None for ranking task")
        grad, hess = func(labels, group, preds)
        """weighted for objective"""
        weight = dataset.get_weight()
@@ -490,7 +489,7 @@ def _group_wise_objective(func):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
-                raise ValueError("lenght of grad and hess should equal with num_data")
+                raise ValueError("Length of grad and hess should equal with num_data")
        return grad, hess
    return inner

@@ -508,8 +507,8 @@ class LGBMRanker(LGBMModel):
                 is_unbalance=False, seed=0):
        super(LGBMRanker, self).__init__(num_leaves, max_depth,
                                         learning_rate, n_estimators, max_bin,
-                                             silent, objective,
-                                             nthread, min_split_gain, min_child_weight, min_child_samples,
+                                         silent, objective, nthread,
+                                         min_split_gain, min_child_weight, min_child_samples,
                                         subsample, subsample_freq, colsample_bytree,
                                         reg_alpha, reg_lambda, scale_pos_weight,
                                         is_unbalance, seed)
@@ -535,17 +534,18 @@ class LGBMRanker(LGBMModel):

        """check group data"""
        if group is None:
-            raise ValueError("should use group for ranking task")
+            raise ValueError("Should set group for ranking task")

        if eval_set is not None:
            if eval_group is None:
-                raise ValueError("eval_group cannot be None when eval_set is not None")
+                raise ValueError("Eval_group cannot be None when eval_set is not None")
            elif len(eval_group) != len(eval_set):
-                raise ValueError("length of eval_group should equal with eval_set")
+                raise ValueError("Length of eval_group should equal to eval_set")
            else:
                for inner_group in eval_group:
                    if inner_group is None:
-                        raise ValueError("should set group for all eval data for ranking task")
+                        raise ValueError("Should set group for all eval dataset for ranking task")
+
        if eval_at is not None:
            other_params = {} if other_params is None else other_params
            other_params['ndcg_eval_at'] = list(eval_at)