Merge pull request #97 from wxchan/dev

Clean codes for python-package; dump model to JSON

Merge pull request #97 from wxchan/dev
Clean codes for python-package; dump model to JSON
9f4849b3 · Guolin Ke · GitHub · fa51a676 · 69114525 · 9f4849b3
Commit 9f4849b3 authored Dec 02, 2016 by Guolin Ke Committed by GitHub Dec 02, 2016
19 changed files
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ For more details, please refer to [Features](https://github.com/Microsoft/LightG
 News
 ----

-12/02/2012 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.
+12/02/2016 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.

 Get Started
 ------------

--- a/examples/python-guide/README.md
+++ b/examples/python-guide/README.md
+Python Package Example
+=====================
+Here is an example for LightGBM to use python package.
+
+***You should install lightgbm (both c++ and python verion) first.***
+
+For the installation, check the wiki [here](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide).
+
+You also need scikit-learn and pandas to run the examples, but they are not required for the package itself. You can install them with pip:
+```
+pip install -U scikit-learn
+pip install -U pandas
+```
+
+Now you can run examples in this folder, for example:
+```
+python simple_example.py
+```
--- a/examples/python-guide/simple_example.py
+++ b/examples/python-guide/simple_example.py
-import numpy as np
-import random
+# coding: utf-8
+# pylint: disable = invalid-name, C0111
+import json
 import lightgbm as lgb
-from sklearn import datasets, metrics, model_selection
-
-rng = np.random.RandomState(2016)
-
-X, y = datasets.make_classification(n_samples=10000, n_features=100)
-x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
-lgb_model = lgb.LGBMClassifier(n_estimators=100).fit(x_train, y_train, [(x_test, y_test)], eval_metric="auc")
-lgb_model.predict(x_test)
-# save model
-lgb_model.booster().save_model('model.txt')
-# load model
-booster = lgb.Booster(model_file='model.txt')
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+
+# load or create your dataset
+df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
+df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
+
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+
+# create dataset for lightgbm
+lgb_train = lgb.Dataset(X_train, y_train)
+lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+# or you can simply use a tuple of length=2 here
+lgb_train = (X_train, y_train)
+lgb_eval = (X_test, y_test)
+
+# specify your configurations as a dict
+params = {
+    'task' : 'train',
+    'boosting_type' : 'gbdt',
+    'objective' : 'regression',
+    'metric' : 'l2',
+    'num_leaves' : 31,
+    'learning_rate' : 0.05,
+    'feature_fraction' : 0.9,
+    'bagging_fraction' : 0.8,
+    'bagging_freq': 5,
+    # 'ndcg_eval_at' : [1, 3, 5, 10],
+    # this metric is not needed in this task, show as an example
+    'verbose' : 0
+}
+
+# train
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=100,
+                valid_datas=lgb_eval,
+                # you can use a list to represent multiple valid_datas/valid_names
+                # don't use tuple, tuple is used to represent one dataset
+                early_stopping_rounds=10)
+
+# save model to file
+gbm.save_model('model.txt')
+
+# load model from file
+gbm = lgb.Booster(model_file='model.txt')
+
 # predict
-print(booster.predict(x_test))
+y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
+# eval
+print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
+
+# dump model to json (and save to file)
+model_json = gbm.dump_model()
+
+with open('model.json', 'w+') as f:
+    json.dump(model_json, f, indent=4)
--- a/examples/python-guide/sklearn_example.py
+++ b/examples/python-guide/sklearn_example.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111
+import lightgbm as lgb
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+
+# load or create your dataset
+df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
+df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
+
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+
+# train
+gbm = lgb.LGBMRegressor(objective='regression',
+                        num_leaves=31,
+                        learning_rate=0.05,
+                        n_estimators=100)
+gbm.fit(X_train, y_train,
+        eval_set=[(X_test, y_test)],
+        early_stopping_rounds=10)
+
+# predict
+y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
+# eval
+print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -133,9 +133,16 @@ public:
    const double* feature_values) const = 0;

  /*!
-  * \brief save model to file
-  * \param num_iterations Iterations that want to save, -1 means save all
-  * \param filename filename that want to save to
+  * \brief Dump model to json format string
+  * \return Json format string of model
+  */
+  virtual std::string DumpModel() const = 0;
+
+  /*!
+  * \brief Save model to file
+  * \param num_used_model Number of model that want to save, -1 means save all
+  * \param is_finish Is training finished or not
+  * \param filename Filename that want to save to
  */
  virtual void SaveModelToFile(int num_iterations, const char* filename) const = 0;


--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -474,7 +474,18 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
  int num_iteration,
  const char* filename);

-
+/*!
+* \brief dump model to json
+* \param handle handle
+* \param buffer_len string buffer length, if buffer_len < out_len, re-allocate buffer
+* \param out_len actual output length
+* \param out_str json format string of model
+* \return 0 when succeed, -1 when failure happens
+*/
+DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
+  int buffer_len,
+  int64_t* out_len,
+  char** out_str);

 // some help functions used to convert data


--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -98,9 +98,12 @@ public:
    }
  }

-  /*! \brief Serialize this object by string*/
+  /*! \brief Serialize this object to string*/
  std::string ToString();

+  /*! \brief Serialize this object to json*/
+  std::string ToJSON();
+
 private:
  /*!
  * \brief Find leaf index of which record belongs by data
@@ -118,6 +121,9 @@ private:
  */
  inline int GetLeaf(const double* feature_values) const;

+  /*! \brief Serialize one node to json*/
+  inline std::string NodeToJSON(int index);
+
  /*! \brief Number of max leaves*/
  int max_leaves_;
  /*! \brief Number of current levas*/
@@ -137,13 +143,13 @@ private:
  std::vector<double> threshold_;
  /*! \brief A non-leaf node's split gain */
  std::vector<double> split_gain_;
+  /*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
+  std::vector<double> internal_value_;
  // used for leaf node
  /*! \brief The parent of leaf */
  std::vector<int> leaf_parent_;
  /*! \brief Output of leaves */
  std::vector<double> leaf_value_;
-  /*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
-  std::vector<double> internal_value_;
  /*! \brief Depth for leaves */
  std::vector<int> leaf_depth_;
 };

--- a/python-package/lightgbm/__init__.py
+++ b/python-package/lightgbm/__init__.py
@@ -20,4 +20,5 @@ __version__ = 0.1

 __all__ = ['Dataset', 'Booster',
           'train', 'cv',
-           'LGBMModel','LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
\ No newline at end of file
+           'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
+
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111, R0912, R0913, R0914, W0105
 """Wrapper c_api of LightGBM"""
 from __future__ import absolute_import

@@ -5,12 +7,24 @@ import sys
 import os
 import ctypes
 import tempfile
+import json

 import numpy as np
 import scipy.sparse

 from .libpath import find_lib_path

+# pandas
+try:
+    from pandas import Series, DataFrame
+    IS_PANDAS_INSTALLED = True
+except ImportError:
+    IS_PANDAS_INSTALLED = False
+    class Series(object):
+        pass
+    class DataFrame(object):
+        pass
+
 IS_PY3 = (sys.version_info[0] == 3)

 def _load_lib():
@@ -69,6 +83,8 @@ def list_to_1d_numpy(data, dtype):
            return data.astype(dtype=dtype, copy=False)
    elif is_1d_list(data):
        return np.array(data, dtype=dtype, copy=False)
+    elif IS_PANDAS_INSTALLED and isinstance(data, Series):
+        return data.astype(dtype).values
    else:
        raise TypeError("Unknow type({})".format(type(data).__name__))

@@ -110,7 +126,7 @@ def param_dict_to_str(data):
        elif isinstance(val, (int, float, bool)):
            pairs.append(str(key)+'='+str(val))
        else:
-            raise TypeError('unknow type of parameter:%s , got:%s' 
+            raise TypeError('unknow type of parameter:%s , got:%s'
                            % (key, type(val).__name__))
    return ' '.join(pairs)
 """marco definition of data type in c_api of LightGBM"""
@@ -183,7 +199,7 @@ class Predictor(object):
            """Prediction task"""
            out_num_iterations = ctypes.c_int64(0)
            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
-                c_str(model_file), 
+                c_str(model_file),
                ctypes.byref(out_num_iterations),
                ctypes.byref(self.handle)))
            out_num_class = ctypes.c_int64(0)
@@ -357,7 +373,7 @@ class Predictor(object):
            type_ptr_data,
            len(csr.indptr),
            len(csr.data),
-            csr.shape[1], 
+            csr.shape[1],
            predict_type,
            num_iteration,
            ctypes.byref(out_num_preds),
@@ -367,13 +383,6 @@ class Predictor(object):
            raise ValueError("incorrect number for predict result")
        return preds, nrow

-# pandas
-try:
-    from pandas import DataFrame
-except ImportError:
-    class DataFrame(object):
-        pass
-
 PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
                       'int64': 'int', 'uint8': 'int', 'uint16': 'int',
                       'uint32': 'int', 'uint64': 'int', 'float16': 'float',
@@ -467,8 +476,8 @@ class Dataset(object):
                    self.data_has_header = True
            self.handle = ctypes.c_void_p()
            _safe_call(_LIB.LGBM_DatasetCreateFromFile(
-                c_str(data), 
-                c_str(params_str), 
+                c_str(data),
+                c_str(params_str),
                ref_dataset,
                ctypes.byref(self.handle)))
        elif isinstance(data, scipy.sparse.csr_matrix):
@@ -830,6 +839,7 @@ class Booster(object):
        self.__is_manage_handle = True
        self.__train_data_name = "training"
        self.__attr = {}
+        self.best_iteration = -1
        params = {} if params is None else params
        if silent:
            params["verbose"] = 0
@@ -1018,7 +1028,7 @@ class Booster(object):
            self.handle,
            ctypes.byref(out_cur_iter)))
        return out_cur_iter.value
-    
+
    def eval(self, data, name, feval=None):
        """Evaluate for data

@@ -1098,6 +1108,34 @@ class Booster(object):
            num_iteration,
            c_str(filename)))

+    def dump_model(self):
+        """
+        Dump model to json format
+
+        Returns
+        -------
+        Json format of model
+        """
+        buffer_len = 1 << 20
+        tmp_out_len = ctypes.c_int64(0)
+        string_buffer = ctypes.create_string_buffer(buffer_len)
+        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        _safe_call(_LIB.LGBM_BoosterDumpModel(
+            self.handle,
+            buffer_len,
+            ctypes.byref(tmp_out_len),
+            ctypes.byref(ptr_string_buffer)))
+        actual_len = tmp_out_len.value
+        if actual_len > buffer_len:
+            string_buffer = ctypes.create_string_buffer(actual_len)
+            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            _safe_call(_LIB.LGBM_BoosterDumpModel(
+                self.handle,
+                actual_len,
+                ctypes.byref(tmp_out_len),
+                ctypes.byref(ptr_string_buffer)))
+        return json.loads(string_buffer.value.decode())
+
    def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
        """
        Predict logic
@@ -1147,7 +1185,7 @@ class Booster(object):
            _safe_call(_LIB.LGBM_BoosterGetEval(
                self.handle,
                data_idx,
-                ctypes.byref(tmp_out_len), 
+                ctypes.byref(tmp_out_len),
                result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
            if tmp_out_len.value != self.__num_inner_eval:
                raise ValueError("incorrect number of eval results")
@@ -1190,7 +1228,7 @@ class Booster(object):
                ctypes.byref(tmp_out_len),
                data_ptr))
            if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
-                raise ValueError("incorrect number of predict results for data %d" % (data_idx) )
+                raise ValueError("incorrect number of predict results for data %d" % (data_idx))
            self.__is_predicted_cur_iter[data_idx] = True
        return self.__inner_predict_buffer[data_idx]


--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
+# coding: utf-8
+# pylint: disable = invalid-name, W0105
 from __future__ import absolute_import
 import collections

@@ -25,12 +27,12 @@ CallbackEnv = collections.namedtuple(
 def _format_eval_result(value, show_stdv=True):
    """format metric string"""
    if len(value) == 4:
-        return '%s_%s:%g' % (value[0], value[1], value[2])
+        return '%s\'s %s:%g' % (value[0], value[1], value[2])
    elif len(value) == 5:
        if show_stdv:
-            return '%s_%s:%g+%g' % (value[0], value[1], value[2], value[4])
+            return '%s\'s %s:%g+%g' % (value[0], value[1], value[2], value[4])
        else:
-            return '%s_%s:%g' % (value[0], value[1], value[2])
+            return '%s\'s %s:%g' % (value[0], value[1], value[2])
    else:
        raise ValueError("wrong metric value")

@@ -55,9 +57,10 @@ def print_evaluation(period=1, show_stdv=True):
        """internal function"""
        if len(env.evaluation_result_list) == 0 or period is False:
            return
-        if (env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration):
-            result = '\t'.join([_format_eval_result(x, show_stdv) for x in env.evaluation_result_list])
-            print('[%d]\t%s\n' % (env.iteration, result))
+        if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
+            result = '\t'.join([_format_eval_result(x, show_stdv) \
+                for x in env.evaluation_result_list])
+            print('[%d]\t%s' % (env.iteration, result))
    return callback


@@ -131,13 +134,13 @@ def reset_learning_rate(learning_rates):

 def early_stop(stopping_rounds, verbose=True):
    """Create a callback that activates early stopping.
-    Activates early stopping. 
+    Activates early stopping.
    Requires at least one validation data and one metric
    If there's more than one, will check all of them

    Parameters
    ----------
-    stopp_rounds : int
+    stopping_rounds : int
       The stopping rounds before the trend occur.

    verbose : optional, bool
@@ -154,13 +157,11 @@ def early_stop(stopping_rounds, verbose=True):
    best_msg = {}
    def init(env):
        """internal function"""
-        bst = env.model
-
        if len(env.evaluation_result_list) == 0:
            raise ValueError('For early stopping you need at least one set in evals.')

        if verbose:
-            msg = "Will train until hasn't improved in {} rounds.\n"
+            msg = "Train until valid scores didn't improve in {} rounds."
            print(msg.format(stopping_rounds))

        for i in range(len(env.evaluation_result_list)):
@@ -182,13 +183,13 @@ def early_stop(stopping_rounds, verbose=True):
                best_score[i] = score
                best_iter[i] = env.iteration
                if verbose:
-                    best_msg[i] = '[%d]\t%s' % ( env.iteration,
+                    best_msg[i] = '[%d]\t%s' % (env.iteration, \
                        '\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
            else:
                if env.iteration - best_iter[i] >= stopping_rounds:
                    if env.model is not None:
                        env.model.set_attr(best_iteration=str(best_iter[i]))
                    if verbose:
-                        print('early stopping, best message is:\n {} '.format(best_msg[i]))
+                        print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
                    raise EarlyStopException(best_iter[i])
    return callback
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
+# coding: utf-8
+# pylint: disable = invalid-name, W0105
 """Training Library containing training routines of LightGBM."""
 from __future__ import absolute_import

@@ -6,7 +8,7 @@ from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
 from . import callback

 def _construct_dataset(X_y, reference=None,
-                       params=None, other_fields=None, 
+                       params=None, other_fields=None,
                       predictor=None):
    if 'max_bin' in params:
        max_bin = int(params['max_bin'])
@@ -30,10 +32,9 @@ def _construct_dataset(X_y, reference=None,
        data = X_y[0]
        label = X_y[1]
    if reference is None:
-        ret = Dataset(data, label=label, max_bin=max_bin, 
+        ret = Dataset(data, label=label, max_bin=max_bin,
                      weight=weight, group=group,
                      predictor=predictor, params=params)
-
    else:
        ret = reference.create_valid(data, label=label, weight=weight,
                                     group=group, params=params)
@@ -53,11 +54,11 @@ def train(params, train_data, num_boost_round=100,
    ----------
    params : dict
         params.
-    train_data : pair, (X, y) or filename of data
+    train_data : Dataset, tuple (X, y) or filename of data
        Data to be trained.
    num_boost_round: int
        Number of boosting iterations.
-    valid_datas: list of pairs (valid_X, valid_y) or filename of data
+    valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
        List of data to be evaluated during training
    valid_names: list of string
        names of valid_datas
@@ -72,18 +73,19 @@ def train(params, train_data, num_boost_round=100,
        other data file in training data. e.g. train_fields['weight'] is weight data
        support fields: weight, group, init_score
    valid_fields : dict
-        other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
+        other data file in training data. \
+        e.g. valid_fields[0]['weight'] is weight data for first valid data
        support fields: weight, group, init_score
    early_stopping_rounds: int
-        Activates early stopping. 
+        Activates early stopping.
        Requires at least one validation data and one metric
        If there's more than one, will check all of them
        Returns the model with (best_iter + early_stopping_rounds)
        If early stopping occurs, the model will add 'best_iteration' field
    evals_result: dict or None
        This dictionary used to store all evaluation results of all the items in valid_datas.
-        Example: with a valid_datas containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and
-        a paramater containing ('metric':'logloss')
+        Example: with a valid_datas containing [valid_set, train_set] \
+        and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
        Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
                  'eval': {'logloss': ['0.480385', '0.357756', ...]}}
        passed with None means no using this function
@@ -120,26 +122,36 @@ def train(params, train_data, num_boost_round=100,
    else:
        predictor = None
    """create dataset"""
-    train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
+    if isinstance(train_data, Dataset):
+        train_set = train_data
+    else:
+        train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
    is_valid_contain_train = False
    train_data_name = "training"
    valid_sets = []
    name_valid_sets = []
    if valid_datas is not None:
-        for i in range(len(valid_datas)):
+        if isinstance(valid_datas, (Dataset, tuple)):
+            valid_datas = [valid_datas]
+        if isinstance(valid_names, str):
+            valid_names = [valid_names]
+        for i, valid_data in enumerate(valid_datas):
            other_fields = None if valid_fields is None else valid_fields[i]
            """reduce cost for prediction training data"""
-            if valid_datas[i] is train_data:
+            if valid_data is train_data:
                is_valid_contain_train = True
                if valid_names is not None:
                    train_data_name = valid_names[i]
                continue
-            valid_set = _construct_dataset(
-                valid_datas[i],
-                train_set,
-                params,
-                other_fields,
-                predictor)
+            if isinstance(valid_data, Dataset):
+                valid_set = valid_data
+            else:
+                valid_set = _construct_dataset(
+                    valid_data,
+                    train_set,
+                    params,
+                    other_fields,
+                    predictor)
            valid_sets.append(valid_set)
            if valid_names is not None:
                name_valid_sets.append(valid_names[i])
@@ -178,8 +190,8 @@ def train(params, train_data, num_boost_round=100,
    booster = Booster(params=params, train_set=train_set)
    if is_valid_contain_train:
        booster.set_train_data_name(train_data_name)
-    for i in range(len(valid_sets)):
-        booster.add_valid(valid_sets[i], name_valid_sets[i])
+    for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
+        booster.add_valid(valid_set, name_valid_set)
    """start training"""
    for i in range(num_boost_round):
        for cb in callbacks_before_iter:
@@ -209,9 +221,9 @@ def train(params, train_data, num_boost_round=100,
        except callback.EarlyStopException:
            break
    if booster.attr('best_iteration') is not None:
-        booster.best_iteration = int(booster.attr('best_iteration'))
+        booster.best_iteration = int(booster.attr('best_iteration')) + 1
    else:
-        booster.best_iteration = num_boost_round - 1
+        booster.best_iteration = num_boost_round
    return booster


@@ -233,13 +245,14 @@ class CVBooster(object):
        return self.booster.eval_valid(feval)

 try:
-    try:
-        from sklearn.model_selection import KFold, StratifiedKFold
-    except ImportError:
-        from sklearn.cross_validation import KFold, StratifiedKFold
+    from sklearn.model_selection import StratifiedKFold
    SKLEARN_StratifiedKFold = True
 except ImportError:
-    SKLEARN_StratifiedKFold = False
+    try:
+        from sklearn.cross_validation import StratifiedKFold
+        SKLEARN_StratifiedKFold = True
+    except ImportError:
+        SKLEARN_StratifiedKFold = False

 def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False):
    """
@@ -270,7 +283,6 @@ def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False
    return ret

 def _agg_cv_result(raw_results):
-    # pylint: disable=invalid-name
    """
    Aggregate cross-validation results.
    """
@@ -294,7 +306,6 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
       metrics=(), fobj=None, feval=None, train_fields=None, early_stopping_rounds=None,
       fpreproc=None, verbose_eval=None, show_stdv=True, seed=0,
       callbacks=None):
-    # pylint: disable = invalid-name
    """Cross-validation with given paramaters.

    Parameters
@@ -351,7 +362,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
    if isinstance(params, list):
        params = dict(params)

-    if not 'metric' in params:
+    if 'metric' not in params:
        params['metric'] = []
    else:
        if is_str(params['metric']):
@@ -410,7 +421,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
                                        end_iteration=num_boost_round,
                                        evaluation_result_list=res))
        except callback.EarlyStopException as e:
-            for k in results.keys():
+            for k in results:
                results[k] = results[k][:(e.best_iteration + 1)]
            break
    return results
--- a/python-package/lightgbm/libpath.py
+++ b/python-package/lightgbm/libpath.py
+# coding: utf-8
+"""Find the path to lightgbm dynamic library files."""
 import os
-import platform
 import sys



--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
+# coding: utf-8
+# pylint: disable = invalid-name, W0105
 """Scikit-Learn Wrapper interface for LightGBM."""
 from __future__ import absolute_import

@@ -81,9 +83,9 @@ class LGBMModel(LGBMModelBase):
    num_leaves : int
        Maximum tree leaves for base learners.
    max_depth : int
-        Maximum tree depth for base learners, -1 means no limit. 
+        Maximum tree depth for base learners, -1 means no limit.
    learning_rate : float
-        Boosting learning rate 
+        Boosting learning rate
    n_estimators : int
        Number of boosted trees to fit.
    silent : boolean
@@ -92,7 +94,7 @@ class LGBMModel(LGBMModelBase):
        Specify the learning task and the corresponding learning objective or
        a custom objective function to be used (see note below).
    nthread : int
-        Number of parallel threads 
+        Number of parallel threads
    min_split_gain : float
        Minimum loss reduction required to make a further partition on a leaf node of the tree.
    min_child_weight : int
@@ -105,9 +107,9 @@ class LGBMModel(LGBMModelBase):
        frequence of subsample, <=0 means no enable
    colsample_bytree : float
        Subsample ratio of columns when constructing each tree.
-    reg_alpha : float 
+    reg_alpha : float
        L1 regularization term on weights
-    reg_lambda : float 
+    reg_lambda : float
        L2 regularization term on weights
    scale_pos_weight : float
        Balancing of positive and negative weights.
@@ -122,7 +124,7 @@ class LGBMModel(LGBMModelBase):
    parameter. In this case, it should have the signature
    ``objective(y_true, y_pred) -> grad, hess``:

-    y_true: array_like of shape [n_samples] 
+    y_true: array_like of shape [n_samples]
        The target values
    y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
        The predicted values
@@ -137,12 +139,12 @@ class LGBMModel(LGBMModelBase):
          and you should group grad and hess in this way as well
    """

-    def __init__(self, num_leaves=31, max_depth=-1, 
+    def __init__(self, num_leaves=31, max_depth=-1,
                 learning_rate=0.1, n_estimators=10, max_bin=255,
-                 silent=True, objective="regression",  
+                 silent=True, objective="regression",
                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
-                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1, 
+                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
        if not SKLEARN_INSTALLED:
            raise LightGBMError('sklearn needs to be installed in order to use this module')
@@ -220,7 +222,8 @@ class LGBMModel(LGBMModelBase):
            other data file in training data. e.g. train_fields['weight'] is weight data
            support fields: weight, group, init_score
        valid_fields : dict
-            other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
+            other data file in training data. \
+            e.g. valid_fields[0]['weight'] is weight data for first valid data
            support fields: weight, group, init_score
        other_params: dict
            other parameters
@@ -235,6 +238,13 @@ class LGBMModel(LGBMModelBase):
            params["objective"] = "None"
        else:
            params["objective"] = self.objective
+            if eval_metric is None and eval_set is not None:
+                eval_metric = {
+                    'regression': 'l2',
+                    'binary': 'binary_logloss',
+                    'lambdarank': 'ndcg',
+                    'multiclass': 'multi_logloss'
+                }.get(self.objective, None)

        if callable(eval_metric):
            feval = eval_metric
@@ -249,7 +259,8 @@ class LGBMModel(LGBMModelBase):
                              self.n_estimators, valid_datas=eval_set,
                              early_stopping_rounds=early_stopping_rounds,
                              evals_result=evals_result, fobj=self.fobj, feval=feval,
-                              verbose_eval=verbose, train_fields=train_fields, valid_fields=valid_fields)
+                              verbose_eval=verbose, train_fields=train_fields,
+                              valid_fields=valid_fields)

        if evals_result:
            for val in evals_result.items():
@@ -320,14 +331,18 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
            other_params['num_class'] = self.n_classes_
+            if eval_metric is None and eval_set is not None:
+                eval_metric = "multi_logloss"
        else:
            self.objective = "binary"
+            if eval_metric is None and eval_set is not None:
+                eval_metric = "binary_logloss"

        self._le = LGBMLabelEncoder().fit(y)
        training_labels = self._le.transform(y)

        if eval_set is not None:
-            eval_set = list( (x[0], self._le.transform(x[1])) for x in eval_set )
+            eval_set = list((x[0], self._le.transform(x[1])) for x in eval_set)

        super(LGBMClassifier, self).fit(X, training_labels, eval_set,
                                        eval_metric, early_stopping_rounds,
@@ -430,6 +445,8 @@ class LGBMRanker(LGBMModel):
        else:
            self.objective = "lambdarank"
            self.fobj = None
+            if eval_metric is None and eval_set is not None:
+                eval_metric = "ndcg"

        super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
                                    early_stopping_rounds, verbose,

--- a/python-package/setup.py
+++ b/python-package/setup.py
+# coding: utf-8
 # pylint: disable=invalid-name, exec-used
 """Setup lightgbm package."""
 from __future__ import absolute_import

--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -393,6 +393,36 @@ void GBDT::Boosting() {
    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
 }

+std::string GBDT::DumpModel() const {
+  std::stringstream ss;
+
+  ss << "{";
+  ss << "\"name\":\"" << Name() << "\"," << std::endl;
+  ss << "\"num_class\":" << num_class_ << "," << std::endl;
+  ss << "\"label_index\":" << label_idx_ << "," << std::endl;
+  ss << "\"max_feature_idx\":" << max_feature_idx_ << "," << std::endl;
+  if (object_function_ != nullptr) {
+    ss << "\"objective\":\"" << object_function_->GetName() << "\"," << std::endl;
+  }
+  ss << "\"sigmoid\":" << sigmoid_ << "," << std::endl;
+
+  ss << "\"tree_info\":[";
+  for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
+    if (i > 0) {
+      ss << ",";
+    }
+    ss << "{";
+    ss << "\"tree_index\":" << i << ",";
+    ss << models_[i]->ToJSON();
+    ss << "}";
+  }
+  ss << "]" << std::endl;
+
+  ss << "}" << std::endl;
+
+  return ss.str();
+}
+
 void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
  /*! \brief File to write models */
  std::ofstream output_file;
@@ -426,7 +456,11 @@ void GBDT::SaveModelToFile(int num_iteration, const char* filename) const {
    output_file << models_[i]->ToString() << std::endl;
  }

-  output_file << std::endl << FeatureImportance() << std::endl;
+  std::vector<std::pair<size_t, std::string>> pairs = FeatureImportance();
+  output_file << std::endl << "feature importances:" << std::endl;
+  for (size_t i = 0; i < pairs.size(); ++i) {
+    output_file << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
+  }
  output_file.close();
 }

@@ -487,7 +521,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
  num_init_iteration_ = num_iteration_for_pred_;
 }

-std::string GBDT::FeatureImportance() const {
+std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
  std::vector<size_t> feature_importances(max_feature_idx_ + 1, 0);
    for (size_t iter = 0; iter < models_.size(); ++iter) {
        for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) {
@@ -507,13 +541,7 @@ std::string GBDT::FeatureImportance() const {
        const std::pair<size_t, std::string>& rhs) {
      return lhs.first > rhs.first;
    });
-    std::stringstream str_buf;
-    // write to model file
-    str_buf << std::endl << "feature importances:" << std::endl;
-    for (size_t i = 0; i < pairs.size(); ++i) {
-      str_buf << pairs[i].second << "=" << std::to_string(pairs[i].first) << std::endl;
-    }
-    return str_buf.str();
+    return pairs;
 }

 std::vector<double> GBDT::PredictRaw(const double* value) const {

--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -145,9 +145,16 @@ public:
  std::vector<int> PredictLeafIndex(const double* value) const override;

  /*!
-  * \brief save model to file
-  * \param num_iterations Iterations that want to save, -1 means save all
-  * \param filename filename that want to save to
+  * \brief Dump model to json format string
+  * \return Json format string of model
+  */
+  std::string DumpModel() const override;
+
+  /*!
+  * \brief Save model to file
+  * \param num_used_model Number of model that want to save, -1 means save all
+  * \param is_finish Is training finished or not
+  * \param filename Filename that want to save to
  */
  virtual void SaveModelToFile(int num_iterations, const char* filename) const override ;

@@ -155,6 +162,7 @@ public:
  * \brief Restore from a serialized string
  */
  void LoadModelFromString(const std::string& model_str) override;
+
  /*!
  * \brief Get max feature index of this model
  * \return Max feature index of this model
@@ -231,7 +239,7 @@ protected:
  * \brief Calculate feature importances
  * \param last_iter Last tree use to calculate
  */
-  std::string FeatureImportance() const;
+  std::vector<std::pair<size_t, std::string>> FeatureImportance() const;
  /*! \brief current iteration */
  int iter_;
  /*! \brief Pointer to training data */

--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -139,6 +139,10 @@ public:
    boosting_->SaveModelToFile(num_iteration, filename);
  }

+  std::string DumpModel() {
+    return boosting_->DumpModel();
+  }
+
  int GetEvalCounts() const {
    int ret = 0;
    for (const auto& metric : train_metric_) {
@@ -733,6 +737,20 @@ DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
  API_END();
 }

+DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
+  int buffer_len,
+  int64_t* out_len,
+  char** out_str) {
+  API_BEGIN();
+  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
+  std::string model = ref_booster->DumpModel();
+  *out_len = static_cast<int64_t>(model.size());
+  if (*out_len <= buffer_len) {
+    std::strcpy(*out_str, model.c_str());
+  }
+  API_END();
+}
+
 // ---- start of some help functions

 std::function<std::vector<double>(int row_idx)>

--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -125,6 +125,43 @@ std::string Tree::ToString() {
  return ss.str();
 }

+std::string Tree::ToJSON() {
+  std::stringstream ss;
+
+  ss << "\"num_leaves\":" << num_leaves_ << "," << std::endl;
+
+  ss << "\"tree_structure\":" << NodeToJSON(0) << std::endl;
+
+  return ss.str();
+}
+
+std::string Tree::NodeToJSON(int index) {
+  std::stringstream ss;
+
+  if (index >= 0) {
+    // non-leaf
+    ss << "{" << std::endl;
+    ss << "\"split_index\":" << index << "," << std::endl;
+    ss << "\"split_feature\":" << split_feature_real_.data()[index] << "," << std::endl;
+    ss << "\"split_gain\":" << split_gain_.data()[index] << "," << std::endl;
+    ss << "\"threshold\":" << threshold_.data()[index] << "," << std::endl;
+    ss << "\"internal_value\":" << internal_value_.data()[index] << "," << std::endl;
+    ss << "\"left_child\":" << NodeToJSON(left_child_.data()[index]) << "," << std::endl;
+    ss << "\"right_child\":" << NodeToJSON(right_child_.data()[index]) << std::endl;
+    ss << "}";
+  } else {
+    // leaf
+    index = ~index;
+    ss << "{" << std::endl;
+    ss << "\"leaf_index\":" << index << "," << std::endl;
+    ss << "\"leaf_parent\":" << leaf_parent_.data()[index] << "," << std::endl;
+    ss << "\"leaf_value\":" << leaf_value_.data()[index] << std::endl;
+    ss << "}";
+  }
+
+  return ss.str();
+}
+
 Tree::Tree(const std::string& str) {
  std::vector<std::string> lines = Common::Split(str.c_str(), '\n');
  std::unordered_map<std::string, std::string> key_vals;

--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
+# coding: utf-8
 import numpy as np
 from sklearn import datasets, metrics, model_selection
 import lightgbm as lgb

-
 X, Y = datasets.make_classification(n_samples=100000, n_features=100)
 x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1)