Squash into one commit:

1. merge python-package 2. add dump model to json 3. fix bugs 4. clean code with pylint 5. update python examples

Squash into one commit:
1. merge python-package 2. add dump model to json 3. fix bugs 4. clean code with pylint 5. update python examples
eba6d200 · wxchan · 19e085c9 · eba6d200 · eba6d200 · eba6d200
Commit eba6d200 authored Dec 02, 2016 by wxchan
20 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,9 +21,13 @@ script:
 - cd $TRAVIS_BUILD_DIR
 - mkdir build && cd build && cmake .. && make -j
 - cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
+- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
+- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
 - cd $TRAVIS_BUILD_DIR
 - rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
+- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py 
+- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
+- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
 notifications:
  email: false

--- a/README.md
+++ b/README.md
 LightGBM, Light Gradient Boosting Machine
-==========
+=========================================
 [![Build Status](https://travis-ci.org/Microsoft/LightGBM.svg?branch=master)](https://travis-ci.org/Microsoft/LightGBM)
 LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:
@@ -14,6 +14,11 @@ For more details, please refer to [Features](https://github.com/Microsoft/LightG
 [Experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#comparison-experiment) on public datasets show that LightGBM can outperform other existing boosting framework on both efficiency and accuracy, with significant lower memory consumption. What's more, the [experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#parallel-experiment) show that LightGBM can achieve a linear speed-up by using multiple machines for training in specific settings.
+News
+----
+12/02/2012 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.
 Get Started
 ------------
 To get started, please follow the [Installation Guide](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide) and [Quick Start](https://github.com/Microsoft/LightGBM/wiki/Quick-Start).

--- a/examples/python-guide/simple_example.py
+++ b/examples/python-guide/simple_example.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111
+import json
+import lightgbm as lgb
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+# load or create your dataset
+df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
+df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+# create dataset for lightgbm
+lgb_train = lgb.Dataset(X_train, y_train)
+lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+# or you can simply use a tuple of length=2 here
+lgb_train = (X_train, y_train)
+lgb_eval = (X_test, y_test)
+# specify your configurations as a dict
+params = {
+    'task' : 'train',
+    'boosting_type' : 'gbdt',
+    'objective' : 'regression',
+    'metric' : 'l2',
+    'num_leaves' : 31,
+    'learning_rate' : 0.05,
+    'feature_fraction' : 0.9,
+    'bagging_fraction' : 0.8,
+    'bagging_freq': 5,
+    # 'ndcg_eval_at' : [1, 3, 5, 10],
+    # this metric is not needed in this task, show as an example
+    'verbose' : 0
+}
+# train
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=100,
+                valid_datas=lgb_eval,
+                # you can use a list to represent multiple valid_datas/valid_names
+                # don't use tuple, tuple is used to represent one dataset
+                early_stopping_rounds=10)
+# save model to file
+gbm.save_model('model.txt')
+# load model from file
+gbm = lgb.Booster(model_file='model.txt')
+# predict
+y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
+# eval
+print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
+# dump model to json (and save to file)
+model_json = gbm.dump_model()
+with open('model.json', 'w+') as f:
+    json.dump(model_json, f, indent=4)
--- a/examples/python-guide/sklearn_example.py
+++ b/examples/python-guide/sklearn_example.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111
+import lightgbm as lgb
+import pandas as pd
+from sklearn.metrics import mean_squared_error
+# load or create your dataset
+df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
+df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+# train
+gbm = lgb.LGBMRegressor(objective='regression',
+                        num_leaves=31,
+                        learning_rate=0.05,
+                        n_estimators=100)
+gbm.fit(X_train, y_train,
+        eval_set=[(X_test, y_test)],
+        early_stopping_rounds=10)
+# predict
+y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
+# eval
+print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -51,6 +51,18 @@ public:
  explicit BinMapper(const void* memory);
  ~BinMapper();
+  bool CheckAlign(const BinMapper& other) const {
+    if (num_bin_ != other.num_bin_) {
+      return false;
+    }
+    for (int i = 0; i < num_bin_; ++i) {
+      if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
  /*! \brief Get number of bins */
  inline int num_bin() const { return num_bin_; }
  /*! \brief True if bin is trival (contains only one bin) */

--- a/include/LightGBM/boosting.h
+++ b/include/LightGBM/boosting.h
@@ -35,12 +35,34 @@ public:
    const ObjectiveFunction* object_function,
    const std::vector<const Metric*>& training_metrics) = 0;
+  /*!
+  * \brief Merge model from other boosting object
+           Will insert to the front of current boosting object
+  * \param other
+  */
+  virtual void MergeFrom(const Boosting* other) = 0;
+  /*!
+  * \brief Reset training data for current boosting
+  * \param config Configs for boosting
+  * \param train_data Training data
+  * \param object_function Training objective function
+  * \param training_metrics Training metric
+  */
+  virtual void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function, const std::vector<const Metric*>& training_metrics) = 0;
+  /*!
+  * \brief Reset shrinkage_rate data for current boosting
+  * \param shrinkage_rate Configs for boosting
+  */
+  virtual void ResetShrinkageRate(double shrinkage_rate) = 0;
  /*!
  * \brief Add a validation data
  * \param valid_data Validation data
  * \param valid_metrics Metric for validation data
  */
-  virtual void AddDataset(const Dataset* valid_data,
+  virtual void AddValidDataset(const Dataset* valid_data,
    const std::vector<const Metric*>& valid_metrics) = 0;
  /*!
@@ -52,6 +74,19 @@ public:
  */
  virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) = 0;
+  /*!
+  * \brief Rollback one iteration
+  */
+  virtual void RollbackOneIter() = 0;
+  /*!
+  * \brief return current iteration
+  */
+  virtual int GetCurrentIteration() const = 0;
+  /*!
+  * \brief Eval metrics and check is met early stopping or not
+  */
  virtual bool EvalAndCheckEarlyStopping() = 0;
  /*!
  * \brief Get evaluation result at data_idx data
@@ -73,7 +108,7 @@ public:
  * \param result used to store prediction result, should allocate memory before call this function
  * \param out_len lenght of returned score
  */
-  virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) const = 0;
+  virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) = 0;
  /*!
  * \brief Prediction for one record, not sigmoid transform
@@ -98,12 +133,18 @@ public:
    const double* feature_values) const = 0;
  /*!
-  * \brief save model to file
+  * \brief Dump model to json format string
-  * \param num_used_model number of model that want to save, -1 means save all
+  * \return Json format string of model
-  * \param is_finish is training finished or not
+  */
-  * \param filename filename that want to save to
+  virtual std::string DumpModel() const = 0;
+  /*!
+  * \brief Save model to file
+  * \param num_used_model Number of model that want to save, -1 means save all
+  * \param is_finish Is training finished or not
+  * \param filename Filename that want to save to
  */
-  virtual void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) = 0;
+  virtual void SaveModelToFile(int num_iterations, const char* filename) const = 0;
  /*!
  * \brief Restore from a serialized string
@@ -127,7 +168,7 @@ public:
  * \brief Get number of weak sub-models
  * \return Number of weak sub-models
  */
-  virtual int NumberOfSubModels() const = 0;
+  virtual int NumberOfTotalModel() const = 0;
  /*!
  * \brief Get number of classes
@@ -138,7 +179,7 @@ public:
  /*!
  * \brief Set number of used model for prediction
  */
-  virtual void SetNumUsedModel(int num_used_model) = 0;
+  virtual void SetNumIterationForPred(int num_iteration) = 0;
  /*!
  * \brief Get Type name of this boosting object
@@ -151,6 +192,8 @@ public:
  /*! \brief Disable copy */
  Boosting(const Boosting&) = delete;
+  static void LoadFileToBoosting(Boosting* boosting, const char* filename);
  /*!
  * \brief Create boosting object
  * \param type Type of boosting

--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -72,6 +72,8 @@ public:
  inline bool GetBool(
    const std::unordered_map<std::string, std::string>& params,
    const std::string& name, bool* out);
+  static std::unordered_map<std::string, std::string> Str2Map(const char* parameters);
 };
 /*! \brief Types of boosting */
@@ -97,7 +99,7 @@ public:
  std::string output_result = "LightGBM_predict_result.txt";
  std::string input_model = "";
  int verbosity = 1;
-  int num_model_predict = NO_LIMIT;
+  int num_iteration_predict = -1;
  bool is_pre_partition = false;
  bool is_enable_sparse = true;
  bool use_two_round_loading = false;
@@ -136,6 +138,8 @@ public:
  bool is_unbalance = false;
  // for multiclass
  int num_class = 1;
+  // Balancing of positive and negative weights
+  double scale_pos_weight = 1.0f;
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };
@@ -164,12 +168,12 @@ public:
  int feature_fraction_seed = 2;
  double feature_fraction = 1.0f;
  // max cache size(unit:MB) for historical histogram. < 0 means not limit
-  double histogram_pool_size = NO_LIMIT;
+  double histogram_pool_size = -1.0f;
  // max depth of tree model.
  // Still grow tree by leaf-wise, but limit the max depth to avoid over-fitting
  // And the max leaves will be min(num_leaves, pow(2, max_depth - 1))
  // max_depth < 0 means not limit
-  int max_depth = NO_LIMIT;
+  int max_depth = -1;
  void Set(const std::unordered_map<std::string, std::string>& params) override;
 };
@@ -231,7 +235,7 @@ public:
  MetricConfig metric_config;
  void Set(const std::unordered_map<std::string, std::string>& params) override;
-  void LoadFromString(const char* str);
 private:
  void GetBoostingType(const std::unordered_map<std::string, std::string>& params);
@@ -328,17 +332,22 @@ struct ParameterAlias {
      { "ndcg_at", "ndcg_eval_at" },
      { "min_data_per_leaf", "min_data_in_leaf" },
      { "min_data", "min_data_in_leaf" },
+      { "min_child_samples", "min_data_in_leaf" },
      { "min_sum_hessian_per_leaf", "min_sum_hessian_in_leaf" },
      { "min_sum_hessian", "min_sum_hessian_in_leaf" },
      { "min_hessian", "min_sum_hessian_in_leaf" },
+      { "min_child_weight", "min_sum_hessian_in_leaf" },
      { "num_leaf", "num_leaves" },
      { "sub_feature", "feature_fraction" },
+      { "colsample_bytree", "feature_fraction" },
      { "num_iteration", "num_iterations" },
      { "num_tree", "num_iterations" },
      { "num_round", "num_iterations" },
      { "num_trees", "num_iterations" },
      { "num_rounds", "num_iterations" },
      { "sub_row", "bagging_fraction" },
+      { "subsample", "bagging_fraction" },
+      { "subsample_freq", "bagging_freq" },
      { "shrinkage_rate", "learning_rate" },
      { "tree", "tree_learner" },
      { "num_machine", "num_machines" },
@@ -361,6 +370,9 @@ struct ParameterAlias {
      { "blacklist", "ignore_column" },
      { "predict_raw_score", "is_predict_raw_score" },
      { "predict_leaf_index", "is_predict_leaf_index" }, 
+      { "min_split_gain", "min_gain_to_split" },
+      { "reg_alpha", "lambda_l1" },
+      { "reg_lambda", "lambda_l2" },
      { "num_classes", "num_class" }
    });
    std::unordered_map<std::string, std::string> tmp_map;

--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -13,6 +13,7 @@
 #include <functional>
 #include <string>
 #include <unordered_set>
+#include <mutex>
 namespace LightGBM {
@@ -46,6 +47,13 @@ public:
  */
  void Init(const char* data_filename, const int num_class);
  /*!
+  * \brief init as subset
+  * \param metadata Filename of data
+  * \param used_indices 
+  * \param num_used_indices
+  */
+  void Init(const Metadata& metadata, const data_size_t* used_indices, data_size_t num_used_indices);
+  /*!
  * \brief Initial with binary memory
  * \param memory Pointer to memory
  */
@@ -76,13 +84,14 @@ public:
  void CheckOrPartition(data_size_t num_all_data,
    const std::vector<data_size_t>& used_data_indices);
  void SetLabel(const float* label, data_size_t len);
  void SetWeights(const float* weights, data_size_t len);
  void SetQueryBoundaries(const data_size_t* query_boundaries, data_size_t len);
+  void SetQueryId(const data_size_t* query_id, data_size_t len);
  /*!
  * \brief Set initial scores
  * \param init_score Initial scores, this class will manage memory for init_score.
@@ -141,8 +150,13 @@ public:
  * \brief Get weights, if not exists, will return nullptr
  * \return Pointer of weights
  */
-  inline const float* weights()
+  inline const float* weights() const {
-            const { return weights_.data(); }
+    if (weights_.size() > 0) {
+      return weights_.data();
+    } else {
+      return nullptr;
+    }
+  }
  /*!
  * \brief Get data boundaries on queries, if not exists, will return nullptr
@@ -151,8 +165,13 @@ public:
  *        is the data indices for query i.
  * \return Pointer of data boundaries on queries
  */
-  inline const data_size_t* query_boundaries()
+  inline const data_size_t* query_boundaries() const { 
-           const { return query_boundaries_.data(); }
+    if (query_boundaries_.size() > 0) {
+      return query_boundaries_.data();
+    } else {
+      return nullptr;
+    }
+  }
  /*!
  * \brief Get Number of queries
@@ -164,13 +183,25 @@ public:
  * \brief Get weights for queries, if not exists, will return nullptr
  * \return Pointer of weights for queries
  */
-  inline const float* query_weights() const { return query_weights_.data(); }
+  inline const float* query_weights() const { 
+    if (query_weights_.size() > 0) {
+      return query_weights_.data();
+    } else {
+      return nullptr;
+    }
+  }
  /*!
  * \brief Get initial scores, if not exists, will return nullptr
  * \return Pointer of initial scores
  */
-  inline const float* init_score() const { return init_score_.data(); }
+  inline const float* init_score() const { 
+    if (init_score_.size() > 0) {
+      return init_score_.data();
+    } else {
+      return nullptr;
+    }
+  }
  /*! \brief Disable copy */
  Metadata& operator=(const Metadata&) = delete;
@@ -210,6 +241,8 @@ private:
  std::vector<float> init_score_;
  /*! \brief Queries data */
  std::vector<data_size_t> queries_;
+  /*! \brief mutex for threading safe call */
+  std::mutex mutex_;
 };
@@ -253,6 +286,27 @@ public:
  /*! \brief Destructor */
  ~Dataset();
+  bool CheckAlign(const Dataset& other) const {
+    if (num_features_ != other.num_features_) {
+      return false;
+    }
+    if (num_total_features_ != other.num_total_features_) {
+      return false;
+    }
+    if (num_class_ != other.num_class_) {
+      return false;
+    }
+    if (label_idx_ != other.label_idx_) {
+      return false;
+    }
+    for (int i = 0; i < num_features_; ++i) {
+      if (!features_[i]->CheckAlign(*(other.features_[i].get()))) {
+        return false;
+      }
+    }
+    return true;
+  }
  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
    for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
      int feature_idx = used_feature_map_[i];
@@ -282,6 +336,8 @@ public:
    }
  }
+  Dataset* Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const;
  void FinishLoad();
  bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
@@ -348,12 +404,12 @@ private:
  int num_class_;
  /*! \brief Store some label level data*/
  Metadata metadata_;
-  /*! \brief True if dataset is loaded from binary file */
-  bool is_loading_from_binfile_;
  /*! \brief index of label column */
  int label_idx_ = 0;
  /*! \brief store feature names */
  std::vector<std::string> feature_names_;
+  /*! \brief store feature names */
+  static const char* binary_file_token;
 };
 }  // namespace LightGBM

--- a/include/LightGBM/dataset_loader.h
+++ b/include/LightGBM/dataset_loader.h
@@ -49,7 +49,7 @@ private:
  void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);
  /*! \brief Check can load from binary file */
-  bool CheckCanLoadFromBin(const char* filename);
+  std::string CheckCanLoadFromBin(const char* filename);
  const IOConfig& io_config_;
  /*! \brief Random generator*/

--- a/include/LightGBM/feature.h
+++ b/include/LightGBM/feature.h
@@ -63,6 +63,13 @@ public:
  ~Feature() {
  }
+  bool CheckAlign(const Feature& other) const {
+    if (feature_index_ != other.feature_index_) {
+      return false;
+    }
+    return bin_mapper_->CheckAlign(*(other.bin_mapper_.get()));
+  }
  /*!
  * \brief Push one record, will auto convert to bin and push to bin data
  * \param tid Thread id
@@ -73,6 +80,9 @@ public:
    unsigned int bin = bin_mapper_->ValueToBin(value);
    bin_data_->Push(tid, line_idx, bin);
  }
+  inline void PushBin(int tid, data_size_t line_idx, unsigned int bin) {
+    bin_data_->Push(tid, line_idx, bin);
+  }
  inline void FinishLoad() { bin_data_->FinishLoad(); }
  /*! \brief Index of this feature */
  inline int feature_index() const { return feature_index_; }

--- a/include/LightGBM/meta.h
+++ b/include/LightGBM/meta.h
@@ -24,7 +24,6 @@ using ReduceFunction = std::function<void(const char*, char*, int)>;
 using PredictFunction =
 std::function<std::vector<double>(const std::vector<std::pair<int, double>>&)>;
-#define NO_LIMIT (-1)
 #define NO_SPECIFIC (-1)
 }  // namespace LightGBM

--- a/include/LightGBM/metric.h
+++ b/include/LightGBM/metric.h
@@ -24,8 +24,7 @@ public:
  * \param metadata Label data
  * \param num_data Number of data
  */
-  virtual void Init(const char* test_name,
+  virtual void Init(const Metadata& metadata, data_size_t num_data) = 0;
-    const Metadata& metadata, data_size_t num_data) = 0;
  virtual const std::vector<std::string>& GetName() const = 0;

--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -98,13 +98,12 @@ public:
    }
  }
-  /*! \brief Serialize this object by string*/
+  /*! \brief Serialize this object to string*/
  std::string ToString();
-  /*! \brief Disable copy */
+  /*! \brief Serialize this object to json*/
-  Tree& operator=(const Tree&) = delete;
+  std::string ToJSON();
-  /*! \brief Disable copy */
-  Tree(const Tree&) = delete;
 private:
  /*!
  * \brief Find leaf index of which record belongs by data
@@ -122,6 +121,9 @@ private:
  */
  inline int GetLeaf(const double* feature_values) const;
+  /*! \brief Serialize one node to json*/
+  inline std::string NodeToJSON(int index);
  /*! \brief Number of max leaves*/
  int max_leaves_;
  /*! \brief Number of current levas*/
@@ -141,13 +143,13 @@ private:
  std::vector<double> threshold_;
  /*! \brief A non-leaf node's split gain */
  std::vector<double> split_gain_;
+  /*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
+  std::vector<double> internal_value_;
  // used for leaf node
  /*! \brief The parent of leaf */
  std::vector<int> leaf_parent_;
  /*! \brief Output of leaves */
  std::vector<double> leaf_value_;
-  /*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
-  std::vector<double> internal_value_;
  /*! \brief Depth for leaves */
  std::vector<int> leaf_depth_;
 };

--- a/include/LightGBM/utils/log.h
+++ b/include/LightGBM/utils/log.h
@@ -89,7 +89,11 @@ private:
  // a trick to use static variable in header file. 
  // May be not good, but avoid to use an additional cpp file
-  static LogLevel& GetLevel() { static LogLevel level; return level; }
+#if defined(_MSC_VER)
+  static LogLevel& GetLevel() { static __declspec(thread) LogLevel level = LogLevel::Info; return level; }
+#else
+  static LogLevel& GetLevel() { static thread_local LogLevel level = LogLevel::Info; return level; }
+#endif
 };

--- a/python-package/README.rst
+++ b/python-package/README.rst
+LightGBM Python Package
+=======================
+Installation
+------------
+1. Following `Installation Guide <https://github.com/Microsoft/LightGBM/wiki/Installation-Guide>`__ to build first.
+   For the windows user, please change the build config to ``DLL``.
+2. Install with ``cd python-package; python setup.py install`` 
+Note: Make sure you have `setuptools <https://pypi.python.org/pypi/setuptools>`__
+Examples
+--------
+-  Refer also to the walk through examples in `python-guide
+   folder <https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide>`__
--- a/python-package/lightgbm/__init__.py
+++ b/python-package/lightgbm/__init__.py
+# coding: utf-8
+"""LightGBM, Light Gradient Boosting Machine.
+Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
+"""
+from __future__ import absolute_import
+import os
+from .basic import Predictor, Dataset, Booster
+from .engine import train, cv
+try:
+    from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
+except ImportError:
+    pass
+__version__ = 0.1
+__all__ = ['Dataset', 'Booster',
+           'train', 'cv',
+           'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
+# coding: utf-8
+# pylint: disable = invalid-name, W0105
+from __future__ import absolute_import
+import collections
+class EarlyStopException(Exception):
+    """Exception of early stopping.
+    Parameters
+    ----------
+    best_iteration : int
+        The best iteration stopped.
+    """
+    def __init__(self, best_iteration):
+        super(EarlyStopException, self).__init__()
+        self.best_iteration = best_iteration
+# Callback environment used by callbacks
+CallbackEnv = collections.namedtuple(
+    "LightGBMCallbackEnv",
+    ["model",
+     "cvfolds",
+     "iteration",
+     "begin_iteration",
+     "end_iteration",
+     "evaluation_result_list"])
+def _format_eval_result(value, show_stdv=True):
+    """format metric string"""
+    if len(value) == 4:
+        return '%s\'s %s:%g' % (value[0], value[1], value[2])
+    elif len(value) == 5:
+        if show_stdv:
+            return '%s\'s %s:%g+%g' % (value[0], value[1], value[2], value[4])
+        else:
+            return '%s\'s %s:%g' % (value[0], value[1], value[2])
+    else:
+        raise ValueError("wrong metric value")
+def print_evaluation(period=1, show_stdv=True):
+    """Create a callback that print evaluation result.
+    Parameters
+    ----------
+    period : int
+        The period to log the evaluation results
+    show_stdv : bool, optional
+         Whether show stdv if provided
+    Returns
+    -------
+    callback : function
+        A callback that print evaluation every period iterations.
+    """
+    def callback(env):
+        """internal function"""
+        if len(env.evaluation_result_list) == 0 or period is False:
+            return
+        if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
+            result = '\t'.join([_format_eval_result(x, show_stdv) \
+                for x in env.evaluation_result_list])
+            print('[%d]\t%s' % (env.iteration, result))
+    return callback
+def record_evaluation(eval_result):
+    """Create a call back that records the evaluation history into eval_result.
+    Parameters
+    ----------
+    eval_result : dict
+       A dictionary to store the evaluation results.
+    Returns
+    -------
+    callback : function
+        The requested callback function.
+    """
+    if not isinstance(eval_result, dict):
+        raise TypeError('eval_result has to be a dictionary')
+    eval_result.clear()
+    def init(env):
+        """internal function"""
+        for data_name, eval_name, _, _ in env.evaluation_result_list:
+            if data_name not in eval_result:
+                eval_result[data_name] = {}
+            if eval_name not in eval_result[data_name]:
+                eval_result[data_name][eval_name] = []
+    def callback(env):
+        """internal function"""
+        if len(eval_result) == 0:
+            init(env)
+        for data_name, eval_name, result, _ in env.evaluation_result_list:
+            eval_result[data_name][eval_name].append(result)
+    return callback
+def reset_learning_rate(learning_rates):
+    """Reset learning rate after iteration 1
+    NOTE: the initial learning rate will still take in-effect on first iteration.
+    Parameters
+    ----------
+    learning_rates: list or function
+        List of learning rate for each boosting round
+        or a customized function that calculates learning_rate in terms of
+        current number of round and the total number of boosting round (e.g. yields
+        learning rate decay)
+        - list l: learning_rate = l[current_round]
+        - function f: learning_rate = f(current_round, total_boost_round)
+    Returns
+    -------
+    callback : function
+        The requested callback function.
+    """
+    def callback(env):
+        """internal function"""
+        booster = env.model
+        i = env.iteration
+        if isinstance(learning_rates, list):
+            if len(learning_rates) != env.end_iteration:
+                raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
+            booster.reset_parameter({'learning_rate':learning_rates[i]})
+        else:
+            booster.reset_parameter({'learning_rate':learning_rates(i, env.end_iteration)})
+    callback.before_iteration = True
+    return callback
+def early_stop(stopping_rounds, verbose=True):
+    """Create a callback that activates early stopping.
+    Activates early stopping.
+    Requires at least one validation data and one metric
+    If there's more than one, will check all of them
+    Parameters
+    ----------
+    stopping_rounds : int
+       The stopping rounds before the trend occur.
+    verbose : optional, bool
+        Whether to print message about early stopping information.
+    Returns
+    -------
+    callback : function
+        The requested callback function.
+    """
+    factor_to_bigger_better = {}
+    best_score = {}
+    best_iter = {}
+    best_msg = {}
+    def init(env):
+        """internal function"""
+        if len(env.evaluation_result_list) == 0:
+            raise ValueError('For early stopping you need at least one set in evals.')
+        if verbose:
+            msg = "Train until valid scores didn't improve in {} rounds."
+            print(msg.format(stopping_rounds))
+        for i in range(len(env.evaluation_result_list)):
+            best_score[i] = float('-inf')
+            best_iter[i] = 0
+            if verbose:
+                best_msg[i] = ""
+            factor_to_bigger_better[i] = -1.0
+            if env.evaluation_result_list[i][3]:
+                factor_to_bigger_better[i] = 1.0
+    def callback(env):
+        """internal function"""
+        if len(best_score) == 0:
+            init(env)
+        for i in range(len(env.evaluation_result_list)):
+            score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
+            if score > best_score[i]:
+                best_score[i] = score
+                best_iter[i] = env.iteration
+                if verbose:
+                    best_msg[i] = '[%d]\t%s' % (env.iteration, \
+                        '\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
+            else:
+                if env.iteration - best_iter[i] >= stopping_rounds:
+                    if env.model is not None:
+                        env.model.set_attr(best_iteration=str(best_iter[i]))
+                    if verbose:
+                        print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
+                    raise EarlyStopException(best_iter[i])
+    return callback
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
+# coding: utf-8
+# pylint: disable = invalid-name, W0105
+"""Training Library containing training routines of LightGBM."""
+from __future__ import absolute_import
+import numpy as np
+from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
+from . import callback
+def _construct_dataset(X_y, reference=None,
+                       params=None, other_fields=None,
+                       predictor=None):
+    if 'max_bin' in params:
+        max_bin = int(params['max_bin'])
+    else:
+        max_bin = 255
+    weight = None
+    group = None
+    init_score = None
+    if other_fields is not None:
+        if not isinstance(other_fields, dict):
+            raise TypeError("other filed data should be dict type")
+        weight = None if 'weight' not in other_fields else other_fields['weight']
+        group = None if 'group' not in other_fields else other_fields['group']
+        init_score = None if 'init_score' not in other_fields else other_fields['init_score']
+    if is_str(X_y):
+        data = X_y
+        label = None
+    else:
+        if len(X_y) != 2:
+            raise TypeError("should pass (data, label) pair")
+        data = X_y[0]
+        label = X_y[1]
+    if reference is None:
+        ret = Dataset(data, label=label, max_bin=max_bin,
+                      weight=weight, group=group,
+                      predictor=predictor, params=params)
+    else:
+        ret = reference.create_valid(data, label=label, weight=weight,
+                                     group=group, params=params)
+    if init_score is not None:
+        ret.set_init_score(init_score)
+    return ret
+def train(params, train_data, num_boost_round=100,
+          valid_datas=None, valid_names=None,
+          fobj=None, feval=None, init_model=None,
+          train_fields=None, valid_fields=None,
+          early_stopping_rounds=None, evals_result=None,
+          verbose_eval=True, learning_rates=None, callbacks=None):
+    """Train with given parameters.
+    Parameters
+    ----------
+    params : dict
+         params.
+    train_data : Dataset, tuple (X, y) or filename of data
+        Data to be trained.
+    num_boost_round: int
+        Number of boosting iterations.
+    valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
+        List of data to be evaluated during training
+    valid_names: list of string
+        names of valid_datas
+    fobj : function
+        Customized objective function.
+    feval : function
+        Customized evaluation function.
+        Note: should return (eval_name, eval_result, is_higher_better) of list of this
+    init_model : file name of lightgbm model or 'Booster' instance
+        model used for continued train
+    train_fields : dict
+        other data file in training data. e.g. train_fields['weight'] is weight data
+        support fields: weight, group, init_score
+    valid_fields : dict
+        other data file in training data. \
+        e.g. valid_fields[0]['weight'] is weight data for first valid data
+        support fields: weight, group, init_score
+    early_stopping_rounds: int
+        Activates early stopping.
+        Requires at least one validation data and one metric
+        If there's more than one, will check all of them
+        Returns the model with (best_iter + early_stopping_rounds)
+        If early stopping occurs, the model will add 'best_iteration' field
+    evals_result: dict or None
+        This dictionary used to store all evaluation results of all the items in valid_datas.
+        Example: with a valid_datas containing [valid_set, train_set] \
+        and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
+        Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
+                  'eval': {'logloss': ['0.480385', '0.357756', ...]}}
+        passed with None means no using this function
+    verbose_eval : bool or int
+        Requires at least one item in evals.
+        If `verbose_eval` is True then the evaluation metric on the validation set is
+        printed at each boosting stage.
+        If `verbose_eval` is an integer then the evaluation metric on the validation set
+        is printed at every given `verbose_eval` boosting stage. The last boosting stage
+        / the boosting stage found by using `early_stopping_rounds` is also printed.
+        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
+        is printed every 4 boosting stages, instead of every boosting stage.
+    learning_rates: list or function
+        List of learning rate for each boosting round
+        or a customized function that calculates learning_rate in terms of
+        current number of round and the total number of boosting round (e.g. yields
+        learning rate decay)
+        - list l: learning_rate = l[current_round]
+        - function f: learning_rate = f(current_round, total_boost_round)
+    callbacks : list of callback functions
+        List of callback functions that are applied at end of each iteration.
+    Returns
+    -------
+    booster : a trained booster model
+    """
+    """create predictor first"""
+    if is_str(init_model):
+        predictor = Predictor(model_file=init_model)
+    elif isinstance(init_model, Booster):
+        predictor = init_model.to_predictor()
+    elif isinstance(init_model, Predictor):
+        predictor = init_model
+    else:
+        predictor = None
+    """create dataset"""
+    if isinstance(train_data, Dataset):
+        train_set = train_data
+    else:
+        train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
+    is_valid_contain_train = False
+    train_data_name = "training"
+    valid_sets = []
+    name_valid_sets = []
+    if valid_datas is not None:
+        if isinstance(valid_datas, (Dataset, tuple)):
+            valid_datas = [valid_datas]
+        if isinstance(valid_names, str):
+            valid_names = [valid_names]
+        for i, valid_data in enumerate(valid_datas):
+            other_fields = None if valid_fields is None else valid_fields[i]
+            """reduce cost for prediction training data"""
+            if valid_data is train_data:
+                is_valid_contain_train = True
+                if valid_names is not None:
+                    train_data_name = valid_names[i]
+                continue
+            if isinstance(valid_data, Dataset):
+                valid_set = valid_data
+            else:
+                valid_set = _construct_dataset(
+                    valid_data,
+                    train_set,
+                    params,
+                    other_fields,
+                    predictor)
+            valid_sets.append(valid_set)
+            if valid_names is not None:
+                name_valid_sets.append(valid_names[i])
+            else:
+                name_valid_sets.append('valid_'+str(i))
+    """process callbacks"""
+    callbacks = [] if callbacks is None else callbacks
+    # Most of legacy advanced options becomes callbacks
+    if isinstance(verbose_eval, bool) and verbose_eval:
+        callbacks.append(callback.print_evaluation())
+    else:
+        if isinstance(verbose_eval, int):
+            callbacks.append(callback.print_evaluation(verbose_eval))
+    if early_stopping_rounds is not None:
+        callbacks.append(callback.early_stop(early_stopping_rounds,
+                                             verbose=bool(verbose_eval)))
+    if learning_rates is not None:
+        callbacks.append(callback.reset_learning_rate(learning_rates))
+    if evals_result is not None:
+        callbacks.append(callback.record_evaluation(evals_result))
+    callbacks_before_iter = [
+        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
+    callbacks_after_iter = [
+        cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
+    """construct booster"""
+    if 'metric' in params:
+        if is_str(params['metric']):
+            params['metric'] = params['metric'].split(',')
+        else:
+            params['metric'] = list(params['metric'])
+    booster = Booster(params=params, train_set=train_set)
+    if is_valid_contain_train:
+        booster.set_train_data_name(train_data_name)
+    for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
+        booster.add_valid(valid_set, name_valid_set)
+    """start training"""
+    for i in range(num_boost_round):
+        for cb in callbacks_before_iter:
+            cb(callback.CallbackEnv(model=booster,
+                                    cvfolds=None,
+                                    iteration=i,
+                                    begin_iteration=0,
+                                    end_iteration=num_boost_round,
+                                    evaluation_result_list=None))
+        booster.update(fobj=fobj)
+        evaluation_result_list = []
+        # check evaluation result.
+        if len(valid_sets) != 0:
+            if is_valid_contain_train:
+                evaluation_result_list.extend(booster.eval_train(feval))
+            evaluation_result_list.extend(booster.eval_valid(feval))
+        try:
+            for cb in callbacks_after_iter:
+                cb(callback.CallbackEnv(model=booster,
+                                        cvfolds=None,
+                                        iteration=i,
+                                        begin_iteration=0,
+                                        end_iteration=num_boost_round,
+                                        evaluation_result_list=evaluation_result_list))
+        except callback.EarlyStopException:
+            break
+    if booster.attr('best_iteration') is not None:
+        booster.best_iteration = int(booster.attr('best_iteration')) + 1
+    else:
+        booster.best_iteration = num_boost_round
+    return booster
+class CVBooster(object):
+    """"Auxiliary datastruct to hold one fold of CV."""
+    def __init__(self, train_set, valid_test, params):
+        """"Initialize the CVBooster"""
+        self.train_set = train_set
+        self.valid_test = valid_test
+        self.booster = Booster(params=params, train_set=train_set)
+        self.booster.add_valid(valid_test, 'valid')
+    def update(self, fobj):
+        """"Update the boosters for one iteration"""
+        self.booster.update(fobj=fobj)
+    def eval(self, feval):
+        """"Evaluate the CVBooster for one iteration."""
+        return self.booster.eval_valid(feval)
+try:
+    try:
+        from sklearn.model_selection import StratifiedKFold
+    except ImportError:
+        from sklearn.cross_validation import StratifiedKFold
+    SKLEARN_StratifiedKFold = True
+except ImportError:
+    SKLEARN_StratifiedKFold = False
+def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False):
+    """
+    Make an n-fold list of CVBooster from random indices.
+    """
+    np.random.seed(seed)
+    if stratified:
+        if SKLEARN_StratifiedKFold:
+            sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
+            idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
+        else:
+            raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
+    else:
+        randidx = np.random.permutation(full_data.num_data())
+        kstep = int(len(randidx) / nfold)
+        idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
+    ret = []
+    for k in range(nfold):
+        train_set = full_data.subset(np.concatenate([idset[i] for i in range(nfold) if k != i]))
+        valid_set = full_data.subset(idset[k])
+        # run preprocessing on the data set if needed
+        if fpreproc is not None:
+            train_set, valid_set, tparam = fpreproc(train_set, valid_set, param.copy())
+        else:
+            tparam = param
+        ret.append(CVBooster(train_set, valid_set, tparam))
+    return ret
+def _agg_cv_result(raw_results):
+    """
+    Aggregate cross-validation results.
+    """
+    cvmap = {}
+    metric_type = {}
+    for one_result in raw_results:
+        for one_line in one_result:
+            key = one_line[1]
+            metric_type[key] = one_line[3]
+            if key not in cvmap:
+                cvmap[key] = []
+            cvmap[key].append(one_line[2])
+    results = []
+    for k, v in cvmap.items():
+        v = np.array(v)
+        mean, std = np.mean(v), np.std(v)
+        results.append(('cv_agg', k, mean, metric_type[k], std))
+    return results
+def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
+       metrics=(), fobj=None, feval=None, train_fields=None, early_stopping_rounds=None,
+       fpreproc=None, verbose_eval=None, show_stdv=True, seed=0,
+       callbacks=None):
+    """Cross-validation with given paramaters.
+    Parameters
+    ----------
+    params : dict
+        Booster params.
+    train_data : pair, (X, y) or filename of data
+        Data to be trained.
+    num_boost_round : int
+        Number of boosting iterations.
+    nfold : int
+        Number of folds in CV.
+    stratified : bool
+        Perform stratified sampling.
+    folds : a KFold or StratifiedKFold instance
+        Sklearn KFolds or StratifiedKFolds.
+    metrics : string or list of strings
+        Evaluation metrics to be watched in CV.
+    fobj : function
+        Custom objective function.
+    feval : function
+        Custom evaluation function.
+    train_fields : dict
+        other data file in training data. e.g. train_fields['weight'] is weight data
+        support fields: weight, group, init_score
+    early_stopping_rounds: int
+        Activates early stopping. CV error needs to decrease at least
+        every <early_stopping_rounds> round(s) to continue.
+        Last entry in evaluation history is the one from best iteration.
+    fpreproc : function
+        Preprocessing function that takes (dtrain, dtest, param) and returns
+        transformed versions of those.
+    verbose_eval : bool, int, or None, default None
+        Whether to display the progress. If None, progress will be displayed
+        when np.ndarray is returned. If True, progress will be displayed at
+        boosting stage. If an integer is given, progress will be displayed
+        at every given `verbose_eval` boosting stage.
+    show_stdv : bool, default True
+        Whether to display the standard deviation in progress.
+        Results are not affected, and always contains std.
+    seed : int
+        Seed used to generate the folds (passed to numpy.random.seed).
+    callbacks : list of callback functions
+        List of callback functions that are applied at end of each iteration.
+    Returns
+    -------
+    evaluation history : list(string)
+    """
+    if isinstance(metrics, str):
+        metrics = [metrics]
+    if isinstance(params, list):
+        params = dict(params)
+    if 'metric' not in params:
+        params['metric'] = []
+    else:
+        if is_str(params['metric']):
+            params['metric'] = params['metric'].split(',')
+        else:
+            params['metric'] = list(params['metric'])
+    if metrics is not None and len(metrics) > 0:
+        params['metric'].extend(metrics)
+    train_set = _construct_dataset(train_data, None, params, train_fields)
+    results = {}
+    cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)
+    # setup callbacks
+    callbacks = [] if callbacks is None else callbacks
+    if early_stopping_rounds is not None:
+        callbacks.append(callback.early_stop(early_stopping_rounds,
+                                             verbose=False))
+    if isinstance(verbose_eval, bool) and verbose_eval:
+        callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
+    else:
+        if isinstance(verbose_eval, int):
+            callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
+    callbacks_before_iter = [
+        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
+    callbacks_after_iter = [
+        cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
+    for i in range(num_boost_round):
+        for cb in callbacks_before_iter:
+            cb(callback.CallbackEnv(model=None,
+                                    cvfolds=cvfolds,
+                                    iteration=i,
+                                    begin_iteration=0,
+                                    end_iteration=num_boost_round,
+                                    evaluation_result_list=None))
+        for fold in cvfolds:
+            fold.update(fobj)
+        res = _agg_cv_result([f.eval(feval) for f in cvfolds])
+        for _, key, mean, _, std in res:
+            if key + '-mean' not in results:
+                results[key + '-mean'] = []
+            if key + '-std' not in results:
+                results[key + '-std'] = []
+            results[key + '-mean'].append(mean)
+            results[key + '-std'].append(std)
+        try:
+            for cb in callbacks_after_iter:
+                cb(callback.CallbackEnv(model=None,
+                                        cvfolds=cvfolds,
+                                        iteration=i,
+                                        begin_iteration=0,
+                                        end_iteration=num_boost_round,
+                                        evaluation_result_list=res))
+        except callback.EarlyStopException as e:
+            for k in results:
+                results[k] = results[k][:(e.best_iteration + 1)]
+            break
+    return results